From 79ea58f5ee737d6e372301b961159844b24a034e Mon Sep 17 00:00:00 2001
From: Maria Kraynyuk <maria.kraynyuk@intel.com>
Date: Wed, 7 Aug 2024 11:08:58 -0700
Subject: [PATCH 01/10] rfc: init rfcs branch

---
 CMakeLists.txt                                |  355 -
 CONTRIBUTING.md                               |  325 -
 README.md                                     |  643 +-
 SECURITY.md                                   |   65 -
 _clang-format                                 |  164 -
 cmake/CMakeLists.txt                          |   27 -
 cmake/FindCBLAS.cmake                         |   37 -
 cmake/FindCompiler.cmake                      |   72 -
 cmake/FindLAPACKE.cmake                       |   43 -
 cmake/FindNETLIB.cmake                        |   41 -
 cmake/FindSphinx.cmake                        |   31 -
 cmake/FindcuBLAS.cmake                        |   69 -
 cmake/FindcuRAND.cmake                        |  114 -
 cmake/FindcuSOLVER.cmake                      |   51 -
 cmake/WarningsUtils.cmake                     |   48 -
 cmake/mkl/MKLConfig.cmake                     | 1158 ---
 cmake/mkl/MKLConfigVersion.cmake              |   59 -
 cmake/oneMKLConfig.cmake                      |   30 -
 deps/googletest/CMakeLists.txt                |  325 -
 deps/googletest/CONTRIBUTORS                  |   37 -
 deps/googletest/LICENSE                       |   28 -
 deps/googletest/cmake/Config.cmake.in         |    9 -
 deps/googletest/cmake/gtest.pc.in             |   10 -
 deps/googletest/cmake/gtest_main.pc.in        |   11 -
 deps/googletest/cmake/internal_utils.cmake    |  360 -
 deps/googletest/cmake/libgtest.la.in          |   21 -
 .../include/gtest/gtest-death-test.h          |  343 -
 .../googletest/include/gtest/gtest-matchers.h |  748 --
 deps/googletest/include/gtest/gtest-message.h |  218 -
 .../include/gtest/gtest-param-test.h          |  503 --
 .../googletest/include/gtest/gtest-printers.h |  927 --
 deps/googletest/include/gtest/gtest-spi.h     |  238 -
 .../include/gtest/gtest-test-part.h           |  184 -
 .../include/gtest/gtest-typed-test.h          |  336 -
 deps/googletest/include/gtest/gtest.h         | 2453 -----
 .../include/gtest/gtest_pred_impl.h           |  359 -
 deps/googletest/include/gtest/gtest_prod.h    |   61 -
 .../include/gtest/internal/custom/README.md   |   56 -
 .../gtest/internal/custom/gtest-port.h        |   37 -
 .../gtest/internal/custom/gtest-printers.h    |   42 -
 .../include/gtest/internal/custom/gtest.h     |   37 -
 .../internal/gtest-death-test-internal.h      |  304 -
 .../include/gtest/internal/gtest-filepath.h   |  211 -
 .../include/gtest/internal/gtest-internal.h   | 1428 ---
 .../include/gtest/internal/gtest-param-util.h |  880 --
 .../include/gtest/internal/gtest-port-arch.h  |  107 -
 .../include/gtest/internal/gtest-port.h       | 2320 -----
 .../include/gtest/internal/gtest-string.h     |  170 -
 .../include/gtest/internal/gtest-type-util.h  | 3347 -------
 .../gtest/internal/gtest-type-util.h.pump     |  314 -
 deps/googletest/src/gtest-all.cc              |   48 -
 deps/googletest/src/gtest-death-test.cc       | 1643 ----
 deps/googletest/src/gtest-filepath.cc         |  379 -
 deps/googletest/src/gtest-internal-inl.h      | 1210 ---
 deps/googletest/src/gtest-matchers.cc         |   97 -
 deps/googletest/src/gtest-port.cc             | 1404 ---
 deps/googletest/src/gtest-printers.cc         |  441 -
 deps/googletest/src/gtest-test-part.cc        |  104 -
 deps/googletest/src/gtest-typed-test.cc       |  118 -
 deps/googletest/src/gtest.cc                  | 6124 -------------
 deps/googletest/src/gtest_main.cc             |   47 -
 docs/CMakeLists.txt                           |   51 -
 docs/README.md                                |   16 -
 docs/_static/favicons.png                     |  Bin 467 -> 0 bytes
 docs/_static/oneAPI-rgb-rev-100.png           |  Bin 7414 -> 0 bytes
 docs/_static/style.css                        |  141 -
 docs/_templates/layout.html                   |   18 -
 docs/building_and_running_tests.rst           |   51 -
 .../building_the_project_with_adaptivecpp.rst |  171 -
 docs/building_the_project_with_dpcpp.rst      |  475 -
 docs/conf.py.in                               |  198 -
 docs/create_new_backend.rst                   |  513 --
 docs/domains/blas/asum.rst                    |  158 -
 docs/domains/blas/axpby.rst                   |  180 -
 docs/domains/blas/axpy.rst                    |  184 -
 docs/domains/blas/axpy_batch.rst              |  350 -
 docs/domains/blas/blas-level-1-routines.rst   |   76 -
 docs/domains/blas/blas-level-2-routines.rst   |  105 -
 docs/domains/blas/blas-level-3-routines.rst   |   55 -
 docs/domains/blas/blas-like-extensions.rst    |   55 -
 docs/domains/blas/blas.rst                    |   17 -
 docs/domains/blas/copy.rst                    |  159 -
 docs/domains/blas/copy_batch.rst              |  328 -
 docs/domains/blas/dgmm_batch.rst              |  462 -
 docs/domains/blas/dot.rst                     |  182 -
 docs/domains/blas/dotc.rst                    |  170 -
 docs/domains/blas/dotu.rst                    |  170 -
 docs/domains/blas/gbmv.rst                    |  285 -
 docs/domains/blas/gemm.rst                    |  455 -
 docs/domains/blas/gemm_batch.rst              |  606 --
 docs/domains/blas/gemm_bias.rst               |  513 --
 docs/domains/blas/gemmt.rst                   |  418 -
 docs/domains/blas/gemv.rst                    |  261 -
 docs/domains/blas/gemv_batch.rst              |  472 -
 docs/domains/blas/ger.rst                     |  226 -
 docs/domains/blas/gerc.rst                    |  227 -
 docs/domains/blas/geru.rst                    |  227 -
 docs/domains/blas/hbmv.rst                    |  245 -
 docs/domains/blas/hemm.rst                    |  315 -
 docs/domains/blas/hemv.rst                    |  232 -
 docs/domains/blas/her.rst                     |  205 -
 docs/domains/blas/her2.rst                    |  231 -
 docs/domains/blas/her2k.rst                   |  397 -
 docs/domains/blas/herk.rst                    |  309 -
 docs/domains/blas/hpmv.rst                    |  228 -
 docs/domains/blas/hpr.rst                     |  201 -
 docs/domains/blas/hpr2.rst                    |  226 -
 docs/domains/blas/iamax.rst                   |  167 -
 docs/domains/blas/iamin.rst                   |  160 -
 docs/domains/blas/nrm2.rst                    |  158 -
 docs/domains/blas/rot.rst                     |  208 -
 docs/domains/blas/rotg.rst                    |  175 -
 docs/domains/blas/rotm.rst                    |  266 -
 docs/domains/blas/rotmg.rst                   |  257 -
 docs/domains/blas/sbmv.rst                    |  244 -
 docs/domains/blas/scal.rst                    |  162 -
 docs/domains/blas/sdsdot.rst                  |  172 -
 docs/domains/blas/spmv.rst                    |  220 -
 docs/domains/blas/spr.rst                     |  193 -
 docs/domains/blas/spr2.rst                    |  213 -
 docs/domains/blas/swap.rst                    |  184 -
 docs/domains/blas/symm.rst                    |  311 -
 docs/domains/blas/symv.rst                    |  226 -
 docs/domains/blas/syr.rst                     |  202 -
 docs/domains/blas/syr2.rst                    |  228 -
 docs/domains/blas/syr2k.rst                   |  397 -
 docs/domains/blas/syrk.rst                    |  296 -
 docs/domains/blas/syrk_batch.rst              |  484 -
 docs/domains/blas/tbmv.rst                    |  223 -
 docs/domains/blas/tbsv.rst                    |  225 -
 docs/domains/blas/tpmv.rst                    |  199 -
 docs/domains/blas/tpsv.rst                    |  207 -
 docs/domains/blas/trmm.rst                    |  288 -
 docs/domains/blas/trmv.rst                    |  210 -
 docs/domains/blas/trsm.rst                    |  286 -
 docs/domains/blas/trsm_batch.rst              |  497 --
 docs/domains/blas/trsv.rst                    |  215 -
 docs/domains/dense_linear_algebra.rst         |   19 -
 docs/domains/lapack/gebrd.rst                 |  230 -
 docs/domains/lapack/gebrd_scratchpad_size.rst |   61 -
 docs/domains/lapack/geqrf.rst                 |  157 -
 docs/domains/lapack/geqrf_batch.rst           |  239 -
 .../lapack/geqrf_batch_scratchpad_size.rst    |  111 -
 docs/domains/lapack/geqrf_scratchpad_size.rst |   64 -
 docs/domains/lapack/gerqf.rst                 |  148 -
 docs/domains/lapack/gerqf_scratchpad_size.rst |   68 -
 docs/domains/lapack/gesvd.rst                 |  344 -
 docs/domains/lapack/gesvd_scratchpad_size.rst |  111 -
 docs/domains/lapack/getrf.rst                 |  144 -
 docs/domains/lapack/getrf_batch.rst           |  226 -
 .../lapack/getrf_batch_scratchpad_size.rst    |  117 -
 docs/domains/lapack/getrf_scratchpad_size.rst |   67 -
 docs/domains/lapack/getri.rst                 |  138 -
 docs/domains/lapack/getri_batch.rst           |  229 -
 .../lapack/getri_batch_scratchpad_size.rst    |  111 -
 docs/domains/lapack/getri_scratchpad_size.rst |   66 -
 docs/domains/lapack/getrs.rst                 |  200 -
 docs/domains/lapack/getrs_batch.rst           |  286 -
 .../lapack/getrs_batch_scratchpad_size.rst    |  135 -
 docs/domains/lapack/getrs_scratchpad_size.rst |   85 -
 docs/domains/lapack/heevd.rst                 |  182 -
 docs/domains/lapack/heevd_scratchpad_size.rst |   81 -
 docs/domains/lapack/hegvd.rst                 |  249 -
 docs/domains/lapack/hegvd_scratchpad_size.rst |   95 -
 docs/domains/lapack/hetrd.rst                 |  206 -
 docs/domains/lapack/hetrd_scratchpad_size.rst |   74 -
 docs/domains/lapack/hetrf.rst                 |  164 -
 docs/domains/lapack/hetrf_scratchpad_size.rst |   74 -
 .../lapack/lapack-like-extensions.inc.rst     |   74 -
 .../lapack-linear-equation-routines.inc.rst   |  121 -
 ...singular-value-eigenvalue-routines.inc.rst |  105 -
 docs/domains/lapack/lapack.rst                |   43 -
 docs/domains/lapack/orgbr.rst                 |  226 -
 docs/domains/lapack/orgbr_scratchpad_size.rst |   90 -
 docs/domains/lapack/orgqr.rst                 |  183 -
 docs/domains/lapack/orgqr_batch.rst           |  262 -
 .../lapack/orgqr_batch_scratchpad_size.rst    |  121 -
 docs/domains/lapack/orgqr_scratchpad_size.rst |   70 -
 docs/domains/lapack/orgtr.rst                 |  148 -
 docs/domains/lapack/orgtr_scratchpad_size.rst |   67 -
 docs/domains/lapack/ormqr.rst                 |  207 -
 docs/domains/lapack/ormqr_scratchpad_size.rst |   87 -
 docs/domains/lapack/ormrq.rst                 |  208 -
 docs/domains/lapack/ormrq_scratchpad_size.rst |   81 -
 docs/domains/lapack/ormtr.rst                 |  230 -
 docs/domains/lapack/ormtr_scratchpad_size.rst |  105 -
 docs/domains/lapack/potrf.rst                 |  172 -
 docs/domains/lapack/potrf_batch.rst           |  239 -
 .../lapack/potrf_batch_scratchpad_size.rst    |  120 -
 docs/domains/lapack/potrf_scratchpad_size.rst |   77 -
 docs/domains/lapack/potri.rst                 |  144 -
 docs/domains/lapack/potri_scratchpad_size.rst |   71 -
 docs/domains/lapack/potrs.rst                 |  177 -
 docs/domains/lapack/potrs_batch.rst           |  276 -
 .../lapack/potrs_batch_scratchpad_size.rst    |  136 -
 docs/domains/lapack/potrs_scratchpad_size.rst |   77 -
 docs/domains/lapack/syevd.rst                 |  186 -
 docs/domains/lapack/syevd_scratchpad_size.rst |   81 -
 docs/domains/lapack/sygvd.rst                 |  249 -
 docs/domains/lapack/sygvd_scratchpad_size.rst |   92 -
 docs/domains/lapack/sytrd.rst                 |  205 -
 docs/domains/lapack/sytrd_scratchpad_size.rst |   72 -
 docs/domains/lapack/sytrf.rst                 |  166 -
 docs/domains/lapack/sytrf_scratchpad_size.rst |   77 -
 docs/domains/lapack/trtrs.rst                 |  197 -
 docs/domains/lapack/trtrs_scratchpad_size.rst |   94 -
 docs/domains/lapack/ungbr.rst                 |  231 -
 docs/domains/lapack/ungbr_scratchpad_size.rst |   90 -
 docs/domains/lapack/ungqr.rst                 |  181 -
 docs/domains/lapack/ungqr_batch.rst           |  274 -
 .../lapack/ungqr_batch_scratchpad_size.rst    |  123 -
 docs/domains/lapack/ungqr_scratchpad_size.rst |   70 -
 docs/domains/lapack/ungtr.rst                 |  153 -
 docs/domains/lapack/ungtr_scratchpad_size.rst |   67 -
 docs/domains/lapack/unmqr.rst                 |  207 -
 docs/domains/lapack/unmqr_scratchpad_size.rst |   87 -
 docs/domains/lapack/unmrq.rst                 |  207 -
 docs/domains/lapack/unmrq_scratchpad_size.rst |   79 -
 docs/domains/lapack/unmtr.rst                 |  250 -
 docs/domains/lapack/unmtr_scratchpad_size.rst |   96 -
 docs/domains/matrix-storage.rst               |  581 --
 docs/index.rst                                |   36 -
 docs/introduction.rst                         |    9 -
 docs/onemkl-datatypes.rst                     |  140 -
 docs/requirements.txt                         |   28 -
 docs/selecting_a_compiler.rst                 |   19 -
 docs/using_onemkl_with_cmake.rst              |  102 -
 examples/CMakeLists.txt                       |   24 -
 examples/README.md                            |  597 --
 examples/blas/CMakeLists.txt                  |   28 -
 .../compile_time_dispatching/CMakeLists.txt   |   20 -
 .../level3/CMakeLists.txt                     |   47 -
 .../level3/gemm_usm_mklcpu_cublas.cpp         |  294 -
 .../blas/run_time_dispatching/CMakeLists.txt  |   20 -
 .../level3/CMakeLists.txt                     |   87 -
 .../run_time_dispatching/level3/gemm_usm.cpp  |  247 -
 examples/dft/CMakeLists.txt                   |   25 -
 .../compile_time_dispatching/CMakeLists.txt   |   50 -
 .../complex_fwd_usm_mklcpu_cufft.cpp          |  180 -
 .../dft/run_time_dispatching/CMakeLists.txt   |   81 -
 .../dft/run_time_dispatching/real_fwd_usm.cpp |  142 -
 examples/include/example_helper.hpp           |  182 -
 examples/lapack/CMakeLists.txt                |   27 -
 .../compile_time_dispatching/CMakeLists.txt   |   49 -
 .../getrs_usm_mklcpu_cusolver.cpp             |  333 -
 .../run_time_dispatching/CMakeLists.txt       |   72 -
 .../lapack/run_time_dispatching/getrs_usm.cpp |  260 -
 examples/rng/CMakeLists.txt                   |   28 -
 .../compile_time_dispatching/CMakeLists.txt   |   50 -
 .../uniform_usm_mklcpu_curand.cpp             |  225 -
 examples/rng/device/CMakeLists.txt            |   74 -
 .../rng/device/include/rng_example_helper.hpp |   50 -
 examples/rng/device/uniform.cpp               |  213 -
 .../rng/run_time_dispatching/CMakeLists.txt   |   73 -
 .../rng/run_time_dispatching/uniform_usm.cpp  |  190 -
 examples/sparse_blas/CMakeLists.txt           |   25 -
 .../compile_time_dispatching/CMakeLists.txt   |   44 -
 .../sparse_blas_gemv_usm_mklcpu.cpp           |  256 -
 .../run_time_dispatching/CMakeLists.txt       |   68 -
 .../sparse_blas_gemv_usm.cpp                  |  264 -
 include/oneapi/mkl.hpp                        |   31 -
 include/oneapi/mkl/bfloat16.hpp               |  228 -
 include/oneapi/mkl/blas.hpp                   |   73 -
 include/oneapi/mkl/blas.hxx                   | 4406 ---------
 .../mkl/blas/detail/blas_ct_backends.hpp      |   85 -
 .../mkl/blas/detail/blas_ct_backends.hxx      | 2966 -------
 .../oneapi/mkl/blas/detail/blas_loader.hpp    |   57 -
 .../oneapi/mkl/blas/detail/blas_loader.hxx    | 2699 ------
 .../oneapi/mkl/blas/detail/cublas/blas_ct.hpp |   57 -
 .../oneapi/mkl/blas/detail/cublas/blas_ct.hxx | 4381 ---------
 .../blas/detail/cublas/onemkl_blas_cublas.hpp |   55 -
 .../blas/detail/cublas/onemkl_blas_cublas.hxx | 2370 -----
 .../oneapi/mkl/blas/detail/mklcpu/blas_ct.hpp |   58 -
 .../oneapi/mkl/blas/detail/mklcpu/blas_ct.hxx | 4383 ---------
 .../blas/detail/mklcpu/onemkl_blas_mklcpu.hpp |   56 -
 .../oneapi/mkl/blas/detail/mklgpu/blas_ct.hpp |   58 -
 .../oneapi/mkl/blas/detail/mklgpu/blas_ct.hxx | 4383 ---------
 .../blas/detail/mklgpu/onemkl_blas_mklgpu.hpp |   54 -
 .../oneapi/mkl/blas/detail/netlib/blas_ct.hpp |   57 -
 .../oneapi/mkl/blas/detail/netlib/blas_ct.hxx | 4388 ---------
 .../blas/detail/netlib/onemkl_blas_netlib.hpp |   62 -
 .../mkl/blas/detail/onemkl_blas_backends.hxx  | 2946 ------
 .../mkl/blas/detail/portblas/blas_ct.hpp      |   57 -
 .../mkl/blas/detail/portblas/blas_ct.hxx      | 4296 ---------
 .../detail/portblas/onemkl_blas_portblas.hpp  |   61 -
 .../mkl/blas/detail/rocblas/blas_ct.hpp       |   60 -
 .../mkl/blas/detail/rocblas/blas_ct.hxx       | 4180 ---------
 .../detail/rocblas/onemkl_blas_rocblas.hpp    |   58 -
 .../detail/rocblas/onemkl_blas_rocblas.hxx    | 2160 -----
 .../oneapi/mkl/detail/backend_selector.hpp    |   47 -
 .../detail/backend_selector_predicates.hpp    |  150 -
 include/oneapi/mkl/detail/backends.hpp        |   61 -
 include/oneapi/mkl/detail/backends_table.hpp  |  203 -
 include/oneapi/mkl/detail/exceptions.hpp      |   59 -
 include/oneapi/mkl/detail/export.hpp          |   40 -
 include/oneapi/mkl/detail/get_device_id.hpp   |   74 -
 include/oneapi/mkl/dft.hpp                    |   39 -
 include/oneapi/mkl/dft/backward.hpp           |  164 -
 include/oneapi/mkl/dft/descriptor.hpp         |   38 -
 include/oneapi/mkl/dft/detail/commit_impl.hpp |  184 -
 .../mkl/dft/detail/cufft/onemkl_dft_cufft.hpp |   38 -
 .../oneapi/mkl/dft/detail/descriptor_impl.hpp |  124 -
 include/oneapi/mkl/dft/detail/dft_ct.hxx      |  138 -
 include/oneapi/mkl/dft/detail/dft_loader.hpp  |   52 -
 .../dft/detail/external_workspace_helper.hpp  |  194 -
 .../dft/detail/mklcpu/onemkl_dft_mklcpu.hpp   |   38 -
 .../dft/detail/mklgpu/onemkl_dft_mklgpu.hpp   |   38 -
 .../dft/detail/portfft/onemkl_dft_portfft.hpp |   39 -
 .../dft/detail/rocfft/onemkl_dft_rocfft.hpp   |   38 -
 include/oneapi/mkl/dft/detail/types_impl.hpp  |  234 -
 include/oneapi/mkl/dft/forward.hpp            |  161 -
 include/oneapi/mkl/dft/types.hpp              |   42 -
 include/oneapi/mkl/exceptions.hpp             |  130 -
 include/oneapi/mkl/lapack.hpp                 |   37 -
 .../mkl/lapack/detail/cusolver/lapack_ct.hpp  |   48 -
 .../mkl/lapack/detail/cusolver/lapack_ct.hxx  | 2627 ------
 .../cusolver/onemkl_lapack_cusolver.hpp       |   45 -
 .../cusolver/onemkl_lapack_cusolver.hxx       | 1830 ----
 .../mkl/lapack/detail/lapack_loader.hpp       | 2382 -----
 .../oneapi/mkl/lapack/detail/lapack_rt.hpp    | 2392 -----
 .../lapack/detail/mkl_common/lapack_ct.hxx    | 2694 ------
 .../mkl_common/onemkl_lapack_backends.hxx     | 2139 -----
 .../mkl/lapack/detail/mklcpu/lapack_ct.hpp    |   46 -
 .../detail/mklcpu/onemkl_lapack_mklcpu.hpp    |   45 -
 .../mkl/lapack/detail/mklgpu/lapack_ct.hpp    |   45 -
 .../detail/mklgpu/onemkl_lapack_mklgpu.hpp    |   45 -
 .../mkl/lapack/detail/rocsolver/lapack_ct.hpp |   50 -
 .../mkl/lapack/detail/rocsolver/lapack_ct.hxx | 2629 ------
 .../rocsolver/onemkl_lapack_rocsolver.hpp     |   49 -
 .../rocsolver/onemkl_lapack_rocsolver.hxx     | 1835 ----
 include/oneapi/mkl/lapack/exceptions.hpp      |   90 -
 include/oneapi/mkl/lapack/types.hpp           |   97 -
 include/oneapi/mkl/rng.hpp                    |   41 -
 .../rng/detail/curand/onemkl_rng_curand.hpp   |   94 -
 include/oneapi/mkl/rng/detail/engine_impl.hpp |  197 -
 .../rng/detail/mklcpu/onemkl_rng_mklcpu.hpp   |   55 -
 .../rng/detail/mklgpu/onemkl_rng_mklgpu.hpp   |   55 -
 include/oneapi/mkl/rng/detail/rng_loader.hpp  |   57 -
 .../rng/detail/rocrand/onemkl_rng_rocrand.hpp |   96 -
 include/oneapi/mkl/rng/device.hpp             |   28 -
 .../mkl/rng/device/detail/bernoulli_impl.hpp  |   89 -
 .../mkl/rng/device/detail/bits_impl.hpp       |   71 -
 .../rng/device/detail/distribution_base.hpp   |   73 -
 .../mkl/rng/device/detail/engine_base.hpp     |   43 -
 .../rng/device/detail/exponential_impl.hpp    |  112 -
 .../mkl/rng/device/detail/gaussian_impl.hpp   |  270 -
 .../mkl/rng/device/detail/lognormal_impl.hpp  |  105 -
 .../mkl/rng/device/detail/mcg31m1_impl.hpp    |  233 -
 .../mkl/rng/device/detail/mcg59_impl.hpp      |  275 -
 .../mkl/rng/device/detail/mrg32k3a_impl.hpp   |  384 -
 .../detail/mrg32k3a_skip_ahead_matrix.hpp     | 3668 --------
 .../rng/device/detail/philox4x32x10_impl.hpp  |  552 --
 .../mkl/rng/device/detail/poisson_impl.hpp    |  355 -
 .../rng/device/detail/uniform_bits_impl.hpp   |   51 -
 .../mkl/rng/device/detail/uniform_impl.hpp    |  131 -
 .../mkl/rng/device/detail/vm_wrappers.hpp     |   61 -
 .../oneapi/mkl/rng/device/distributions.hpp   |  480 -
 include/oneapi/mkl/rng/device/engines.hpp     |  170 -
 include/oneapi/mkl/rng/device/functions.hpp   |   52 -
 include/oneapi/mkl/rng/device/types.hpp       |   62 -
 include/oneapi/mkl/rng/distributions.hpp      |  373 -
 include/oneapi/mkl/rng/engines.hpp            |  243 -
 include/oneapi/mkl/rng/functions.hpp          |  120 -
 include/oneapi/mkl/rng/predicates.hpp         |   69 -
 include/oneapi/mkl/sparse_blas.hpp            |   40 -
 .../mkl/sparse_blas/detail/helper_types.hpp   |   52 -
 .../mklcpu/onemkl_sparse_blas_mklcpu.hpp      |   34 -
 .../detail/mklcpu/sparse_blas_ct.hpp          |   41 -
 .../mklgpu/onemkl_sparse_blas_mklgpu.hpp      |   34 -
 .../detail/mklgpu/sparse_blas_ct.hpp          |   41 -
 .../detail/onemkl_sparse_blas_backends.hxx    |   91 -
 .../mkl/sparse_blas/detail/sparse_blas_ct.hxx |  135 -
 .../mkl/sparse_blas/detail/sparse_blas_rt.hpp |  103 -
 include/oneapi/mkl/sparse_blas/types.hpp      |   44 -
 include/oneapi/mkl/types.hpp                  |  122 -
 legal_information.md                          |   36 -
 rfcs/template.md                              |   62 +
 scripts/blas_list.txt                         |  178 -
 scripts/func_parser.py                        |  190 -
 scripts/generate_backend_api.py               |  122 -
 scripts/generate_cmake.py                     |  129 -
 scripts/generate_ct_instant.py                |  134 -
 scripts/generate_ct_templates.py              |  122 -
 scripts/generate_wrappers.py                  |  172 -
 src/CMakeLists.txt                            |   94 -
 src/blas/CMakeLists.txt                       |   47 -
 src/blas/backends/CMakeLists.txt              |   45 -
 src/blas/backends/backend_wrappers.cxx        |  511 --
 src/blas/backends/cublas/CMakeLists.txt       |   72 -
 src/blas/backends/cublas/cublas_batch.cpp     | 1844 ----
 .../backends/cublas/cublas_extensions.cpp     |  756 --
 src/blas/backends/cublas/cublas_handle.hpp    |   61 -
 src/blas/backends/cublas/cublas_helper.hpp    |  308 -
 src/blas/backends/cublas/cublas_level1.cpp    | 1853 ----
 src/blas/backends/cublas/cublas_level2.cpp    | 2702 ------
 src/blas/backends/cublas/cublas_level3.cpp    | 1336 ---
 .../backends/cublas/cublas_scope_handle.cpp   |  143 -
 .../backends/cublas/cublas_scope_handle.hpp   |  113 -
 .../cublas/cublas_scope_handle_hipsycl.cpp    |   74 -
 .../cublas/cublas_scope_handle_hipsycl.hpp    |   84 -
 src/blas/backends/cublas/cublas_task.hpp      |   77 -
 src/blas/backends/cublas/cublas_wrappers.cpp  | 1006 ---
 src/blas/backends/mkl_common/mkl_batch.cxx    | 1072 ---
 .../backends/mkl_common/mkl_blas_backend.hpp  |   84 -
 .../backends/mkl_common/mkl_blas_backend.hxx  | 2494 ------
 .../backends/mkl_common/mkl_extensions.cxx    |  359 -
 src/blas/backends/mkl_common/mkl_level1.cxx   |  645 --
 src/blas/backends/mkl_common/mkl_level2.cxx   |  862 --
 src/blas/backends/mkl_common/mkl_level3.cxx   |  519 --
 src/blas/backends/mklcpu/CMakeLists.txt       |   75 -
 src/blas/backends/mklcpu/mklcpu_batch.cpp     |   50 -
 .../backends/mklcpu/mklcpu_extensions.cpp     |   50 -
 src/blas/backends/mklcpu/mklcpu_level1.cpp    |   49 -
 src/blas/backends/mklcpu/mklcpu_level2.cpp    |   49 -
 src/blas/backends/mklcpu/mklcpu_level3.cpp    |   49 -
 src/blas/backends/mklcpu/mklcpu_wrappers.cpp  |   35 -
 src/blas/backends/mklgpu/CMakeLists.txt       |   72 -
 src/blas/backends/mklgpu/mklgpu_batch.cpp     |   50 -
 .../backends/mklgpu/mklgpu_extensions.cpp     |   50 -
 src/blas/backends/mklgpu/mklgpu_level1.cpp    |   50 -
 src/blas/backends/mklgpu/mklgpu_level2.cpp    |   49 -
 src/blas/backends/mklgpu/mklgpu_level3.cpp    |   49 -
 src/blas/backends/mklgpu/mklgpu_wrappers.cpp  |   35 -
 src/blas/backends/netlib/CMakeLists.txt       |   76 -
 src/blas/backends/netlib/netlib_batch.cpp     |   51 -
 src/blas/backends/netlib/netlib_batch.cxx     | 1620 ----
 src/blas/backends/netlib/netlib_common.hpp    |  103 -
 .../backends/netlib/netlib_extensions.cpp     |   51 -
 .../backends/netlib/netlib_extensions.cxx     |  585 --
 src/blas/backends/netlib/netlib_level1.cpp    |  245 -
 src/blas/backends/netlib/netlib_level1.cxx    | 1525 ----
 src/blas/backends/netlib/netlib_level2.cpp    |   50 -
 src/blas/backends/netlib/netlib_level2.cxx    | 2138 -----
 src/blas/backends/netlib/netlib_level3.cpp    |   55 -
 src/blas/backends/netlib/netlib_level3.cxx    | 1148 ---
 src/blas/backends/netlib/netlib_wrappers.cpp  |   35 -
 src/blas/backends/portblas/CMakeLists.txt     |  222 -
 src/blas/backends/portblas/portblas_batch.cpp |   57 -
 src/blas/backends/portblas/portblas_batch.cxx | 1017 ---
 .../backends/portblas/portblas_common.hpp     |  239 -
 .../backends/portblas/portblas_gemm_bias.cxx  |   90 -
 .../backends/portblas/portblas_level1.cxx     |  410 -
 .../portblas/portblas_level1_double.cpp       |   62 -
 .../portblas/portblas_level1_float.cpp        |   60 -
 .../backends/portblas/portblas_level2.cxx     |  470 -
 .../portblas/portblas_level2_double.cpp       |   60 -
 .../portblas/portblas_level2_float.cpp        |   60 -
 .../backends/portblas/portblas_level3.cxx     |  451 -
 .../portblas/portblas_level3_bfloat16.cpp     |   78 -
 .../portblas/portblas_level3_double.cpp       |   60 -
 .../portblas/portblas_level3_float.cpp        |   62 -
 .../portblas/portblas_level3_half.cpp         |  103 -
 .../backends/portblas/portblas_wrappers.cpp   |   21 -
 src/blas/backends/rocblas/CMakeLists.txt      |   87 -
 src/blas/backends/rocblas/rocblas_batch.cpp   | 2433 -----
 .../backends/rocblas/rocblas_extensions.cpp   |  716 --
 src/blas/backends/rocblas/rocblas_handle.hpp  |   63 -
 src/blas/backends/rocblas/rocblas_helper.hpp  |  293 -
 src/blas/backends/rocblas/rocblas_level1.cpp  | 1782 ----
 src/blas/backends/rocblas/rocblas_level2.cpp  | 3575 --------
 src/blas/backends/rocblas/rocblas_level3.cpp  | 1482 ----
 .../backends/rocblas/rocblas_scope_handle.cpp |  158 -
 .../backends/rocblas/rocblas_scope_handle.hpp |   68 -
 .../rocblas/rocblas_scope_handle_hipsycl.cpp  |   94 -
 .../rocblas/rocblas_scope_handle_hipsycl.hpp  |   66 -
 src/blas/backends/rocblas/rocblas_task.hpp    |   73 -
 .../backends/rocblas/rocblas_wrappers.cpp     | 1008 ---
 src/blas/blas_loader.cpp                      | 7898 -----------------
 src/blas/function_table.hpp                   | 4974 -----------
 src/config.hpp.in                             |   44 -
 src/dft/CMakeLists.txt                        |   50 -
 src/dft/backends/CMakeLists.txt               |   41 -
 .../backend_backward_instantiations.cxx       |   58 -
 .../backends/backend_compute_signature.cxx    |  137 -
 .../backend_forward_instantiations.cxx        |   58 -
 src/dft/backends/backend_wrappers.cxx         |   46 -
 src/dft/backends/cufft/CMakeLists.txt         |   85 -
 src/dft/backends/cufft/backward.cpp           |  245 -
 src/dft/backends/cufft/commit.cpp             |  462 -
 src/dft/backends/cufft/descriptor.cpp         |   49 -
 src/dft/backends/cufft/execute_helper.hpp     |  148 -
 src/dft/backends/cufft/forward.cpp            |  247 -
 .../backends/cufft/mkl_dft_cufft_wrappers.cpp |   32 -
 src/dft/backends/descriptor.cpp               |   46 -
 src/dft/backends/mklcpu/CMakeLists.txt        |   93 -
 src/dft/backends/mklcpu/backward.cpp          |  330 -
 src/dft/backends/mklcpu/commit.cpp            |  212 -
 .../backends/mklcpu/commit_derived_impl.hpp   |   88 -
 src/dft/backends/mklcpu/descriptor.cpp        |   51 -
 src/dft/backends/mklcpu/forward.cpp           |  336 -
 .../backends/mklcpu/mkl_dft_cpu_wrappers.cpp  |   29 -
 src/dft/backends/mklcpu/mklcpu_helpers.hpp    |  178 -
 src/dft/backends/mklgpu/CMakeLists.txt        |   89 -
 src/dft/backends/mklgpu/backward.cpp          |  175 -
 src/dft/backends/mklgpu/commit.cpp            |  264 -
 src/dft/backends/mklgpu/descriptor.cpp        |   51 -
 src/dft/backends/mklgpu/forward.cpp           |  181 -
 .../backends/mklgpu/mkl_dft_gpu_wrappers.cpp  |   29 -
 src/dft/backends/mklgpu/mklgpu_helpers.hpp    |  177 -
 src/dft/backends/portfft/CMakeLists.txt       |  134 -
 src/dft/backends/portfft/commit.cpp           |  345 -
 src/dft/backends/portfft/descriptor.cpp       |   47 -
 .../portfft/mkl_dft_portfft_wrappers.cpp      |   32 -
 src/dft/backends/portfft/portfft_helper.hpp   |   62 -
 src/dft/backends/rocfft/CMakeLists.txt        |   95 -
 src/dft/backends/rocfft/backward.cpp          |  357 -
 src/dft/backends/rocfft/commit.cpp            |  640 --
 src/dft/backends/rocfft/descriptor.cpp        |   51 -
 src/dft/backends/rocfft/execute_helper.hpp    |   97 -
 src/dft/backends/rocfft/forward.cpp           |  358 -
 .../rocfft/mkl_dft_rocfft_wrappers.cpp        |   32 -
 src/dft/backends/rocfft/rocfft_handle.hpp     |   34 -
 src/dft/backends/stride_helper.hpp            |  151 -
 src/dft/descriptor.cxx                        |  297 -
 src/dft/descriptor_config_helper.hpp          |  262 -
 src/dft/dft_loader.cpp                        |   72 -
 src/dft/function_table.hpp                    |   62 -
 src/include/allocator_helper.hpp              |   50 -
 src/include/dtype_string.hpp                  |   56 -
 src/include/exceptions_helper.hpp             |   36 -
 src/include/function_table_initializer.hpp    |  128 -
 src/include/runtime_support_helper.hpp        |   53 -
 src/lapack/CMakeLists.txt                     |   43 -
 src/lapack/backends/CMakeLists.txt            |   37 -
 src/lapack/backends/cusolver/CMakeLists.txt   |   68 -
 .../backends/cusolver/cusolver_batch.cpp      | 1994 -----
 .../backends/cusolver/cusolver_handle.hpp     |   61 -
 .../backends/cusolver/cusolver_helper.hpp     |  326 -
 .../backends/cusolver/cusolver_lapack.cpp     | 3321 -------
 .../cusolver/cusolver_scope_handle.cpp        |  144 -
 .../cusolver/cusolver_scope_handle.hpp        |  118 -
 .../backends/cusolver/cusolver_task.hpp       |   61 -
 .../backends/cusolver/cusolver_wrappers.cpp   |  426 -
 .../backends/mkl_common/lapack_wrappers.cxx   |  331 -
 src/lapack/backends/mkl_common/mkl_lapack.cxx | 2793 ------
 .../mkl_common/mkl_lapack_backend.hpp         | 1263 ---
 src/lapack/backends/mklcpu/CMakeLists.txt     |   71 -
 .../backends/mklcpu/lapack_cpu_wrappers.cpp   |   30 -
 src/lapack/backends/mklcpu/mkl_lapack.cpp     |   41 -
 src/lapack/backends/mklgpu/CMakeLists.txt     |   71 -
 .../backends/mklgpu/lapack_gpu_wrappers.cpp   |   30 -
 src/lapack/backends/mklgpu/mkl_lapack.cpp     |   41 -
 src/lapack/backends/rocsolver/CMakeLists.txt  |   69 -
 .../backends/rocsolver/rocsolver_batch.cpp    | 1066 ---
 .../backends/rocsolver/rocsolver_handle.hpp   |   63 -
 .../backends/rocsolver/rocsolver_helper.hpp   |  277 -
 .../backends/rocsolver/rocsolver_lapack.cpp   | 2807 ------
 .../rocsolver/rocsolver_scope_handle.cpp      |  146 -
 .../rocsolver/rocsolver_scope_handle.hpp      |   69 -
 .../backends/rocsolver/rocsolver_task.hpp     |   63 -
 .../backends/rocsolver/rocsolver_wrappers.cpp |  428 -
 src/lapack/function_table.hpp                 | 1839 ----
 src/lapack/lapack_loader.cpp                  | 3004 -------
 src/rng/CMakeLists.txt                        |   47 -
 src/rng/backends/CMakeLists.txt               |   38 -
 src/rng/backends/curand/CMakeLists.txt        |  103 -
 src/rng/backends/curand/curand_helper.hpp     |  326 -
 src/rng/backends/curand/curand_task.hpp       |   74 -
 .../curand/mkl_rng_curand_wrappers.cpp        |   68 -
 src/rng/backends/curand/mrg32k3a.cpp          |  828 --
 src/rng/backends/curand/philox4x32x10.cpp     |  850 --
 src/rng/backends/mklcpu/CMakeLists.txt        |   71 -
 src/rng/backends/mklcpu/cpu_common.hpp        |   74 -
 .../backends/mklcpu/mkl_rng_cpu_wrappers.cpp  |   29 -
 src/rng/backends/mklcpu/mrg32k3a.cpp          |  585 --
 src/rng/backends/mklcpu/philox4x32x10.cpp     |  587 --
 src/rng/backends/mklgpu/CMakeLists.txt        |   73 -
 .../backends/mklgpu/mkl_internal_rng_gpu.hpp  |   81 -
 .../backends/mklgpu/mkl_rng_gpu_wrappers.cpp  |   29 -
 src/rng/backends/mklgpu/mrg32k3a.cpp          |  317 -
 src/rng/backends/mklgpu/philox4x32x10.cpp     |  318 -
 src/rng/backends/rocrand/CMakeLists.txt       |   94 -
 .../rocrand/mkl_rng_rocrand_wrappers.cpp      |   70 -
 src/rng/backends/rocrand/mrg32k3a.cpp         | 1026 ---
 src/rng/backends/rocrand/philox4x32x10.cpp    | 1048 ---
 src/rng/backends/rocrand/rocrand_helper.hpp   |  335 -
 src/rng/backends/rocrand/rocrand_task.hpp     |   80 -
 src/rng/function_table.hpp                    |   46 -
 src/rng/rng_loader.cpp                        |   54 -
 src/sparse_blas/CMakeLists.txt                |   48 -
 src/sparse_blas/backends/CMakeLists.txt       |   29 -
 src/sparse_blas/backends/backend_wrappers.cxx |   85 -
 .../backends/mkl_common/mkl_basic.cxx         |   62 -
 .../backends/mkl_common/mkl_helper.hpp        |   56 -
 .../backends/mkl_common/mkl_operations.cxx    |  170 -
 .../backends/mklcpu/CMakeLists.txt            |   82 -
 .../backends/mklcpu/mklcpu_basic.cpp          |   28 -
 .../backends/mklcpu/mklcpu_operations.cpp     |   28 -
 .../backends/mklcpu/mklcpu_wrappers.cpp       |   32 -
 .../backends/mklgpu/CMakeLists.txt            |   82 -
 .../backends/mklgpu/mklgpu_basic.cpp          |   28 -
 .../backends/mklgpu/mklgpu_operations.cpp     |   28 -
 .../backends/mklgpu/mklgpu_wrappers.cpp       |   32 -
 src/sparse_blas/function_table.hpp            |  109 -
 src/sparse_blas/macros.hpp                    |   39 -
 src/sparse_blas/sparse_blas_loader.cpp        |  162 -
 tests/CMakeLists.txt                          |   24 -
 tests/README.md                               |   11 -
 tests/unit_tests/CMakeLists.txt               |  228 -
 tests/unit_tests/blas/CMakeLists.txt          |   24 -
 tests/unit_tests/blas/batch/CMakeLists.txt    |   55 -
 .../blas/batch/axpy_batch_stride.cpp          |  215 -
 .../blas/batch/axpy_batch_stride_usm.cpp      |  221 -
 .../unit_tests/blas/batch/axpy_batch_usm.cpp  |  285 -
 .../blas/batch/copy_batch_stride.cpp          |  202 -
 .../blas/batch/copy_batch_stride_usm.cpp      |  214 -
 .../unit_tests/blas/batch/copy_batch_usm.cpp  |  279 -
 .../blas/batch/dgmm_batch_stride.cpp          |  252 -
 .../blas/batch/dgmm_batch_stride_usm.cpp      |  257 -
 .../unit_tests/blas/batch/dgmm_batch_usm.cpp  |  318 -
 .../blas/batch/gemm_batch_stride.cpp          |  289 -
 .../blas/batch/gemm_batch_stride_usm.cpp      |  324 -
 .../unit_tests/blas/batch/gemm_batch_usm.cpp  |  421 -
 .../blas/batch/gemv_batch_stride.cpp          |  233 -
 .../blas/batch/gemv_batch_stride_usm.cpp      |  238 -
 .../unit_tests/blas/batch/gemv_batch_usm.cpp  |  339 -
 .../blas/batch/imatcopy_batch_stride.cpp      |  204 -
 .../blas/batch/imatcopy_batch_stride_usm.cpp  |  227 -
 .../blas/batch/imatcopy_batch_usm.cpp         |  282 -
 .../blas/batch/omatadd_batch_stride.cpp       |  220 -
 .../blas/batch/omatadd_batch_stride_usm.cpp   |  253 -
 .../blas/batch/omatcopy_batch_stride.cpp      |  207 -
 .../blas/batch/omatcopy_batch_stride_usm.cpp  |  238 -
 .../blas/batch/omatcopy_batch_usm.cpp         |  288 -
 .../blas/batch/syrk_batch_stride.cpp          |  227 -
 .../blas/batch/syrk_batch_stride_usm.cpp      |  254 -
 .../unit_tests/blas/batch/syrk_batch_usm.cpp  |  334 -
 .../blas/batch/trsm_batch_stride.cpp          |  232 -
 .../blas/batch/trsm_batch_stride_usm.cpp      |  234 -
 .../unit_tests/blas/batch/trsm_batch_usm.cpp  |  352 -
 .../unit_tests/blas/extensions/CMakeLists.txt |   55 -
 .../unit_tests/blas/extensions/gemm_bias.cpp  |  385 -
 .../blas/extensions/gemm_bias_usm.cpp         |  391 -
 tests/unit_tests/blas/extensions/gemmt.cpp    |  387 -
 .../unit_tests/blas/extensions/gemmt_usm.cpp  |  387 -
 tests/unit_tests/blas/extensions/imatcopy.cpp |  193 -
 .../blas/extensions/imatcopy_usm.cpp          |  198 -
 tests/unit_tests/blas/extensions/omatadd.cpp  |  209 -
 .../blas/extensions/omatadd_usm.cpp           |  213 -
 tests/unit_tests/blas/extensions/omatcopy.cpp |  202 -
 .../unit_tests/blas/extensions/omatcopy2.cpp  |  201 -
 .../blas/extensions/omatcopy2_usm.cpp         |  210 -
 .../blas/extensions/omatcopy_usm.cpp          |  200 -
 .../blas/include/allocator_helper.hpp         |   78 -
 .../blas/include/onemkl_blas_helper.hpp       |   85 -
 .../blas/include/reference_blas_templates.hpp | 2183 -----
 .../blas/include/reference_blas_wrappers.hpp  | 2416 -----
 tests/unit_tests/blas/include/test_common.hpp |  711 --
 tests/unit_tests/blas/level1/CMakeLists.txt   |   55 -
 tests/unit_tests/blas/level1/asum.cpp         |  177 -
 tests/unit_tests/blas/level1/asum_usm.cpp     |  203 -
 tests/unit_tests/blas/level1/axpby.cpp        |  187 -
 tests/unit_tests/blas/level1/axpby_usm.cpp    |  190 -
 tests/unit_tests/blas/level1/axpy.cpp         |  183 -
 tests/unit_tests/blas/level1/axpy_usm.cpp     |  186 -
 tests/unit_tests/blas/level1/copy.cpp         |  171 -
 tests/unit_tests/blas/level1/copy_usm.cpp     |  175 -
 tests/unit_tests/blas/level1/dot.cpp          |  168 -
 tests/unit_tests/blas/level1/dot_usm.cpp      |  188 -
 tests/unit_tests/blas/level1/dotc.cpp         |  162 -
 tests/unit_tests/blas/level1/dotc_usm.cpp     |  167 -
 tests/unit_tests/blas/level1/dotu.cpp         |  162 -
 tests/unit_tests/blas/level1/dotu_usm.cpp     |  166 -
 tests/unit_tests/blas/level1/iamax.cpp        |  169 -
 tests/unit_tests/blas/level1/iamax_usm.cpp    |  192 -
 tests/unit_tests/blas/level1/iamin.cpp        |  169 -
 tests/unit_tests/blas/level1/iamin_usm.cpp    |  192 -
 tests/unit_tests/blas/level1/nrm2.cpp         |  175 -
 tests/unit_tests/blas/level1/nrm2_usm.cpp     |  199 -
 tests/unit_tests/blas/level1/rot.cpp          |  192 -
 tests/unit_tests/blas/level1/rot_usm.cpp      |  193 -
 tests/unit_tests/blas/level1/rotg.cpp         |  187 -
 tests/unit_tests/blas/level1/rotg_usm.cpp     |  208 -
 tests/unit_tests/blas/level1/rotm.cpp         |  211 -
 tests/unit_tests/blas/level1/rotm_usm.cpp     |  212 -
 tests/unit_tests/blas/level1/rotmg.cpp        |  208 -
 tests/unit_tests/blas/level1/rotmg_usm.cpp    |  237 -
 tests/unit_tests/blas/level1/scal.cpp         |  187 -
 tests/unit_tests/blas/level1/scal_usm.cpp     |  194 -
 tests/unit_tests/blas/level1/sdsdot.cpp       |  148 -
 tests/unit_tests/blas/level1/sdsdot_usm.cpp   |  153 -
 tests/unit_tests/blas/level1/swap.cpp         |  174 -
 tests/unit_tests/blas/level1/swap_usm.cpp     |  177 -
 tests/unit_tests/blas/level2/CMakeLists.txt   |   56 -
 tests/unit_tests/blas/level2/gbmv.cpp         |  265 -
 tests/unit_tests/blas/level2/gbmv_usm.cpp     |  267 -
 tests/unit_tests/blas/level2/gemv.cpp         |  256 -
 tests/unit_tests/blas/level2/gemv_usm.cpp     |  258 -
 tests/unit_tests/blas/level2/ger.cpp          |  168 -
 tests/unit_tests/blas/level2/ger_usm.cpp      |  173 -
 tests/unit_tests/blas/level2/gerc.cpp         |  168 -
 tests/unit_tests/blas/level2/gerc_usm.cpp     |  173 -
 tests/unit_tests/blas/level2/geru.cpp         |  168 -
 tests/unit_tests/blas/level2/geru_usm.cpp     |  173 -
 tests/unit_tests/blas/level2/hbmv.cpp         |  197 -
 tests/unit_tests/blas/level2/hbmv_usm.cpp     |  201 -
 tests/unit_tests/blas/level2/hemv.cpp         |  195 -
 tests/unit_tests/blas/level2/hemv_usm.cpp     |  200 -
 tests/unit_tests/blas/level2/her.cpp          |  188 -
 tests/unit_tests/blas/level2/her2.cpp         |  180 -
 tests/unit_tests/blas/level2/her2_usm.cpp     |  186 -
 tests/unit_tests/blas/level2/her_usm.cpp      |  193 -
 tests/unit_tests/blas/level2/hpmv.cpp         |  183 -
 tests/unit_tests/blas/level2/hpmv_usm.cpp     |  189 -
 tests/unit_tests/blas/level2/hpr.cpp          |  181 -
 tests/unit_tests/blas/level2/hpr2.cpp         |  179 -
 tests/unit_tests/blas/level2/hpr2_usm.cpp     |  185 -
 tests/unit_tests/blas/level2/hpr_usm.cpp      |  186 -
 tests/unit_tests/blas/level2/sbmv.cpp         |  183 -
 tests/unit_tests/blas/level2/sbmv_usm.cpp     |  188 -
 tests/unit_tests/blas/level2/spmv.cpp         |  181 -
 tests/unit_tests/blas/level2/spmv_usm.cpp     |  187 -
 tests/unit_tests/blas/level2/spr.cpp          |  175 -
 tests/unit_tests/blas/level2/spr2.cpp         |  179 -
 tests/unit_tests/blas/level2/spr2_usm.cpp     |  185 -
 tests/unit_tests/blas/level2/spr_usm.cpp      |  180 -
 tests/unit_tests/blas/level2/symv.cpp         |  182 -
 tests/unit_tests/blas/level2/symv_usm.cpp     |  187 -
 tests/unit_tests/blas/level2/syr.cpp          |  175 -
 tests/unit_tests/blas/level2/syr2.cpp         |  179 -
 tests/unit_tests/blas/level2/syr2_usm.cpp     |  185 -
 tests/unit_tests/blas/level2/syr_usm.cpp      |  180 -
 tests/unit_tests/blas/level2/tbmv.cpp         |  279 -
 tests/unit_tests/blas/level2/tbmv_usm.cpp     |  286 -
 tests/unit_tests/blas/level2/tbsv.cpp         |  279 -
 tests/unit_tests/blas/level2/tbsv_usm.cpp     |  286 -
 tests/unit_tests/blas/level2/tpmv.cpp         |  277 -
 tests/unit_tests/blas/level2/tpmv_usm.cpp     |  284 -
 tests/unit_tests/blas/level2/tpsv.cpp         |  277 -
 tests/unit_tests/blas/level2/tpsv_usm.cpp     |  284 -
 tests/unit_tests/blas/level2/trmv.cpp         |  277 -
 tests/unit_tests/blas/level2/trmv_usm.cpp     |  284 -
 tests/unit_tests/blas/level2/trsv.cpp         |  277 -
 tests/unit_tests/blas/level2/trsv_usm.cpp     |  284 -
 tests/unit_tests/blas/level3/CMakeLists.txt   |   57 -
 tests/unit_tests/blas/level3/gemm.cpp         |  313 -
 tests/unit_tests/blas/level3/gemm_usm.cpp     |  312 -
 tests/unit_tests/blas/level3/hemm.cpp         |  192 -
 tests/unit_tests/blas/level3/hemm_usm.cpp     |  192 -
 tests/unit_tests/blas/level3/her2k.cpp        |  193 -
 tests/unit_tests/blas/level3/her2k_usm.cpp    |  194 -
 tests/unit_tests/blas/level3/herk.cpp         |  184 -
 tests/unit_tests/blas/level3/herk_usm.cpp     |  189 -
 tests/unit_tests/blas/level3/symm.cpp         |  226 -
 tests/unit_tests/blas/level3/symm_usm.cpp     |  226 -
 tests/unit_tests/blas/level3/syr2k.cpp        |  222 -
 tests/unit_tests/blas/level3/syr2k_usm.cpp    |  223 -
 tests/unit_tests/blas/level3/syrk.cpp         |  217 -
 tests/unit_tests/blas/level3/syrk_usm.cpp     |  221 -
 tests/unit_tests/blas/level3/trmm.cpp         |  366 -
 tests/unit_tests/blas/level3/trmm_usm.cpp     |  368 -
 tests/unit_tests/blas/level3/trsm.cpp         |  494 --
 tests/unit_tests/blas/level3/trsm_usm.cpp     |  497 --
 tests/unit_tests/dft/CMakeLists.txt           |   20 -
 .../dft/include/compute_inplace.hpp           |  214 -
 .../dft/include/compute_inplace_real_real.hpp |  152 -
 .../dft/include/compute_out_of_place.hpp      |  171 -
 .../compute_out_of_place_real_real.hpp        |  160 -
 .../unit_tests/dft/include/compute_tester.hpp |  154 -
 .../unit_tests/dft/include/parseval_check.hpp |   81 -
 .../unit_tests/dft/include/reference_dft.hpp  |  130 -
 tests/unit_tests/dft/include/test_common.hpp  |  391 -
 tests/unit_tests/dft/source/CMakeLists.txt    |   57 -
 tests/unit_tests/dft/source/compute_tests.cpp |  207 -
 .../dft/source/descriptor_tests.cpp           |  782 --
 .../dft/source/workspace_external_tests.cpp   |  403 -
 tests/unit_tests/include/test_helper.hpp      |  379 -
 tests/unit_tests/lapack/CMakeLists.txt        |   21 -
 tests/unit_tests/lapack/common/CMakeLists.txt |   27 -
 .../lapack/common/dependency_check.cpp        |   72 -
 tests/unit_tests/lapack/common/test_log.cpp   |   43 -
 .../lapack/include/lapack_accuracy_checks.hpp |  649 --
 .../lapack/include/lapack_common.hpp          |  303 -
 .../lapack/include/lapack_gtest_suite.hpp     |  169 -
 .../include/lapack_reference_wrappers.hpp     |  906 --
 .../lapack/include/lapack_test_controller.hpp |  251 -
 tests/unit_tests/lapack/source/CMakeLists.txt |   95 -
 tests/unit_tests/lapack/source/gebrd.cpp      |  190 -
 tests/unit_tests/lapack/source/geqrf.cpp      |  163 -
 .../lapack/source/geqrf_batch_group.cpp       |  298 -
 .../lapack/source/geqrf_batch_stride.cpp      |  180 -
 tests/unit_tests/lapack/source/gerqf.cpp      |  163 -
 tests/unit_tests/lapack/source/gesvd.cpp      |  241 -
 tests/unit_tests/lapack/source/getrf.cpp      |  166 -
 .../lapack/source/getrf_batch_group.cpp       |  307 -
 .../lapack/source/getrf_batch_stride.cpp      |  180 -
 tests/unit_tests/lapack/source/getri.cpp      |  177 -
 .../lapack/source/getri_batch_group.cpp       |  322 -
 .../lapack/source/getri_batch_stride.cpp      |  195 -
 tests/unit_tests/lapack/source/getrs.cpp      |  190 -
 .../lapack/source/getrs_batch_group.cpp       |  392 -
 .../lapack/source/getrs_batch_stride.cpp      |  214 -
 tests/unit_tests/lapack/source/heevd.cpp      |  162 -
 tests/unit_tests/lapack/source/hegvd.cpp      |  300 -
 tests/unit_tests/lapack/source/hetrd.cpp      |  225 -
 tests/unit_tests/lapack/source/hetrf.cpp      |  281 -
 tests/unit_tests/lapack/source/orgbr.cpp      |  200 -
 tests/unit_tests/lapack/source/orgqr.cpp      |  174 -
 .../lapack/source/orgqr_batch_group.cpp       |  316 -
 .../lapack/source/orgqr_batch_stride.cpp      |  193 -
 tests/unit_tests/lapack/source/orgtr.cpp      |  176 -
 tests/unit_tests/lapack/source/ormqr.cpp      |  211 -
 tests/unit_tests/lapack/source/ormrq.cpp      |  220 -
 tests/unit_tests/lapack/source/ormtr.cpp      |  210 -
 tests/unit_tests/lapack/source/potrf.cpp      |  157 -
 .../lapack/source/potrf_batch_group.cpp       |  273 -
 .../lapack/source/potrf_batch_stride.cpp      |  169 -
 tests/unit_tests/lapack/source/potri.cpp      |  191 -
 tests/unit_tests/lapack/source/potrs.cpp      |  181 -
 .../lapack/source/potrs_batch_group.cpp       |  347 -
 .../lapack/source/potrs_batch_stride.cpp      |  205 -
 tests/unit_tests/lapack/source/syevd.cpp      |  162 -
 tests/unit_tests/lapack/source/sygvd.cpp      |  305 -
 tests/unit_tests/lapack/source/sytrd.cpp      |  225 -
 tests/unit_tests/lapack/source/sytrf.cpp      |  278 -
 tests/unit_tests/lapack/source/trtrs.cpp      |  180 -
 tests/unit_tests/lapack/source/ungbr.cpp      |  200 -
 tests/unit_tests/lapack/source/ungqr.cpp      |  173 -
 .../lapack/source/ungqr_batch_group.cpp       |  316 -
 .../lapack/source/ungqr_batch_stride.cpp      |  193 -
 tests/unit_tests/lapack/source/ungtr.cpp      |  176 -
 tests/unit_tests/lapack/source/unmqr.cpp      |  211 -
 tests/unit_tests/lapack/source/unmrq.cpp      |  220 -
 tests/unit_tests/lapack/source/unmtr.cpp      |  210 -
 tests/unit_tests/main_test.cpp                |  230 -
 tests/unit_tests/rng/CMakeLists.txt           |   22 -
 tests/unit_tests/rng/device/CMakeLists.txt    |   21 -
 .../unit_tests/rng/device/include/moments.hpp |  121 -
 .../device/include/rng_device_test_common.hpp |  342 -
 .../rng/device/include/skip_ahead_test.hpp    |  178 -
 .../rng/device/moments/CMakeLists.txt         |   40 -
 .../unit_tests/rng/device/moments/moments.cpp | 1050 ---
 .../rng/device/service/CMakeLists.txt         |   40 -
 .../rng/device/service/skip_ahead.cpp         |  113 -
 .../rng/include/engines_api_tests.hpp         |  142 -
 .../rng/include/rng_test_common.hpp           |  131 -
 .../rng/include/skip_ahead_test.hpp           |  142 -
 .../rng/include/statistics_check.hpp          |  179 -
 .../rng/include/statistics_check_test.hpp     |  126 -
 tests/unit_tests/rng/service/CMakeLists.txt   |   53 -
 .../rng/service/engines_api_test.cpp          |   70 -
 tests/unit_tests/rng/service/skip_ahead.cpp   |   68 -
 .../rng/statistics_check/CMakeLists.txt       |   53 -
 .../rng/statistics_check/bernoulli.cpp        |   59 -
 .../rng/statistics_check/bernoulli_usm.cpp    |   59 -
 .../rng/statistics_check/gaussian.cpp         |   94 -
 .../rng/statistics_check/gaussian_usm.cpp     |   94 -
 .../rng/statistics_check/lognormal.cpp        |   94 -
 .../rng/statistics_check/lognormal_usm.cpp    |   94 -
 .../rng/statistics_check/poisson.cpp          |   63 -
 .../rng/statistics_check/poisson_usm.cpp      |   63 -
 .../rng/statistics_check/uniform.cpp          |  107 -
 .../rng/statistics_check/uniform_usm.cpp      |  107 -
 tests/unit_tests/sparse_blas/CMakeLists.txt   |   20 -
 .../sparse_blas/include/sparse_reference.hpp  |  297 -
 .../sparse_blas/include/test_common.hpp       |  286 -
 .../sparse_blas/source/CMakeLists.txt         |   63 -
 .../sparse_blas/source/sparse_gemm_buffer.cpp |  302 -
 .../sparse_blas/source/sparse_gemm_usm.cpp    |  330 -
 .../sparse_blas/source/sparse_gemv_buffer.cpp |  230 -
 .../sparse_blas/source/sparse_gemv_usm.cpp    |  256 -
 .../sparse_blas/source/sparse_trsv_buffer.cpp |  240 -
 .../sparse_blas/source/sparse_trsv_usm.cpp    |  261 -
 third-party-programs/THIRD-PARTY-PROGRAMS     |  115 -
 864 files changed, 104 insertions(+), 294965 deletions(-)
 delete mode 100644 CMakeLists.txt
 delete mode 100644 CONTRIBUTING.md
 delete mode 100644 SECURITY.md
 delete mode 100644 _clang-format
 delete mode 100644 cmake/CMakeLists.txt
 delete mode 100644 cmake/FindCBLAS.cmake
 delete mode 100644 cmake/FindCompiler.cmake
 delete mode 100644 cmake/FindLAPACKE.cmake
 delete mode 100644 cmake/FindNETLIB.cmake
 delete mode 100644 cmake/FindSphinx.cmake
 delete mode 100644 cmake/FindcuBLAS.cmake
 delete mode 100644 cmake/FindcuRAND.cmake
 delete mode 100644 cmake/FindcuSOLVER.cmake
 delete mode 100644 cmake/WarningsUtils.cmake
 delete mode 100644 cmake/mkl/MKLConfig.cmake
 delete mode 100755 cmake/mkl/MKLConfigVersion.cmake
 delete mode 100644 cmake/oneMKLConfig.cmake
 delete mode 100644 deps/googletest/CMakeLists.txt
 delete mode 100644 deps/googletest/CONTRIBUTORS
 delete mode 100644 deps/googletest/LICENSE
 delete mode 100644 deps/googletest/cmake/Config.cmake.in
 delete mode 100644 deps/googletest/cmake/gtest.pc.in
 delete mode 100644 deps/googletest/cmake/gtest_main.pc.in
 delete mode 100644 deps/googletest/cmake/internal_utils.cmake
 delete mode 100644 deps/googletest/cmake/libgtest.la.in
 delete mode 100644 deps/googletest/include/gtest/gtest-death-test.h
 delete mode 100644 deps/googletest/include/gtest/gtest-matchers.h
 delete mode 100644 deps/googletest/include/gtest/gtest-message.h
 delete mode 100644 deps/googletest/include/gtest/gtest-param-test.h
 delete mode 100644 deps/googletest/include/gtest/gtest-printers.h
 delete mode 100644 deps/googletest/include/gtest/gtest-spi.h
 delete mode 100644 deps/googletest/include/gtest/gtest-test-part.h
 delete mode 100644 deps/googletest/include/gtest/gtest-typed-test.h
 delete mode 100644 deps/googletest/include/gtest/gtest.h
 delete mode 100644 deps/googletest/include/gtest/gtest_pred_impl.h
 delete mode 100644 deps/googletest/include/gtest/gtest_prod.h
 delete mode 100644 deps/googletest/include/gtest/internal/custom/README.md
 delete mode 100644 deps/googletest/include/gtest/internal/custom/gtest-port.h
 delete mode 100644 deps/googletest/include/gtest/internal/custom/gtest-printers.h
 delete mode 100644 deps/googletest/include/gtest/internal/custom/gtest.h
 delete mode 100644 deps/googletest/include/gtest/internal/gtest-death-test-internal.h
 delete mode 100644 deps/googletest/include/gtest/internal/gtest-filepath.h
 delete mode 100644 deps/googletest/include/gtest/internal/gtest-internal.h
 delete mode 100644 deps/googletest/include/gtest/internal/gtest-param-util.h
 delete mode 100644 deps/googletest/include/gtest/internal/gtest-port-arch.h
 delete mode 100644 deps/googletest/include/gtest/internal/gtest-port.h
 delete mode 100644 deps/googletest/include/gtest/internal/gtest-string.h
 delete mode 100644 deps/googletest/include/gtest/internal/gtest-type-util.h
 delete mode 100644 deps/googletest/include/gtest/internal/gtest-type-util.h.pump
 delete mode 100644 deps/googletest/src/gtest-all.cc
 delete mode 100644 deps/googletest/src/gtest-death-test.cc
 delete mode 100644 deps/googletest/src/gtest-filepath.cc
 delete mode 100644 deps/googletest/src/gtest-internal-inl.h
 delete mode 100644 deps/googletest/src/gtest-matchers.cc
 delete mode 100644 deps/googletest/src/gtest-port.cc
 delete mode 100644 deps/googletest/src/gtest-printers.cc
 delete mode 100644 deps/googletest/src/gtest-test-part.cc
 delete mode 100644 deps/googletest/src/gtest-typed-test.cc
 delete mode 100644 deps/googletest/src/gtest.cc
 delete mode 100644 deps/googletest/src/gtest_main.cc
 delete mode 100644 docs/CMakeLists.txt
 delete mode 100644 docs/README.md
 delete mode 100644 docs/_static/favicons.png
 delete mode 100644 docs/_static/oneAPI-rgb-rev-100.png
 delete mode 100644 docs/_static/style.css
 delete mode 100644 docs/_templates/layout.html
 delete mode 100644 docs/building_and_running_tests.rst
 delete mode 100644 docs/building_the_project_with_adaptivecpp.rst
 delete mode 100644 docs/building_the_project_with_dpcpp.rst
 delete mode 100644 docs/conf.py.in
 delete mode 100644 docs/create_new_backend.rst
 delete mode 100644 docs/domains/blas/asum.rst
 delete mode 100644 docs/domains/blas/axpby.rst
 delete mode 100644 docs/domains/blas/axpy.rst
 delete mode 100644 docs/domains/blas/axpy_batch.rst
 delete mode 100644 docs/domains/blas/blas-level-1-routines.rst
 delete mode 100644 docs/domains/blas/blas-level-2-routines.rst
 delete mode 100644 docs/domains/blas/blas-level-3-routines.rst
 delete mode 100644 docs/domains/blas/blas-like-extensions.rst
 delete mode 100644 docs/domains/blas/blas.rst
 delete mode 100644 docs/domains/blas/copy.rst
 delete mode 100644 docs/domains/blas/copy_batch.rst
 delete mode 100644 docs/domains/blas/dgmm_batch.rst
 delete mode 100644 docs/domains/blas/dot.rst
 delete mode 100644 docs/domains/blas/dotc.rst
 delete mode 100644 docs/domains/blas/dotu.rst
 delete mode 100644 docs/domains/blas/gbmv.rst
 delete mode 100644 docs/domains/blas/gemm.rst
 delete mode 100644 docs/domains/blas/gemm_batch.rst
 delete mode 100644 docs/domains/blas/gemm_bias.rst
 delete mode 100644 docs/domains/blas/gemmt.rst
 delete mode 100644 docs/domains/blas/gemv.rst
 delete mode 100644 docs/domains/blas/gemv_batch.rst
 delete mode 100644 docs/domains/blas/ger.rst
 delete mode 100644 docs/domains/blas/gerc.rst
 delete mode 100644 docs/domains/blas/geru.rst
 delete mode 100644 docs/domains/blas/hbmv.rst
 delete mode 100644 docs/domains/blas/hemm.rst
 delete mode 100644 docs/domains/blas/hemv.rst
 delete mode 100644 docs/domains/blas/her.rst
 delete mode 100644 docs/domains/blas/her2.rst
 delete mode 100644 docs/domains/blas/her2k.rst
 delete mode 100644 docs/domains/blas/herk.rst
 delete mode 100644 docs/domains/blas/hpmv.rst
 delete mode 100644 docs/domains/blas/hpr.rst
 delete mode 100644 docs/domains/blas/hpr2.rst
 delete mode 100644 docs/domains/blas/iamax.rst
 delete mode 100644 docs/domains/blas/iamin.rst
 delete mode 100644 docs/domains/blas/nrm2.rst
 delete mode 100644 docs/domains/blas/rot.rst
 delete mode 100644 docs/domains/blas/rotg.rst
 delete mode 100644 docs/domains/blas/rotm.rst
 delete mode 100644 docs/domains/blas/rotmg.rst
 delete mode 100644 docs/domains/blas/sbmv.rst
 delete mode 100644 docs/domains/blas/scal.rst
 delete mode 100644 docs/domains/blas/sdsdot.rst
 delete mode 100644 docs/domains/blas/spmv.rst
 delete mode 100644 docs/domains/blas/spr.rst
 delete mode 100644 docs/domains/blas/spr2.rst
 delete mode 100644 docs/domains/blas/swap.rst
 delete mode 100644 docs/domains/blas/symm.rst
 delete mode 100644 docs/domains/blas/symv.rst
 delete mode 100644 docs/domains/blas/syr.rst
 delete mode 100644 docs/domains/blas/syr2.rst
 delete mode 100644 docs/domains/blas/syr2k.rst
 delete mode 100644 docs/domains/blas/syrk.rst
 delete mode 100644 docs/domains/blas/syrk_batch.rst
 delete mode 100644 docs/domains/blas/tbmv.rst
 delete mode 100644 docs/domains/blas/tbsv.rst
 delete mode 100644 docs/domains/blas/tpmv.rst
 delete mode 100644 docs/domains/blas/tpsv.rst
 delete mode 100644 docs/domains/blas/trmm.rst
 delete mode 100644 docs/domains/blas/trmv.rst
 delete mode 100644 docs/domains/blas/trsm.rst
 delete mode 100644 docs/domains/blas/trsm_batch.rst
 delete mode 100644 docs/domains/blas/trsv.rst
 delete mode 100644 docs/domains/dense_linear_algebra.rst
 delete mode 100644 docs/domains/lapack/gebrd.rst
 delete mode 100644 docs/domains/lapack/gebrd_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/geqrf.rst
 delete mode 100644 docs/domains/lapack/geqrf_batch.rst
 delete mode 100644 docs/domains/lapack/geqrf_batch_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/geqrf_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/gerqf.rst
 delete mode 100644 docs/domains/lapack/gerqf_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/gesvd.rst
 delete mode 100644 docs/domains/lapack/gesvd_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/getrf.rst
 delete mode 100644 docs/domains/lapack/getrf_batch.rst
 delete mode 100644 docs/domains/lapack/getrf_batch_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/getrf_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/getri.rst
 delete mode 100644 docs/domains/lapack/getri_batch.rst
 delete mode 100644 docs/domains/lapack/getri_batch_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/getri_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/getrs.rst
 delete mode 100644 docs/domains/lapack/getrs_batch.rst
 delete mode 100644 docs/domains/lapack/getrs_batch_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/getrs_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/heevd.rst
 delete mode 100644 docs/domains/lapack/heevd_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/hegvd.rst
 delete mode 100644 docs/domains/lapack/hegvd_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/hetrd.rst
 delete mode 100644 docs/domains/lapack/hetrd_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/hetrf.rst
 delete mode 100644 docs/domains/lapack/hetrf_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/lapack-like-extensions.inc.rst
 delete mode 100644 docs/domains/lapack/lapack-linear-equation-routines.inc.rst
 delete mode 100644 docs/domains/lapack/lapack-singular-value-eigenvalue-routines.inc.rst
 delete mode 100644 docs/domains/lapack/lapack.rst
 delete mode 100644 docs/domains/lapack/orgbr.rst
 delete mode 100644 docs/domains/lapack/orgbr_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/orgqr.rst
 delete mode 100644 docs/domains/lapack/orgqr_batch.rst
 delete mode 100644 docs/domains/lapack/orgqr_batch_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/orgqr_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/orgtr.rst
 delete mode 100644 docs/domains/lapack/orgtr_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/ormqr.rst
 delete mode 100644 docs/domains/lapack/ormqr_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/ormrq.rst
 delete mode 100644 docs/domains/lapack/ormrq_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/ormtr.rst
 delete mode 100644 docs/domains/lapack/ormtr_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/potrf.rst
 delete mode 100644 docs/domains/lapack/potrf_batch.rst
 delete mode 100644 docs/domains/lapack/potrf_batch_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/potrf_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/potri.rst
 delete mode 100644 docs/domains/lapack/potri_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/potrs.rst
 delete mode 100644 docs/domains/lapack/potrs_batch.rst
 delete mode 100644 docs/domains/lapack/potrs_batch_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/potrs_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/syevd.rst
 delete mode 100644 docs/domains/lapack/syevd_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/sygvd.rst
 delete mode 100644 docs/domains/lapack/sygvd_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/sytrd.rst
 delete mode 100644 docs/domains/lapack/sytrd_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/sytrf.rst
 delete mode 100644 docs/domains/lapack/sytrf_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/trtrs.rst
 delete mode 100644 docs/domains/lapack/trtrs_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/ungbr.rst
 delete mode 100644 docs/domains/lapack/ungbr_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/ungqr.rst
 delete mode 100644 docs/domains/lapack/ungqr_batch.rst
 delete mode 100644 docs/domains/lapack/ungqr_batch_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/ungqr_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/ungtr.rst
 delete mode 100644 docs/domains/lapack/ungtr_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/unmqr.rst
 delete mode 100644 docs/domains/lapack/unmqr_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/unmrq.rst
 delete mode 100644 docs/domains/lapack/unmrq_scratchpad_size.rst
 delete mode 100644 docs/domains/lapack/unmtr.rst
 delete mode 100644 docs/domains/lapack/unmtr_scratchpad_size.rst
 delete mode 100644 docs/domains/matrix-storage.rst
 delete mode 100644 docs/index.rst
 delete mode 100644 docs/introduction.rst
 delete mode 100644 docs/onemkl-datatypes.rst
 delete mode 100644 docs/requirements.txt
 delete mode 100644 docs/selecting_a_compiler.rst
 delete mode 100644 docs/using_onemkl_with_cmake.rst
 delete mode 100644 examples/CMakeLists.txt
 delete mode 100644 examples/README.md
 delete mode 100644 examples/blas/CMakeLists.txt
 delete mode 100644 examples/blas/compile_time_dispatching/CMakeLists.txt
 delete mode 100644 examples/blas/compile_time_dispatching/level3/CMakeLists.txt
 delete mode 100644 examples/blas/compile_time_dispatching/level3/gemm_usm_mklcpu_cublas.cpp
 delete mode 100644 examples/blas/run_time_dispatching/CMakeLists.txt
 delete mode 100644 examples/blas/run_time_dispatching/level3/CMakeLists.txt
 delete mode 100644 examples/blas/run_time_dispatching/level3/gemm_usm.cpp
 delete mode 100644 examples/dft/CMakeLists.txt
 delete mode 100644 examples/dft/compile_time_dispatching/CMakeLists.txt
 delete mode 100644 examples/dft/compile_time_dispatching/complex_fwd_usm_mklcpu_cufft.cpp
 delete mode 100644 examples/dft/run_time_dispatching/CMakeLists.txt
 delete mode 100644 examples/dft/run_time_dispatching/real_fwd_usm.cpp
 delete mode 100644 examples/include/example_helper.hpp
 delete mode 100644 examples/lapack/CMakeLists.txt
 delete mode 100644 examples/lapack/compile_time_dispatching/CMakeLists.txt
 delete mode 100644 examples/lapack/compile_time_dispatching/getrs_usm_mklcpu_cusolver.cpp
 delete mode 100644 examples/lapack/run_time_dispatching/CMakeLists.txt
 delete mode 100644 examples/lapack/run_time_dispatching/getrs_usm.cpp
 delete mode 100644 examples/rng/CMakeLists.txt
 delete mode 100644 examples/rng/compile_time_dispatching/CMakeLists.txt
 delete mode 100644 examples/rng/compile_time_dispatching/uniform_usm_mklcpu_curand.cpp
 delete mode 100644 examples/rng/device/CMakeLists.txt
 delete mode 100644 examples/rng/device/include/rng_example_helper.hpp
 delete mode 100644 examples/rng/device/uniform.cpp
 delete mode 100644 examples/rng/run_time_dispatching/CMakeLists.txt
 delete mode 100644 examples/rng/run_time_dispatching/uniform_usm.cpp
 delete mode 100644 examples/sparse_blas/CMakeLists.txt
 delete mode 100644 examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
 delete mode 100644 examples/sparse_blas/compile_time_dispatching/sparse_blas_gemv_usm_mklcpu.cpp
 delete mode 100644 examples/sparse_blas/run_time_dispatching/CMakeLists.txt
 delete mode 100644 examples/sparse_blas/run_time_dispatching/sparse_blas_gemv_usm.cpp
 delete mode 100644 include/oneapi/mkl.hpp
 delete mode 100644 include/oneapi/mkl/bfloat16.hpp
 delete mode 100644 include/oneapi/mkl/blas.hpp
 delete mode 100644 include/oneapi/mkl/blas.hxx
 delete mode 100644 include/oneapi/mkl/blas/detail/blas_ct_backends.hpp
 delete mode 100644 include/oneapi/mkl/blas/detail/blas_ct_backends.hxx
 delete mode 100644 include/oneapi/mkl/blas/detail/blas_loader.hpp
 delete mode 100644 include/oneapi/mkl/blas/detail/blas_loader.hxx
 delete mode 100644 include/oneapi/mkl/blas/detail/cublas/blas_ct.hpp
 delete mode 100644 include/oneapi/mkl/blas/detail/cublas/blas_ct.hxx
 delete mode 100644 include/oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp
 delete mode 100644 include/oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hxx
 delete mode 100644 include/oneapi/mkl/blas/detail/mklcpu/blas_ct.hpp
 delete mode 100644 include/oneapi/mkl/blas/detail/mklcpu/blas_ct.hxx
 delete mode 100644 include/oneapi/mkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp
 delete mode 100644 include/oneapi/mkl/blas/detail/mklgpu/blas_ct.hpp
 delete mode 100644 include/oneapi/mkl/blas/detail/mklgpu/blas_ct.hxx
 delete mode 100644 include/oneapi/mkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp
 delete mode 100644 include/oneapi/mkl/blas/detail/netlib/blas_ct.hpp
 delete mode 100644 include/oneapi/mkl/blas/detail/netlib/blas_ct.hxx
 delete mode 100644 include/oneapi/mkl/blas/detail/netlib/onemkl_blas_netlib.hpp
 delete mode 100644 include/oneapi/mkl/blas/detail/onemkl_blas_backends.hxx
 delete mode 100644 include/oneapi/mkl/blas/detail/portblas/blas_ct.hpp
 delete mode 100644 include/oneapi/mkl/blas/detail/portblas/blas_ct.hxx
 delete mode 100644 include/oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp
 delete mode 100644 include/oneapi/mkl/blas/detail/rocblas/blas_ct.hpp
 delete mode 100644 include/oneapi/mkl/blas/detail/rocblas/blas_ct.hxx
 delete mode 100644 include/oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hpp
 delete mode 100644 include/oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hxx
 delete mode 100644 include/oneapi/mkl/detail/backend_selector.hpp
 delete mode 100644 include/oneapi/mkl/detail/backend_selector_predicates.hpp
 delete mode 100644 include/oneapi/mkl/detail/backends.hpp
 delete mode 100644 include/oneapi/mkl/detail/backends_table.hpp
 delete mode 100644 include/oneapi/mkl/detail/exceptions.hpp
 delete mode 100644 include/oneapi/mkl/detail/export.hpp
 delete mode 100644 include/oneapi/mkl/detail/get_device_id.hpp
 delete mode 100644 include/oneapi/mkl/dft.hpp
 delete mode 100644 include/oneapi/mkl/dft/backward.hpp
 delete mode 100644 include/oneapi/mkl/dft/descriptor.hpp
 delete mode 100644 include/oneapi/mkl/dft/detail/commit_impl.hpp
 delete mode 100644 include/oneapi/mkl/dft/detail/cufft/onemkl_dft_cufft.hpp
 delete mode 100644 include/oneapi/mkl/dft/detail/descriptor_impl.hpp
 delete mode 100644 include/oneapi/mkl/dft/detail/dft_ct.hxx
 delete mode 100644 include/oneapi/mkl/dft/detail/dft_loader.hpp
 delete mode 100644 include/oneapi/mkl/dft/detail/external_workspace_helper.hpp
 delete mode 100644 include/oneapi/mkl/dft/detail/mklcpu/onemkl_dft_mklcpu.hpp
 delete mode 100644 include/oneapi/mkl/dft/detail/mklgpu/onemkl_dft_mklgpu.hpp
 delete mode 100644 include/oneapi/mkl/dft/detail/portfft/onemkl_dft_portfft.hpp
 delete mode 100644 include/oneapi/mkl/dft/detail/rocfft/onemkl_dft_rocfft.hpp
 delete mode 100644 include/oneapi/mkl/dft/detail/types_impl.hpp
 delete mode 100644 include/oneapi/mkl/dft/forward.hpp
 delete mode 100644 include/oneapi/mkl/dft/types.hpp
 delete mode 100644 include/oneapi/mkl/exceptions.hpp
 delete mode 100644 include/oneapi/mkl/lapack.hpp
 delete mode 100644 include/oneapi/mkl/lapack/detail/cusolver/lapack_ct.hpp
 delete mode 100644 include/oneapi/mkl/lapack/detail/cusolver/lapack_ct.hxx
 delete mode 100644 include/oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hpp
 delete mode 100644 include/oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hxx
 delete mode 100644 include/oneapi/mkl/lapack/detail/lapack_loader.hpp
 delete mode 100644 include/oneapi/mkl/lapack/detail/lapack_rt.hpp
 delete mode 100644 include/oneapi/mkl/lapack/detail/mkl_common/lapack_ct.hxx
 delete mode 100644 include/oneapi/mkl/lapack/detail/mkl_common/onemkl_lapack_backends.hxx
 delete mode 100644 include/oneapi/mkl/lapack/detail/mklcpu/lapack_ct.hpp
 delete mode 100644 include/oneapi/mkl/lapack/detail/mklcpu/onemkl_lapack_mklcpu.hpp
 delete mode 100644 include/oneapi/mkl/lapack/detail/mklgpu/lapack_ct.hpp
 delete mode 100644 include/oneapi/mkl/lapack/detail/mklgpu/onemkl_lapack_mklgpu.hpp
 delete mode 100644 include/oneapi/mkl/lapack/detail/rocsolver/lapack_ct.hpp
 delete mode 100644 include/oneapi/mkl/lapack/detail/rocsolver/lapack_ct.hxx
 delete mode 100644 include/oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hpp
 delete mode 100644 include/oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hxx
 delete mode 100644 include/oneapi/mkl/lapack/exceptions.hpp
 delete mode 100644 include/oneapi/mkl/lapack/types.hpp
 delete mode 100644 include/oneapi/mkl/rng.hpp
 delete mode 100644 include/oneapi/mkl/rng/detail/curand/onemkl_rng_curand.hpp
 delete mode 100644 include/oneapi/mkl/rng/detail/engine_impl.hpp
 delete mode 100644 include/oneapi/mkl/rng/detail/mklcpu/onemkl_rng_mklcpu.hpp
 delete mode 100644 include/oneapi/mkl/rng/detail/mklgpu/onemkl_rng_mklgpu.hpp
 delete mode 100644 include/oneapi/mkl/rng/detail/rng_loader.hpp
 delete mode 100644 include/oneapi/mkl/rng/detail/rocrand/onemkl_rng_rocrand.hpp
 delete mode 100644 include/oneapi/mkl/rng/device.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/bernoulli_impl.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/bits_impl.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/distribution_base.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/engine_base.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/exponential_impl.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/gaussian_impl.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/lognormal_impl.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/mcg31m1_impl.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/mcg59_impl.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/mrg32k3a_impl.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/mrg32k3a_skip_ahead_matrix.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/philox4x32x10_impl.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/poisson_impl.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/uniform_bits_impl.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/uniform_impl.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/detail/vm_wrappers.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/distributions.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/engines.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/functions.hpp
 delete mode 100644 include/oneapi/mkl/rng/device/types.hpp
 delete mode 100644 include/oneapi/mkl/rng/distributions.hpp
 delete mode 100644 include/oneapi/mkl/rng/engines.hpp
 delete mode 100644 include/oneapi/mkl/rng/functions.hpp
 delete mode 100644 include/oneapi/mkl/rng/predicates.hpp
 delete mode 100644 include/oneapi/mkl/sparse_blas.hpp
 delete mode 100644 include/oneapi/mkl/sparse_blas/detail/helper_types.hpp
 delete mode 100644 include/oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp
 delete mode 100644 include/oneapi/mkl/sparse_blas/detail/mklcpu/sparse_blas_ct.hpp
 delete mode 100644 include/oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp
 delete mode 100644 include/oneapi/mkl/sparse_blas/detail/mklgpu/sparse_blas_ct.hpp
 delete mode 100644 include/oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx
 delete mode 100644 include/oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx
 delete mode 100644 include/oneapi/mkl/sparse_blas/detail/sparse_blas_rt.hpp
 delete mode 100644 include/oneapi/mkl/sparse_blas/types.hpp
 delete mode 100644 include/oneapi/mkl/types.hpp
 delete mode 100644 legal_information.md
 create mode 100644 rfcs/template.md
 delete mode 100644 scripts/blas_list.txt
 delete mode 100755 scripts/func_parser.py
 delete mode 100755 scripts/generate_backend_api.py
 delete mode 100644 scripts/generate_cmake.py
 delete mode 100755 scripts/generate_ct_instant.py
 delete mode 100755 scripts/generate_ct_templates.py
 delete mode 100755 scripts/generate_wrappers.py
 delete mode 100644 src/CMakeLists.txt
 delete mode 100644 src/blas/CMakeLists.txt
 delete mode 100644 src/blas/backends/CMakeLists.txt
 delete mode 100644 src/blas/backends/backend_wrappers.cxx
 delete mode 100644 src/blas/backends/cublas/CMakeLists.txt
 delete mode 100644 src/blas/backends/cublas/cublas_batch.cpp
 delete mode 100644 src/blas/backends/cublas/cublas_extensions.cpp
 delete mode 100644 src/blas/backends/cublas/cublas_handle.hpp
 delete mode 100644 src/blas/backends/cublas/cublas_helper.hpp
 delete mode 100644 src/blas/backends/cublas/cublas_level1.cpp
 delete mode 100644 src/blas/backends/cublas/cublas_level2.cpp
 delete mode 100644 src/blas/backends/cublas/cublas_level3.cpp
 delete mode 100644 src/blas/backends/cublas/cublas_scope_handle.cpp
 delete mode 100644 src/blas/backends/cublas/cublas_scope_handle.hpp
 delete mode 100644 src/blas/backends/cublas/cublas_scope_handle_hipsycl.cpp
 delete mode 100644 src/blas/backends/cublas/cublas_scope_handle_hipsycl.hpp
 delete mode 100644 src/blas/backends/cublas/cublas_task.hpp
 delete mode 100644 src/blas/backends/cublas/cublas_wrappers.cpp
 delete mode 100644 src/blas/backends/mkl_common/mkl_batch.cxx
 delete mode 100644 src/blas/backends/mkl_common/mkl_blas_backend.hpp
 delete mode 100644 src/blas/backends/mkl_common/mkl_blas_backend.hxx
 delete mode 100644 src/blas/backends/mkl_common/mkl_extensions.cxx
 delete mode 100644 src/blas/backends/mkl_common/mkl_level1.cxx
 delete mode 100644 src/blas/backends/mkl_common/mkl_level2.cxx
 delete mode 100644 src/blas/backends/mkl_common/mkl_level3.cxx
 delete mode 100644 src/blas/backends/mklcpu/CMakeLists.txt
 delete mode 100644 src/blas/backends/mklcpu/mklcpu_batch.cpp
 delete mode 100644 src/blas/backends/mklcpu/mklcpu_extensions.cpp
 delete mode 100644 src/blas/backends/mklcpu/mklcpu_level1.cpp
 delete mode 100644 src/blas/backends/mklcpu/mklcpu_level2.cpp
 delete mode 100644 src/blas/backends/mklcpu/mklcpu_level3.cpp
 delete mode 100644 src/blas/backends/mklcpu/mklcpu_wrappers.cpp
 delete mode 100644 src/blas/backends/mklgpu/CMakeLists.txt
 delete mode 100644 src/blas/backends/mklgpu/mklgpu_batch.cpp
 delete mode 100644 src/blas/backends/mklgpu/mklgpu_extensions.cpp
 delete mode 100644 src/blas/backends/mklgpu/mklgpu_level1.cpp
 delete mode 100644 src/blas/backends/mklgpu/mklgpu_level2.cpp
 delete mode 100644 src/blas/backends/mklgpu/mklgpu_level3.cpp
 delete mode 100644 src/blas/backends/mklgpu/mklgpu_wrappers.cpp
 delete mode 100644 src/blas/backends/netlib/CMakeLists.txt
 delete mode 100644 src/blas/backends/netlib/netlib_batch.cpp
 delete mode 100644 src/blas/backends/netlib/netlib_batch.cxx
 delete mode 100644 src/blas/backends/netlib/netlib_common.hpp
 delete mode 100644 src/blas/backends/netlib/netlib_extensions.cpp
 delete mode 100644 src/blas/backends/netlib/netlib_extensions.cxx
 delete mode 100644 src/blas/backends/netlib/netlib_level1.cpp
 delete mode 100644 src/blas/backends/netlib/netlib_level1.cxx
 delete mode 100644 src/blas/backends/netlib/netlib_level2.cpp
 delete mode 100644 src/blas/backends/netlib/netlib_level2.cxx
 delete mode 100644 src/blas/backends/netlib/netlib_level3.cpp
 delete mode 100644 src/blas/backends/netlib/netlib_level3.cxx
 delete mode 100644 src/blas/backends/netlib/netlib_wrappers.cpp
 delete mode 100644 src/blas/backends/portblas/CMakeLists.txt
 delete mode 100644 src/blas/backends/portblas/portblas_batch.cpp
 delete mode 100644 src/blas/backends/portblas/portblas_batch.cxx
 delete mode 100644 src/blas/backends/portblas/portblas_common.hpp
 delete mode 100644 src/blas/backends/portblas/portblas_gemm_bias.cxx
 delete mode 100644 src/blas/backends/portblas/portblas_level1.cxx
 delete mode 100644 src/blas/backends/portblas/portblas_level1_double.cpp
 delete mode 100644 src/blas/backends/portblas/portblas_level1_float.cpp
 delete mode 100644 src/blas/backends/portblas/portblas_level2.cxx
 delete mode 100644 src/blas/backends/portblas/portblas_level2_double.cpp
 delete mode 100644 src/blas/backends/portblas/portblas_level2_float.cpp
 delete mode 100644 src/blas/backends/portblas/portblas_level3.cxx
 delete mode 100644 src/blas/backends/portblas/portblas_level3_bfloat16.cpp
 delete mode 100644 src/blas/backends/portblas/portblas_level3_double.cpp
 delete mode 100644 src/blas/backends/portblas/portblas_level3_float.cpp
 delete mode 100644 src/blas/backends/portblas/portblas_level3_half.cpp
 delete mode 100644 src/blas/backends/portblas/portblas_wrappers.cpp
 delete mode 100644 src/blas/backends/rocblas/CMakeLists.txt
 delete mode 100644 src/blas/backends/rocblas/rocblas_batch.cpp
 delete mode 100644 src/blas/backends/rocblas/rocblas_extensions.cpp
 delete mode 100644 src/blas/backends/rocblas/rocblas_handle.hpp
 delete mode 100644 src/blas/backends/rocblas/rocblas_helper.hpp
 delete mode 100644 src/blas/backends/rocblas/rocblas_level1.cpp
 delete mode 100644 src/blas/backends/rocblas/rocblas_level2.cpp
 delete mode 100644 src/blas/backends/rocblas/rocblas_level3.cpp
 delete mode 100644 src/blas/backends/rocblas/rocblas_scope_handle.cpp
 delete mode 100644 src/blas/backends/rocblas/rocblas_scope_handle.hpp
 delete mode 100644 src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.cpp
 delete mode 100644 src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.hpp
 delete mode 100644 src/blas/backends/rocblas/rocblas_task.hpp
 delete mode 100644 src/blas/backends/rocblas/rocblas_wrappers.cpp
 delete mode 100644 src/blas/blas_loader.cpp
 delete mode 100644 src/blas/function_table.hpp
 delete mode 100644 src/config.hpp.in
 delete mode 100644 src/dft/CMakeLists.txt
 delete mode 100644 src/dft/backends/CMakeLists.txt
 delete mode 100644 src/dft/backends/backend_backward_instantiations.cxx
 delete mode 100644 src/dft/backends/backend_compute_signature.cxx
 delete mode 100644 src/dft/backends/backend_forward_instantiations.cxx
 delete mode 100644 src/dft/backends/backend_wrappers.cxx
 delete mode 100644 src/dft/backends/cufft/CMakeLists.txt
 delete mode 100644 src/dft/backends/cufft/backward.cpp
 delete mode 100644 src/dft/backends/cufft/commit.cpp
 delete mode 100644 src/dft/backends/cufft/descriptor.cpp
 delete mode 100644 src/dft/backends/cufft/execute_helper.hpp
 delete mode 100644 src/dft/backends/cufft/forward.cpp
 delete mode 100644 src/dft/backends/cufft/mkl_dft_cufft_wrappers.cpp
 delete mode 100644 src/dft/backends/descriptor.cpp
 delete mode 100644 src/dft/backends/mklcpu/CMakeLists.txt
 delete mode 100644 src/dft/backends/mklcpu/backward.cpp
 delete mode 100644 src/dft/backends/mklcpu/commit.cpp
 delete mode 100644 src/dft/backends/mklcpu/commit_derived_impl.hpp
 delete mode 100644 src/dft/backends/mklcpu/descriptor.cpp
 delete mode 100644 src/dft/backends/mklcpu/forward.cpp
 delete mode 100644 src/dft/backends/mklcpu/mkl_dft_cpu_wrappers.cpp
 delete mode 100644 src/dft/backends/mklcpu/mklcpu_helpers.hpp
 delete mode 100644 src/dft/backends/mklgpu/CMakeLists.txt
 delete mode 100644 src/dft/backends/mklgpu/backward.cpp
 delete mode 100644 src/dft/backends/mklgpu/commit.cpp
 delete mode 100644 src/dft/backends/mklgpu/descriptor.cpp
 delete mode 100644 src/dft/backends/mklgpu/forward.cpp
 delete mode 100644 src/dft/backends/mklgpu/mkl_dft_gpu_wrappers.cpp
 delete mode 100644 src/dft/backends/mklgpu/mklgpu_helpers.hpp
 delete mode 100644 src/dft/backends/portfft/CMakeLists.txt
 delete mode 100644 src/dft/backends/portfft/commit.cpp
 delete mode 100644 src/dft/backends/portfft/descriptor.cpp
 delete mode 100644 src/dft/backends/portfft/mkl_dft_portfft_wrappers.cpp
 delete mode 100644 src/dft/backends/portfft/portfft_helper.hpp
 delete mode 100644 src/dft/backends/rocfft/CMakeLists.txt
 delete mode 100644 src/dft/backends/rocfft/backward.cpp
 delete mode 100644 src/dft/backends/rocfft/commit.cpp
 delete mode 100644 src/dft/backends/rocfft/descriptor.cpp
 delete mode 100644 src/dft/backends/rocfft/execute_helper.hpp
 delete mode 100644 src/dft/backends/rocfft/forward.cpp
 delete mode 100644 src/dft/backends/rocfft/mkl_dft_rocfft_wrappers.cpp
 delete mode 100644 src/dft/backends/rocfft/rocfft_handle.hpp
 delete mode 100644 src/dft/backends/stride_helper.hpp
 delete mode 100644 src/dft/descriptor.cxx
 delete mode 100644 src/dft/descriptor_config_helper.hpp
 delete mode 100644 src/dft/dft_loader.cpp
 delete mode 100644 src/dft/function_table.hpp
 delete mode 100644 src/include/allocator_helper.hpp
 delete mode 100644 src/include/dtype_string.hpp
 delete mode 100644 src/include/exceptions_helper.hpp
 delete mode 100644 src/include/function_table_initializer.hpp
 delete mode 100644 src/include/runtime_support_helper.hpp
 delete mode 100644 src/lapack/CMakeLists.txt
 delete mode 100644 src/lapack/backends/CMakeLists.txt
 delete mode 100644 src/lapack/backends/cusolver/CMakeLists.txt
 delete mode 100644 src/lapack/backends/cusolver/cusolver_batch.cpp
 delete mode 100644 src/lapack/backends/cusolver/cusolver_handle.hpp
 delete mode 100644 src/lapack/backends/cusolver/cusolver_helper.hpp
 delete mode 100644 src/lapack/backends/cusolver/cusolver_lapack.cpp
 delete mode 100644 src/lapack/backends/cusolver/cusolver_scope_handle.cpp
 delete mode 100644 src/lapack/backends/cusolver/cusolver_scope_handle.hpp
 delete mode 100644 src/lapack/backends/cusolver/cusolver_task.hpp
 delete mode 100644 src/lapack/backends/cusolver/cusolver_wrappers.cpp
 delete mode 100644 src/lapack/backends/mkl_common/lapack_wrappers.cxx
 delete mode 100644 src/lapack/backends/mkl_common/mkl_lapack.cxx
 delete mode 100644 src/lapack/backends/mkl_common/mkl_lapack_backend.hpp
 delete mode 100644 src/lapack/backends/mklcpu/CMakeLists.txt
 delete mode 100644 src/lapack/backends/mklcpu/lapack_cpu_wrappers.cpp
 delete mode 100644 src/lapack/backends/mklcpu/mkl_lapack.cpp
 delete mode 100644 src/lapack/backends/mklgpu/CMakeLists.txt
 delete mode 100644 src/lapack/backends/mklgpu/lapack_gpu_wrappers.cpp
 delete mode 100644 src/lapack/backends/mklgpu/mkl_lapack.cpp
 delete mode 100644 src/lapack/backends/rocsolver/CMakeLists.txt
 delete mode 100644 src/lapack/backends/rocsolver/rocsolver_batch.cpp
 delete mode 100644 src/lapack/backends/rocsolver/rocsolver_handle.hpp
 delete mode 100644 src/lapack/backends/rocsolver/rocsolver_helper.hpp
 delete mode 100644 src/lapack/backends/rocsolver/rocsolver_lapack.cpp
 delete mode 100644 src/lapack/backends/rocsolver/rocsolver_scope_handle.cpp
 delete mode 100644 src/lapack/backends/rocsolver/rocsolver_scope_handle.hpp
 delete mode 100644 src/lapack/backends/rocsolver/rocsolver_task.hpp
 delete mode 100644 src/lapack/backends/rocsolver/rocsolver_wrappers.cpp
 delete mode 100644 src/lapack/function_table.hpp
 delete mode 100644 src/lapack/lapack_loader.cpp
 delete mode 100644 src/rng/CMakeLists.txt
 delete mode 100644 src/rng/backends/CMakeLists.txt
 delete mode 100644 src/rng/backends/curand/CMakeLists.txt
 delete mode 100644 src/rng/backends/curand/curand_helper.hpp
 delete mode 100644 src/rng/backends/curand/curand_task.hpp
 delete mode 100644 src/rng/backends/curand/mkl_rng_curand_wrappers.cpp
 delete mode 100644 src/rng/backends/curand/mrg32k3a.cpp
 delete mode 100644 src/rng/backends/curand/philox4x32x10.cpp
 delete mode 100644 src/rng/backends/mklcpu/CMakeLists.txt
 delete mode 100644 src/rng/backends/mklcpu/cpu_common.hpp
 delete mode 100644 src/rng/backends/mklcpu/mkl_rng_cpu_wrappers.cpp
 delete mode 100644 src/rng/backends/mklcpu/mrg32k3a.cpp
 delete mode 100644 src/rng/backends/mklcpu/philox4x32x10.cpp
 delete mode 100644 src/rng/backends/mklgpu/CMakeLists.txt
 delete mode 100755 src/rng/backends/mklgpu/mkl_internal_rng_gpu.hpp
 delete mode 100644 src/rng/backends/mklgpu/mkl_rng_gpu_wrappers.cpp
 delete mode 100644 src/rng/backends/mklgpu/mrg32k3a.cpp
 delete mode 100644 src/rng/backends/mklgpu/philox4x32x10.cpp
 delete mode 100644 src/rng/backends/rocrand/CMakeLists.txt
 delete mode 100644 src/rng/backends/rocrand/mkl_rng_rocrand_wrappers.cpp
 delete mode 100644 src/rng/backends/rocrand/mrg32k3a.cpp
 delete mode 100644 src/rng/backends/rocrand/philox4x32x10.cpp
 delete mode 100644 src/rng/backends/rocrand/rocrand_helper.hpp
 delete mode 100644 src/rng/backends/rocrand/rocrand_task.hpp
 delete mode 100644 src/rng/function_table.hpp
 delete mode 100644 src/rng/rng_loader.cpp
 delete mode 100644 src/sparse_blas/CMakeLists.txt
 delete mode 100644 src/sparse_blas/backends/CMakeLists.txt
 delete mode 100644 src/sparse_blas/backends/backend_wrappers.cxx
 delete mode 100644 src/sparse_blas/backends/mkl_common/mkl_basic.cxx
 delete mode 100644 src/sparse_blas/backends/mkl_common/mkl_helper.hpp
 delete mode 100644 src/sparse_blas/backends/mkl_common/mkl_operations.cxx
 delete mode 100644 src/sparse_blas/backends/mklcpu/CMakeLists.txt
 delete mode 100644 src/sparse_blas/backends/mklcpu/mklcpu_basic.cpp
 delete mode 100644 src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp
 delete mode 100644 src/sparse_blas/backends/mklcpu/mklcpu_wrappers.cpp
 delete mode 100644 src/sparse_blas/backends/mklgpu/CMakeLists.txt
 delete mode 100644 src/sparse_blas/backends/mklgpu/mklgpu_basic.cpp
 delete mode 100644 src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp
 delete mode 100644 src/sparse_blas/backends/mklgpu/mklgpu_wrappers.cpp
 delete mode 100644 src/sparse_blas/function_table.hpp
 delete mode 100644 src/sparse_blas/macros.hpp
 delete mode 100644 src/sparse_blas/sparse_blas_loader.cpp
 delete mode 100644 tests/CMakeLists.txt
 delete mode 100644 tests/README.md
 delete mode 100644 tests/unit_tests/CMakeLists.txt
 delete mode 100644 tests/unit_tests/blas/CMakeLists.txt
 delete mode 100644 tests/unit_tests/blas/batch/CMakeLists.txt
 delete mode 100644 tests/unit_tests/blas/batch/axpy_batch_stride.cpp
 delete mode 100644 tests/unit_tests/blas/batch/axpy_batch_stride_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/axpy_batch_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/copy_batch_stride.cpp
 delete mode 100644 tests/unit_tests/blas/batch/copy_batch_stride_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/copy_batch_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/dgmm_batch_stride.cpp
 delete mode 100644 tests/unit_tests/blas/batch/dgmm_batch_stride_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/dgmm_batch_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/gemm_batch_stride.cpp
 delete mode 100644 tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/gemm_batch_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/gemv_batch_stride.cpp
 delete mode 100644 tests/unit_tests/blas/batch/gemv_batch_stride_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/gemv_batch_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/imatcopy_batch_stride.cpp
 delete mode 100644 tests/unit_tests/blas/batch/imatcopy_batch_stride_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/imatcopy_batch_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/omatadd_batch_stride.cpp
 delete mode 100644 tests/unit_tests/blas/batch/omatadd_batch_stride_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/omatcopy_batch_stride.cpp
 delete mode 100644 tests/unit_tests/blas/batch/omatcopy_batch_stride_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/omatcopy_batch_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/syrk_batch_stride.cpp
 delete mode 100644 tests/unit_tests/blas/batch/syrk_batch_stride_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/syrk_batch_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/trsm_batch_stride.cpp
 delete mode 100644 tests/unit_tests/blas/batch/trsm_batch_stride_usm.cpp
 delete mode 100644 tests/unit_tests/blas/batch/trsm_batch_usm.cpp
 delete mode 100644 tests/unit_tests/blas/extensions/CMakeLists.txt
 delete mode 100644 tests/unit_tests/blas/extensions/gemm_bias.cpp
 delete mode 100644 tests/unit_tests/blas/extensions/gemm_bias_usm.cpp
 delete mode 100644 tests/unit_tests/blas/extensions/gemmt.cpp
 delete mode 100644 tests/unit_tests/blas/extensions/gemmt_usm.cpp
 delete mode 100644 tests/unit_tests/blas/extensions/imatcopy.cpp
 delete mode 100644 tests/unit_tests/blas/extensions/imatcopy_usm.cpp
 delete mode 100644 tests/unit_tests/blas/extensions/omatadd.cpp
 delete mode 100644 tests/unit_tests/blas/extensions/omatadd_usm.cpp
 delete mode 100644 tests/unit_tests/blas/extensions/omatcopy.cpp
 delete mode 100644 tests/unit_tests/blas/extensions/omatcopy2.cpp
 delete mode 100644 tests/unit_tests/blas/extensions/omatcopy2_usm.cpp
 delete mode 100644 tests/unit_tests/blas/extensions/omatcopy_usm.cpp
 delete mode 100644 tests/unit_tests/blas/include/allocator_helper.hpp
 delete mode 100644 tests/unit_tests/blas/include/onemkl_blas_helper.hpp
 delete mode 100644 tests/unit_tests/blas/include/reference_blas_templates.hpp
 delete mode 100644 tests/unit_tests/blas/include/reference_blas_wrappers.hpp
 delete mode 100644 tests/unit_tests/blas/include/test_common.hpp
 delete mode 100644 tests/unit_tests/blas/level1/CMakeLists.txt
 delete mode 100644 tests/unit_tests/blas/level1/asum.cpp
 delete mode 100644 tests/unit_tests/blas/level1/asum_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/axpby.cpp
 delete mode 100644 tests/unit_tests/blas/level1/axpby_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/axpy.cpp
 delete mode 100644 tests/unit_tests/blas/level1/axpy_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/copy.cpp
 delete mode 100644 tests/unit_tests/blas/level1/copy_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/dot.cpp
 delete mode 100644 tests/unit_tests/blas/level1/dot_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/dotc.cpp
 delete mode 100644 tests/unit_tests/blas/level1/dotc_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/dotu.cpp
 delete mode 100644 tests/unit_tests/blas/level1/dotu_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/iamax.cpp
 delete mode 100644 tests/unit_tests/blas/level1/iamax_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/iamin.cpp
 delete mode 100644 tests/unit_tests/blas/level1/iamin_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/nrm2.cpp
 delete mode 100644 tests/unit_tests/blas/level1/nrm2_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/rot.cpp
 delete mode 100644 tests/unit_tests/blas/level1/rot_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/rotg.cpp
 delete mode 100644 tests/unit_tests/blas/level1/rotg_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/rotm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/rotm_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/rotmg.cpp
 delete mode 100644 tests/unit_tests/blas/level1/rotmg_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/scal.cpp
 delete mode 100644 tests/unit_tests/blas/level1/scal_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/sdsdot.cpp
 delete mode 100644 tests/unit_tests/blas/level1/sdsdot_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level1/swap.cpp
 delete mode 100644 tests/unit_tests/blas/level1/swap_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/CMakeLists.txt
 delete mode 100644 tests/unit_tests/blas/level2/gbmv.cpp
 delete mode 100644 tests/unit_tests/blas/level2/gbmv_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/gemv.cpp
 delete mode 100644 tests/unit_tests/blas/level2/gemv_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/ger.cpp
 delete mode 100644 tests/unit_tests/blas/level2/ger_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/gerc.cpp
 delete mode 100644 tests/unit_tests/blas/level2/gerc_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/geru.cpp
 delete mode 100644 tests/unit_tests/blas/level2/geru_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/hbmv.cpp
 delete mode 100644 tests/unit_tests/blas/level2/hbmv_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/hemv.cpp
 delete mode 100644 tests/unit_tests/blas/level2/hemv_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/her.cpp
 delete mode 100644 tests/unit_tests/blas/level2/her2.cpp
 delete mode 100644 tests/unit_tests/blas/level2/her2_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/her_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/hpmv.cpp
 delete mode 100644 tests/unit_tests/blas/level2/hpmv_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/hpr.cpp
 delete mode 100644 tests/unit_tests/blas/level2/hpr2.cpp
 delete mode 100644 tests/unit_tests/blas/level2/hpr2_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/hpr_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/sbmv.cpp
 delete mode 100644 tests/unit_tests/blas/level2/sbmv_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/spmv.cpp
 delete mode 100644 tests/unit_tests/blas/level2/spmv_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/spr.cpp
 delete mode 100644 tests/unit_tests/blas/level2/spr2.cpp
 delete mode 100644 tests/unit_tests/blas/level2/spr2_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/spr_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/symv.cpp
 delete mode 100644 tests/unit_tests/blas/level2/symv_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/syr.cpp
 delete mode 100644 tests/unit_tests/blas/level2/syr2.cpp
 delete mode 100644 tests/unit_tests/blas/level2/syr2_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/syr_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/tbmv.cpp
 delete mode 100644 tests/unit_tests/blas/level2/tbmv_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/tbsv.cpp
 delete mode 100644 tests/unit_tests/blas/level2/tbsv_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/tpmv.cpp
 delete mode 100644 tests/unit_tests/blas/level2/tpmv_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/tpsv.cpp
 delete mode 100644 tests/unit_tests/blas/level2/tpsv_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/trmv.cpp
 delete mode 100644 tests/unit_tests/blas/level2/trmv_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level2/trsv.cpp
 delete mode 100644 tests/unit_tests/blas/level2/trsv_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level3/CMakeLists.txt
 delete mode 100644 tests/unit_tests/blas/level3/gemm.cpp
 delete mode 100644 tests/unit_tests/blas/level3/gemm_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level3/hemm.cpp
 delete mode 100644 tests/unit_tests/blas/level3/hemm_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level3/her2k.cpp
 delete mode 100644 tests/unit_tests/blas/level3/her2k_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level3/herk.cpp
 delete mode 100644 tests/unit_tests/blas/level3/herk_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level3/symm.cpp
 delete mode 100644 tests/unit_tests/blas/level3/symm_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level3/syr2k.cpp
 delete mode 100644 tests/unit_tests/blas/level3/syr2k_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level3/syrk.cpp
 delete mode 100644 tests/unit_tests/blas/level3/syrk_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level3/trmm.cpp
 delete mode 100644 tests/unit_tests/blas/level3/trmm_usm.cpp
 delete mode 100644 tests/unit_tests/blas/level3/trsm.cpp
 delete mode 100644 tests/unit_tests/blas/level3/trsm_usm.cpp
 delete mode 100644 tests/unit_tests/dft/CMakeLists.txt
 delete mode 100644 tests/unit_tests/dft/include/compute_inplace.hpp
 delete mode 100644 tests/unit_tests/dft/include/compute_inplace_real_real.hpp
 delete mode 100644 tests/unit_tests/dft/include/compute_out_of_place.hpp
 delete mode 100644 tests/unit_tests/dft/include/compute_out_of_place_real_real.hpp
 delete mode 100644 tests/unit_tests/dft/include/compute_tester.hpp
 delete mode 100644 tests/unit_tests/dft/include/parseval_check.hpp
 delete mode 100644 tests/unit_tests/dft/include/reference_dft.hpp
 delete mode 100644 tests/unit_tests/dft/include/test_common.hpp
 delete mode 100644 tests/unit_tests/dft/source/CMakeLists.txt
 delete mode 100644 tests/unit_tests/dft/source/compute_tests.cpp
 delete mode 100644 tests/unit_tests/dft/source/descriptor_tests.cpp
 delete mode 100644 tests/unit_tests/dft/source/workspace_external_tests.cpp
 delete mode 100644 tests/unit_tests/include/test_helper.hpp
 delete mode 100644 tests/unit_tests/lapack/CMakeLists.txt
 delete mode 100644 tests/unit_tests/lapack/common/CMakeLists.txt
 delete mode 100644 tests/unit_tests/lapack/common/dependency_check.cpp
 delete mode 100644 tests/unit_tests/lapack/common/test_log.cpp
 delete mode 100644 tests/unit_tests/lapack/include/lapack_accuracy_checks.hpp
 delete mode 100644 tests/unit_tests/lapack/include/lapack_common.hpp
 delete mode 100644 tests/unit_tests/lapack/include/lapack_gtest_suite.hpp
 delete mode 100644 tests/unit_tests/lapack/include/lapack_reference_wrappers.hpp
 delete mode 100644 tests/unit_tests/lapack/include/lapack_test_controller.hpp
 delete mode 100644 tests/unit_tests/lapack/source/CMakeLists.txt
 delete mode 100644 tests/unit_tests/lapack/source/gebrd.cpp
 delete mode 100644 tests/unit_tests/lapack/source/geqrf.cpp
 delete mode 100644 tests/unit_tests/lapack/source/geqrf_batch_group.cpp
 delete mode 100644 tests/unit_tests/lapack/source/geqrf_batch_stride.cpp
 delete mode 100644 tests/unit_tests/lapack/source/gerqf.cpp
 delete mode 100644 tests/unit_tests/lapack/source/gesvd.cpp
 delete mode 100644 tests/unit_tests/lapack/source/getrf.cpp
 delete mode 100644 tests/unit_tests/lapack/source/getrf_batch_group.cpp
 delete mode 100644 tests/unit_tests/lapack/source/getrf_batch_stride.cpp
 delete mode 100644 tests/unit_tests/lapack/source/getri.cpp
 delete mode 100644 tests/unit_tests/lapack/source/getri_batch_group.cpp
 delete mode 100644 tests/unit_tests/lapack/source/getri_batch_stride.cpp
 delete mode 100644 tests/unit_tests/lapack/source/getrs.cpp
 delete mode 100644 tests/unit_tests/lapack/source/getrs_batch_group.cpp
 delete mode 100644 tests/unit_tests/lapack/source/getrs_batch_stride.cpp
 delete mode 100644 tests/unit_tests/lapack/source/heevd.cpp
 delete mode 100644 tests/unit_tests/lapack/source/hegvd.cpp
 delete mode 100644 tests/unit_tests/lapack/source/hetrd.cpp
 delete mode 100644 tests/unit_tests/lapack/source/hetrf.cpp
 delete mode 100644 tests/unit_tests/lapack/source/orgbr.cpp
 delete mode 100644 tests/unit_tests/lapack/source/orgqr.cpp
 delete mode 100644 tests/unit_tests/lapack/source/orgqr_batch_group.cpp
 delete mode 100644 tests/unit_tests/lapack/source/orgqr_batch_stride.cpp
 delete mode 100644 tests/unit_tests/lapack/source/orgtr.cpp
 delete mode 100644 tests/unit_tests/lapack/source/ormqr.cpp
 delete mode 100644 tests/unit_tests/lapack/source/ormrq.cpp
 delete mode 100644 tests/unit_tests/lapack/source/ormtr.cpp
 delete mode 100644 tests/unit_tests/lapack/source/potrf.cpp
 delete mode 100644 tests/unit_tests/lapack/source/potrf_batch_group.cpp
 delete mode 100644 tests/unit_tests/lapack/source/potrf_batch_stride.cpp
 delete mode 100644 tests/unit_tests/lapack/source/potri.cpp
 delete mode 100644 tests/unit_tests/lapack/source/potrs.cpp
 delete mode 100644 tests/unit_tests/lapack/source/potrs_batch_group.cpp
 delete mode 100644 tests/unit_tests/lapack/source/potrs_batch_stride.cpp
 delete mode 100644 tests/unit_tests/lapack/source/syevd.cpp
 delete mode 100644 tests/unit_tests/lapack/source/sygvd.cpp
 delete mode 100644 tests/unit_tests/lapack/source/sytrd.cpp
 delete mode 100644 tests/unit_tests/lapack/source/sytrf.cpp
 delete mode 100644 tests/unit_tests/lapack/source/trtrs.cpp
 delete mode 100644 tests/unit_tests/lapack/source/ungbr.cpp
 delete mode 100644 tests/unit_tests/lapack/source/ungqr.cpp
 delete mode 100644 tests/unit_tests/lapack/source/ungqr_batch_group.cpp
 delete mode 100644 tests/unit_tests/lapack/source/ungqr_batch_stride.cpp
 delete mode 100644 tests/unit_tests/lapack/source/ungtr.cpp
 delete mode 100644 tests/unit_tests/lapack/source/unmqr.cpp
 delete mode 100644 tests/unit_tests/lapack/source/unmrq.cpp
 delete mode 100644 tests/unit_tests/lapack/source/unmtr.cpp
 delete mode 100644 tests/unit_tests/main_test.cpp
 delete mode 100644 tests/unit_tests/rng/CMakeLists.txt
 delete mode 100644 tests/unit_tests/rng/device/CMakeLists.txt
 delete mode 100644 tests/unit_tests/rng/device/include/moments.hpp
 delete mode 100644 tests/unit_tests/rng/device/include/rng_device_test_common.hpp
 delete mode 100644 tests/unit_tests/rng/device/include/skip_ahead_test.hpp
 delete mode 100644 tests/unit_tests/rng/device/moments/CMakeLists.txt
 delete mode 100644 tests/unit_tests/rng/device/moments/moments.cpp
 delete mode 100644 tests/unit_tests/rng/device/service/CMakeLists.txt
 delete mode 100644 tests/unit_tests/rng/device/service/skip_ahead.cpp
 delete mode 100644 tests/unit_tests/rng/include/engines_api_tests.hpp
 delete mode 100644 tests/unit_tests/rng/include/rng_test_common.hpp
 delete mode 100644 tests/unit_tests/rng/include/skip_ahead_test.hpp
 delete mode 100644 tests/unit_tests/rng/include/statistics_check.hpp
 delete mode 100644 tests/unit_tests/rng/include/statistics_check_test.hpp
 delete mode 100644 tests/unit_tests/rng/service/CMakeLists.txt
 delete mode 100644 tests/unit_tests/rng/service/engines_api_test.cpp
 delete mode 100644 tests/unit_tests/rng/service/skip_ahead.cpp
 delete mode 100644 tests/unit_tests/rng/statistics_check/CMakeLists.txt
 delete mode 100755 tests/unit_tests/rng/statistics_check/bernoulli.cpp
 delete mode 100755 tests/unit_tests/rng/statistics_check/bernoulli_usm.cpp
 delete mode 100644 tests/unit_tests/rng/statistics_check/gaussian.cpp
 delete mode 100644 tests/unit_tests/rng/statistics_check/gaussian_usm.cpp
 delete mode 100755 tests/unit_tests/rng/statistics_check/lognormal.cpp
 delete mode 100755 tests/unit_tests/rng/statistics_check/lognormal_usm.cpp
 delete mode 100755 tests/unit_tests/rng/statistics_check/poisson.cpp
 delete mode 100755 tests/unit_tests/rng/statistics_check/poisson_usm.cpp
 delete mode 100644 tests/unit_tests/rng/statistics_check/uniform.cpp
 delete mode 100644 tests/unit_tests/rng/statistics_check/uniform_usm.cpp
 delete mode 100644 tests/unit_tests/sparse_blas/CMakeLists.txt
 delete mode 100644 tests/unit_tests/sparse_blas/include/sparse_reference.hpp
 delete mode 100644 tests/unit_tests/sparse_blas/include/test_common.hpp
 delete mode 100644 tests/unit_tests/sparse_blas/source/CMakeLists.txt
 delete mode 100644 tests/unit_tests/sparse_blas/source/sparse_gemm_buffer.cpp
 delete mode 100644 tests/unit_tests/sparse_blas/source/sparse_gemm_usm.cpp
 delete mode 100644 tests/unit_tests/sparse_blas/source/sparse_gemv_buffer.cpp
 delete mode 100644 tests/unit_tests/sparse_blas/source/sparse_gemv_usm.cpp
 delete mode 100644 tests/unit_tests/sparse_blas/source/sparse_trsv_buffer.cpp
 delete mode 100644 tests/unit_tests/sparse_blas/source/sparse_trsv_usm.cpp
 delete mode 100644 third-party-programs/THIRD-PARTY-PROGRAMS

diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index 79af06f6a..000000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,355 +0,0 @@
-#===============================================================================
-# Copyright 2020-2022 Intel Corporation
-# Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-cmake_minimum_required (VERSION 3.13)
-
-# Define build type
-set(DEFAULT_BUILD_TYPE "Release")
-
-if("${CMAKE_BUILD_TYPE}" STREQUAL "")
-    message(STATUS "CMAKE_BUILD_TYPE: None, set to ${DEFAULT_BUILD_TYPE} by default")
-    set(CMAKE_BUILD_TYPE ${DEFAULT_BUILD_TYPE} CACHE STRING
-            "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel RelWithAssert" FORCE)
-else()
-    message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
-endif()
-
-# Build options
-option(BUILD_SHARED_LIBS "Build dynamic libraries" ON)
-
-## Backends
-option(ENABLE_MKLCPU_BACKEND "Enable the Intel oneMKL CPU backend for supported interfaces" ON)
-option(ENABLE_MKLGPU_BACKEND "Enable the Intel oneMKL GPU backend for supported interfaces" ON)
-if(ENABLE_MKLCPU_BACKEND)
-  option(ENABLE_MKLCPU_THREAD_TBB "Enable the use of Intel TBB with the oneMKL CPU backend" ON)
-endif()
-
-# blas
-option(ENABLE_CUBLAS_BACKEND "Enable the cuBLAS backend for the BLAS interface" OFF)
-option(ENABLE_ROCBLAS_BACKEND "Enable the rocBLAS backend for the BLAS interface" OFF)
-option(ENABLE_NETLIB_BACKEND "Enable the Netlib backend for the BLAS interface" OFF)
-option(ENABLE_PORTBLAS_BACKEND "Enable the portBLAS backend for the BLAS interface. Cannot be used with other BLAS backends." OFF)
-
-# rand
-option(ENABLE_CURAND_BACKEND "Enable the cuRAND backend for the RNG interface" OFF)
-option(ENABLE_ROCRAND_BACKEND "Enable the rocRAND backend for the RNG interface" OFF)
-
-# lapack
-option(ENABLE_CUSOLVER_BACKEND "Enable the cuSOLVER backend for the LAPACK interface" OFF)
-option(ENABLE_ROCSOLVER_BACKEND "Enable the rocSOLVER backend for the LAPACK interface" OFF)
-
-# dft
-option(ENABLE_CUFFT_BACKEND "Enable the cuFFT backend for the DFT interface" OFF)
-option(ENABLE_ROCFFT_BACKEND "Enable the rocFFT backend for the DFT interface" OFF)
-option(ENABLE_PORTFFT_BACKEND "Enable the portFFT DFT backend for the DFT interface. Cannot be used with other DFT backends." OFF)
-
-set(ONEMKL_SYCL_IMPLEMENTATION "dpc++" CACHE STRING "Name of the SYCL compiler")
-set(HIP_TARGETS "" CACHE STRING "Target HIP architectures")
-
-## Testing
-option(BUILD_FUNCTIONAL_TESTS "" ON)
-
-## Examples
-option(BUILD_EXAMPLES "" ON)
-
-## Documentation
-option(BUILD_DOC "" OFF)
-
-## Supported domains
-set(DOMAINS_LIST "")
-if(ENABLE_MKLCPU_BACKEND
-        OR ENABLE_MKLGPU_BACKEND
-        OR ENABLE_CUBLAS_BACKEND
-        OR ENABLE_ROCBLAS_BACKEND
-        OR ENABLE_NETLIB_BACKEND
-        OR ENABLE_PORTBLAS_BACKEND)
-  list(APPEND DOMAINS_LIST "blas")
-endif()
-if(ENABLE_MKLCPU_BACKEND
-        OR ENABLE_MKLGPU_BACKEND
-        OR ENABLE_CUSOLVER_BACKEND
-        OR ENABLE_ROCSOLVER_BACKEND)
-  list(APPEND DOMAINS_LIST "lapack")
-endif()
-if(ENABLE_MKLCPU_BACKEND
-        OR ENABLE_MKLGPU_BACKEND
-        OR ENABLE_CURAND_BACKEND
-        OR ENABLE_ROCRAND_BACKEND)
-  list(APPEND DOMAINS_LIST "rng")
-endif()
-if(ENABLE_MKLGPU_BACKEND
-        OR ENABLE_MKLCPU_BACKEND
-        OR ENABLE_CUFFT_BACKEND
-        OR ENABLE_ROCFFT_BACKEND
-        OR ENABLE_PORTFFT_BACKEND)
-  list(APPEND DOMAINS_LIST "dft")
-endif()
-if(ENABLE_MKLCPU_BACKEND
-        OR ENABLE_MKLGPU_BACKEND)
-  list(APPEND DOMAINS_LIST "sparse_blas")
-endif()
-
-if(ENABLE_PORTBLAS_BACKEND
-	AND (ENABLE_MKLCPU_BACKEND
-		OR ENABLE_MKLGPU_BACKEND
-		OR ENABLE_CUBLAS_BACKEND
-		OR ENABLE_ROCBLAS_BACKEND
-		OR ENABLE_NETLIB_BACKEND))
-	message(FATAL_ERROR "ENABLE_PORTBLAS_BACKEND cannot be enabled at the same time as other BLAS backends.")
-endif()
-
-if (ENABLE_PORTFFT_BACKEND
-	AND (ENABLE_MKLCPU_BACKEND
-		OR ENABLE_MKLGPU_BACKEND
-		OR ENABLE_ROCFFT_BACKEND
-		OR ENABLE_CUFFT_BACKEND))
-	message(FATAL_ERROR "ENABLE_PORTFFT_BACKEND cannot be enabled at the same time as other DFT backends.")
-endif()
-
-# Define required CXX compilers before project
-if(CMAKE_CXX_COMPILER OR NOT ONEMKL_SYCL_IMPLEMENTATION STREQUAL "dpc++")
-  if(WIN32)
-    string(REPLACE "\\" "/" CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER})
-  endif()
-else()
-  if(ENABLE_CUBLAS_BACKEND OR ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUFFT_BACKEND
-    OR ENABLE_ROCBLAS_BACKEND OR ENABLE_ROCRAND_BACKEND OR ENABLE_ROCSOLVER_BACKEND OR ENABLE_ROCFFT_BACKEND)
-    set(CMAKE_CXX_COMPILER "clang++")
-  elseif(ENABLE_MKLGPU_BACKEND)
-    if(UNIX)
-      set(CMAKE_CXX_COMPILER "icpx")
-    else()
-      set(CMAKE_CXX_COMPILER "icx")
-    endif()
-  else()
-    if(UNIX)
-      find_program(ICPX_ICX_PATH icpx)
-    else()
-      find_program(ICPX_ICX_PATH icx)
-    endif()
-    if(ICPX_ICX_PATH)
-      if(UNIX)
-        message(STATUS "CXX compiler: icpx was found in PATH, using icpx")
-        set(CMAKE_CXX_COMPILER "icpx")
-      else()
-        message(STATUS "CXX compiler: icx was found in PATH, using icx")
-        set(CMAKE_CXX_COMPILER "icx")
-      endif()
-    else()
-      if(WIN32)
-        message(STATUS "CXX compiler: icx was not found in PATH, using clang-cl instead")
-        set(CMAKE_CXX_COMPILER "clang-cl")
-      else()
-        message(STATUS "CXX compiler: icpx was not found in PATH, using clang++ instead")
-        set(CMAKE_CXX_COMPILER "clang++")
-      endif()
-    endif()
-  endif()
-endif()
-
-# Define required C compilers before project
-if(CMAKE_C_COMPILER OR NOT ONEMKL_SYCL_IMPLEMENTATION STREQUAL "dpc++")
-  if(WIN32)
-    string(REPLACE "\\" "/" CMAKE_C_COMPILER ${CMAKE_C_COMPILER})
-  endif()
-else()
-  find_program(ICX_PATH icx)
-  if(ICX_PATH)
-    message(STATUS "C compiler: icx was found in PATH, using icx")
-    set(CMAKE_C_COMPILER "icx")
-  else()
-    if(WIN32)
-      message(STATUS "C compiler: icx was not found in PATH, using clang-cl instead")
-      set(CMAKE_C_COMPILER "clang-cl")
-    else()
-      message(STATUS "C compiler: icx was not found in PATH, using clang instead")
-      set(CMAKE_C_COMPILER "clang")
-    endif()
-  endif()
-endif()
-
-project(oneMKL VERSION 0.2.0 LANGUAGES CXX)
-
-# Override default CXX compile/link lines for Windows after project
-if(WIN32 AND ONEMKL_SYCL_IMPLEMENTATION STREQUAL "dpc++")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-function -w")
-  foreach (flag_var
-           CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-           CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-    string(REPLACE "/MD" "" ${flag_var} "${${flag_var}}")
-  endforeach()
-  set(CMAKE_CXX_COMPILE_OBJECT "<CMAKE_CXX_COMPILER> -fsycl /nologo <DEFINES> <INCLUDES> /EHsc <FLAGS> /Fo<OBJECT> -c <SOURCE>")
-  set(CMAKE_CXX_CREATE_STATIC_LIBRARY "lib /nologo <OBJECTS> /out:<TARGET>")
-  if(CMAKE_VERSION VERSION_LESS "3.25.2")
-    set(CMAKE_CXX_LINK_EXECUTABLE "<CMAKE_CXX_COMPILER> -fsycl -fsycl-device-code-split=per_kernel /nologo <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
-    set(CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> -fsycl -fsycl-device-code-split=per_kernel /nologo <OBJECTS> <LINK_LIBRARIES> /link /out:<TARGET> /implib:<TARGET_IMPLIB> /pdb:<TARGET_PDB> /dll /version:<TARGET_VERSION_MAJOR>.<TARGET_VERSION_MINOR>")
-  endif()
-endif()
-
-# Temporary disable sycl 2020 deprecations warnings for cuSOLVER and rocSOLVER
-if(ONEMKL_SYCL_IMPLEMENTATION STREQUAL "dpc++" AND (ENABLE_ROCSOLVER_BACKEND))
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSYCL2020_DISABLE_DEPRECATION_WARNINGS")
-endif()
-
-# Target domains
-if(NOT TARGET_DOMAINS OR TARGET_DOMAINS STREQUAL "None")
-  # Set to all by default
-  set(TARGET_DOMAINS ${DOMAINS_LIST})
-else()
-  # Make sure the input was converted to list
-  string(REPLACE " " ";" TARGET_DOMAINS ${TARGET_DOMAINS})
-  set(NOT_FOUND 0)
-  foreach(domain ${TARGET_DOMAINS})
-    if(NOT ${domain} IN_LIST DOMAINS_LIST)
-      set(NOT_FOUND 1)
-      break()
-    endif()
-  endforeach()
-  if(NOT_FOUND)
-    message(STATUS "TARGET_DOMAINS contains unsupported options, reset to all")
-    set(TARGET_DOMAINS ${DOMAINS_LIST})
-  endif()
-endif()
-message(STATUS "TARGET_DOMAINS: ${TARGET_DOMAINS}")
-
-# Include Intel oneMKL
-if(ENABLE_MKLGPU_BACKEND OR ENABLE_MKLCPU_BACKEND)
-  set(MKL_ARCH intel64)
-  set(MKL_INTERFACE ilp64)
-  if(ENABLE_MKLCPU_THREAD_TBB)
-    set(MKL_THREADING tbb_thread)
-  else()
-    set(MKL_THREADING sequential)
-  endif()
-  if(BUILD_SHARED_LIBS AND NOT WIN32)
-    set(MKL_LINK dynamic)
-  else()
-    set(MKL_LINK static)
-  endif()
-  # Enable SYCL API
-  set(DPCPP_COMPILER ON)
-  set(SYCL_COMPILER ON)
-  # In case Intel oneMKL package doesn't include MKLConfig,
-  # use MKLConfig from the repo
-  find_package(MKL REQUIRED
-          HINTS ${MKL_ROOT}/lib/cmake
-                ${MKL_ROOT}/lib/cmake/mkl
-                $ENV{MKLROOT}
-                ${PROJECT_SOURCE_DIR}/cmake/mkl)
-endif()
-
-# Set output directories for the project
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-
-# Add CMake Finders
-add_subdirectory(cmake)
-
-# Include general cmake config files
-list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
-
-# Add DPC++ options for Linux
-if(WIN32)
-  add_library(ONEMKL::SYCL::SYCL INTERFACE IMPORTED)
-else()
-  # Find necessary packages
-  if(ONEMKL_SYCL_IMPLEMENTATION)
-    string( TOLOWER "${ONEMKL_SYCL_IMPLEMENTATION}" ONEMKL_SYCL_IMPLEMENTATION)
-    if (ONEMKL_SYCL_IMPLEMENTATION STREQUAL "hipsycl")
-      message(STATUS "Looking for hipSYCL")
-      find_package(hipSYCL CONFIG REQUIRED)
-      set(USE_ADD_SYCL_TO_TARGET_INTEGRATION true)
-      set (CMAKE_CXX_STANDARD 17)
-      add_library(ONEMKL::SYCL::SYCL INTERFACE IMPORTED)
-    elseif(ONEMKL_SYCL_IMPLEMENTATION STREQUAL "dpc++")
-      message(STATUS "Looking for dpc++")
-      set(USE_ADD_SYCL_TO_TARGET_INTEGRATION false)
-      find_package(Compiler REQUIRED)
-    else()
-      message(FATAL_ERROR "SYCL implementation ${ONEMKL_SYCL_IMPLEMENTATION} is not known")
-    endif()
-  else()
-    message(STATUS "Looking for dpc++")
-    set(USE_ADD_SYCL_TO_TARGET_INTEGRATION false)
-    find_package(Compiler REQUIRED)
-  endif()
-endif()
-
-if(DEFINED REF_BLAS_ROOT)
-  find_file(REF_BLAS_LIBNAME NAMES blas.dll libblas.so HINTS ${REF_BLAS_ROOT} PATH_SUFFIXES lib lib64)
-  find_file(REF_CBLAS_LIBNAME NAMES cblas.dll libcblas.so HINTS ${REF_BLAS_ROOT} PATH_SUFFIXES lib lib64)
-endif()
-
-# Add source directory and output to bin/
-add_subdirectory(src bin)
-
-# Functional Tests
-if(BUILD_FUNCTIONAL_TESTS OR BUILD_EXAMPLES)
-  enable_testing()
-endif()
-
-if(BUILD_FUNCTIONAL_TESTS)
-  add_subdirectory(tests)
-endif()
-
-# Examples
-if (BUILD_EXAMPLES)
-  add_subdirectory(examples)
-endif()
-
-if(BUILD_DOC)
-  add_subdirectory(docs)
-endif()
-
-install(DIRECTORY include/
-  DESTINATION include
-  COMPONENT Devel
-)
-
-include(CMakePackageConfigHelpers)
-write_basic_package_version_file(
-  "${CMAKE_CURRENT_BINARY_DIR}/oneMKLConfigVersion.cmake"
-  VERSION ${PROJECT_VERSION}
-  COMPATIBILITY AnyNewerVersion
-)
-
-export(EXPORT oneMKLTargets
-  FILE "${CMAKE_CURRENT_BINARY_DIR}/oneMKLTargets.cmake"
-  NAMESPACE ONEMKL::
-)
-configure_file("${PROJECT_SOURCE_DIR}/cmake/oneMKLConfig.cmake"
-  "${CMAKE_CURRENT_BINARY_DIR}/oneMKLConfig.cmake"
-  COPYONLY
-)
-
-set(config_package_location "lib/cmake/${PROJECT_NAME}")
-install(EXPORT oneMKLTargets
-  FILE oneMKLTargets.cmake
-  NAMESPACE MKL::
-  DESTINATION ${config_package_location}
-)
-install(
-  FILES
-  "${PROJECT_SOURCE_DIR}/cmake/oneMKLConfig.cmake"
-  "${CMAKE_CURRENT_BINARY_DIR}/oneMKLConfigVersion.cmake"
-  DESTINATION ${config_package_location}
-  COMPONENT Devel
-)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
deleted file mode 100644
index 9a41383bd..000000000
--- a/CONTRIBUTING.md
+++ /dev/null
@@ -1,325 +0,0 @@
-# Contributing Guidelines
-If you have improvements, new libraries integrated under oneAPI Math Kernel Library (oneMKL) Interfaces, or new interfaces to contribute to the oneMKL Specification, please send us your pull requests! For getting started, see GitHub [howto](https://help.github.com/en/articles/about-pull-requests).
-
-For how to enable a new third-party library, see the [guidelines](docs/create_new_backend.rst).
-
-## Pull Request Checklist
-
-Before sending your pull requests, ensure that you follow this checklist:
-
-* If you are contributing a new interface, refer to the [library functionality guidelines](CONTRIBUTING.md#library-functionality-guidelines). It is strongly advised that you first open an [RFC issue](CONTRIBUTING.md#RFC-issue) with a detailed explanation of the expected use cases.
-
-* Ensure that your code includes proper documentation.
-
-* Ensure that the changes are consistent with the [coding style](CONTRIBUTING.md#coding-style).
-
-* Ensure that [unit tests](CONTRIBUTING.md#unit-tests) pass. Include logs from tests as attachments to the pull request.
-
-## Library Functionality Guidelines
-
-oneMKL focuses on the following criteria:
-
-1. *Performance*: Functionality that has highly optimized and extensively parallelized routines for applications that require maximum performance.
-
-   This means that for a new primitive you should demonstrate that it brings visible performance improvement to some applications.
-
-2. *Generality*: Functionality is useful in a wide range of applications.
-
-    This implies that when you introduce a new function, its API needs to be general enough to be integrated into multiple applications that have similar functionality and that its interface can support multiple hardware (HW).
-
-3. *Complexity*: Functionality that is not trivial to implement directly or by combining existing primitives.
-
-For the new API to become a part of the open source project, it should be accepted as part of [oneMKL spec](https://spec.oneapi.com/versions/latest/elements/oneMKL/source/index.html).
-
-
-### RFC Issue
-
-Open a Request For Comment (RFC) issue when contributing new interfaces. In the RFC, please provide the following details:
-
-* Description of how the new interface meets [library functionality guidelines](CONTRIBUTING.md#library-functionality-guidelines).
-
-* The definition of the function including the interface and semantics, and how this interface will be extendable for different HW implementations.
-
-* What existing libraries have implementations of this function and can be used under the oneMKL interface.
-
-
-## Bug Reporting
-
-If you find a bug or problem, please open a request under [Issues](https://github.com/oneapi-src/oneMKL/issues).
-
-
-## Security Issues
-
-Report security issues to onemkl.maintainers@intel.com.
-
-
-## Coding Style
-
-The general principle is to follow the style of existing/surrounding code. If you are in doubt, use the `clang-format`:
-```sh
-clang-format -style=file -i foo.cpp
-```
-This formats code using the `_clang_format` file found in the oneMKL top-level directory.
-
-
-### GN: General Naming
-* **GN1:** Use snake_case for all type names: classes, structures, enums, template type arguments, type aliases.
-
-* **GN2:** Use snake_case for all variables (global, local, files, function parameters), global and local constants (including constexpr), functions (member, non-member) and enum values.
-
-* **GN3:** Use capitalized SNAKE_CASE only for
-macros.
-
-### GF: General Formatting
-* **GF1:** Each line of text in the code shall be at most 100 characters long.
-
-* **GF2:** Use only spaces, and indent 4 spaces at a time, never use tabs.
-
-* **GF3:** The open curly brace is always on the end of the last line of the statement (type, function, namespace declaration or control flow statement), not the start of the next line.
-```c
-int foo() { // <-- curly brace here
-    do_something();
-}
-
-if (condition) { // <-- curly brace here
-    do_something();
-}
-else { // <-- curly brace here
-    do_something();
-}
-
-if (condition) { // <-- curly brace here
-    do_something();
-} else { // <-- Also possible
-    do_something();
-}
-```
-
-* **GF4:** There is never a space between the parentheses and the parameters in function declaration/invocation or control flow statements.
-
-```c
-// Wrong
-int foo( int arg_1, float arg_2 );
-if ( condition );
-call_foo( value_1, value_2 );
-for ( int i = 0; i < loop_count; i++ );
-
-// Right
-int foo(int arg_1, float arg_2);
-if (condition);
-call_foo(value_1, value_2);
-for (int i = 0; i < loop_count; i++);
-```
-
-### FA: Files
-* **FA1:** Filenames should be lowercase and can include underscores "_".
-
-* **FA2:** C++ header files exposed to the user should end in .hpp.
-
-* **FA3:** C++ source files should end in .cpp.
-
-* **FA4:** All header files shall start with `#pragma once` guards to prevent multiple inclusion, refer to [Structure of Header Files](CONTRIBUTING.md#structure-of-header-files) for more details.
-
-* **FA5:** Each header file shall contain items in the following order:
-  1. Copyright
-  2. Single blank line
-  3. Preprocessor guard
-  4. Single blank line
-  5. Include statements (if there)
-  6. Single blank line if include statements are present
-  7. Global macros* (if any)
-  8. Single blank line if macros statements are present
-  9. Type/function declarations wrapped into namespaces
-
-Note: It is not necessary to put all macro definitions here. Sometimes it is convenient to have macros closer to the place where they are used. For example, sometimes it makes more sense to define macros inside the functions that use them (see Macros for more details). However, if the macro is used throughout the library, put it in header file between includes and the namespace declaration.
-
-* **FA6:** Each header file shall include other header
-files in the following order:
-
-  1. C standard headers
-  2. C++ standard headers
-  3. Single blank line if C/C++ headers are present
-  4. Third party libraries' header files (e.g., SYCL, TBB, OMP, etc.)
-  5. Single blank line if third party headers are present
-  6. Project's header files
-
-### NS: Namespaces
-* **NS1:** Use snake_case: all lowercase, with underscores "_" between words for all namespaces.
-
-* **NS2:** The name of a top-level namespace must be the name of the project (oneMKL).
-
-* **NS3:** Do not indent content inside a namespace scope.
-
-```c
-// Wrong! Do not indent
-namespace oneapi {
-namespace mkl {
-
-   class table { };
-
-} // namespace mkl
-} // namespace oneapi
-
-// Right
-namespace oneapi {
-namespace mkl {
-
-class table { };
-
-} // namespace mkl
-} // namespace oneapi
-```
-
-*  **NS4:** Put each namespace on its own line when declaring nested namespaces.
-
-```c
-#include "oneapi/mkl/blas/path_to_some_header.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-
-/* ... */
-
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-```
-
-
-### FU: Functions
-
-*  **FU1:** Use snake_case: all lowercase, with
-```c
-// Underscores between words for all function names.
-return_type class_name::function_name(type_1 arg_name_1, type_2 arg_name_2) {
-    do_something();
-}
-```
-
-*  **FU2:** There is never a space between the function name (or operator) and the open brace. This rule applies to both function declaration/definitions and calls.
-
-Declaration
-```c
-// Wrong
-void foo (type arg_name);
-void operator() (type arg_name);
-void operator bool ();
-
-// Right
-void foo(type arg_name);
-void operator()(type arg_name);
-void operator bool();
-```
-
-Call
-```c
-// Wrong
-const auto x = foo (arg_1, arg_2);
-
-// Right
-const auto x = foo(arg_1, arg_2);
-```
-
-*  **FU3:** Do not put the function signature and the body on the same line. The only exception is an empty body, in that case place the curly braces at the same line (see rule FU4).
-```c
-// Wrong
-std::int32_t get_something() const { return something_; }
-
-// Right
-std::int32_t get_something() const {
-    return something_;
-}
-```
-
-*  **FU4:** Empty function body shall be at the same line as function signature.
-```c
-// Wrong
-void empty_foo(type arg) {
-}
-
-// Right
-void empty_foo(type arg) {}
-```
-
-
-### CS: Classes and Structures
-
-*  **CS1:** Use snake_case: lower case and all words are separated with underscore character (_).
-```c
-class numeric_table;
-class image;
-struct params;
-```
-
-*  **CS2:** The acceptable formats for initializer lists are when everything fits on one line:
-```c
-my_class::my_class(int var) : some_var_(var) {
-    do_something();
-}
-```
-If the signature and initializer list are not all on one line, you must line wrap before the colon, indent 8 spaces, put each member on its own line, and align them:
-```c
-my_class::my_class(int var)
-        : some_var_(var),             // <-- 8 space indent
-          some_other_var_(var + 1) {  // lined up
-    do_something();
-}
-```
-As with any other code block, the close curly brace can be on the same line as the open curly, if it fits:
-```c
-my_class::my_class(int var)
-        : some_var_(var),
-          another_var_(0) {}
-```
-
-
-### VC: Variables and Constants
-
-* **VC1:** Use snake_case for all variables, function's arguments and constants.
-
-* **VC2:** Use variables and constant names followed by one underscore "_" for private and protected class-level variables.
-
-* **VC3:** The assignment operator "=" shall be surrounded by single whitespace.
-```c
-const auto val = get_some_value();
-```
-
-
-### ST: Statements
-
-*  **ST1:** Each of the keywords
-if/else/do/while/for/switch shall be followed by one space. An open curly brace after the condition shall be prepended with one space.
-```c
-while (condition) { // <-- one space after `while` and one space before `{`
-    do_something();
-} // <-- `;` is not required
-```
-
-*  **ST2:** Each of the keywords if/else/do/while/for/switch shall always have accompanying curly braces even if they contain a single-line statement.
-```c
-// Wrong
-if (my_const == my_var)
-    do_something();
-
-// Right
-if (my_const == my_var) {
-    do_something();
-}
-```
-
-*  **ST3:** The statements within parentheses for operators if, for, while shall have no spaces adjacent to the open and close parentheses characters:
-```c
-// Wrong
-for ( int i = 0; i < loop_size; i++ ) ...;
-
-// Right
-for (int i = 0; i < loop_size; i++) ...;
-```
-
-
-## Unit Tests
-
-oneMKL uses GoogleTest for functional testing. For more information about how to build and run Unit Tests please see [Building and Running Tests](https://oneapi-src.github.io/oneMKL/building_and_running_tests.html).
-
-Be sure to extend the existing tests when fixing an issue, adding a new interface or new implementation under existing interfaces.
diff --git a/README.md b/README.md
index e74e3b5ed..38bd4e2c6 100644
--- a/README.md
+++ b/README.md
@@ -1,602 +1,43 @@
-<img src="https://github.com/uxlfoundation/artwork/blob/main/foundation/uxl-foundation-logo-horizontal-color.png" alt="UXL Foundation Logo" width="250"/>
+# oneAPI Math Kernel Library (oneMKL) Interfaces Design Documents / RFCs
+
+This branch contains design documents for oneMKL Interfaces project-wide changes. The purpose of Request for Comments (RFC) process is to communicate all major changes in the project prior the actual implementation and document the decisions in one place.
+
+All design documents (RFCs) that are approved for implementation should be merged to this branch.
+
+## Document Style
+
+* Every design documents should be added as markdown document
+`rfcs/<YYYMMDD>-descriptive-but-short-proposal-name/README.md`.
+    * [Optional] For very domain specific documents location could be
+`rfcs/<domain>/<YYYMMDD>-descriptive-but-short-proposal-name/README.md`
+* Additional to `README.md` the design document directory can contain any other
+supporting materials: images, formulas, sub-proposal, etc.
+* The recommended document structure:
+[RFC template](rfcs/template.md).
+* Recommended width of the raw text is 80-100 symbols,
+long lines make it hard to read the document in the raw format
+
+
+## RFC Ratification Process
+
+1. Add new design document as a PR to this repository
+    * Please add a link to preview document in the PR description,
+e.g. link for this README in your fork will be
+```
+https://github.com/<USERNAME>/oneMKL/blob/rfcs/README.md
+```
+2. Assign all affected [teams](https://github.com/oneapi-src/oneMKL/blob/develop/README.md#contributing) and individual contributors as reviewers to the PR
+3. Organize offline review or an architecture meeting in order to collect feedback
+    * It's recommended to keep all feedback as part of PR review, so it also
+will be documented in one place
+4. If changes affect API defined by [oneMKL specification](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemkl/source/) the design document must be reviewed by [UXL Foundation Math SIG](https://github.com/uxlfoundation/foundation/tree/main/math) and contributed to [oneAPI specification](https://github.com/uxlfoundation/oneAPI-spec) and only after it the proposed changes can be implemented in this project.
+5. Merge PR when it has all required approvals
+    * It's recommended to add PR number to the commit message, so it will be easy
+to find the design discussion
+    * It's recommended to update the preview document link in the PR to the merged
+one because initial link to the local fork/branch will stop working after local branch removal,
+e.g. link for this README will be 
+```
+https://github.com/oneapi-src/oneMKL/blob/rfcs/README.md
+```
 
-# oneAPI Math Kernel Library (oneMKL) Interfaces
-
-oneMKL Interfaces is an open-source implementation of the oneMKL Data Parallel C++ (DPC++) interface according to the [oneMKL specification](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemkl/source/). It works with multiple devices (backends) using device-specific libraries underneath.
-
-oneMKL is part of the [UXL Foundation](http://www.uxlfoundation.org).
-<br/><br/>
-
-<table>
-    <thead>
-        <tr align="center" >
-            <th>User Application</th>
-            <th>oneMKL Layer</th>
-            <th>Third-Party Library</th>
-            <th>Hardware Backend</th>
-        </tr>
-    </thead>
-    <tbody>
-        <tr>
-            <td rowspan=12 align="center">oneMKL interface</td>
-            <td rowspan=12 align="center">oneMKL selector</td>
-            <td align="center"><a href="https://software.intel.com/en-us/oneapi/onemkl">Intel(R) oneAPI Math Kernel Library (oneMKL)</a></td>
-            <td align="center">x86 CPU, Intel GPU</td>
-        </tr>
-        </tr>
-        <tr>
-            <td align="center"><a href="https://developer.nvidia.com/cublas"> NVIDIA cuBLAS</a></td>
-            <td align="center">NVIDIA GPU</td>
-        </tr>
-	<tr>
-            <td align="center"><a href="https://developer.nvidia.com/cusolver"> NVIDIA cuSOLVER</a></td>
-            <td align="center">NVIDIA GPU</td>
-	</tr>
-        <tr>
-            <td align="center"><a href="https://developer.nvidia.com/curand"> NVIDIA cuRAND</a></td>
-            <td align="center">NVIDIA GPU</td>
-        </tr>
-        <tr>
-            <td align="center"><a href="https://developer.nvidia.com/cufft"> NVIDIA cuFFT</a></td>
-            <td align="center">NVIDIA GPU</td>
-        </tr>
-        <tr>
-            <td align="center"><a href="https://ww.netlib.org"> NETLIB LAPACK</a> </td>
-            <td align="center">x86 CPU</td>
-        </tr>
-        <tr>
-            <td align="center"><a href="https://rocblas.readthedocs.io/en/rocm-4.5.2/"> AMD rocBLAS</a></td>
-            <td align="center">AMD GPU</td>
-        </tr>
-        <tr>
-            <td align="center"><a href="https://github.com/ROCmSoftwarePlatform/rocSOLVER"> AMD rocSOLVER</a></td>
-            <td align="center">AMD GPU</td>
-        </tr>
-        <tr>
-            <td align="center"><a href="https://github.com/ROCmSoftwarePlatform/rocRAND"> AMD rocRAND</a></td>
-            <td align="center">AMD GPU</td>
-        </tr>
-        <tr>
-            <td align="center"><a href="https://github.com/ROCmSoftwarePlatform/rocFFT">AMD rocFFT</a></td>
-            <td align="center">AMD GPU</td>
-        </tr>
-        <tr>
-            <td align="center"><a href="https://github.com/codeplaysoftware/portBLAS"> portBLAS </a></td>
-            <td align="center">x86 CPU, Intel GPU, NVIDIA GPU, AMD GPU</td>
-        </tr>
-        <tr>
-            <td align="center"><a href="https://github.com/codeplaysoftware/portFFT"> portFFT </a></td>
-            <td align="center">x86 CPU, Intel GPU, NVIDIA GPU, AMD GPU</td>
-        </tr>
-    </tbody>
-</table>
-
-
-## Table of Contents
-
-- [Support and Requirements](#support-and-requirements)
-- [Documentation](#documentation)
-- [FAQs](#faqs)
-- [Legal Information](#legal-information)
-
----
-
-## Support and Requirements
-
-### Supported Usage Models:
-
-#### Host API
-
-There are two oneMKL selector layer implementations:
-
-- **Run-time dispatching**: The application is linked with the oneMKL library and the required backend is loaded at run-time based on device vendor (all libraries should be dynamic).
-
-  Example of app.cpp with run-time dispatching:
-  
-  ```cpp
-  #include "oneapi/mkl.hpp"
-  
-  ...
-  cpu_dev = sycl::device(sycl::cpu_selector());
-  gpu_dev = sycl::device(sycl::gpu_selector());
-  
-  sycl::queue cpu_queue(cpu_dev);
-  sycl::queue gpu_queue(gpu_dev);
-  
-  oneapi::mkl::blas::column_major::gemm(cpu_queue, transA, transB, m, ...);
-  oneapi::mkl::blas::column_major::gemm(gpu_queue, transA, transB, m, ...);
-  ```
-  How to build an application with run-time dispatching:
-  
-  if OS is Linux, use icpx compiler. If OS is Windows, use icx compiler.
-  Linux example:
-  ```cmd
-  $> icpx -fsycl –I$ONEMKL/include app.cpp
-  $> icpx -fsycl app.o –L$ONEMKL/lib –lonemkl
-  ```
-
-- **Compile-time dispatching**: The application uses a templated backend selector API where the template parameters specify the required backends and third-party libraries and the application is linked with the required oneMKL backend wrapper libraries (libraries can be static or dynamic).
-
-  Example of app.cpp with compile-time dispatching:
-  
-  ```cpp
-  #include "oneapi/mkl.hpp"
-  
-  ...
-  cpu_dev = sycl::device(sycl::cpu_selector());
-  gpu_dev = sycl::device(sycl::gpu_selector());
-  
-  sycl::queue cpu_queue(cpu_dev);
-  sycl::queue gpu_queue(gpu_dev);
-  
-  oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu> cpu_selector(cpu_queue);
-  
-  oneapi::mkl::blas::column_major::gemm(cpu_selector, transA, transB, m, ...);
-  oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas> {gpu_queue}, transA, transB, m, ...);
-  ```
-  How to build an application with compile-time dispatching:
-  
-  ```cmd
-  $> clang++ -fsycl –I$ONEMKL/include app.cpp
-  $> clang++ -fsycl app.o –L$ONEMKL/lib –lonemkl_blas_mklcpu –lonemkl_blas_cublas
-  ```
-  
-*Refer to [Selecting a Compiler](https://oneapi-src.github.io/oneMKL/selecting_a_compiler.html) for the choice between `icpx/icx` and `clang++` compilers.*
-
-#### Device API
-
-Header-based and backend-independent Device API can be called within ```sycl kernel``` or work from Host code ([device-rng-usage-model-example](https://spec.oneapi.io/versions/latest/elements/oneMKL/source/domains/rng/device_api/device-rng-usage-model.html#id2)). Currently, the following domains support the Device API:
-
-- **RNG**. To use RNG Device API functionality it's required to include ```oneapi/mkl/rng/device.hpp``` header file.
-
-### Supported Configurations:
-
-Supported domains include: BLAS, LAPACK, RNG, DFT, SPARSE_BLAS
-
-Supported compilers include:
-- [Intel(R) oneAPI DPC++ Compiler](https://software.intel.com/en-us/oneapi/dpc-compiler): Intel proprietary compiler that supports CPUs and Intel GPUs. Intel(R) oneAPI DPC++ Compiler will be referred to as "Intel DPC++" in the "Supported Compiler" column of the tables below.
-- [oneAPI DPC++ Compiler](https://github.com/intel/llvm): Open source compiler that supports CPUs and Intel, NVIDIA, and AMD GPUs. oneAPI DPC++ Compiler will be referred to as "Open DPC++" in the "Supported Compiler" column of the tables below.
-- [AdaptiveCpp Compiler](https://github.com/AdaptiveCpp/AdaptiveCpp) (formerly known as hipSYCL): Open source compiler that supports CPUs and Intel, NVIDIA, and AMD GPUs.</br>**Note**: The source code and some documents in this project still use the previous name hipSYCL during this transition period.
-
-#### Linux*
-
-<table>
-    <thead>
-        <tr align="center" >
-            <th>Domain</th>
-            <th>Backend</th>
-            <th>Library</th>
-            <th>Supported Compiler</th>		
-            <th>Supported Link Type</th>
-        </tr>
-    </thead>
-    <tbody>
-        <tr>
-            <td rowspan=9 align="center">BLAS</td>
-            <td rowspan=3 align="center">x86 CPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</br>AdaptiveCpp</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">NETLIB LAPACK</td>
-            <td align="center">Intel DPC++</br>Open DPC++</br>AdaptiveCpp</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">portBLAS</td>
-            <td align="center">Intel DPC++</br>Open DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td rowspan=2 align="center">Intel GPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">portBLAS</td>
-            <td align="center">Intel DPC++</br>Open DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td rowspan=2 align="center">NVIDIA GPU</td>
-            <td align="center">NVIDIA cuBLAS</td>
-            <td align="center">Open DPC++</br>AdaptiveCpp</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">portBLAS</td>
-            <td align="center">Open DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td rowspan=2 align="center">AMD GPU</td>
-            <td align="center">AMD rocBLAS</td>
-            <td align="center">Open DPC++</br>AdaptiveCpp</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">portBLAS</td>
-            <td align="center">Open DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td rowspan=4 align="center">LAPACK</td>
-            <td align="center">x86 CPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">Intel GPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">NVIDIA GPU</td>
-            <td align="center">NVIDIA cuSOLVER</td>
-            <td align="center">Open DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">AMD GPU</td>
-            <td align="center">AMD rocSOLVER</td>
-            <td align="center">Open DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td rowspan=4 align="center">RNG</td>
-            <td align="center">x86 CPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</br>AdaptiveCpp</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">Intel GPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">NVIDIA GPU</td>
-            <td align="center">NVIDIA cuRAND</td>
-            <td align="center">Open DPC++</br>AdaptiveCpp</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">AMD GPU</td>
-            <td align="center">AMD rocRAND</td>
-            <td align="center">Open DPC++</br>AdaptiveCpp</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td rowspan=8 align="center">DFT</td>
-            <td rowspan=2 align="center">x86 CPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">portFFT (<a href="https://github.com/codeplaysoftware/portFFT#supported-configurations">limited API support</a>)</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td rowspan=2 align="center">Intel GPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">portFFT (<a href="https://github.com/codeplaysoftware/portFFT#supported-configurations">limited API support</a>)</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td rowspan=2 align="center">NVIDIA GPU</td>
-            <td align="center">NVIDIA cuFFT</td>
-            <td align="center">Open DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">portFFT (<a href="https://github.com/codeplaysoftware/portFFT#supported-configurations">limited API support</a>)</td>
-            <td align="center">Open DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td rowspan=2 align="center">AMD GPU</td>
-            <td align="center">AMD rocFFT</td>
-            <td align="center">Open DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">portFFT (<a href="https://github.com/codeplaysoftware/portFFT#supported-configurations">limited API support</a>)</td>
-            <td align="center">Open DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td rowspan=2 align="center">SPARSE_BLAS</td>
-            <td align="center">x86 CPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">Intel GPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-    </tbody>
-</table>
-
-#### Windows*
-
-<table>
-    <thead>
-        <tr align="center" >
-            <th>Domain</th>
-            <th>Backend</th>
-            <th>Library</th>
-            <th>Supported Compiler</th>	
-            <th>Supported Link Type</th>
-        </tr>
-    </thead>
-    <tbody>
-        <tr>
-            <td rowspan=3 align="center">BLAS</td>
-            <td rowspan=2 align="center">x86 CPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">NETLIB LAPACK</td>
-            <td align="center">Intel DPC++</br>Open DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">Intel GPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td rowspan=2 align="center">LAPACK</td>
-            <td align="center">x86 CPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">Intel GPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td rowspan=2 align="center">RNG</td>
-            <td align="center">x86 CPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-        <tr>
-            <td align="center">Intel GPU</td>
-            <td align="center">Intel(R) oneMKL</td>
-            <td align="center">Intel DPC++</td>
-            <td align="center">Dynamic, Static</td>
-        </tr>
-    </tbody>
-</table>
-
----
-
-### Hardware Platform Support
-
-- CPU
-    - Intel Atom(R) Processors
-    - Intel(R) Core(TM) Processor Family
-    - Intel(R) Xeon(R) Processor Family
-- Accelerators
-    - Intel(R) Arc(TM) A-Series Graphics
-    - Intel(R) Data Center GPU Max Series
-    - NVIDIA(R) A100 (Linux* only)
-    - AMD(R) GPUs see [here](https://github.com/RadeonOpenCompute/ROCm#hardware-and-software-support) tested on AMD Vega 20 (gfx906)
-    
----
-### Supported Operating Systems
-
-#### Linux*
-
-Backend | Supported Operating System
-:--- | :---
-x86 CPU    | Red Hat Enterprise Linux* 9 (RHEL* 9)
-Intel GPU  | Ubuntu 22.04 LTS
-NVIDIA GPU | Ubuntu 22.04 LTS
-
-#### Windows*
-
-Backend | Supported Operating System
-:--- | :---
-x86 CPU   | Microsoft Windows* Server 2022
-Intel GPU | Microsoft Windows* 11
----
-
-### Software Requirements
-
-**What should I download?**
-
-#### General:
-<table>
-    <thead>
-        <tr align="center">
-            <th> Functional Testing </th>
-            <th> Build Only </th>
-            <th>Documentation</th>
-        </tr>
-    </thead>
-    <tbody>
-        <tr>
-            <td colspan=3 align=center> CMake (version 3.13 or newer) </td>
-        </tr>
-        <tr>
-            <td colspan=3 align=center> Linux* : GNU* GCC 5.1 or higher <br> Windows* : MSVS* 2017 or MSVS* 2019 (version 16.5 or newer) </td>
-        </tr>
-        <tr>
-            <tr>
-                <td colspan=3 align=center> Ninja (optional) </td>
-            </tr>
-            <tr>
-                <td> GNU* FORTRAN Compiler </td>
-                <td> - </td>
-                <td> Sphinx </td>
-            </tr>
-            <tr>
-                <td> NETLIB LAPACK </td>
-                <td> - </td>
-                <td> - </td>
-            </tr>
-        </tr>
-    </tbody>
-</table>
-
-#### Hardware and OS Specific:
-<table>
-    <thead>
-        <tr align="center">
-            <th>Operating System</th>
-            <th>Device</th>
-            <th>Package</th>
-        </tr>
-    </thead>
-    <tbody>
-        <td rowspan=5> Linux*/Windows* </td>
-        <td rowspan=2> x86 CPU </td>
-        <td> Intel(R) oneAPI DPC++ Compiler <br> or <br> oneAPI DPC++ Compiler </td>
-        <tr>
-            <td> Intel(R) oneAPI Math Kernel Library </td>
-        </tr>
-        <td rowspan=3> Intel GPU </td>
-        <td> Intel(R) oneAPI DPC++ Compiler </td>
-        <tr>
-            <td> Intel GPU driver </td>
-        </tr>
-        <tr>
-            <td> Intel(R) oneAPI Math Kernel Library </td>
-        </tr>
-        <td rowspan=2> Linux* only </td>
-        <td> NVIDIA GPU </td>
-        <td> oneAPI DPC++ Compiler <br> or <br> AdaptiveCpp with CUDA backend and dependencies </td>
-        <tr>
-            <td> AMD GPU </td>
-            <td> oneAPI DPC++ Compiler <br> or <br> AdaptiveCpp with ROCm backend and dependencies </td>
-        </tr>
-    </tbody>
-</table>
-
-#### Product and Version Information:
-
-Product | Supported Version | License
-:--- | :--- | :---
-[CMake](https://cmake.org/download/) | 3.13 or higher | [The OSI-approved BSD 3-clause License](https://gitlab.kitware.com/cmake/cmake/raw/master/Copyright.txt)
-[Ninja](https://ninja-build.org/) | 1.10.0 | [Apache License v2.0](https://github.com/ninja-build/ninja/blob/master/COPYING)
-[GNU* FORTRAN Compiler](https://gcc.gnu.org/wiki/GFortran) | 7.4.0 or higher | [GNU General Public License, version 3](https://gcc.gnu.org/onlinedocs/gcc-7.5.0/gfortran/Copying.html)
-[Intel(R) oneAPI DPC++ Compiler](https://software.intel.com/en-us/oneapi/dpc-compiler) | Latest | [End User License Agreement for the Intel(R) Software Development Products](https://software.intel.com/en-us/license/eula-for-intel-software-development-products)
-[AdaptiveCpp](https://github.com/AdaptiveCpp/AdaptiveCpp) | Later than [2cfa530](https://github.com/AdaptiveCpp/AdaptiveCpp/commit/2cfa5303fd88b8f84e539b5bb6ed41e49c6d6118) | [BSD-2-Clause License ](https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/LICENSE)
-[oneAPI DPC++ Compiler binary for x86 CPU](https://github.com/intel/llvm/releases) | Daily builds | [Apache License v2](https://github.com/intel/llvm/blob/sycl/sycl/LICENSE.TXT)
-[oneAPI DPC++ Compiler source for NVIDIA and AMD GPUs](https://github.com/intel/llvm) | Daily source releases | [Apache License v2](https://github.com/intel/llvm/blob/sycl/sycl/LICENSE.TXT)
-[Intel(R) oneAPI Math Kernel Library](https://software.intel.com/en-us/oneapi/onemkl) | Latest | [Intel Simplified Software License](https://software.intel.com/en-us/license/intel-simplified-software-license)
-[NVIDIA CUDA SDK](https://developer.nvidia.com/hpc-sdk) | 12.0 | [End User License Agreement](https://docs.nvidia.com/cuda/eula/index.html)
-[AMD rocBLAS](https://github.com/ROCm/rocblas) | 4.5 | [AMD License](https://github.com/ROCm/rocBLAS/blob/develop/LICENSE.md)
-[AMD rocRAND](https://github.com/ROCm/rocRAND) | 5.1.0 | [AMD License](https://github.com/ROCm/rocRAND/blob/develop/LICENSE.txt)
-[AMD rocSOLVER](https://github.com/ROCm/rocSOLVER) | 5.0.0 | [AMD License](https://github.com/ROCm/rocSOLVER/blob/develop/LICENSE.md)
-[AMD rocFFT](https://github.com/ROCm/rocFFT) | rocm-5.4.3 | [AMD License](https://github.com/ROCm/rocFFT/blob/rocm-5.4.3/LICENSE.md)
-[NETLIB LAPACK](https://www.netlib.org/) | [5d4180c](https://github.com/Reference-LAPACK/lapack/commit/5d4180cf8288ae6ad9a771d18793d15bd0c5643c) | [BSD like license](http://www.netlib.org/lapack/LICENSE.txt)
-[portBLAS](https://github.com/codeplaysoftware/portBLAS) | 0.1 | [Apache License v2.0](https://github.com/codeplaysoftware/portBLAS/blob/main/LICENSE)
-[portFFT](https://github.com/codeplaysoftware/portFFT) | 0.1 | [Apache License v2.0](https://github.com/codeplaysoftware/portFFT/blob/main/LICENSE)
-
----
-
-## Documentation
-- [Contents](https://oneapi-src.github.io/oneMKL/)
-- [About](https://oneapi-src.github.io/oneMKL/introduction.html)
-- Get Started
-  - [Selecting a Compiler](https://oneapi-src.github.io/oneMKL/selecting_a_compiler.html)
-  - [Building the Project with DPC++](https://oneapi-src.github.io/oneMKL/building_the_project_with_dpcpp.html)
-  - [Building the Project with AdaptiveCpp](https://oneapi-src.github.io/oneMKL/building_the_project_with_adaptivecpp.html)
-- Developer Reference
-  - [oneMKL Defined Datatypes](https://oneapi-src.github.io/oneMKL/onemkl-datatypes.html)
-  - [Dense Linear Algebra](https://oneapi-src.github.io/oneMKL/domains/dense_linear_algebra.html)
-- [Integrating a Third-Party Library](https://oneapi-src.github.io/oneMKL/create_new_backend.html)
-
----
-
-## Governance
-
-The oneMKL Interfaces project is governed by the UXL Foundation and you can get involved in this project in multiple ways. It is possible to join the [Math Special Interest Group (SIG)](https://github.com/uxlfoundation/foundation/tree/main/math) meetings where the group discusses and demonstrates work using this project. Members can also join the Open Source and Specification Working Group meetings.
-
-You can also join the mailing lists for the [UXL Foundation](https://lists.uxlfoundation.org/g/main/subgroups) to be informed of when meetings are happening and receive the latest information and discussions.
-
----
-
-## Contributing
-
-You can contribute to this project and also contribute to [the specification for this project](https://spec.oneapi.io/versions/latest/elements/oneMKL/source/index.html). Please read the [CONTRIBUTING](CONTRIBUTING.md) page for more information. You can also contact oneMKL developers and maintainers via [UXL Foundation Slack](https://slack-invite.uxlfoundation.org/) using [#onemkl](https://uxlfoundation.slack.com/archives/onemkl) channel.
-
-For GitHub questions, issues, RFCs, or PRs you can contact maintainers via one of the following GitHub teams based on the topic:
-
-| GitHub team name | Description |
-:-----------|:------------|
-| @oneapi-src/onemkl-maintain  | All oneMKL maintainers |
-| @oneapi-src/onemkl-arch-write | oneMKL Architecture maintainers |
-| @oneapi-src/onemkl-blas-write | oneMKL BLAS maintainers |
-| @oneapi-src/onemkl-dft-write | oneMKL DFT maintainers |
-| @oneapi-src/onemkl-lapack-write) | oneMKL LAPACK maintainers |
-| @oneapi-src/onemkl-rng-write | oneMKL RNG maintainers |
-| @oneapi-src/onemkl-sparse-write | oneMKL Sparse Algebra maintainers |
-| @oneapi-src/onemkl-vm-write | oneMKL Vector Math maintainers |
-
----
-
-## License
-
-Distributed under the Apache license 2.0. See [LICENSE](LICENSE) for more information.
-
----
-
-## FAQs
-
-### oneMKL
-
-**Q: What is the difference between the following oneMKL items?**
-   - The [oneAPI Specification for oneMKL](https://spec.oneapi.com/versions/latest/index.html)
-   - The [oneAPI Math Kernel Library (oneMKL) Interfaces](https://github.com/oneapi-src/oneMKL) Project
-   - The [Intel(R) oneAPI Math Kernel Library (oneMKL)](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html) Product
-
-**A:**
-- The [oneAPI Specification for oneMKL](https://spec.oneapi.com/versions/latest/index.html) defines the DPC++ interfaces for performance math library functions. The oneMKL specification can evolve faster and more frequently than implementations of the specification.
-
-- The [oneAPI Math Kernel Library (oneMKL) Interfaces](https://github.com/oneapi-src/oneMKL) Project is an open source implementation of the specification. The project goal is to demonstrate how the DPC++ interfaces documented in the oneMKL specification can be implemented for any math library and work for any target hardware. While the implementation provided here may not yet be the full implementation of the specification, the goal is to build it out over time. We encourage the community to contribute to this project and help to extend support to multiple hardware targets and other math libraries.
-
-- The [Intel(R) oneAPI Math Kernel Library (oneMKL)](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html) product is the Intel product implementation of the specification (with DPC++ interfaces) as well as similar functionality with C and Fortran interfaces, and is provided as part of Intel® oneAPI Base Toolkit. It is highly optimized for Intel CPU and Intel GPU hardware.
-
-**Q: I'm trying to use oneMKL Interfaces in my project using `FetchContent`**, but I keep running into `ONEMKL::SYCL::SYCL target was not found` problem when I try to build the project. What should I do?
-
-**A:**
-Make sure you set the compiler when you configure your project.
-E.g. `cmake -Bbuild . -DCMAKE_CXX_COMPILER=icpx`.
-
-**Q: I'm trying to use oneMKL Interfaces in my project using `find_package(oneMKL)`.** I set oneMKL/oneTBB and Compiler environment first, then I built and installed oneMKL Interfaces, and finally I tried to build my project using installed oneMKL Interfaces (e.g. like this `cmake -Bbuild -GNinja -DCMAKE_CXX_COMPILER=icpx -DoneMKL_ROOT=<path_to_installed_oneMKL_interfaces> .`) and I noticed that cmake includes installed oneMKL Interfaces headers as a system include which ends up as a lower priority than the installed oneMKL package includes which I set before for building oneMKL Interfaces. As a result, I get conflicts between oneMKL and installed oneMKL Interfaces headers. What should I do?
-
-**A:**
-Having installed oneMKL Interfaces headers as `-I` instead on system includes (as `-isystem`) helps to resolve this problem. We use `INTERFACE_INCLUDE_DIRECTORIES` to add paths to installed oneMKL Interfaces headers (check `oneMKLTargets.cmake` in `lib/cmake` to find it). It's a known limitation that `INTERFACE_INCLUDE_DIRECTORIES` puts headers paths as system headers. To avoid that:
-- Option 1: Use CMake >=3.25. In this case oneMKL Interfaces will be built with `EXPORT_NO_SYSTEM` property set to `true` and you won't see the issue.
-- Option 2: If you use CMake < 3.25, set `PROPERTIES NO_SYSTEM_FROM_IMPORTED true` for your target. E.g: `set_target_properties(test PROPERTIES NO_SYSTEM_FROM_IMPORTED true)`.
-
----
-
-
-#### [Legal information](legal_information.md)
diff --git a/SECURITY.md b/SECURITY.md
deleted file mode 100644
index 480361d12..000000000
--- a/SECURITY.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# Security Policy
-As an open-source project, we understand the importance of and responsibility
-for security. This Security policy outlines our guidelines and procedures for
-ensuring the highest level of Security and trust for our users who consume
-oneMKL Interfaces.
-
-## Supported Versions
-We provide support for the [latest version][1] only.
-The security vulnerabilities can be fixed in patch release on top of the latest version.Prior major releases might receive critical security fixes on a best-effort basis; however, we cannot guarantee that security fixes will get back-ported.
-
-## Report a Vulnerability
-We are very grateful to the security researchers and users that report back
-security vulnerabilities. We investigate every report thoroughly.
-We strongly encourage you to report security vulnerabilities to us privately,
-before disclosing them on public forums or opening a public GitHub issue. 
-Report a vulnerability to us in one of two ways:
-* Open a draft [**GitHub Security Advisory**][2]
-* Send e-mail to the following address: **security@uxlfoundation.org**.
-Along with the report, please include the following info:
-  * A descriptive title.
-  * Your name and affiliation (if any).
-  * A description of the technical details of the vulnerabilities.
-  * A minimal example of the vulnerability so we can reproduce your findings.
-  * An explanation of who can exploit this vulnerability, and what they gain
-  when doing so. 
-  * Whether this vulnerability is public or known to third parties. If it is,
-  please provide details.
-
-### When Should I Report a Vulnerability?
-* You think you discovered a potential security vulnerability in oneMKL Interfaces.
-* You are unsure how the potential vulnerability affects oneMKL Interfaces.
-* You think you discovered a vulnerability in another project or 3rd party
-component on which oneMKL Interfaces depends. If the issue is not fixed in the 3rd party
-component, try to report directly there first.
-
-### When Should I NOT Report a Vulnerability?
-* You got an automated scan hit and are unable to provide details.
-* You need help using oneMKL Interfaces for security.
-* You need help applying security-related updates.
-* Your issue is not security-related.
-
-## Security Reports Review Process
-Our goal is to respond quickly to your inquiry, and to coordinate a fix and
-disclosure with you. All confirmed security vulnerabilities will be addressed
-according to severity level and impact on oneMKL Interfaces. Normally, security issues
-are fixed in the next planned release.
-
-## Disclosure Policy
-We will publish security advisories using the 
-[**GitHub Security Advisories feature**][3]
-to keep our community well-informed, and will credit you for your findings
-unless you prefer to stay anonymous. We request that you refrain from
-exploiting the vulnerability or making it public before the official disclosure.
-
-We will disclose the vulnerabilities and/or bugs as soon as possible once
-mitigation is implemented and available. 
-
-## Feedback on This Policy
-If you have any suggestions on how this Policy could be improved, please submit
-an issue or a pull request to this repository. Please **do not** report
-potential vulnerabilities or security flaws via a pull request.
-
-[1]: https://github.com/oneapi-src/oneMKL/releases/latest
-[2]: https://github.com/oneapi-src/oneMKL/security/advisories/new
-[3]: https://github.com/oneapi-src/oneMKL/security/advisories
\ No newline at end of file
diff --git a/_clang-format b/_clang-format
deleted file mode 100644
index 37a50f367..000000000
--- a/_clang-format
+++ /dev/null
@@ -1,164 +0,0 @@
-#===============================================================================
-# Copyright 2016-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
----
-Language: Cpp
-AccessModifierOffset: -4
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignConsecutiveMacros: true
-AlignEscapedNewlines: Left
-AlignOperands: true
-AlignTrailingComments: false
-AllowAllArgumentsOnNextLine: true
-AllowAllConstructorInitializersOnNextLine: false
-AllowAllParametersOfDeclarationOnNextLine: false
-AllowShortBlocksOnASingleLine: false
-AllowShortCaseLabelsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: Empty
-AllowShortIfStatementsOnASingleLine: Never
-AllowShortLambdasOnASingleLine: Inline
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterCaseLabel: false
-  AfterClass: false
-  AfterControlStatement: false
-  AfterEnum: false
-  AfterFunction: false
-  AfterNamespace: false
-  AfterObjCDeclaration: false
-  AfterStruct: false
-  AfterUnion: false
-  AfterExternBlock: false
-  BeforeCatch: true
-  BeforeElse: true
-  IndentBraces: false
-  SplitEmptyFunction: false
-  SplitEmptyRecord: false
-  SplitEmptyNamespace: false
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Custom
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializers: BeforeColon
-BreakInheritanceList: BeforeColon
-BreakStringLiterals: false
-ColumnLimit: 100
-CommentPragmas: '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-ConstructorInitializerIndentWidth: 8
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: false
-DerivePointerAlignment: true
-DisableFormat: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks: Preserve
-IncludeCategories:
-  - Regex:           '^<ext/.*\.h>'
-    Priority:        2
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth: 4
-IndentWrappedFunctionNames: false
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Left
-RawStringFormats:
-  - Language: Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
-    CanonicalDelimiter: ''
-    BasedOnStyle: google
-  - Language: TextProto
-    Delimiters:
-      - pb
-      - PB
-      - proto
-      - PROTO
-    EnclosingFunctions:
-      - EqualsProto
-      - EquivToProto
-      - PARSE_PARTIAL_TEXT_PROTO
-      - PARSE_TEST_PROTO
-      - PARSE_TEXT_PROTO
-      - ParseTextOrDie
-      - ParseTextProtoOrDie
-    CanonicalDelimiter: ''
-    BasedOnStyle: google
-ReflowComments: false
-SortIncludes: false
-SortUsingDeclarations: false
-SpaceAfterCStyleCast: false
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 1
-SpacesInAngles: false
-SpacesInContainerLiterals: false
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard: Cpp11
-StatementMacros:
-  - Q_UNUSED
-  - QT_REQUIRE_VERSION
-TabWidth: 1
-UseTab: Never
-...
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
deleted file mode 100644
index df7d2fc4c..000000000
--- a/cmake/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-install(FILES FindCompiler.cmake
-        DESTINATION "lib/cmake/${PROJECT_NAME}"
-)
-if(ENABLE_MKLGPU_BACKEND OR ENABLE_MKLCPU_BACKEND)
-  install(FILES mkl/MKLConfig.cmake
-        DESTINATION "lib/cmake/${PROJECT_NAME}"
-  )
-endif()
diff --git a/cmake/FindCBLAS.cmake b/cmake/FindCBLAS.cmake
deleted file mode 100644
index b6515854d..000000000
--- a/cmake/FindCBLAS.cmake
+++ /dev/null
@@ -1,37 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-include_guard()
-
-find_library(CBLAS_file NAMES cblas.dll.lib cblas.lib cblas HINTS ${REF_BLAS_ROOT} PATH_SUFFIXES lib lib64)
-find_package_handle_standard_args(CBLAS REQUIRED_VARS CBLAS_file)
-find_library(BLAS_file NAMES blas.dll.lib blas.lib blas HINTS ${REF_BLAS_ROOT} PATH_SUFFIXES lib lib64)
-find_package_handle_standard_args(CBLAS REQUIRED_VARS BLAS_file)
-
-get_filename_component(CBLAS_LIB_DIR ${CBLAS_file} DIRECTORY)
-find_path(CBLAS_INCLUDE cblas.h HINTS ${REF_BLAS_ROOT} PATH_SUFFIXES include)
-
-if(UNIX)
-  list(APPEND CBLAS_LINK "-Wl,-rpath,${CBLAS_LIB_DIR}")
-endif()
-list(APPEND CBLAS_LINK ${CBLAS_file})
-list(APPEND CBLAS_LINK ${BLAS_file})
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(CBLAS REQUIRED_VARS CBLAS_INCLUDE CBLAS_LINK)
diff --git a/cmake/FindCompiler.cmake b/cmake/FindCompiler.cmake
deleted file mode 100644
index 265719bf0..000000000
--- a/cmake/FindCompiler.cmake
+++ /dev/null
@@ -1,72 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-include_guard()
-include(CheckCXXCompilerFlag)
-include(FindPackageHandleStandardArgs)
-check_cxx_compiler_flag("-fsycl" is_dpcpp)
-
-if(is_dpcpp)
-  # Workaround for internal compiler error during linking if -fsycl is used
-  get_filename_component(SYCL_BINARY_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
-  find_library(SYCL_LIBRARY NAMES sycl PATHS "${SYCL_BINARY_DIR}/../lib" "${SYCL_BINARY_DIR}/lib" ENV LIBRARY_PATH ENV PATH)
-  if(NOT SYCL_LIBRARY)
-    message(FATAL_ERROR "SYCL library is not found in ${SYCL_BINARY_DIR}/../lib, PATH, and LIBRARY_PATH")
-  endif()
-
-  add_library(ONEMKL::SYCL::SYCL INTERFACE IMPORTED)
-  if(UNIX)
-    set(UNIX_INTERFACE_COMPILE_OPTIONS -fsycl)
-    set(UNIX_INTERFACE_LINK_OPTIONS -fsycl)
-    # Check if the Nvidia target is supported. PortFFT uses this for choosing default configuration.
-    check_cxx_compiler_flag("-fsycl -fsycl-targets=nvptx64-nvidia-cuda" dpcpp_supports_nvptx64)
-
-    if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND)
-      list(APPEND UNIX_INTERFACE_COMPILE_OPTIONS
-        -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda)
-      list(APPEND UNIX_INTERFACE_LINK_OPTIONS
-        -fsycl-targets=nvptx64-nvidia-cuda)
-    elseif(ENABLE_ROCBLAS_BACKEND OR ENABLE_ROCRAND_BACKEND
-                OR ENABLE_ROCSOLVER_BACKEND)
-      list(APPEND UNIX_INTERFACE_COMPILE_OPTIONS
-        -fsycl-targets=amdgcn-amd-amdhsa -fsycl-unnamed-lambda 
-	-Xsycl-target-backend --offload-arch=${HIP_TARGETS})
-      list(APPEND UNIX_INTERFACE_LINK_OPTIONS
-        -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend 
-	--offload-arch=${HIP_TARGETS})
-    endif()
-    if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_ROCBLAS_BACKEND
-	    OR ENABLE_ROCRAND_BACKEND OR ENABLE_ROCSOLVER_BACKEND)
-      set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES
-        INTERFACE_COMPILE_OPTIONS "${UNIX_INTERFACE_COMPILE_OPTIONS}"
-        INTERFACE_LINK_OPTIONS "${UNIX_INTERFACE_LINK_OPTIONS}"
-        INTERFACE_LINK_LIBRARIES ${SYCL_LIBRARY})
-    else()
-      set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES
-        INTERFACE_COMPILE_OPTIONS "-fsycl"
-        INTERFACE_LINK_OPTIONS "-fsycl"
-        INTERFACE_LINK_LIBRARIES ${SYCL_LIBRARY})
-    endif()
-  else()
-    set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES
-      INTERFACE_COMPILE_OPTIONS "-fsycl"
-      INTERFACE_LINK_LIBRARIES ${SYCL_LIBRARY})
-  endif()
-
-endif()
diff --git a/cmake/FindLAPACKE.cmake b/cmake/FindLAPACKE.cmake
deleted file mode 100644
index 42a5b7df3..000000000
--- a/cmake/FindLAPACKE.cmake
+++ /dev/null
@@ -1,43 +0,0 @@
-#===============================================================================
-# Copyright 2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-include_guard()
-
-find_library(LAPACKE64_file NAMES lapacke64.dll.lib lapacke64.lib lapacke64 HINTS ${REF_LAPACK_ROOT} PATH_SUFFIXES lib lib64)
-find_package_handle_standard_args(LAPACKE REQUIRED_VARS LAPACKE64_file)
-find_library(LAPACK64_file NAMES lapack64.dll.lib lapack64.lib lapack64 HINTS ${REF_LAPACK_ROOT} PATH_SUFFIXES lib lib64)
-find_package_handle_standard_args(LAPACKE REQUIRED_VARS LAPACK64_file)
-find_library(CBLAS64_file NAMES cblas64.dll.lib cblas64.lib cblas64 HINTS ${REF_LAPACK_ROOT} PATH_SUFFIXES lib lib64)
-find_package_handle_standard_args(LAPACKE REQUIRED_VARS CBLAS64_file)
-find_library(BLAS64_file NAMES blas64.dll.lib blas64.lib blas64 HINTS ${REF_LAPACK_ROOT} PATH_SUFFIXES lib lib64)
-find_package_handle_standard_args(LAPACKE REQUIRED_VARS BLAS64_file)
-
-get_filename_component(LAPACKE64_LIB_DIR ${LAPACKE64_file} DIRECTORY)
-find_path(LAPACKE_INCLUDE lapacke.h HINTS ${REF_LAPACK_ROOT} PATH_SUFFIXES include)
-
-if(UNIX)
-    list(APPEND LAPACKE_LINK "-Wl,-rpath,${LAPACKE64_LIB_DIR}")
-endif()
-list(APPEND LAPACKE_LINK ${LAPACKE64_file})
-list(APPEND LAPACKE_LINK ${LAPACK64_file})
-list(APPEND LAPACKE_LINK ${CBLAS64_file})
-list(APPEND LAPACKE_LINK ${BLAS64_file})
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(LAPACKE REQUIRED_VARS LAPACKE_INCLUDE LAPACKE_LINK)
diff --git a/cmake/FindNETLIB.cmake b/cmake/FindNETLIB.cmake
deleted file mode 100644
index a066f4043..000000000
--- a/cmake/FindNETLIB.cmake
+++ /dev/null
@@ -1,41 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-include_guard()
-
-include(FindPackageHandleStandardArgs)
-find_library(NETLIB_CBLAS_LIBRARY NAMES cblas.dll.lib cblas.lib cblas HINTS ${REF_BLAS_ROOT} PATH_SUFFIXES lib lib64)
-find_package_handle_standard_args(NETLIB REQUIRED_VARS NETLIB_CBLAS_LIBRARY)
-find_library(NETLIB_BLAS_LIBRARY NAMES blas.dll.lib blas.lib blas HINTS ${REF_BLAS_ROOT} PATH_SUFFIXES lib lib64)
-find_package_handle_standard_args(NETLIB REQUIRED_VARS NETLIB_BLAS_LIBRARY)
-
-get_filename_component(NETLIB_LIB_DIR ${NETLIB_CBLAS_LIBRARY} DIRECTORY)
-find_path(NETLIB_INCLUDE cblas.h HINTS ${REF_BLAS_ROOT} PATH_SUFFIXES include)
-
-if(UNIX)
-  list(APPEND NETLIB_LINK "-Wl,-rpath,${NETLIB_LIB_DIR}")
-endif()
-list(APPEND NETLIB_LINK ${NETLIB_CBLAS_LIBRARY})
-list(APPEND NETLIB_LINK ${NETLIB_BLAS_LIBRARY})
-
-find_package_handle_standard_args(NETLIB REQUIRED_VARS NETLIB_INCLUDE NETLIB_LINK)
-
-add_library(ONEMKL::NETLIB::NETLIB UNKNOWN IMPORTED)
-set_target_properties(ONEMKL::NETLIB::NETLIB PROPERTIES IMPORTED_LOCATION ${NETLIB_CBLAS_LIBRARY})
-
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
deleted file mode 100644
index 69461ecc2..000000000
--- a/cmake/FindSphinx.cmake
+++ /dev/null
@@ -1,31 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-find_program(SPHINX_EXECUTABLE
-    NAMES sphinx-build
-    HINTS $ENV{SPHINXROOT} ${SPHINX_ROOT}
-    PATH_SUFFIXES bin
-    DOC "Sphinx Documentation generator"
-)
-
-include(FindPackageHandleStandardArgs)
-
-find_package_handle_standard_args(Sphinx DEFAULT_MSG SPHINX_EXECUTABLE)
-
-mark_as_advanced(SPHINX_EXECUTABLE)
diff --git a/cmake/FindcuBLAS.cmake b/cmake/FindcuBLAS.cmake
deleted file mode 100644
index c26a62f6b..000000000
--- a/cmake/FindcuBLAS.cmake
+++ /dev/null
@@ -1,69 +0,0 @@
-#==========================================================================
-#  Copyright (C) Codeplay Software Limited
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  For your convenience, a copy of the License has been included in this
-#  repository.
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-#=========================================================================
-
-find_package(CUDA 10.0 REQUIRED)
-get_filename_component(SYCL_BINARY_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
-# the OpenCL include file from cuda is opencl 1.1 and it is not compatible with DPC++
-# the OpenCL include headers 1.2 onward is required. This is used to bypass NVIDIA OpenCL headers
-find_path(OPENCL_INCLUDE_DIR CL/cl.h OpenCL/cl.h 
-HINTS 
-${OPENCL_INCLUDE_DIR}
-${SYCL_BINARY_DIR}/../include/sycl/
-${SYCL_BINARY_DIR}/../../include/sycl/
-)
-# this is work around to avoid duplication half creation in both cuda and SYCL
-add_compile_definitions(CUDA_NO_HALF)
-
-find_package(Threads REQUIRED)
-
-include(FindPackageHandleStandardArgs)
-
-
-if(NOT TARGET ONEMKL::cuBLAS::cuBLAS)
-  add_library(ONEMKL::cuBLAS::cuBLAS SHARED IMPORTED)
-  if(USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    find_package_handle_standard_args(cuBLAS
-        REQUIRED_VARS
-          CUDA_TOOLKIT_INCLUDE
-          CUDA_cublas_LIBRARY
-          CUDA_LIBRARIES
-          CUDA_CUDART_LIBRARY
-	  CUDA_CUDA_LIBRARY
-    )
-    set_target_properties(ONEMKL::cuBLAS::cuBLAS PROPERTIES
-        IMPORTED_LOCATION ${CUDA_cublas_LIBRARY}
-        INTERFACE_INCLUDE_DIRECTORIES "${CUDA_TOOLKIT_INCLUDE}"
-        INTERFACE_LINK_LIBRARIES "Threads::Threads;${CUDA_LIBRARIES};${CUDA_CUDART_LIBRARY};${CUDA_CUDA_LIBRARY}"
-    )
-  else()
-    find_package_handle_standard_args(cuBLAS
-        REQUIRED_VARS
-          CUDA_TOOLKIT_INCLUDE
-          CUDA_cublas_LIBRARY
-          CUDA_LIBRARIES
-          CUDA_CUDA_LIBRARY
-          OPENCL_INCLUDE_DIR
-    )
-    set_target_properties(ONEMKL::cuBLAS::cuBLAS PROPERTIES
-        IMPORTED_LOCATION ${CUDA_cublas_LIBRARY}
-        INTERFACE_INCLUDE_DIRECTORIES "${OPENCL_INCLUDE_DIR};${CUDA_TOOLKIT_INCLUDE}"
-        INTERFACE_LINK_LIBRARIES "Threads::Threads;${CUDA_CUDA_LIBRARY};${CUDA_LIBRARIES}"
-    )
-  endif()
-endif()
diff --git a/cmake/FindcuRAND.cmake b/cmake/FindcuRAND.cmake
deleted file mode 100644
index df353077c..000000000
--- a/cmake/FindcuRAND.cmake
+++ /dev/null
@@ -1,114 +0,0 @@
-#--===============================================================================
-# cuRAND back-end Copyright (c) 2021, The Regents of the University of
-# California, through Lawrence Berkeley National Laboratory (subject to receipt
-# of any required approvals from the U.S. Dept. of Energy). All rights
-# reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# (1) Redistributions of source code must retain the above copyright notice,
-# this list of conditions and the following disclaimer.
-#
-# (2) Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-#
-# (3) Neither the name of the University of California, Lawrence Berkeley
-# National Laboratory, U.S. Dept. of Energy nor the names of its contributors
-# may be used to endorse or promote products derived from this software
-# without specific prior written permission.
-#
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-# You are under no obligation whatsoever to provide any bug fixes, patches,
-# or upgrades to the features, functionality or performance of the source
-# code ("Enhancements") to anyone; however, if you choose to make your
-# Enhancements available either publicly, or directly to Lawrence Berkeley
-# National Laboratory, without imposing a separate written license agreement
-# for such Enhancements, then you hereby grant the following license: a
-# non-exclusive, royalty-free perpetual license to install, use, modify,
-# prepare derivative works, incorporate into other computer software,
-# distribute, and sublicense such enhancements or derivative works thereof,
-# in binary and source code form.
-#
-# If you have questions about your rights to use or distribute this software,
-# please contact Berkeley Lab's Intellectual Property Office at
-# IPO@lbl.gov.
-#
-# NOTICE.  This Software was developed under funding from the U.S. Department
-# of Energy and the U.S. Government consequently retains certain rights.  As
-# such, the U.S. Government has been granted for itself and others acting on
-# its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
-# Software to reproduce, distribute copies to the public, prepare derivative
-# works, and perform publicly and display publicly, and to permit others to do
-# so.
-#=================================================================================
-
-find_package(CUDA 10.0 REQUIRED)
-get_filename_component(SYCL_BINARY_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
-
-if (NOT (ONEMKL_SYCL_IMPLEMENTATION STREQUAL "hipsycl"))
-# the OpenCL include file from cuda is opencl 1.1 and it is not compatible with DPC++
-# the OpenCL include headers 1.2 onward is required. This is used to bypass NVIDIA OpenCL headers
-find_path(OPENCL_INCLUDE_DIR CL/cl.h OpenCL/cl.h 
-HINTS 
-${OPENCL_INCLUDE_DIR}
-${SYCL_BINARY_DIR}/../include/sycl/
-)
-endif()
-
-# this is work around to avoid duplication half creation in both cuda and SYCL
-add_compile_definitions(CUDA_NO_HALF)
-
-find_package(Threads REQUIRED)
-
-include(FindPackageHandleStandardArgs)
-
-if (ONEMKL_SYCL_IMPLEMENTATION STREQUAL "hipsycl")
-find_package_handle_standard_args(cuRAND
-    REQUIRED_VARS
-	CUDA_TOOLKIT_INCLUDE
-	CUDA_curand_LIBRARY
-        CUDA_LIBRARIES
-        CUDA_CUDA_LIBRARY
-)
-
-  if(NOT TARGET ONEMKL::cuRAND::cuRAND)
-  add_library(ONEMKL::cuRAND::cuRAND SHARED IMPORTED)
-  set_target_properties(ONEMKL::cuRAND::cuRAND PROPERTIES
-    IMPORTED_LOCATION ${CUDA_curand_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES "${CUDA_TOOLKIT_INCLUDE}"
-    INTERFACE_LINK_LIBRARIES "Threads::Threads;${CUDA_CUDA_LIBRARY};${CUDA_LIBRARIES}"
-  )
-  endif()
-else()
-find_package_handle_standard_args(cuRAND
-    REQUIRED_VARS
-	CUDA_TOOLKIT_INCLUDE
-	CUDA_curand_LIBRARY
-        CUDA_LIBRARIES
-        CUDA_CUDA_LIBRARY
-        OPENCL_INCLUDE_DIR
-)
-
-  if(NOT TARGET ONEMKL::cuRAND::cuRAND)
-  add_library(ONEMKL::cuRAND::cuRAND SHARED IMPORTED)
-  set_target_properties(ONEMKL::cuRAND::cuRAND PROPERTIES
-    IMPORTED_LOCATION ${CUDA_curand_LIBRARY}
-    INTERFACE_INCLUDE_DIRECTORIES "${OPENCL_INCLUDE_DIR};${CUDA_TOOLKIT_INCLUDE}"
-    INTERFACE_LINK_LIBRARIES "Threads::Threads;${CUDA_CUDA_LIBRARY};${CUDA_LIBRARIES}"
-  )
-  endif()
-endif()
diff --git a/cmake/FindcuSOLVER.cmake b/cmake/FindcuSOLVER.cmake
deleted file mode 100644
index 0f7f59564..000000000
--- a/cmake/FindcuSOLVER.cmake
+++ /dev/null
@@ -1,51 +0,0 @@
-#==========================================================================
-#  Copyright (C) Codeplay Software Limited
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  For your convenience, a copy of the License has been included in this
-#  repository.
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-#=========================================================================
-
-find_package(CUDA 10.0 REQUIRED)
-get_filename_component(SYCL_BINARY_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
-# the OpenCL include file from cuda is opencl 1.1 and it is not compatible with DPC++
-# the OpenCL include headers 1.2 onward is required. This is used to bypass NVIDIA OpenCL headers
-find_path(OPENCL_INCLUDE_DIR CL/cl.h OpenCL/cl.h 
-HINTS 
-${OPENCL_INCLUDE_DIR}
-${SYCL_BINARY_DIR}/../include/sycl/
-)
-# this is work around to avoid duplication half creation in both cuda and SYCL
-add_compile_definitions(CUDA_NO_HALF)
-
-find_package(Threads REQUIRED)
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(cuSOLVER
-    REQUIRED_VARS
-	CUDA_TOOLKIT_INCLUDE
-	CUDA_cusolver_LIBRARY
-        CUDA_LIBRARIES
-        CUDA_CUDA_LIBRARY
-        OPENCL_INCLUDE_DIR
-)
-if(NOT TARGET ONEMKL::cuSOLVER::cuSOLVER)
-  add_library(ONEMKL::cuSOLVER::cuSOLVER SHARED IMPORTED)
-  set_target_properties(ONEMKL::cuSOLVER::cuSOLVER PROPERTIES
-      IMPORTED_LOCATION ${CUDA_cusolver_LIBRARY}
-      INTERFACE_INCLUDE_DIRECTORIES "${OPENCL_INCLUDE_DIR};${CUDA_TOOLKIT_INCLUDE}"
-      INTERFACE_LINK_LIBRARIES "Threads::Threads;${CUDA_CUDA_LIBRARY};${CUDA_LIBRARIES}"
-  )
-
-endif()
diff --git a/cmake/WarningsUtils.cmake b/cmake/WarningsUtils.cmake
deleted file mode 100644
index 3b5f76afb..000000000
--- a/cmake/WarningsUtils.cmake
+++ /dev/null
@@ -1,48 +0,0 @@
-#===============================================================================
-# Copyright Codeplay Software Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-include_guard(GLOBAL)
-
-add_library(onemkl_warnings INTERFACE)
-
-set(ONEMKL_WARNINGS "")
-
-include(CheckCXXCompilerFlag)
-macro(add_warning flag)
-  check_cxx_compiler_flag(${flag} IS_SUPPORTED)
-  if(${IS_SUPPORTED})
-    list(APPEND ONEMKL_WARNINGS ${flag})
-  else()
-    message(WARNING "Compiler does not support ${flag}")
-  endif()
-endmacro()
-
-add_warning("-Wall")
-add_warning("-Wextra")
-add_warning("-Wshadow")
-add_warning("-Wconversion")
-add_warning("-Wpedantic")
-
-message(VERBOSE "Domains with warnings enabled use: ${ONEMKL_WARNINGS}")
-
-# The onemkl_warnings target can be linked to any other target to enable warnings.
-target_compile_options(onemkl_warnings INTERFACE ${ONEMKL_WARNINGS})
-
-# Add the library to install package
-install(TARGETS onemkl_warnings EXPORT oneMKLTargets)
diff --git a/cmake/mkl/MKLConfig.cmake b/cmake/mkl/MKLConfig.cmake
deleted file mode 100644
index 7614288b3..000000000
--- a/cmake/mkl/MKLConfig.cmake
+++ /dev/null
@@ -1,1158 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-#===================================================================
-# CMake Config file for Intel(R) oneAPI Math Kernel Library (oneMKL)
-#===============================================================================
-
-#===============================================================================
-# Input parameters
-#=================
-#-------------
-# Main options
-#-------------
-# MKL_ROOT: oneMKL root directory (May be required for non-standard install locations. Optional otherwise.)
-#    Default: use location from MKLROOT environment variable or <Full path to this file>/../../../ if MKLROOT is not defined
-# MKL_ARCH
-#    Values:  ia32 intel64
-#    Default: intel64
-# MKL_LINK
-#    Values:  static, dynamic, sdl
-#    Default: dynamic
-#       Exceptions:- DPC++ doesn't support sdl
-# MKL_THREADING
-#    Values:  sequential,
-#             intel_thread (Intel OpenMP),
-#             gnu_thread (GNU OpenMP),
-#             pgi_thread (PGI OpenMP) [PGI support is deprecated],
-#             tbb_thread
-#    Default: intel_thread
-#       Exceptions:- DPC++ defaults to oneTBB, PGI compiler on Windows defaults to pgi_thread
-# MKL_INTERFACE (for MKL_ARCH=intel64 only)
-#    Values:  lp64, ilp64
-#       GNU or INTEL interface will be selected based on Compiler.
-#    Default: ilp64
-# MKL_MPI
-#    Values:  intelmpi, mpich, openmpi, msmpi, mshpc
-#    Default: intelmpi
-#-----------------------------------
-# Special options (OFF by default)
-#-----------------------------------
-# ENABLE_BLAS95:      Enables BLAS Fortran95 API
-# ENABLE_LAPACK95:    Enables LAPACK Fortran95 API
-# ENABLE_BLACS:       Enables cluster BLAS library
-# ENABLE_CDFT:        Enables cluster DFT library
-# ENABLE_CPARDISO:    Enables cluster PARDISO functionality
-# ENABLE_SCALAPACK:   Enables cluster LAPACK library
-# ENABLE_OMP_OFFLOAD: Enables OpenMP Offload functionality
-#
-#==================
-# Output parameters
-#==================
-# MKL_ROOT
-#     oneMKL root directory.
-# MKL_INCLUDE
-#     Use of target_include_directories() is recommended.
-#     INTERFACE_INCLUDE_DIRECTORIES property is set on mkl_core and mkl_rt libraries.
-#     Alternatively, this variable can be used directly (not recommended as per Modern CMake)
-# MKL_ENV
-#     Provides all environment variables based on input parameters.
-#     Currently useful for mkl_rt linking and BLACS on Windows.
-#     Must be set as an ENVIRONMENT property.
-# Example:
-#     add_test(NAME mytest COMMAND myexe)
-#     if(MKL_ENV)
-#       set_tests_properties(mytest PROPERTIES ENVIRONMENT "${MKL_ENV}")
-#     endif()
-#
-# MKL::<library name>
-#     IMPORTED targets to link oneMKL libraries individually or when using a custom link-line.
-#     mkl_core and mkl_rt have INTERFACE_* properties set to them.
-#     Please refer to Intel(R) oneMKL Link Line Advisor for help with linking.
-#
-# Below INTERFACE targets provide full link-lines for direct use.
-# Example:
-#     target_link_options(<my_linkable_target> PUBLIC $<LINK_ONLY:MKL::MKL>)
-#
-# MKL::MKL
-#     Link line for C and Fortran API
-# MKL::MKL_SYCL
-#     Link line for DPC++ API
-#
-# Note: For Device API, library linking is not required.
-#       Compile options can be added from the INTERFACE_COMPILE_OPTIONS property on MKL::MKL_SYCL
-#       Include directories can be added from the INTERFACE_INCLUDE_DIRECTORIES property on MKL::MKL_SYCL
-#
-# Note: Output parameters' and targets' availability can change
-# based on Input parameters and application project languages.
-#===============================================================================
-
-include_guard()
-
-if(NOT MKL_LIBRARIES)
-
-function(mkl_message MSG_MODE MSG_TEXT)
-  if(MSG_MODE STREQUAL "FATAL_ERROR")
-    message(${MSG_MODE} ${MSG_TEXT})
-  else()
-    if(NOT MKL_FIND_QUIETLY)
-      message(${MSG_MODE} ${MSG_TEXT})
-    endif()
-  endif()
-endfunction()
-
-if(CMAKE_VERSION VERSION_LESS "3.13")
-  mkl_message(FATAL_ERROR "The minimum supported CMake version is 3.13. You are running version ${CMAKE_VERSION}.")
-endif()
-
-# Set CMake policies for well-defined behavior across CMake versions
-cmake_policy(SET CMP0011 NEW)
-cmake_policy(SET CMP0057 NEW)
-
-# Project Languages
-get_property(languages GLOBAL PROPERTY ENABLED_LANGUAGES)
-list(APPEND MKL_LANGS C CXX Fortran)
-foreach(lang ${languages})
-  if(${lang} IN_LIST MKL_LANGS)
-    list(APPEND CURR_LANGS ${lang})
-  endif()
-endforeach()
-list(REMOVE_DUPLICATES CURR_LANGS)
-
-option(ENABLE_BLAS95      "Enables BLAS Fortran95 API"            OFF)
-option(ENABLE_LAPACK95    "Enables LAPACK Fortran95 API"          OFF)
-option(ENABLE_BLACS       "Enables cluster BLAS library"          OFF)
-option(ENABLE_CDFT        "Enables cluster DFT library"           OFF)
-option(ENABLE_CPARDISO    "Enables cluster PARDISO functionality" OFF)
-option(ENABLE_SCALAPACK   "Enables cluster LAPACK library"        OFF)
-option(ENABLE_OMP_OFFLOAD "Enables OpenMP Offload functionality"  OFF)
-
-# Use MPI if any of these are enabled
-if(ENABLE_BLACS OR ENABLE_CDFT OR ENABLE_SCALAPACK OR ENABLE_CPARDISO)
-  set(USE_MPI ON)
-endif()
-
-# Check Parameters
-function(define_param TARGET_PARAM DEFAULT_PARAM SUPPORTED_LIST)
-  if(NOT DEFINED ${TARGET_PARAM} AND NOT DEFINED ${DEFAULT_PARAM})
-    mkl_message(STATUS "${TARGET_PARAM}: Undefined")
-  elseif(NOT DEFINED ${TARGET_PARAM} AND DEFINED ${DEFAULT_PARAM})
-    set(${TARGET_PARAM} "${${DEFAULT_PARAM}}" CACHE STRING "Choose ${TARGET_PARAM} options are: ${${SUPPORTED_LIST}}")
-    foreach(opt ${${DEFAULT_PARAM}})
-      set(STR_LIST "${STR_LIST} ${opt}")
-    endforeach()
-    mkl_message(STATUS "${TARGET_PARAM}: None, set to `${STR_LIST}` by default")
-  elseif(${SUPPORTED_LIST})
-    set(ITEM_FOUND 1)
-    foreach(opt ${${TARGET_PARAM}})
-      if(NOT ${opt} IN_LIST ${SUPPORTED_LIST})
-        set(ITEM_FOUND 0)
-      endif()
-    endforeach()
-    if(ITEM_FOUND EQUAL 0)
-      foreach(opt ${${SUPPORTED_LIST}})
-        set(STR_LIST "${STR_LIST} ${opt}")
-      endforeach()
-      if(${ARGC} EQUAL 3)
-        mkl_message(FATAL_ERROR "Invalid ${TARGET_PARAM} `${${TARGET_PARAM}}`, options are: ${STR_LIST}")
-      elseif(${ARGC} EQUAL 4)
-        mkl_message(${ARGV3} "Invalid ${TARGET_PARAM} `${${TARGET_PARAM}}`, options are: ${STR_LIST}")
-        set(${TARGET_PARAM} "" PARENT_SCOPE)
-      endif()
-    else()
-      mkl_message(STATUS "${TARGET_PARAM}: ${${TARGET_PARAM}}")
-    endif()
-  else()
-    mkl_message(STATUS "${TARGET_PARAM}: ${${TARGET_PARAM}}")
-  endif()
-endfunction()
-
-macro(check_required_vars)
-  foreach(var IN ITEMS ${ARGV})
-    if(NOT ${var})
-      set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "The required variable ${var} has an invalid value \"${${var}}\".")
-      set(${CMAKE_FIND_PACKAGE_NAME}_FOUND FALSE)
-      return()
-    endif()
-  endforeach()
-endmacro()
-
-#================
-# Compiler checks
-#================
-
-if(CMAKE_C_COMPILER)
-  get_filename_component(C_COMPILER_NAME ${CMAKE_C_COMPILER} NAME)
-endif()
-if(CMAKE_CXX_COMPILER)
-  get_filename_component(CXX_COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME)
-endif()
-if(CMAKE_Fortran_COMPILER)
-  get_filename_component(Fortran_COMPILER_NAME ${CMAKE_Fortran_COMPILER} NAME)
-endif()
-
-# Determine Compiler Family
-if(CXX_COMPILER_NAME STREQUAL "dpcpp" OR CXX_COMPILER_NAME STREQUAL "dpcpp.exe"
-    OR CXX_COMPILER_NAME STREQUAL "icpx" OR CXX_COMPILER_NAME STREQUAL "icx.exe")
-  set(SYCL_COMPILER ON)
-endif()
-if(C_COMPILER_NAME MATCHES "^clang" OR CXX_COMPILER_NAME MATCHES "^clang")
-  set(CLANG_COMPILER ON)
-endif()
-if(CMAKE_C_COMPILER_ID STREQUAL "PGI" OR CMAKE_CXX_COMPILER_ID STREQUAL "PGI" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI"
-    OR CMAKE_C_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_CXX_COMPILER_ID STREQUAL "NVHPC"
-    OR CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC") # PGI 22.9
-  mkl_message(WARNING "PGI support is deprecated and will be removed in the oneMKL 2025.0 release.")
-  set(PGI_COMPILER ON)
-elseif(CMAKE_C_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_Fortran_COMPILER_ID STREQUAL "Intel"
-        OR CMAKE_C_COMPILER_ID STREQUAL "IntelLLVM" OR CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM" OR CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
-  set(INTEL_COMPILER ON)
-else()
-  if(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    set(GNU_C_COMPILER ON)
-  endif()
-  if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
-    set(GNU_Fortran_COMPILER ON)
-  endif()
-endif()
-# CMake identifies IntelLLVM compilers only after 3.20
-if(NOT INTEL_COMPILER)
-  if(C_COMPILER_NAME STREQUAL "icx" OR C_COMPILER_NAME STREQUAL "icx.exe"
-      OR CXX_COMPILER_NAME STREQUAL "icpx" OR CXX_COMPILER_NAME STREQUAL "icx.exe"
-      OR Fortran_COMPILER_NAME STREQUAL "ifx" OR Fortran_COMPILER_NAME STREQUAL "ifx.exe")
-    set(INTEL_COMPILER ON)
-  endif()
-endif()
-# CMake supports IntelLLVM compilers only after 3.25.2
-if(CMAKE_VERSION VERSION_LESS "3.25.2")
-  if(C_COMPILER_NAME STREQUAL "icx" OR C_COMPILER_NAME STREQUAL "icx.exe" OR CXX_COMPILER_NAME STREQUAL "icx.exe")
-    list(APPEND INTEL_LLVM_COMPILERS_IN_USE "icx")
-  endif()
-  if(CXX_COMPILER_NAME STREQUAL "icpx")
-    list(APPEND INTEL_LLVM_COMPILERS_IN_USE "icpx")
-  endif()
-  if(Fortran_COMPILER_NAME STREQUAL "ifx" OR Fortran_COMPILER_NAME STREQUAL "ifx.exe")
-    list(APPEND INTEL_LLVM_COMPILERS_IN_USE "ifx")
-  endif()
-  if(INTEL_LLVM_COMPILERS_IN_USE)
-    list(JOIN INTEL_LLVM_COMPILERS_IN_USE ", " INTEL_LLVM_COMPILERS_IN_USE_COMMA)
-    mkl_message(STATUS "Upgrade to CMake version 3.25.2 or later for native support of Intel compiler(s) ${INTEL_LLVM_COMPILERS_IN_USE_COMMA}. You are running version ${CMAKE_VERSION}.")
-  endif()
-endif()
-
-if(USE_MPI AND (C_COMPILER_NAME MATCHES "^mpi" OR Fortran_COMPILER_NAME MATCHES "^mpi"))
-  set(USE_MPI_SCRIPT ON)
-endif()
-
-#================
-
-#================
-# System-specific
-#================
-
-# Extensions
-if(UNIX)
-  set(LIB_PREFIX "lib")
-  set(LIB_EXT ".a")
-  set(DLL_EXT ".so")
-  if(APPLE)
-    set(DLL_EXT ".dylib")
-  endif()
-  set(LINK_PREFIX "-l")
-  set(LINK_SUFFIX "")
-else()
-  set(LIB_PREFIX "")
-  set(LIB_EXT ".lib")
-  set(DLL_EXT "_dll.lib")
-  set(LINK_PREFIX "")
-  set(LINK_SUFFIX ".lib")
-endif()
-
-#================
-
-#=============
-# Setup oneMKL
-#=============
-
-# Set MKL_ROOT directory
-if(NOT DEFINED MKL_ROOT)
-  if(DEFINED ENV{MKLROOT})
-    set(MKL_ROOT $ENV{MKLROOT})
-    # Verify that the version in MKL_ROOT is the same as MKL_VERSION
-    find_file(MKL_VERSION_H mkl_version.h
-      HINTS ${MKL_ROOT}
-      PATH_SUFFIXES include
-      NO_DEFAULT_PATH)
-    check_required_vars(MKL_VERSION_H)
-    file(READ ${MKL_VERSION_H} MKL_VERSION_H_CONTENT)
-    string(REGEX MATCH "__INTEL_MKL__ +([0-9]+)" MKL_VERSION_INFO ${MKL_VERSION_H_CONTENT})
-    set(MKL_ROOT_MAJOR_VERSION ${CMAKE_MATCH_1})
-    string(REGEX MATCH "__INTEL_MKL_UPDATE__ +([0-9]+)" MKL_VERSION_INFO ${MKL_VERSION_H_CONTENT})
-    set(MKL_ROOT_UPDATE_VERSION ${CMAKE_MATCH_1})
-    set(MKL_ROOT_VERSION ${MKL_ROOT_MAJOR_VERSION}.${MKL_ROOT_UPDATE_VERSION})
-    if(NOT MKL_ROOT_VERSION VERSION_EQUAL ${CMAKE_FIND_PACKAGE_NAME}_VERSION)
-      mkl_message(FATAL_ERROR "oneMKL ${MKL_ROOT_VERSION} specified by the environment variable MKLROOT \
-                               mismatches the found version ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} \
-                               indicated by ${CMAKE_CURRENT_LIST_DIR}/MKLConfigVersion.cmake")
-    endif()
-  else()
-    get_filename_component(MKL_CMAKE_PATH "${CMAKE_CURRENT_LIST_DIR}" REALPATH)
-    get_filename_component(MKL_ROOT "${MKL_CMAKE_PATH}/../../../" ABSOLUTE)
-  endif()
-endif()
-string(REPLACE "\\" "/" MKL_ROOT ${MKL_ROOT})
-check_required_vars(MKL_ROOT)
-mkl_message(STATUS "${CMAKE_FIND_PACKAGE_NAME}_VERSION: ${${CMAKE_FIND_PACKAGE_NAME}_VERSION}")
-mkl_message(STATUS "MKL_ROOT: ${MKL_ROOT}")
-
-# Set target system architecture
-if(SYCL_COMPILER)
-  set(DEFAULT_MKL_SYCL_ARCH intel64)
-  set(MKL_SYCL_ARCH_LIST intel64)
-  if(NOT DEFINED MKL_SYCL_ARCH)
-    set(MKL_SYCL_ARCH ${MKL_ARCH})
-  endif()
-  define_param(MKL_SYCL_ARCH DEFAULT_MKL_SYCL_ARCH MKL_SYCL_ARCH_LIST STATUS)
-  if(NOT MKL_SYCL_ARCH)
-    set(SYCL_COMPILER OFF)
-    mkl_message(STATUS "MKL::MKL_SYCL target will not be available.")
-  endif()
-endif()
-set(DEFAULT_MKL_ARCH intel64)
-if(PGI_COMPILER OR ENABLE_OMP_OFFLOAD OR USE_MPI)
-  set(MKL_ARCH_LIST intel64)
-else()
-  set(MKL_ARCH_LIST ia32 intel64)
-endif()
-define_param(MKL_ARCH DEFAULT_MKL_ARCH MKL_ARCH_LIST)
-check_required_vars(MKL_ARCH)
-if(MKL_ARCH STREQUAL "ia32")
-  set(MKL_ARCH_DIR "32")
-else()
-  set(MKL_ARCH_DIR "")
-endif()
-
-# Define MKL_LINK
-if(SYCL_COMPILER)
-  set(DEFAULT_MKL_SYCL_LINK dynamic)
-  set(MKL_SYCL_LINK_LIST static dynamic)
-  if(NOT DEFINED MKL_SYCL_LINK)
-    set(MKL_SYCL_LINK ${MKL_LINK})
-  endif()
-  define_param(MKL_SYCL_LINK DEFAULT_MKL_SYCL_LINK MKL_SYCL_LINK_LIST STATUS)
-  if(NOT MKL_SYCL_LINK)
-    set(SYCL_COMPILER OFF)
-    mkl_message(STATUS "MKL::MKL_SYCL target will not be available.")
-  endif()
-endif()
-set(DEFAULT_MKL_LINK dynamic)
-if(USE_MPI)
-  set(MKL_LINK_LIST static dynamic)
-else()
-  set(MKL_LINK_LIST static dynamic sdl)
-endif()
-define_param(MKL_LINK DEFAULT_MKL_LINK MKL_LINK_LIST)
-check_required_vars(MKL_LINK)
-
-# Define MKL_INTERFACE
-if(SYCL_COMPILER)
-  if(MKL_INTERFACE AND NOT DEFINED MKL_SYCL_INTERFACE_FULL)
-    set(MKL_SYCL_INTERFACE_FULL intel_${MKL_INTERFACE})
-  endif()
-  set(DEFAULT_MKL_SYCL_INTERFACE intel_ilp64)
-  set(MKL_SYCL_INTERFACE_LIST intel_ilp64)
-  define_param(MKL_SYCL_INTERFACE_FULL DEFAULT_MKL_SYCL_INTERFACE MKL_SYCL_INTERFACE_LIST STATUS)
-  if(NOT MKL_SYCL_INTERFACE_FULL)
-    set(SYCL_COMPILER OFF)
-    mkl_message(STATUS "MKL::MKL_SYCL target will not be available.")
-  endif()
-endif()
-if(MKL_ARCH STREQUAL "intel64")
-  set(IFACE_TYPE intel)
-  if(GNU_Fortran_COMPILER)
-    set(IFACE_TYPE gf)
-  endif()
-  if(MKL_INTERFACE)
-    set(MKL_INTERFACE_FULL ${IFACE_TYPE}_${MKL_INTERFACE})
-  endif()
-  set(DEFAULT_MKL_INTERFACE ${IFACE_TYPE}_ilp64)
-  set(MKL_INTERFACE_LIST ${IFACE_TYPE}_ilp64 ${IFACE_TYPE}_lp64)
-  define_param(MKL_INTERFACE_FULL DEFAULT_MKL_INTERFACE MKL_INTERFACE_LIST)
-else()
-  if(WIN32)
-    set(MKL_INTERFACE_FULL intel_c)
-  elseif(NOT APPLE)
-    if(GNU_Fortran_COMPILER)
-      set(MKL_INTERFACE_FULL gf)
-    else()
-      set(MKL_INTERFACE_FULL intel)
-    endif()
-  else()
-    mkl_message(FATAL_ERROR "OSX does not support MKL_ARCH ia32.")
-  endif()
-endif()
-if(MKL_INTERFACE_FULL MATCHES "ilp64")
-  set(MKL_INTERFACE "ilp64")
-else()
-  set(MKL_INTERFACE "lp64")
-endif()
-check_required_vars(MKL_INTERFACE_FULL)
-
-# Define oneMKL headers
-find_path(MKL_INCLUDE mkl.h
-  HINTS ${MKL_ROOT}
-  PATH_SUFFIXES include
-  NO_DEFAULT_PATH)
-check_required_vars(MKL_INCLUDE)
-
-# Add pre-built F95 Interface Modules
-if(INTEL_COMPILER AND (ENABLE_BLAS95 OR ENABLE_LAPACK95))
-  if(MKL_ARCH STREQUAL "intel64")
-    list(APPEND MKL_INCLUDE "${MKL_ROOT}/include/mkl/${MKL_ARCH}/${MKL_INTERFACE}")
-  else()
-    list(APPEND MKL_INCLUDE "${MKL_ROOT}/include/mkl/${MKL_ARCH}")
-  endif()
-endif()
-
-# Define MKL_THREADING
-# All APIs support sequential threading
-# SYCL API supports oneTBB and OpenMP threadings, but OpenMP threading might have composability problem on CPU device with other SYCL kernels
-if(SYCL_COMPILER)
-  set(MKL_SYCL_THREADING_LIST "sequential" "intel_thread" "tbb_thread")
-  set(DEFAULT_MKL_SYCL_THREADING tbb_thread)
-  if(NOT DEFINED MKL_SYCL_THREADING)
-    set(MKL_SYCL_THREADING ${MKL_THREADING})
-  endif()
-  define_param(MKL_SYCL_THREADING DEFAULT_MKL_SYCL_THREADING MKL_SYCL_THREADING_LIST STATUS)
-  if(NOT MKL_SYCL_THREADING)
-    set(SYCL_COMPILER OFF)
-    mkl_message(STATUS "MKL::MKL_SYCL target will not be available.")
-  endif()
-  if(MKL_SYCL_THREADING STREQUAL "intel_thread")
-    mkl_message(STATUS "Using MKL::MKL_SYCL* targets with intel_thread may have potential composability problems on CPU device with other SYCL kernels.")
-    add_custom_target(MKL_SYCL_MESSAGE
-                      COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --red
-                      "Warning: Using MKL::MKL_SYCL* targets with intel_thread may have potential composability problems on CPU device with other SYCL kernels.")
-  endif()
-endif()
-# C, Fortran API
-set(MKL_THREADING_LIST "sequential" "intel_thread" "tbb_thread")
-set(DEFAULT_MKL_THREADING intel_thread)
-if(PGI_COMPILER)
-  # PGI compiler supports PGI OpenMP threading, additionally
-  list(APPEND MKL_THREADING_LIST pgi_thread)
-  # PGI compiler does not support oneTBB threading
-  list(REMOVE_ITEM MKL_THREADING_LIST tbb_thread)
-  if(WIN32)
-    # PGI 19.10 and 20.1 on Windows, do not support Intel OpenMP threading
-    list(REMOVE_ITEM MKL_THREADING_LIST intel_thread)
-    set(DEFAULT_MKL_THREADING pgi_thread)
-  endif()
-elseif(GNU_C_COMPILER OR GNU_Fortran_COMPILER OR CLANG_COMPILER)
-  list(APPEND MKL_THREADING_LIST gnu_thread)
-else()
-  # Intel and Microsoft compilers
-  # Nothing to do, only for completeness
-endif()
-define_param(MKL_THREADING DEFAULT_MKL_THREADING MKL_THREADING_LIST)
-check_required_vars(MKL_THREADING)
-
-# Define MKL_MPI
-if(USE_MPI)
-  set(DEFAULT_MKL_MPI intelmpi)
-  if(UNIX)
-    if(APPLE)
-      # Override defaults for OSX
-      set(DEFAULT_MKL_MPI mpich)
-      set(MKL_MPI_LIST mpich)
-    else()
-      set(MKL_MPI_LIST intelmpi openmpi mpich mpich2)
-    endif()
-  else()
-    # Windows
-    set(MKL_MPI_LIST intelmpi mshpc msmpi)
-  endif()
-  define_param(MKL_MPI DEFAULT_MKL_MPI MKL_MPI_LIST)
-  # MSMPI is now called MSHPC. MSMPI option exists for backward compatibility.
-  if(MKL_MPI STREQUAL "mshpc")
-    set(MKL_MPI msmpi)
-  endif()
-  check_required_vars(MKL_MPI)
-endif()
-
-# Provides a list of IMPORTED targets for the project
-if(NOT DEFINED MKL_IMPORTED_TARGETS)
-  set(MKL_IMPORTED_TARGETS "")
-endif()
-
-# Clear temporary variables
-set(MKL_C_COPT "")
-set(MKL_F_COPT "")
-set(MKL_SDL_COPT "")
-set(MKL_CXX_COPT "")
-set(MKL_SYCL_COPT "")
-set(MKL_SYCL_LOPT "")
-set(MKL_OFFLOAD_COPT "")
-set(MKL_OFFLOAD_LOPT "")
-
-set(MKL_SUPP_LINK "")        # Other link options. Usually at the end of the link-line.
-set(MKL_SYCL_SUPP_LINK "")
-set(MKL_LINK_LINE "")
-set(MKL_SYCL_LINK_LINE "")
-set(MKL_ENV_PATH "")         # Temporary variable to work with PATH
-set(MKL_ENV "")              # Exported environment variables
-
-# Modify PATH variable to make it CMake-friendly
-set(OLD_PATH $ENV{PATH})
-string(REPLACE ";" "\;" OLD_PATH "${OLD_PATH}")
-
-# Compiler options
-if(GNU_C_COMPILER OR GNU_Fortran_COMPILER)
-  if(MKL_ARCH STREQUAL "ia32")
-    list(APPEND MKL_C_COPT   -m32)
-    list(APPEND MKL_CXX_COPT -m32)
-    list(APPEND MKL_F_COPT   -m32)
-  else()
-    list(APPEND MKL_C_COPT   -m64)
-    list(APPEND MKL_CXX_COPT -m64)
-    list(APPEND MKL_F_COPT   -m64)
-  endif()
-endif()
-
-# Additonal compiler & linker options
-if(SYCL_COMPILER)
-  list(APPEND MKL_SYCL_COPT "-fsycl")
-  list(APPEND MKL_SYCL_LOPT "-fsycl")
-  if(MKL_SYCL_LINK STREQUAL "static")
-    list(APPEND MKL_SYCL_LOPT "-fsycl-device-code-split=per_kernel")
-  endif()
-endif()
-if(ENABLE_OMP_OFFLOAD)
-  if(MKL_LINK STREQUAL "static")
-    list(APPEND MKL_OFFLOAD_LOPT "-fsycl-device-code-split=per_kernel")
-  endif()
-endif()
-
-# For OpenMP Offload
-if(ENABLE_OMP_OFFLOAD)
-  if(WIN32)
-    if(OPENMP_VERSION VERSION_GREATER_EQUAL "5.1")
-      if("Fortran" IN_LIST CURR_LANGS)
-        list(APPEND MKL_OFFLOAD_COPT -Qiopenmp -Qopenmp-targets:spir64 -DONEMKL_USE_OPENMP_VERSION=202011)
-      else()
-        list(APPEND MKL_OFFLOAD_COPT -Qiopenmp -Qopenmp-targets:spir64 -Qopenmp-version:51 -DONEMKL_USE_OPENMP_VERSION=202011)
-      endif()
-    else()
-      list(APPEND MKL_OFFLOAD_COPT -Qiopenmp -Qopenmp-targets:spir64)
-    endif()
-    # -MD and -MDd are manually added here because offload functionality uses DPC++ runtime.
-    if(CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo")
-      list(APPEND MKL_OFFLOAD_COPT -MDd)
-    else()
-      list(APPEND MKL_OFFLOAD_COPT -MD)
-    endif()
-    list(APPEND MKL_OFFLOAD_LOPT -Qiopenmp -Qopenmp-targets:spir64 -fsycl)
-    set(SKIP_LIBPATH ON)
-  else()
-    if(OPENMP_VERSION VERSION_GREATER_EQUAL "5.1")
-      if("Fortran" IN_LIST CURR_LANGS)
-        list(APPEND MKL_OFFLOAD_COPT -fiopenmp -fopenmp-targets=spir64 -DONEMKL_USE_OPENMP_VERSION=202011)
-      else()
-        list(APPEND MKL_OFFLOAD_COPT -fiopenmp -fopenmp-targets=spir64 -fopenmp-version=51 -DONEMKL_USE_OPENMP_VERSION=202011)
-      endif()
-    else ()
-      list(APPEND MKL_OFFLOAD_COPT -fiopenmp -fopenmp-targets=spir64)
-    endif()
-    list(APPEND MKL_OFFLOAD_LOPT -fiopenmp -fopenmp-targets=spir64 -fsycl)
-    if(APPLE)
-      list(APPEND MKL_SUPP_LINK -lc++)
-    else()
-      list(APPEND MKL_SUPP_LINK -lstdc++)
-    endif()
-  endif()
-endif()
-
-# For selected Interface
-if(SYCL_COMPILER)
-  list(INSERT MKL_SYCL_COPT 0 "-DMKL_ILP64")
-endif()
-
-if(MKL_INTERFACE_FULL)
-  if(MKL_ARCH STREQUAL "ia32")
-    if(GNU_Fortran_COMPILER)
-      set(MKL_SDL_IFACE_ENV "GNU")
-    endif()
-  else()
-    if(GNU_Fortran_COMPILER)
-      set(MKL_SDL_IFACE_ENV "GNU,${MKL_INTERFACE}")
-    else()
-      set(MKL_SDL_IFACE_ENV "${MKL_INTERFACE}")
-    endif()
-    if(MKL_INTERFACE STREQUAL "ilp64")
-      if("Fortran" IN_LIST CURR_LANGS)
-        if(INTEL_COMPILER)
-          if(WIN32)
-            list(APPEND MKL_F_COPT "-4I8")
-          else()
-            list(APPEND MKL_F_COPT "-i8")
-          endif()
-        elseif(GNU_Fortran_COMPILER)
-          list(APPEND MKL_F_COPT "-fdefault-integer-8")
-        elseif(PGI_COMPILER)
-          list(APPEND MKL_F_COPT "-i8")
-        endif()
-      endif()
-      list(INSERT MKL_C_COPT 0 "-DMKL_ILP64")
-      list(INSERT MKL_SDL_COPT 0 "-DMKL_ILP64")
-      list(INSERT MKL_CXX_COPT 0 "-DMKL_ILP64")
-      list(INSERT MKL_OFFLOAD_COPT 0 "-DMKL_ILP64")
-    else()
-      # lp64
-    endif()
-  endif()
-  if(MKL_SDL_IFACE_ENV)
-    string(TOUPPER ${MKL_SDL_IFACE_ENV} MKL_SDL_IFACE_ENV)
-  endif()
-endif() # MKL_INTERFACE_FULL
-
-# All oneMKL Libraries
-if(SYCL_COMPILER)
-  set(MKL_SYCL_IFACE_LIB mkl_${MKL_SYCL_INTERFACE_FULL})
-  if(WIN32 AND CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo" AND MKL_SYCL_THREADING STREQUAL "tbb_thread")
-    set(MKL_SYCL_THREAD mkl_tbb_threadd)
-  else()
-    set(MKL_SYCL_THREAD mkl_${MKL_SYCL_THREADING})
-  endif()
-endif()
-set(MKL_SYCL)
-set(MKL_SYCL_LIBS)
-list(APPEND MKL_SYCL_LIBS mkl_sycl_blas)
-list(APPEND MKL_SYCL_LIBS mkl_sycl_lapack)
-list(APPEND MKL_SYCL_LIBS mkl_sycl_dft)
-list(APPEND MKL_SYCL_LIBS mkl_sycl_sparse)
-list(APPEND MKL_SYCL_LIBS mkl_sycl_data_fitting)
-list(APPEND MKL_SYCL_LIBS mkl_sycl_rng)
-list(APPEND MKL_SYCL_LIBS mkl_sycl_stats)
-list(APPEND MKL_SYCL_LIBS mkl_sycl_vm)
-if(NOT MKL_LINK STREQUAL "static")
-  if(WIN32 AND CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo")
-    list(TRANSFORM MKL_SYCL_LIBS APPEND "d")
-  endif()
-  list(APPEND MKL_SYCL ${MKL_SYCL_LIBS})
-  # List for tracking incomplete onemKL package
-  set(MISSED_MKL_SYCL_LIBS)
-else()
-  if(WIN32 AND CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo")
-    set(MKL_SYCL         mkl_sycld)
-  else()
-    set(MKL_SYCL         mkl_sycl)
-  endif()
-endif()
-
-set(MKL_IFACE_LIB     mkl_${MKL_INTERFACE_FULL})
-set(MKL_CORE          mkl_core)
-if(WIN32 AND CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo" AND MKL_THREADING STREQUAL "tbb_thread")
-  set(MKL_THREAD        mkl_tbb_threadd)
-else()
-  set(MKL_THREAD        mkl_${MKL_THREADING})
-endif()
-set(MKL_SDL           mkl_rt)
-if(MKL_ARCH STREQUAL "ia32")
-  set(MKL_BLAS95      mkl_blas95)
-  set(MKL_LAPACK95    mkl_lapack95)
-else()
-  set(MKL_BLAS95      mkl_blas95_${MKL_INTERFACE})
-  set(MKL_LAPACK95    mkl_lapack95_${MKL_INTERFACE})
-endif()
-# BLACS
-set(MKL_BLACS mkl_blacs_${MKL_MPI}_${MKL_INTERFACE})
-if(UNIX AND NOT APPLE AND MKL_MPI MATCHES "mpich")
-  # MPICH is compatible with INTELMPI Wrappers on Linux
-  set(MKL_BLACS mkl_blacs_intelmpi_${MKL_INTERFACE})
-endif()
-if(WIN32)
-  if(MKL_MPI STREQUAL "msmpi")
-    if("Fortran" IN_LIST CURR_LANGS)
-      list(APPEND MKL_SUPP_LINK "msmpifec.lib")
-    endif()
-    # MSMPI and MSHPC are supported with the same BLACS library
-    set(MKL_BLACS mkl_blacs_msmpi_${MKL_INTERFACE})
-    if(NOT MKL_LINK STREQUAL "static")
-      set(MKL_BLACS mkl_blacs_${MKL_INTERFACE})
-      set(MKL_BLACS_ENV MSMPI)
-    endif()
-  elseif(MKL_MPI STREQUAL "intelmpi" AND NOT MKL_LINK STREQUAL "static")
-    set(MKL_BLACS mkl_blacs_${MKL_INTERFACE})
-    set(MKL_BLACS_ENV INTELMPI)
-  endif()
-endif()
-# CDFT & SCALAPACK
-set(MKL_CDFT      mkl_cdft_core)
-set(MKL_SCALAPACK mkl_scalapack_${MKL_INTERFACE})
-
-
-if(UNIX AND NOT APPLE)
-  if(MKL_LINK STREQUAL "static" OR MKL_SYCL_LINK STREQUAL "static")
-    set(START_GROUP "-Wl,--start-group")
-    set(END_GROUP "-Wl,--end-group")
-    if(SYCL_COMPILER)
-      set(SYCL_EXPORT_DYNAMIC "-Wl,-export-dynamic")
-    endif()
-    if(ENABLE_OMP_OFFLOAD)
-      set(EXPORT_DYNAMIC "-Wl,-export-dynamic")
-    endif()
-  endif()
-  if(MKL_LINK STREQUAL "dynamic")
-    set(MKL_RPATH "-Wl,-rpath=$<TARGET_FILE_DIR:MKL::${MKL_CORE}>")
-    if((GNU_Fortran_COMPILER OR PGI_COMPILER) AND "Fortran" IN_LIST CURR_LANGS)
-      set(NO_AS_NEEDED -Wl,--no-as-needed)
-    endif()
-  endif()
-  if(MKL_SYCL_LINK STREQUAL "dynamic")
-    set(MKL_SYCL_RPATH "-Wl,-rpath=$<TARGET_FILE_DIR:MKL::${MKL_CORE}>")
-  endif()
-  if(MKL_LINK STREQUAL "sdl")
-    set(MKL_RPATH "-Wl,-rpath=$<TARGET_FILE_DIR:MKL::${MKL_SDL}>")
-  endif()
-endif()
-
-# Create a list of requested libraries, based on input options (MKL_LIBRARIES)
-# Create full link-line in MKL_LINK_LINE
-if(SYCL_COMPILER)
-  list(APPEND MKL_SYCL_LIBRARIES ${MKL_SYCL} ${MKL_SYCL_IFACE_LIB} ${MKL_SYCL_THREAD} ${MKL_CORE})
-  list(TRANSFORM MKL_SYCL PREPEND MKL:: OUTPUT_VARIABLE MKL_SYCL_T)
-  list(APPEND MKL_SYCL_LINK_LINE ${MKL_SYCL_LOPT} ${SYCL_EXPORT_DYNAMIC} ${NO_AS_NEEDED} ${MKL_SYCL_RPATH}
-       ${MKL_SYCL_T} ${START_GROUP} MKL::${MKL_SYCL_IFACE_LIB} MKL::${MKL_SYCL_THREAD} MKL::${MKL_CORE} ${END_GROUP})
-endif()
-list(APPEND MKL_LINK_LINE $<IF:$<BOOL:${ENABLE_OMP_OFFLOAD}>,${MKL_OFFLOAD_LOPT},>
-     ${EXPORT_DYNAMIC} ${NO_AS_NEEDED} ${MKL_RPATH})
-if(ENABLE_BLAS95)
-  list(APPEND MKL_LIBRARIES ${MKL_BLAS95})
-  list(APPEND MKL_LINK_LINE MKL::${MKL_BLAS95})
-endif()
-if(ENABLE_LAPACK95)
-  list(APPEND MKL_LIBRARIES ${MKL_LAPACK95})
-  list(APPEND MKL_LINK_LINE MKL::${MKL_LAPACK95})
-endif()
-if(ENABLE_SCALAPACK)
-  list(APPEND MKL_LIBRARIES ${MKL_SCALAPACK})
-  list(APPEND MKL_LINK_LINE MKL::${MKL_SCALAPACK})
-endif()
-if(ENABLE_OMP_OFFLOAD AND NOT MKL_LINK STREQUAL "sdl")
-  list(APPEND MKL_LIBRARIES ${MKL_SYCL})
-  list(TRANSFORM MKL_SYCL PREPEND MKL:: OUTPUT_VARIABLE MKL_SYCL_T)
-  list(APPEND MKL_LINK_LINE ${MKL_SYCL_T})
-endif()
-list(APPEND MKL_LINK_LINE ${START_GROUP})
-if(ENABLE_CDFT)
-  list(APPEND MKL_LIBRARIES ${MKL_CDFT})
-  list(APPEND MKL_LINK_LINE MKL::${MKL_CDFT})
-endif()
-if(MKL_LINK STREQUAL "sdl")
-  list(APPEND MKL_LIBRARIES ${MKL_SDL})
-  list(APPEND MKL_LINK_LINE MKL::${MKL_SDL})
-else()
-  list(APPEND MKL_LIBRARIES ${MKL_IFACE_LIB} ${MKL_THREAD} ${MKL_CORE})
-  list(APPEND MKL_LINK_LINE MKL::${MKL_IFACE_LIB} MKL::${MKL_THREAD} MKL::${MKL_CORE})
-endif()
-if(USE_MPI)
-  list(APPEND MKL_LIBRARIES ${MKL_BLACS})
-  list(APPEND MKL_LINK_LINE MKL::${MKL_BLACS})
-endif()
-list(APPEND MKL_LINK_LINE ${END_GROUP})
-
-# Find all requested libraries
-list(APPEND MKL_REQUESTED_LIBRARIES ${MKL_LIBRARIES})
-if(SYCL_COMPILER)
-  # If SYCL_COMPILER is still ON, MKL_SYCL_ARCH, MKL_SYCL_LINK, and MKL_SYCL_IFACE_LIB are the same as MKL_ARCH, MKL_LINK, and MKL_IFACE_LIB.
-  # Hence we can combine the libraries and find them in the following for loop.
-  # Note that MKL_SYCL_THREADING and MKL_THREADING could be different because of the default value.
-  list(APPEND MKL_REQUESTED_LIBRARIES ${MKL_SYCL_LIBRARIES})
-  list(REMOVE_DUPLICATES MKL_REQUESTED_LIBRARIES)
-endif()
-foreach(lib ${MKL_REQUESTED_LIBRARIES})
-  unset(${lib}_file CACHE)
-  if(MKL_LINK STREQUAL "static" AND NOT ${lib} STREQUAL ${MKL_SDL})
-    find_library(${lib}_file ${LIB_PREFIX}${lib}${LIB_EXT}
-                  PATHS ${MKL_ROOT}
-                  PATH_SUFFIXES "lib${MKL_ARCH_DIR}"
-                  NO_DEFAULT_PATH)
-    add_library(MKL::${lib} STATIC IMPORTED)
-  else()
-    find_library(${lib}_file NAMES ${LIB_PREFIX}${lib}${DLL_EXT} ${lib}
-                  PATHS ${MKL_ROOT}
-                  PATH_SUFFIXES "lib${MKL_ARCH_DIR}"
-                  NO_DEFAULT_PATH)
-    add_library(MKL::${lib} SHARED IMPORTED)
-  endif()
-  if(NOT MKL_LINK STREQUAL "static" AND ${lib} MATCHES "mkl_sycl" AND ${${lib}_file} STREQUAL "${lib}_file-NOTFOUND")
-    list(APPEND MISSED_MKL_SYCL_LIBS ${lib})
-    set(MKL_SYCL_DOMAIN "")
-    string(REGEX REPLACE "mkl_sycl_" "" MKL_SYCL_DOMAIN ${lib})
-    if(WIN32 AND CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo")
-      string(REGEX REPLACE "d$" "" MKL_SYCL_DOMAIN ${MKL_SYCL_DOMAIN})
-    endif()
-    string(TOUPPER ${MKL_SYCL_DOMAIN} MKL_SYCL_DOMAIN)
-    mkl_message(WARNING "Could NOT find MKL ${lib} for target MKL::MKL_SYCL::${MKL_SYCL_DOMAIN}")
-  else()
-    check_required_vars(${lib}_file)
-    mkl_message(STATUS "Found ${${lib}_file}")
-  endif()
-  # CMP0111, implemented in CMake 3.20+ requires a shared library target on Windows
-  # to be defined with IMPLIB and LOCATION property.
-  # It also requires a static library target to be defined with LOCATION property.
-  # Setting the policy to OLD usage, using cmake_policy() does not work as of 3.20.0, hence the if-else below.
-  if(WIN32 AND NOT MKL_LINK STREQUAL "static")
-    set_target_properties(MKL::${lib} PROPERTIES IMPORTED_IMPLIB "${${lib}_file}")
-    # Find corresponding DLL
-    set(MKL_DLL_GLOB ${lib}.*.dll)
-    file(GLOB MKL_DLL_FILE "${MKL_ROOT}/bin${MKL_ARCH_DIR}/${MKL_DLL_GLOB}"
-        # Legacy oneAPI layout support below
-        "${MKL_ROOT}/redist/${MKL_ARCH}/${MKL_DLL_GLOB}"
-        "${MKL_ROOT}/../redist/${MKL_ARCH}/${MKL_DLL_GLOB}"
-        "${MKL_ROOT}/../redist/${MKL_ARCH}/mkl/${MKL_DLL_GLOB}"
-        # Support for Conda directory layout
-        "${MKL_ROOT}/bin/${MKL_DLL_GLOB}"
-    )
-    if(NOT ${lib} STREQUAL ${MKL_IFACE_LIB} AND NOT ${lib} STREQUAL ${MKL_BLAS95} AND NOT ${lib} STREQUAL ${MKL_LAPACK95})  # Windows IFACE libs are static only
-      list(LENGTH MKL_DLL_FILE MKL_DLL_FILE_LEN)
-      if(MKL_DLL_FILE_LEN)
-        # in case multiple versions of the same dll are found, select the highest version
-        list(SORT MKL_DLL_FILE)
-        list(REVERSE MKL_DLL_FILE)
-        list(GET MKL_DLL_FILE 0 MKL_DLL_FILE)
-
-        mkl_message(STATUS "Found DLL: ${MKL_DLL_FILE}")
-        set_target_properties(MKL::${lib} PROPERTIES IMPORTED_LOCATION "${MKL_DLL_FILE}")
-      else()
-        if(${lib} MATCHES "mkl_sycl" AND ${${lib}_file} STREQUAL "${lib}_file-NOTFOUND")
-          mkl_message(WARNING "Could NOT find ${MKL_DLL_GLOB} for target MKL::MKL_SYCL::${MKL_SYCL_DOMAIN}")
-        else()
-          mkl_message(FATAL_ERROR "${MKL_DLL_GLOB} not found")
-        endif()
-      endif()
-    endif()
-  else()
-    set_target_properties(MKL::${lib} PROPERTIES IMPORTED_LOCATION "${${lib}_file}")
-  endif()
-  list(APPEND MKL_IMPORTED_TARGETS MKL::${lib})
-endforeach()
-
-# Threading selection
-if(MKL_THREADING STREQUAL "tbb_thread" OR MKL_SYCL_THREADING STREQUAL "tbb_thread")
-  find_package(TBB CONFIG COMPONENTS tbb)
-  if(TARGET TBB::tbb)
-    if(MKL_THREADING STREQUAL "tbb_thread")
-      set(MKL_THREAD_LIB $<TARGET_LINKER_FILE:TBB::tbb>)
-      set(MKL_SDL_THREAD_ENV "TBB")
-    endif()
-    if(MKL_SYCL_THREADING STREQUAL "tbb_thread")
-      set(MKL_SYCL_THREAD_LIB $<TARGET_LINKER_FILE:TBB::tbb>)
-    endif()
-    get_property(TBB_LIB TARGET TBB::tbb PROPERTY IMPORTED_LOCATION_RELEASE)
-    get_filename_component(TBB_LIB_DIR ${TBB_LIB} DIRECTORY)
-  else()
-    if(UNIX)
-      set(TBB_LIBNAME libtbb.so)
-    else()
-      set(TBB_LIBNAME tbb.lib)
-    endif()
-    find_path(TBB_LIB_DIR ${TBB_LIBNAME}
-        HINTS $ENV{TBBROOT} $ENV{MKLROOT} ${MKL_ROOT} ${TBB_ROOT}
-        PATH_SUFFIXES "lib" "lib/intel64/gcc4.4" "lib/intel64/gcc4.8"
-                 "../tbb/lib/intel64/gcc4.4" "../tbb/lib/intel64/gcc4.8"
-                 "../../tbb/latest/lib/intel64/gcc4.8"
-                 "../tbb/lib/intel64/vc14" "lib/intel64/vc14"
-    )
-    find_library(TBB_LIBRARIES NAMES tbb
-        HINTS $ENV{TBBROOT} $ENV{MKLROOT} ${MKL_ROOT} ${TBB_ROOT}
-        PATH_SUFFIXES "lib" "lib/intel64/gcc4.4" "lib/intel64/gcc4.8"
-                 "../tbb/lib/intel64/gcc4.4" "../tbb/lib/intel64/gcc4.8"
-                 "../../tbb/latest/lib/intel64/gcc4.8"
-                 "../tbb/lib/intel64/vc14" "lib/intel64/vc14"
-    )
-    include(FindPackageHandleStandardArgs)
-    find_package_handle_standard_args(MKL REQUIRED_VARS TBB_LIBRARIES)
-  endif()
-  if(UNIX)
-    if(CMAKE_SKIP_BUILD_RPATH)
-      set(TBB_LINK "-L${TBB_LIB_DIR} -ltbb")
-    else()
-      set(TBB_LINK "-Wl,-rpath,${TBB_LIB_DIR} -L${TBB_LIB_DIR} -ltbb")
-    endif()
-    if(MKL_THREADING STREQUAL "tbb_thread")
-      list(APPEND MKL_SUPP_LINK ${TBB_LINK})
-      if(APPLE)
-        list(APPEND MKL_SUPP_LINK -lc++)
-      else()
-        list(APPEND MKL_SUPP_LINK -lstdc++)
-      endif()
-    endif()
-    if(MKL_SYCL_THREADING STREQUAL "tbb_thread")
-      list(APPEND MKL_SYCL_SUPP_LINK ${TBB_LINK})
-    endif()
-  endif()
-  if(WIN32 OR APPLE)
-    set(MKL_ENV_PATH ${TBB_LIB_DIR})
-  endif()
-endif()
-if(NOT MKL_THREADING STREQUAL "tbb_thread" AND MKL_THREADING MATCHES "_thread")
-  if(MKL_THREADING STREQUAL "pgi_thread")
-    list(APPEND MKL_SUPP_LINK -mp -pgf90libs)
-    set(MKL_SDL_THREAD_ENV "PGI")
-  elseif(MKL_THREADING STREQUAL "gnu_thread")
-    list(APPEND MKL_SUPP_LINK -lgomp)
-    set(MKL_SDL_THREAD_ENV "GNU")
-  else()
-    # intel_thread
-    if(UNIX)
-      set(MKL_OMP_LIB iomp5)
-      set(LIB_EXT ".so")
-      if(APPLE)
-        set(LIB_EXT ".dylib")
-      endif()
-    else()
-      set(MKL_OMP_LIB libiomp5md)
-    endif()
-    set(MKL_SDL_THREAD_ENV "INTEL")
-    set(OMP_LIBNAME ${LIB_PREFIX}${MKL_OMP_LIB}${LIB_EXT})
-
-    find_library(OMP_LIBRARY ${OMP_LIBNAME}
-      HINTS $ENV{LIB} $ENV{LIBRARY_PATH} $ENV{MKLROOT} ${MKL_ROOT} $ENV{CMPLR_ROOT}
-      PATH_SUFFIXES "lib" "lib/${MKL_ARCH}"
-             "lib/${MKL_ARCH}_lin" "lib/${MKL_ARCH}_win"
-             "linux/compiler/lib/${MKL_ARCH}"
-             "linux/compiler/lib/${MKL_ARCH}_lin"
-             "windows/compiler/lib/${MKL_ARCH}"
-             "windows/compiler/lib/${MKL_ARCH}_win"
-             "../compiler/lib/${MKL_ARCH}_lin" "../compiler/lib/${MKL_ARCH}_win"
-             "../compiler/lib/${MKL_ARCH}" "../compiler/lib" "compiler/lib"
-             "../../compiler/latest/linux/compiler/lib/${MKL_ARCH}"
-             "../../compiler/latest/linux/compiler/lib/${MKL_ARCH}_lin"
-             "../../compiler/latest/windows/compiler/lib/${MKL_ARCH}"
-             "../../compiler/latest/windows/compiler/lib/${MKL_ARCH}_win"
-             "../../compiler/latest/mac/compiler/lib"
-      NO_DEFAULT_PATH)
-    if(WIN32)
-      set(OMP_DLLNAME ${LIB_PREFIX}${MKL_OMP_LIB}.dll)
-      find_path(OMP_DLL_DIR ${OMP_DLLNAME}
-        HINTS $ENV{LIB} $ENV{LIBRARY_PATH} $ENV{MKLROOT} ${MKL_ROOT} $ENV{CMPLR_ROOT}
-        PATH_SUFFIXES "bin"
-              # Legacy layout support for oneMKL
-              "redist/${MKL_ARCH}"
-              "redist/${MKL_ARCH}_win" "redist/${MKL_ARCH}_win/compiler"
-              "../redist/${MKL_ARCH}/compiler" "../compiler/lib"
-              "../../compiler/latest/windows/redist/${MKL_ARCH}_win"
-              "../../compiler/latest/windows/redist/${MKL_ARCH}_win/compiler"
-              "../../compiler/latest/windows/compiler/redist/${MKL_ARCH}_win"
-              "../../compiler/latest/windows/compiler/redist/${MKL_ARCH}_win/compiler"
-        NO_DEFAULT_PATH)
-      check_required_vars(OMP_DLL_DIR)
-      set(MKL_ENV_PATH "${OMP_DLL_DIR}")
-    endif()
-
-    if(WIN32 AND SKIP_LIBPATH)
-      # Only for Intel OpenMP Offload
-      set(OMP_LINK "libiomp5md.lib")
-    else()
-      set(OMP_LINK "${OMP_LIBRARY}")
-      if(CMAKE_C_COMPILER_ID STREQUAL "PGI" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
-        # Disable PGI OpenMP runtime for correct work of Intel OpenMP runtime
-        list(APPEND MKL_SUPP_LINK -nomp)
-      endif()
-    endif()
-    check_required_vars(OMP_LIBRARY OMP_LINK)
-    mkl_message(STATUS "Found ${OMP_LIBRARY}")
-    if(MKL_SYCL_THREADING STREQUAL "intel_thread")
-      set(MKL_SYCL_THREAD_LIB ${OMP_LINK})
-    endif()
-    set(MKL_THREAD_LIB ${OMP_LINK})
-  endif()
-elseif(MKL_THREADING STREQUAL "sequential")
-  # Sequential threading
-  set(MKL_SDL_THREAD_ENV "SEQUENTIAL")
-endif() # MKL_THREADING
-
-if(UNIX)
-  if(SYCL_COMPILER)
-    list(APPEND MKL_SYCL_SUPP_LINK -lm -ldl -lpthread)
-  endif()
-  list(APPEND MKL_SUPP_LINK -lm -ldl -lpthread)
-endif()
-
-if(SYCL_COMPILER OR ENABLE_OMP_OFFLOAD)
-  if(WIN32)
-    # Detect sycl library version
-    if(NOT DEFINED SYCL_LIB_VER_CACHE)
-      set(SYCL_LIB_VER "")
-      find_library(SYCL_LIB_DIR ${LIB_PREFIX}sycl${LIB_EXT}
-        HINTS $ENV{LIB} $ENV{CMPLR_ROOT}
-        PATH_SUFFIXES "windows/lib" "../lib${MKL_ARCH_DIR}")
-      if(NOT SYCL_LIB_DIR)
-        foreach(ver RANGE 6 99)
-          find_library(SYCL_LIB_DIR ${LIB_PREFIX}sycl${ver}${LIB_EXT}
-            HINTS $ENV{LIB} $ENV{CMPLR_ROOT}
-            PATH_SUFFIXES "windows/lib" "../lib${MKL_ARCH_DIR}")
-          if(SYCL_LIB_DIR)
-            set(SYCL_LIB_VER ${ver})
-            break()
-          endif()
-        endforeach()
-      endif()
-      set(SYCL_LIB_VER_CACHE ${SYCL_LIB_VER} CACHE STRING "")
-    endif()
-
-    if(SYCL_COMPILER)
-      if(CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo")
-        list(APPEND MKL_SYCL_SUPP_LINK ${LINK_PREFIX}sycl${SYCL_LIB_VER_CACHE}d${LINK_SUFFIX})
-      else()
-        list(APPEND MKL_SYCL_SUPP_LINK ${LINK_PREFIX}sycl${SYCL_LIB_VER_CACHE}${LINK_SUFFIX})
-      endif()
-    endif()
-    if(ENABLE_OMP_OFFLOAD)
-      if(CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo")
-        list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}sycl${SYCL_LIB_VER_CACHE}d${LINK_SUFFIX})
-      else()
-        list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}sycl${SYCL_LIB_VER_CACHE}${LINK_SUFFIX})
-      endif()
-    endif()
-  else()
-    if(SYCL_COMPILER)
-      list(APPEND MKL_SYCL_SUPP_LINK ${LINK_PREFIX}sycl${LINK_SUFFIX})
-    endif()
-    if(ENABLE_OMP_OFFLOAD)
-      list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}sycl${LINK_SUFFIX})
-    endif()
-  endif()
-  if(SYCL_COMPILER)
-    list(APPEND MKL_SYCL_SUPP_LINK ${LINK_PREFIX}OpenCL${LINK_SUFFIX})
-  endif()
-  if(ENABLE_OMP_OFFLOAD)
-    list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}OpenCL${LINK_SUFFIX})
-  endif()
-endif()
-
-# Setup link types based on input options
-set(LINK_TYPES "")
-
-if(SYCL_COMPILER OR ENABLE_OMP_OFFLOAD)
-# Remove missed mkl_sycl libraries in case of incomplete oneMKL package
-  if(MISSED_MKL_SYCL_LIBS)
-    list(REMOVE_ITEM MKL_SYCL_LIBS ${MISSED_MKL_SYCL_LIBS})
-    list(TRANSFORM MISSED_MKL_SYCL_LIBS PREPEND MKL:: OUTPUT_VARIABLE MISSED_MKL_SYCL_TARGETS)
-    list(REMOVE_ITEM MKL_SYCL_LINK_LINE ${MISSED_MKL_SYCL_TARGETS})
-    list(REMOVE_ITEM MKL_LINK_LINE ${MISSED_MKL_SYCL_TARGETS})
-  endif()
-endif()
-
-if(SYCL_COMPILER)
-  if(NOT TARGET MKL::MKL_SYCL)
-    add_library(MKL::MKL_SYCL INTERFACE IMPORTED GLOBAL)
-    add_library(MKL::MKL_DPCPP ALIAS MKL::MKL_SYCL)
-    add_dependencies(MKL::MKL_SYCL MKL_SYCL_MESSAGE)
-  endif()
-  target_compile_options(MKL::MKL_SYCL INTERFACE $<$<COMPILE_LANGUAGE:CXX>:${MKL_SYCL_COPT}>)
-  target_link_libraries(MKL::MKL_SYCL INTERFACE ${MKL_SYCL_LINK_LINE} ${MKL_SYCL_THREAD_LIB} ${MKL_SYCL_SUPP_LINK})
-  list(APPEND LINK_TYPES MKL::MKL_SYCL)
-  foreach(lib ${MKL_SYCL_LIBS})
-    set(MKL_SYCL_DOMAIN "")
-    string(REGEX REPLACE "mkl_sycl_" "" MKL_SYCL_DOMAIN ${lib})
-    if(WIN32 AND CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo")
-      string(REGEX REPLACE "d$" "" MKL_SYCL_DOMAIN ${MKL_SYCL_DOMAIN})
-    endif()
-    string(TOUPPER ${MKL_SYCL_DOMAIN} MKL_SYCL_DOMAIN)
-    add_library(MKL::MKL_SYCL::${MKL_SYCL_DOMAIN} INTERFACE IMPORTED GLOBAL)
-    add_dependencies(MKL::MKL_SYCL::${MKL_SYCL_DOMAIN} MKL_SYCL_MESSAGE)
-    # Only dynamic link has domain specific libraries
-    # Domain specific targets still use mkl_sycl for static
-    # STREQUAL "${lib}_file-NOTFOUND"
-    if(MKL_LINK STREQUAL "static")
-      target_link_libraries(MKL::MKL_SYCL::${MKL_SYCL_DOMAIN} INTERFACE ${MKL_SYCL_LINK_LINE} ${MKL_SYCL_THREAD_LIB} ${MKL_SYCL_SUPP_LINK})
-    else()
-      list(TRANSFORM MKL_SYCL_LINK_LINE REPLACE ".*mkl_sycl.*" "TBD")
-      list(REMOVE_DUPLICATES MKL_SYCL_LINK_LINE)
-      list(TRANSFORM MKL_SYCL_LINK_LINE REPLACE "TBD" "MKL::${lib}")
-      target_link_libraries(MKL::MKL_SYCL::${MKL_SYCL_DOMAIN} INTERFACE ${MKL_SYCL_LINK_LINE} ${MKL_SYCL_THREAD_LIB} ${MKL_SYCL_SUPP_LINK})
-    endif()
-    list(APPEND LINK_TYPES MKL::MKL_SYCL::${MKL_SYCL_DOMAIN})
-  endforeach(lib) # MKL_SYCL_LIBS
-endif()
-# Single target for all C, Fortran link-lines
-if(NOT TARGET MKL::MKL)
-  add_library(MKL::MKL INTERFACE IMPORTED GLOBAL)
-endif()
-target_compile_options(MKL::MKL INTERFACE
-    $<$<STREQUAL:$<TARGET_PROPERTY:LINKER_LANGUAGE>,C>:${MKL_C_COPT}>
-    $<$<STREQUAL:$<TARGET_PROPERTY:LINKER_LANGUAGE>,Fortran>:${MKL_F_COPT}>
-    $<$<STREQUAL:$<TARGET_PROPERTY:LINKER_LANGUAGE>,CXX>:${MKL_CXX_COPT}>
-    $<IF:$<BOOL:${ENABLE_OMP_OFFLOAD}>,${MKL_OFFLOAD_COPT},>)
-target_link_libraries(MKL::MKL INTERFACE ${MKL_LINK_LINE} ${MKL_THREAD_LIB} ${MKL_SUPP_LINK})
-list(APPEND LINK_TYPES MKL::MKL)
-
-foreach(link ${LINK_TYPES})
-  # Set properties on all INTERFACE targets
-  target_include_directories(${link} BEFORE INTERFACE "${MKL_INCLUDE}")
-  list(APPEND MKL_IMPORTED_TARGETS ${link})
-endforeach(link) # LINK_TYPES
-# oneMKL could be added to implicit directories when it's defined in CPATH
-# In order to avoid dependency on CPATH, remove oneMKL from implicit directories
-if(CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES)
-  list(REMOVE_ITEM CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES "${MKL_INCLUDE}")
-endif()
-if(CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES)
-  list(REMOVE_ITEM CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES "${MKL_INCLUDE}")
-endif()
-
-if(MKL_LINK STREQUAL "sdl")
-  list(APPEND MKL_ENV "MKL_INTERFACE_LAYER=${MKL_SDL_IFACE_ENV}" "MKL_THREADING_LAYER=${MKL_SDL_THREAD_ENV}")
-endif()
-if(WIN32 AND NOT MKL_LINK STREQUAL "static")
-  list(APPEND MKL_ENV "MKL_BLACS_MPI=${MKL_BLACS_ENV}")
-endif()
-
-# Add oneMKL dynamic libraries if RPATH is not defined on Unix
-if(UNIX AND CMAKE_SKIP_BUILD_RPATH)
-  if(MKL_LINK STREQUAL "sdl")
-    set(MKL_LIB_DIR $<TARGET_FILE_DIR:MKL::${MKL_SDL}>)
-  else()
-    set(MKL_LIB_DIR $<TARGET_FILE_DIR:MKL::${MKL_CORE}>)
-  endif()
-  if(APPLE)
-    list(APPEND MKL_ENV "DYLD_LIBRARY_PATH=${MKL_LIB_DIR}\;$ENV{DYLD_LIBRARY_PATH}")
-  else()
-    list(APPEND MKL_ENV "LD_LIBRARY_PATH=${MKL_LIB_DIR}\;$ENV{LD_LIBRARY_PATH}")
-  endif()
-endif()
-
-# Add oneMKL dynamic libraries to PATH on Windows
-if(WIN32 AND NOT MKL_LINK STREQUAL "static")
-  get_filename_component(MKL_DLL_DIR ${MKL_DLL_FILE} DIRECTORY)
-  set(MKL_ENV_PATH "${MKL_DLL_DIR}\;${MKL_ENV_PATH}")
-endif()
-
-if(MKL_ENV_PATH)
-  list(APPEND MKL_ENV "PATH=${MKL_ENV_PATH}\;${OLD_PATH}")
-  if(APPLE)
-    list(APPEND MKL_ENV "DYLD_LIBRARY_PATH=${MKL_ENV_PATH}\:${OLD_PATH}")
-  endif()
-endif()
-
-unset(MKL_DLL_FILE)
-
-endif() # MKL_LIBRARIES
diff --git a/cmake/mkl/MKLConfigVersion.cmake b/cmake/mkl/MKLConfigVersion.cmake
deleted file mode 100755
index 996cd550f..000000000
--- a/cmake/mkl/MKLConfigVersion.cmake
+++ /dev/null
@@ -1,59 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(PACKAGE_VERSION "2023.2.0")
-
-if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
-  set(PACKAGE_VERSION_COMPATIBLE FALSE)
-else()
-
-  if("2023.2.0" MATCHES "^([0-9]+)\\.")
-    set(CVF_VERSION_MAJOR "${CMAKE_MATCH_1}")
-  else()
-    set(CVF_VERSION_MAJOR "2024.0.0")
-  endif()
-
-  if(PACKAGE_FIND_VERSION_MAJOR STREQUAL CVF_VERSION_MAJOR)
-    set(PACKAGE_VERSION_COMPATIBLE TRUE)
-  else()
-    set(PACKAGE_VERSION_COMPATIBLE FALSE)
-  endif()
-
-  if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
-      set(PACKAGE_VERSION_EXACT TRUE)
-  endif()
-endif()
-
-
-
-if("FALSE")
-  return()
-endif()
-
-
-if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
-  return()
-endif()
-
-
-if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
-  math(EXPR installedBits " * 8")
-  set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
-  set(PACKAGE_VERSION_UNSUITABLE TRUE)
-endif()
diff --git a/cmake/oneMKLConfig.cmake b/cmake/oneMKLConfig.cmake
deleted file mode 100644
index 5baf9024b..000000000
--- a/cmake/oneMKLConfig.cmake
+++ /dev/null
@@ -1,30 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR})
-include(CMakeFindDependencyMacro)
-
-#find_dependency(MKL REQUIRED)
-# try to search for SYCLConfig first to find compiler. If it's not present, use local FindCompiler.cmake
-find_package(SYCL QUIET)
-if(NOT ${SYCL_FOUND})
-  find_package(Compiler REQUIRED)
-endif()
-
-include("${CMAKE_CURRENT_LIST_DIR}/oneMKLTargets.cmake")
diff --git a/deps/googletest/CMakeLists.txt b/deps/googletest/CMakeLists.txt
deleted file mode 100644
index f6a528753..000000000
--- a/deps/googletest/CMakeLists.txt
+++ /dev/null
@@ -1,325 +0,0 @@
-########################################################################
-# CMake build script for Google Test.
-#
-# To run the tests for Google Test itself on Linux, use 'make test' or
-# ctest.  You can select which tests to run using 'ctest -R regex'.
-# For more options, run 'ctest --help'.
-
-# When other libraries are using a shared version of runtime libraries,
-# Google Test also has to use one.
-option(
-  gtest_force_shared_crt
-  "Use shared (DLL) run-time lib even when Google Test is built as static lib."
-  OFF)
-
-option(gtest_build_tests "Build all of gtest's own tests." OFF)
-
-option(gtest_build_samples "Build gtest's sample programs." OFF)
-
-option(gtest_disable_pthreads "Disable uses of pthreads in gtest." OFF)
-
-option(
-  gtest_hide_internal_symbols
-  "Build gtest with internal symbols hidden in shared libraries."
-  OFF)
-
-# Defines pre_project_set_up_hermetic_build() and set_up_hermetic_build().
-include(cmake/hermetic_build.cmake OPTIONAL)
-
-if (COMMAND pre_project_set_up_hermetic_build)
-  pre_project_set_up_hermetic_build()
-endif()
-
-########################################################################
-#
-# Project-wide settings
-
-# Name of the project.
-#
-# CMake files in this project can refer to the root source directory
-# as ${gtest_SOURCE_DIR} and to the root binary directory as
-# ${gtest_BINARY_DIR}.
-# Language "C" is required for find_package(Threads).
-
-# Project version:
-
-if (CMAKE_VERSION VERSION_LESS 3.0)
-  project(gtest CXX C)
-  set(PROJECT_VERSION ${GOOGLETEST_VERSION})
-else()
-  cmake_policy(SET CMP0048 NEW)
-  project(gtest VERSION 1.8.1 LANGUAGES CXX C)
-endif()
-cmake_minimum_required(VERSION 2.6.4)
-
-if (POLICY CMP0063) # Visibility
-  cmake_policy(SET CMP0063 NEW)
-endif (POLICY CMP0063)
-
-if (COMMAND set_up_hermetic_build)
-  set_up_hermetic_build()
-endif()
-
-# These commands only run if this is the main project
-if(CMAKE_PROJECT_NAME STREQUAL "gtest" OR CMAKE_PROJECT_NAME STREQUAL "googletest-distribution")
-
-  # BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to
-  # make it prominent in the GUI.
-  option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)." OFF)
-
-else()
-
-  mark_as_advanced(
-    gtest_force_shared_crt
-    gtest_build_tests
-    gtest_build_samples
-    gtest_disable_pthreads
-    gtest_hide_internal_symbols)
-
-endif()
-
-
-if (gtest_hide_internal_symbols)
-  set(CMAKE_CXX_VISIBILITY_PRESET hidden)
-  set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
-endif()
-
-# Define helper functions and macros used by Google Test.
-include(cmake/internal_utils.cmake)
-
-config_compiler_and_linker()  # Defined in internal_utils.cmake.
-
-# Create the CMake package file descriptors.
-if (INSTALL_GTEST)
-  include(CMakePackageConfigHelpers)
-  set(cmake_package_name GTest)
-  set(targets_export_name ${cmake_package_name}Targets CACHE INTERNAL "")
-  set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated" CACHE INTERNAL "")
-  set(cmake_files_install_dir "${CMAKE_INSTALL_LIBDIR}/cmake/${cmake_package_name}")
-  set(version_file "${generated_dir}/${cmake_package_name}ConfigVersion.cmake")
-  write_basic_package_version_file(${version_file} COMPATIBILITY AnyNewerVersion)
-  install(EXPORT ${targets_export_name}
-    NAMESPACE ${cmake_package_name}::
-    DESTINATION ${cmake_files_install_dir})
-  set(config_file "${generated_dir}/${cmake_package_name}Config.cmake")
-  configure_package_config_file("${gtest_SOURCE_DIR}/cmake/Config.cmake.in"
-    "${config_file}" INSTALL_DESTINATION ${cmake_files_install_dir})
-  install(FILES ${version_file} ${config_file}
-    DESTINATION ${cmake_files_install_dir})
-endif()
-
-# Where Google Test's .h files can be found.
-set(gtest_build_include_dirs
-  "${gtest_SOURCE_DIR}/include"
-  "${gtest_SOURCE_DIR}")
-include_directories(${gtest_build_include_dirs})
-
-########################################################################
-#
-# Defines the gtest & gtest_main libraries.  User tests should link
-# with one of them.
-
-# Google Test libraries.  We build them using more strict warnings than what
-# are used for other targets, to ensure that gtest can be compiled by a user
-# aggressive about warnings.
-cxx_library(gtest "${cxx_strict}" src/gtest-all.cc)
-cxx_library(gtest_main "${cxx_strict}" src/gtest_main.cc)
-# If the CMake version supports it, attach header directory information
-# to the targets for when we are part of a parent build (ie being pulled
-# in via add_subdirectory() rather than being a standalone build).
-if (DEFINED CMAKE_VERSION AND NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
-  target_include_directories(gtest SYSTEM INTERFACE
-    "$<BUILD_INTERFACE:${gtest_build_include_dirs}>"
-    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
-  target_include_directories(gtest_main SYSTEM INTERFACE
-    "$<BUILD_INTERFACE:${gtest_build_include_dirs}>"
-    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
-endif()
-target_link_libraries(gtest_main PUBLIC gtest)
-
-########################################################################
-#
-# Install rules
-install_project(gtest gtest_main)
-
-########################################################################
-#
-# Samples on how to link user tests with gtest or gtest_main.
-#
-# They are not built by default.  To build them, set the
-# gtest_build_samples option to ON.  You can do it by running ccmake
-# or specifying the -Dgtest_build_samples=ON flag when running cmake.
-
-if (gtest_build_samples)
-  cxx_executable(sample1_unittest samples gtest_main samples/sample1.cc)
-  cxx_executable(sample2_unittest samples gtest_main samples/sample2.cc)
-  cxx_executable(sample3_unittest samples gtest_main)
-  cxx_executable(sample4_unittest samples gtest_main samples/sample4.cc)
-  cxx_executable(sample5_unittest samples gtest_main samples/sample1.cc)
-  cxx_executable(sample6_unittest samples gtest_main)
-  cxx_executable(sample7_unittest samples gtest_main)
-  cxx_executable(sample8_unittest samples gtest_main)
-  cxx_executable(sample9_unittest samples gtest)
-  cxx_executable(sample10_unittest samples gtest)
-endif()
-
-########################################################################
-#
-# Google Test's own tests.
-#
-# You can skip this section if you aren't interested in testing
-# Google Test itself.
-#
-# The tests are not built by default.  To build them, set the
-# gtest_build_tests option to ON.  You can do it by running ccmake
-# or specifying the -Dgtest_build_tests=ON flag when running cmake.
-
-if (gtest_build_tests)
-  # This must be set in the root directory for the tests to be run by
-  # 'make test' or ctest.
-  enable_testing()
-
-  if (WIN32)
-    file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/RunTest.ps1"
-         CONTENT
-"$project_bin = \"${CMAKE_BINARY_DIR}/bin/$<CONFIG>\"
-$env:Path = \"$project_bin;$env:Path\"
-& $args")
-  elseif (MINGW)
-    file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/RunTest.ps1"
-         CONTENT
-"$project_bin = (cygpath --windows ${CMAKE_BINARY_DIR}/bin)
-$env:Path = \"$project_bin;$env:Path\"
-& $args")
-  endif()
-
-  ############################################################
-  # C++ tests built with standard compiler flags.
-
-  cxx_test(googletest-death-test-test gtest_main)
-  cxx_test(gtest_environment_test gtest)
-  cxx_test(googletest-filepath-test gtest_main)
-  cxx_test(googletest-listener-test gtest_main)
-  cxx_test(gtest_main_unittest gtest_main)
-  cxx_test(googletest-message-test gtest_main)
-  cxx_test(gtest_no_test_unittest gtest)
-  cxx_test(googletest-options-test gtest_main)
-  cxx_test(googletest-param-test-test gtest
-    test/googletest-param-test2-test.cc)
-  cxx_test(googletest-port-test gtest_main)
-  cxx_test(gtest_pred_impl_unittest gtest_main)
-  cxx_test(gtest_premature_exit_test gtest
-    test/gtest_premature_exit_test.cc)
-  cxx_test(googletest-printers-test gtest_main)
-  cxx_test(gtest_prod_test gtest_main
-    test/production.cc)
-  cxx_test(gtest_repeat_test gtest)
-  cxx_test(gtest_sole_header_test gtest_main)
-  cxx_test(gtest_stress_test gtest)
-  cxx_test(googletest-test-part-test gtest_main)
-  cxx_test(gtest_throw_on_failure_ex_test gtest)
-  cxx_test(gtest-typed-test_test gtest_main
-    test/gtest-typed-test2_test.cc)
-  cxx_test(gtest_unittest gtest_main)
-  cxx_test(gtest-unittest-api_test gtest)
-  cxx_test(gtest_skip_in_environment_setup_test gtest_main)
-  cxx_test(gtest_skip_test gtest_main)
-
-  ############################################################
-  # C++ tests built with non-standard compiler flags.
-
-  # MSVC 7.1 does not support STL with exceptions disabled.
-  if (NOT MSVC OR MSVC_VERSION GREATER 1310)
-    cxx_library(gtest_no_exception "${cxx_no_exception}"
-      src/gtest-all.cc)
-    cxx_library(gtest_main_no_exception "${cxx_no_exception}"
-      src/gtest-all.cc src/gtest_main.cc)
-  endif()
-  cxx_library(gtest_main_no_rtti "${cxx_no_rtti}"
-    src/gtest-all.cc src/gtest_main.cc)
-
-  cxx_test_with_flags(gtest-death-test_ex_nocatch_test
-    "${cxx_exception} -DGTEST_ENABLE_CATCH_EXCEPTIONS_=0"
-    gtest test/googletest-death-test_ex_test.cc)
-  cxx_test_with_flags(gtest-death-test_ex_catch_test
-    "${cxx_exception} -DGTEST_ENABLE_CATCH_EXCEPTIONS_=1"
-    gtest test/googletest-death-test_ex_test.cc)
-
-  cxx_test_with_flags(gtest_no_rtti_unittest "${cxx_no_rtti}"
-    gtest_main_no_rtti test/gtest_unittest.cc)
-
-  cxx_shared_library(gtest_dll "${cxx_default}"
-    src/gtest-all.cc src/gtest_main.cc)
-
-  cxx_executable_with_flags(gtest_dll_test_ "${cxx_default}"
-    gtest_dll test/gtest_all_test.cc)
-  set_target_properties(gtest_dll_test_
-                        PROPERTIES
-                        COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
-
-  ############################################################
-  # Python tests.
-
-  cxx_executable(googletest-break-on-failure-unittest_ test gtest)
-  py_test(googletest-break-on-failure-unittest)
-
-  py_test(gtest_skip_environment_check_output_test)
-
-  # Visual Studio .NET 2003 does not support STL with exceptions disabled.
-  if (NOT MSVC OR MSVC_VERSION GREATER 1310)  # 1310 is Visual Studio .NET 2003
-    cxx_executable_with_flags(
-      googletest-catch-exceptions-no-ex-test_
-      "${cxx_no_exception}"
-      gtest_main_no_exception
-      test/googletest-catch-exceptions-test_.cc)
-  endif()
-
-  cxx_executable_with_flags(
-    googletest-catch-exceptions-ex-test_
-    "${cxx_exception}"
-    gtest_main
-    test/googletest-catch-exceptions-test_.cc)
-  py_test(googletest-catch-exceptions-test)
-
-  cxx_executable(googletest-color-test_ test gtest)
-  py_test(googletest-color-test)
-
-  cxx_executable(googletest-env-var-test_ test gtest)
-  py_test(googletest-env-var-test)
-
-  cxx_executable(googletest-filter-unittest_ test gtest)
-  py_test(googletest-filter-unittest)
-
-  cxx_executable(gtest_help_test_ test gtest_main)
-  py_test(gtest_help_test)
-
-  cxx_executable(googletest-list-tests-unittest_ test gtest)
-  py_test(googletest-list-tests-unittest)
-
-  cxx_executable(googletest-output-test_ test gtest)
-  py_test(googletest-output-test --no_stacktrace_support)
-
-  cxx_executable(googletest-shuffle-test_ test gtest)
-  py_test(googletest-shuffle-test)
-
-  # MSVC 7.1 does not support STL with exceptions disabled.
-  if (NOT MSVC OR MSVC_VERSION GREATER 1310)
-    cxx_executable(googletest-throw-on-failure-test_ test gtest_no_exception)
-    set_target_properties(googletest-throw-on-failure-test_
-      PROPERTIES
-      COMPILE_FLAGS "${cxx_no_exception}")
-    py_test(googletest-throw-on-failure-test)
-  endif()
-
-  cxx_executable(googletest-uninitialized-test_ test gtest)
-  py_test(googletest-uninitialized-test)
-
-  cxx_executable(gtest_xml_outfile1_test_ test gtest_main)
-  cxx_executable(gtest_xml_outfile2_test_ test gtest_main)
-  py_test(gtest_xml_outfiles_test)
-  py_test(googletest-json-outfiles-test)
-
-  cxx_executable(gtest_xml_output_unittest_ test gtest)
-  py_test(gtest_xml_output_unittest --no_stacktrace_support)
-  py_test(googletest-json-output-unittest --no_stacktrace_support)
-endif()
diff --git a/deps/googletest/CONTRIBUTORS b/deps/googletest/CONTRIBUTORS
deleted file mode 100644
index feae2fc04..000000000
--- a/deps/googletest/CONTRIBUTORS
+++ /dev/null
@@ -1,37 +0,0 @@
-# This file contains a list of people who've made non-trivial
-# contribution to the Google C++ Testing Framework project.  People
-# who commit code to the project are encouraged to add their names
-# here.  Please keep the list sorted by first names.
-
-Ajay Joshi <jaj@google.com>
-Balázs Dán <balazs.dan@gmail.com>
-Bharat Mediratta <bharat@menalto.com>
-Chandler Carruth <chandlerc@google.com>
-Chris Prince <cprince@google.com>
-Chris Taylor <taylorc@google.com>
-Dan Egnor <egnor@google.com>
-Eric Roman <eroman@chromium.org>
-Hady Zalek <hady.zalek@gmail.com>
-Jeffrey Yasskin <jyasskin@google.com>
-Jói Sigurðsson <joi@google.com>
-Keir Mierle <mierle@gmail.com>
-Keith Ray <keith.ray@gmail.com>
-Kenton Varda <kenton@google.com>
-Manuel Klimek <klimek@google.com>
-Markus Heule <markus.heule@gmail.com>
-Mika Raento <mikie@iki.fi>
-Miklós Fazekas <mfazekas@szemafor.com>
-Pasi Valminen <pasi.valminen@gmail.com>
-Patrick Hanna <phanna@google.com>
-Patrick Riley <pfr@google.com>
-Peter Kaminski <piotrk@google.com>
-Preston Jackson <preston.a.jackson@gmail.com>
-Rainer Klaffenboeck <rainer.klaffenboeck@dynatrace.com>
-Russ Cox <rsc@google.com>
-Russ Rufer <russ@pentad.com>
-Sean Mcafee <eefacm@gmail.com>
-Sigurður Ásgeirsson <siggi@google.com>
-Tracy Bialik <tracy@pentad.com>
-Vadim Berman <vadimb@google.com>
-Vlad Losev <vladl@google.com>
-Zhanyong Wan <wan@google.com>
diff --git a/deps/googletest/LICENSE b/deps/googletest/LICENSE
deleted file mode 100644
index 1941a11f8..000000000
--- a/deps/googletest/LICENSE
+++ /dev/null
@@ -1,28 +0,0 @@
-Copyright 2008, Google Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-    * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/deps/googletest/cmake/Config.cmake.in b/deps/googletest/cmake/Config.cmake.in
deleted file mode 100644
index 12be4498b..000000000
--- a/deps/googletest/cmake/Config.cmake.in
+++ /dev/null
@@ -1,9 +0,0 @@
-@PACKAGE_INIT@
-include(CMakeFindDependencyMacro)
-if (@GTEST_HAS_PTHREAD@)
-  set(THREADS_PREFER_PTHREAD_FLAG @THREADS_PREFER_PTHREAD_FLAG@)
-  find_dependency(Threads)
-endif()
-
-include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
-check_required_components("@project_name@")
diff --git a/deps/googletest/cmake/gtest.pc.in b/deps/googletest/cmake/gtest.pc.in
deleted file mode 100644
index 9aae29e26..000000000
--- a/deps/googletest/cmake/gtest.pc.in
+++ /dev/null
@@ -1,10 +0,0 @@
-prefix=${pcfiledir}/../..
-libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
-includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
-
-Name: gtest
-Description: GoogleTest (without main() function)
-Version: @PROJECT_VERSION@
-URL: https://github.com/google/googletest
-Libs: -L${libdir} -lgtest @CMAKE_THREAD_LIBS_INIT@
-Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ @CMAKE_THREAD_LIBS_INIT@
diff --git a/deps/googletest/cmake/gtest_main.pc.in b/deps/googletest/cmake/gtest_main.pc.in
deleted file mode 100644
index 915f2973a..000000000
--- a/deps/googletest/cmake/gtest_main.pc.in
+++ /dev/null
@@ -1,11 +0,0 @@
-prefix=${pcfiledir}/../..
-libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
-includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
-
-Name: gtest_main
-Description: GoogleTest (with main() function)
-Version: @PROJECT_VERSION@
-URL: https://github.com/google/googletest
-Requires: gtest
-Libs: -L${libdir} -lgtest_main @CMAKE_THREAD_LIBS_INIT@
-Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ @CMAKE_THREAD_LIBS_INIT@
diff --git a/deps/googletest/cmake/internal_utils.cmake b/deps/googletest/cmake/internal_utils.cmake
deleted file mode 100644
index b7b4fdc88..000000000
--- a/deps/googletest/cmake/internal_utils.cmake
+++ /dev/null
@@ -1,360 +0,0 @@
-# Defines functions and macros useful for building Google Test and
-# Google Mock.
-#
-# Note:
-#
-# - This file will be run twice when building Google Mock (once via
-#   Google Test's CMakeLists.txt, and once via Google Mock's).
-#   Therefore it shouldn't have any side effects other than defining
-#   the functions and macros.
-#
-# - The functions/macros defined in this file may depend on Google
-#   Test and Google Mock's option() definitions, and thus must be
-#   called *after* the options have been defined.
-
-# Tweaks CMake's default compiler/linker settings to suit Google Test's needs.
-#
-# This must be a macro(), as inside a function string() can only
-# update variables in the function scope.
-macro(fix_default_compiler_settings_)
-  if (MSVC)
-    # For MSVC, CMake sets certain flags to defaults we want to override.
-    # This replacement code is taken from sample in the CMake Wiki at
-    # https://gitlab.kitware.com/cmake/community/wikis/FAQ#dynamic-replace.
-    foreach (flag_var
-             CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-             CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
-             CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-             CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-      if (NOT BUILD_SHARED_LIBS AND NOT gtest_force_shared_crt)
-        # When Google Test is built as a shared library, it should also use
-        # shared runtime libraries.  Otherwise, it may end up with multiple
-        # copies of runtime library data in different modules, resulting in
-        # hard-to-find crashes. When it is built as a static library, it is
-        # preferable to use CRT as static libraries, as we don't have to rely
-        # on CRT DLLs being available. CMake always defaults to using shared
-        # CRT libraries, so we override that default here.
-        string(REGEX REPLACE "/M[TD]+[d]*" "" ${flag_var} "${${flag_var}}")
-      endif()
-
-      # We prefer more strict warning checking for building Google Test.
-      # Replaces /W3 with /W4 in defaults.
-      string(REPLACE "/W3" "/W4" ${flag_var} "${${flag_var}}")
-
-      # Prevent D9025 warning for targets that have exception handling
-      # turned off (/EHs-c- flag). Where required, exceptions are explicitly
-      # re-enabled using the cxx_exception_flags variable.
-      string(REPLACE "/EHsc" "" ${flag_var} "${${flag_var}}")
-    endforeach()
-  endif()
-endmacro()
-
-# Defines the compiler/linker flags used to build Google Test and
-# Google Mock.  You can tweak these definitions to suit your need.  A
-# variable's value is empty before it's explicitly assigned to.
-macro(config_compiler_and_linker)
-  # Note: pthreads on MinGW is not supported, even if available
-  # instead, we use windows threading primitives
-  unset(GTEST_HAS_PTHREAD)
-  if (NOT gtest_disable_pthreads AND NOT MINGW)
-    # Defines CMAKE_USE_PTHREADS_INIT and CMAKE_THREAD_LIBS_INIT.
-    find_package(Threads)
-    if (CMAKE_USE_PTHREADS_INIT)
-      set(GTEST_HAS_PTHREAD ON)
-    endif()
-  endif()
-
-  fix_default_compiler_settings_()
-  if (MSVC)
-    # Newlines inside flags variables break CMake's NMake generator.
-    # TODO(vladl@google.com): Add -RTCs and -RTCu to debug builds.
-    set(cxx_base_flags "-GS -W4 -WX -wd4251 -wd4275 -nologo -J -Zi")
-    set(cxx_base_flags "${cxx_base_flags} -D_UNICODE -DUNICODE -DWIN32 -D_WIN32")
-    set(cxx_base_flags "${cxx_base_flags} -DSTRICT -DWIN32_LEAN_AND_MEAN")
-    set(cxx_exception_flags "-EHsc -D_HAS_EXCEPTIONS=1 -DGTEST_HAS_SEH=0")
-    set(cxx_no_exception_flags "-EHs-c- -D_HAS_EXCEPTIONS=0")
-    set(cxx_no_rtti_flags "-GR-")
-    # Suppress "unreachable code" warning
-    # http://stackoverflow.com/questions/3232669 explains the issue.
-    set(cxx_base_flags "${cxx_base_flags} -wd4702")
-  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-# TODO: add Werror back when LLVM compiler stop generate warnings on CUDA 10.2
-#    set(cxx_base_flags "-Wall -Wshadow -Werror -Wconversion")
-    set(cxx_base_flags "-Wall -Wshadow -Wconversion")
-    set(cxx_exception_flags "-fexceptions")
-    set(cxx_no_exception_flags "-fno-exceptions")
-    set(cxx_strict_flags "-W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Winline -Wredundant-decls")
-    set(cxx_no_rtti_flags "-fno-rtti")
-  elseif (CMAKE_COMPILER_IS_GNUCXX)
-    set(cxx_base_flags "-Wall -Wshadow -Werror")
-    if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0)
-      set(cxx_base_flags "${cxx_base_flags} -Wno-error=dangling-else")
-    endif()
-    set(cxx_exception_flags "-fexceptions")
-    set(cxx_no_exception_flags "-fno-exceptions")
-    # Until version 4.3.2, GCC doesn't define a macro to indicate
-    # whether RTTI is enabled.  Therefore we define GTEST_HAS_RTTI
-    # explicitly.
-    set(cxx_no_rtti_flags "-fno-rtti -DGTEST_HAS_RTTI=0")
-    set(cxx_strict_flags
-      "-Wextra -Wno-unused-parameter -Wno-missing-field-initializers")
-  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "SunPro")
-    set(cxx_exception_flags "-features=except")
-    # Sun Pro doesn't provide macros to indicate whether exceptions and
-    # RTTI are enabled, so we define GTEST_HAS_* explicitly.
-    set(cxx_no_exception_flags "-features=no%except -DGTEST_HAS_EXCEPTIONS=0")
-    set(cxx_no_rtti_flags "-features=no%rtti -DGTEST_HAS_RTTI=0")
-  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "VisualAge" OR
-      CMAKE_CXX_COMPILER_ID STREQUAL "XL")
-    # CMake 2.8 changes Visual Age's compiler ID to "XL".
-    set(cxx_exception_flags "-qeh")
-    set(cxx_no_exception_flags "-qnoeh")
-    # Until version 9.0, Visual Age doesn't define a macro to indicate
-    # whether RTTI is enabled.  Therefore we define GTEST_HAS_RTTI
-    # explicitly.
-    set(cxx_no_rtti_flags "-qnortti -DGTEST_HAS_RTTI=0")
-  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "HP")
-    set(cxx_base_flags "-AA -mt")
-    set(cxx_exception_flags "-DGTEST_HAS_EXCEPTIONS=1")
-    set(cxx_no_exception_flags "+noeh -DGTEST_HAS_EXCEPTIONS=0")
-    # RTTI can not be disabled in HP aCC compiler.
-    set(cxx_no_rtti_flags "")
-  endif()
-
-  # The pthreads library is available and allowed?
-  if (DEFINED GTEST_HAS_PTHREAD)
-    set(GTEST_HAS_PTHREAD_MACRO "-DGTEST_HAS_PTHREAD=1")
-  else()
-    set(GTEST_HAS_PTHREAD_MACRO "-DGTEST_HAS_PTHREAD=0")
-  endif()
-  set(cxx_base_flags "${cxx_base_flags} ${GTEST_HAS_PTHREAD_MACRO}")
-
-  # For building gtest's own tests and samples.
-  set(cxx_exception "${cxx_base_flags} ${cxx_exception_flags}")
-  set(cxx_no_exception
-    "${CMAKE_CXX_FLAGS} ${cxx_base_flags} ${cxx_no_exception_flags}")
-  if(NOT WIN32)
-  set(cxx_default "${cxx_exception}")
-  else()
-  set(cxx_default "${cxx_exception} /MD")
-  endif()
-  set(cxx_no_rtti "${cxx_default} ${cxx_no_rtti_flags}")
-
-  # For building the gtest libraries.
-  set(cxx_strict "${cxx_default} ${cxx_strict_flags}")
-endmacro()
-
-# Defines the gtest & gtest_main libraries.  User tests should link
-# with one of them.
-function(cxx_library_with_type name type cxx_flags)
-  # type can be either STATIC or SHARED to denote a static or shared library.
-  # ARGN refers to additional arguments after 'cxx_flags'.
-  add_library(${name} ${type} ${ARGN})
-  set_target_properties(${name}
-    PROPERTIES
-    COMPILE_FLAGS "${cxx_flags}")
-  # Generate debug library name with a postfix.
-  set_target_properties(${name}
-    PROPERTIES
-    DEBUG_POSTFIX "d")
-  # Set the output directory for build artifacts
-  set_target_properties(${name}
-    PROPERTIES
-    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
-    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
-  # make PDBs match library name
-  get_target_property(pdb_debug_postfix ${name} DEBUG_POSTFIX)
-  set_target_properties(${name}
-    PROPERTIES
-    PDB_NAME "${name}"
-    PDB_NAME_DEBUG "${name}${pdb_debug_postfix}"
-    COMPILE_PDB_NAME "${name}"
-    COMPILE_PDB_NAME_DEBUG "${name}${pdb_debug_postfix}")
-
-  if (BUILD_SHARED_LIBS OR type STREQUAL "SHARED")
-    set_target_properties(${name}
-      PROPERTIES
-      COMPILE_DEFINITIONS "GTEST_CREATE_SHARED_LIBRARY=1")
-    if (NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
-      target_compile_definitions(${name} INTERFACE
-        $<INSTALL_INTERFACE:GTEST_LINKED_AS_SHARED_LIBRARY=1>)
-    endif()
-  endif()
-  if (DEFINED GTEST_HAS_PTHREAD)
-    if ("${CMAKE_VERSION}" VERSION_LESS "3.1.0")
-      set(threads_spec ${CMAKE_THREAD_LIBS_INIT})
-    else()
-      set(threads_spec Threads::Threads)
-    endif()
-    target_link_libraries(${name} PUBLIC ${threads_spec})
-  endif()
-endfunction()
-
-########################################################################
-#
-# Helper functions for creating build targets.
-
-function(cxx_shared_library name cxx_flags)
-  cxx_library_with_type(${name} SHARED "${cxx_flags}" ${ARGN})
-endfunction()
-
-function(cxx_library name cxx_flags)
-  cxx_library_with_type(${name} "" "${cxx_flags}" ${ARGN})
-endfunction()
-
-# cxx_executable_with_flags(name cxx_flags libs srcs...)
-#
-# creates a named C++ executable that depends on the given libraries and
-# is built from the given source files with the given compiler flags.
-function(cxx_executable_with_flags name cxx_flags libs)
-  add_executable(${name} ${ARGN})
-  if (MSVC)
-    # BigObj required for tests.
-    set(cxx_flags "${cxx_flags} -bigobj")
-  endif()
-  if (cxx_flags)
-    set_target_properties(${name}
-      PROPERTIES
-      COMPILE_FLAGS "${cxx_flags}")
-  endif()
-  if (BUILD_SHARED_LIBS)
-    set_target_properties(${name}
-      PROPERTIES
-      COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
-  endif()
-  # To support mixing linking in static and dynamic libraries, link each
-  # library in with an extra call to target_link_libraries.
-  foreach (lib "${libs}")
-    target_link_libraries(${name} ${lib})
-  endforeach()
-endfunction()
-
-# cxx_executable(name dir lib srcs...)
-#
-# creates a named target that depends on the given libs and is built
-# from the given source files.  dir/name.cc is implicitly included in
-# the source file list.
-function(cxx_executable name dir libs)
-  cxx_executable_with_flags(
-    ${name} "${cxx_default}" "${libs}" "${dir}/${name}.cc" ${ARGN})
-endfunction()
-
-# Sets PYTHONINTERP_FOUND and PYTHON_EXECUTABLE.
-find_package(PythonInterp)
-
-# cxx_test_with_flags(name cxx_flags libs srcs...)
-#
-# creates a named C++ test that depends on the given libs and is built
-# from the given source files with the given compiler flags.
-function(cxx_test_with_flags name cxx_flags libs)
-  cxx_executable_with_flags(${name} "${cxx_flags}" "${libs}" ${ARGN})
-  if (WIN32 OR MINGW)
-    add_test(NAME ${name}
-      COMMAND "powershell" "-Command" "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/RunTest.ps1" "$<TARGET_FILE:${name}>")
-  else()
-    add_test(NAME ${name}
-      COMMAND "$<TARGET_FILE:${name}>")
-  endif()
-endfunction()
-
-# cxx_test(name libs srcs...)
-#
-# creates a named test target that depends on the given libs and is
-# built from the given source files.  Unlike cxx_test_with_flags,
-# test/name.cc is already implicitly included in the source file list.
-function(cxx_test name libs)
-  cxx_test_with_flags("${name}" "${cxx_default}" "${libs}"
-    "test/${name}.cc" ${ARGN})
-endfunction()
-
-# py_test(name)
-#
-# creates a Python test with the given name whose main module is in
-# test/name.py.  It does nothing if Python is not installed.
-function(py_test name)
-  if (PYTHONINTERP_FOUND)
-    if ("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" VERSION_GREATER 3.1)
-      if (CMAKE_CONFIGURATION_TYPES)
-        # Multi-configuration build generators as for Visual Studio save
-        # output in a subdirectory of CMAKE_CURRENT_BINARY_DIR (Debug,
-        # Release etc.), so we have to provide it here.
-        if (WIN32 OR MINGW)
-          add_test(NAME ${name}
-            COMMAND powershell -Command ${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/RunTest.ps1
-              ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
-              --build_dir=${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG> ${ARGN})
-        else()
-          add_test(NAME ${name}
-            COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
-              --build_dir=${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG> ${ARGN})
-        endif()
-      else (CMAKE_CONFIGURATION_TYPES)
-        # Single-configuration build generators like Makefile generators
-        # don't have subdirs below CMAKE_CURRENT_BINARY_DIR.
-        if (WIN32 OR MINGW)
-          add_test(NAME ${name}
-            COMMAND powershell -Command ${CMAKE_CURRENT_BINARY_DIR}/RunTest.ps1
-              ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
-              --build_dir=${CMAKE_CURRENT_BINARY_DIR} ${ARGN})
-        else()
-          add_test(NAME ${name}
-            COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
-              --build_dir=${CMAKE_CURRENT_BINARY_DIR} ${ARGN})
-        endif()
-      endif (CMAKE_CONFIGURATION_TYPES)
-    else()
-      # ${CMAKE_CURRENT_BINARY_DIR} is known at configuration time, so we can
-      # directly bind it from cmake. ${CTEST_CONFIGURATION_TYPE} is known
-      # only at ctest runtime (by calling ctest -c <Configuration>), so
-      # we have to escape $ to delay variable substitution here.
-      if (WIN32 OR MINGW)
-        add_test(NAME ${name}
-          COMMAND powershell -Command ${CMAKE_CURRENT_BINARY_DIR}/RunTest.ps1
-            ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
-            --build_dir=${CMAKE_CURRENT_BINARY_DIR}/\${CTEST_CONFIGURATION_TYPE} ${ARGN})
-      else()
-        add_test(NAME ${name}
-          COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
-            --build_dir=${CMAKE_CURRENT_BINARY_DIR}/\${CTEST_CONFIGURATION_TYPE} ${ARGN})
-      endif()
-    endif()
-  endif(PYTHONINTERP_FOUND)
-endfunction()
-
-# install_project(targets...)
-#
-# Installs the specified targets and configures the associated pkgconfig files.
-function(install_project)
-  if(INSTALL_GTEST)
-    install(DIRECTORY "${PROJECT_SOURCE_DIR}/include/"
-      DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
-    # Install the project targets.
-    install(TARGETS ${ARGN}
-      EXPORT ${targets_export_name}
-      RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
-      ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-      LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")
-    if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
-      # Install PDBs
-      foreach(t ${ARGN})
-        get_target_property(t_pdb_name ${t} COMPILE_PDB_NAME)
-        get_target_property(t_pdb_name_debug ${t} COMPILE_PDB_NAME_DEBUG)
-        get_target_property(t_pdb_output_directory ${t} PDB_OUTPUT_DIRECTORY)
-        install(FILES
-          "${t_pdb_output_directory}/\${CMAKE_INSTALL_CONFIG_NAME}/$<$<CONFIG:Debug>:${t_pdb_name_debug}>$<$<NOT:$<CONFIG:Debug>>:${t_pdb_name}>.pdb"
-          DESTINATION ${CMAKE_INSTALL_LIBDIR}
-          OPTIONAL)
-      endforeach()
-    endif()
-    # Configure and install pkgconfig files.
-    foreach(t ${ARGN})
-      set(configured_pc "${generated_dir}/${t}.pc")
-      configure_file("${PROJECT_SOURCE_DIR}/cmake/${t}.pc.in"
-        "${configured_pc}" @ONLY)
-      install(FILES "${configured_pc}"
-        DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
-    endforeach()
-  endif()
-endfunction()
diff --git a/deps/googletest/cmake/libgtest.la.in b/deps/googletest/cmake/libgtest.la.in
deleted file mode 100644
index 840c83885..000000000
--- a/deps/googletest/cmake/libgtest.la.in
+++ /dev/null
@@ -1,21 +0,0 @@
-# libgtest.la - a libtool library file
-# Generated by libtool (GNU libtool) 2.4.6
-
-# Please DO NOT delete this file!
-# It is necessary for linking the library.
-
-# Names of this library.
-library_names='libgtest.so'
-
-# Is this an already installed library?
-installed=yes
-
-# Should we warn about portability when linking against -modules?
-shouldnotlink=no
-
-# Files to dlopen/dlpreopen
-dlopen=''
-dlpreopen=''
-
-# Directory that this library needs to be installed in:
-libdir='@CMAKE_INSTALL_FULL_LIBDIR@'
diff --git a/deps/googletest/include/gtest/gtest-death-test.h b/deps/googletest/include/gtest/gtest-death-test.h
deleted file mode 100644
index 0eb5b279f..000000000
--- a/deps/googletest/include/gtest/gtest-death-test.h
+++ /dev/null
@@ -1,343 +0,0 @@
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-//
-// The Google C++ Testing and Mocking Framework (Google Test)
-//
-// This header file defines the public API for death tests.  It is
-// #included by gtest.h so a user doesn't need to include this
-// directly.
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-
-#include "gtest/internal/gtest-death-test-internal.h"
-
-namespace testing {
-
-// This flag controls the style of death tests.  Valid values are "threadsafe",
-// meaning that the death test child process will re-execute the test binary
-// from the start, running only a single death test, or "fast",
-// meaning that the child process will execute the test logic immediately
-// after forking.
-GTEST_DECLARE_string_(death_test_style);
-
-#if GTEST_HAS_DEATH_TEST
-
-namespace internal {
-
-// Returns a Boolean value indicating whether the caller is currently
-// executing in the context of the death test child process.  Tools such as
-// Valgrind heap checkers may need this to modify their behavior in death
-// tests.  IMPORTANT: This is an internal utility.  Using it may break the
-// implementation of death tests.  User code MUST NOT use it.
-GTEST_API_ bool InDeathTestChild();
-
-}  // namespace internal
-
-// The following macros are useful for writing death tests.
-
-// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
-// executed:
-//
-//   1. It generates a warning if there is more than one active
-//   thread.  This is because it's safe to fork() or clone() only
-//   when there is a single thread.
-//
-//   2. The parent process clone()s a sub-process and runs the death
-//   test in it; the sub-process exits with code 0 at the end of the
-//   death test, if it hasn't exited already.
-//
-//   3. The parent process waits for the sub-process to terminate.
-//
-//   4. The parent process checks the exit code and error message of
-//   the sub-process.
-//
-// Examples:
-//
-//   ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
-//   for (int i = 0; i < 5; i++) {
-//     EXPECT_DEATH(server.ProcessRequest(i),
-//                  "Invalid request .* in ProcessRequest()")
-//                  << "Failed to die on request " << i;
-//   }
-//
-//   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
-//
-//   bool KilledBySIGHUP(int exit_code) {
-//     return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
-//   }
-//
-//   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
-//
-// On the regular expressions used in death tests:
-//
-//   GOOGLETEST_CM0005 DO NOT DELETE
-//   On POSIX-compliant systems (*nix), we use the <regex.h> library,
-//   which uses the POSIX extended regex syntax.
-//
-//   On other platforms (e.g. Windows or Mac), we only support a simple regex
-//   syntax implemented as part of Google Test.  This limited
-//   implementation should be enough most of the time when writing
-//   death tests; though it lacks many features you can find in PCRE
-//   or POSIX extended regex syntax.  For example, we don't support
-//   union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
-//   repetition count ("x{5,7}"), among others.
-//
-//   Below is the syntax that we do support.  We chose it to be a
-//   subset of both PCRE and POSIX extended regex, so it's easy to
-//   learn wherever you come from.  In the following: 'A' denotes a
-//   literal character, period (.), or a single \\ escape sequence;
-//   'x' and 'y' denote regular expressions; 'm' and 'n' are for
-//   natural numbers.
-//
-//     c     matches any literal character c
-//     \\d   matches any decimal digit
-//     \\D   matches any character that's not a decimal digit
-//     \\f   matches \f
-//     \\n   matches \n
-//     \\r   matches \r
-//     \\s   matches any ASCII whitespace, including \n
-//     \\S   matches any character that's not a whitespace
-//     \\t   matches \t
-//     \\v   matches \v
-//     \\w   matches any letter, _, or decimal digit
-//     \\W   matches any character that \\w doesn't match
-//     \\c   matches any literal character c, which must be a punctuation
-//     .     matches any single character except \n
-//     A?    matches 0 or 1 occurrences of A
-//     A*    matches 0 or many occurrences of A
-//     A+    matches 1 or many occurrences of A
-//     ^     matches the beginning of a string (not that of each line)
-//     $     matches the end of a string (not that of each line)
-//     xy    matches x followed by y
-//
-//   If you accidentally use PCRE or POSIX extended regex features
-//   not implemented by us, you will get a run-time failure.  In that
-//   case, please try to rewrite your regular expression within the
-//   above syntax.
-//
-//   This implementation is *not* meant to be as highly tuned or robust
-//   as a compiled regex library, but should perform well enough for a
-//   death test, which already incurs significant overhead by launching
-//   a child process.
-//
-// Known caveats:
-//
-//   A "threadsafe" style death test obtains the path to the test
-//   program from argv[0] and re-executes it in the sub-process.  For
-//   simplicity, the current implementation doesn't search the PATH
-//   when launching the sub-process.  This means that the user must
-//   invoke the test program via a path that contains at least one
-//   path separator (e.g. path/to/foo_test and
-//   /absolute/path/to/bar_test are fine, but foo_test is not).  This
-//   is rarely a problem as people usually don't put the test binary
-//   directory in PATH.
-//
-
-// Asserts that a given statement causes the program to exit, with an
-// integer exit status that satisfies predicate, and emitting error output
-// that matches regex.
-# define ASSERT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
-
-// Like ASSERT_EXIT, but continues on to successive tests in the
-// test suite, if any:
-# define EXPECT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
-
-// Asserts that a given statement causes the program to exit, either by
-// explicitly exiting with a nonzero exit code or being killed by a
-// signal, and emitting error output that matches regex.
-# define ASSERT_DEATH(statement, regex) \
-    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
-
-// Like ASSERT_DEATH, but continues on to successive tests in the
-// test suite, if any:
-# define EXPECT_DEATH(statement, regex) \
-    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
-
-// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
-
-// Tests that an exit code describes a normal exit with a given exit code.
-class GTEST_API_ ExitedWithCode {
- public:
-  explicit ExitedWithCode(int exit_code);
-  bool operator()(int exit_status) const;
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ExitedWithCode& other);
-
-  const int exit_code_;
-};
-
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
-// Tests that an exit code describes an exit due to termination by a
-// given signal.
-// GOOGLETEST_CM0006 DO NOT DELETE
-class GTEST_API_ KilledBySignal {
- public:
-  explicit KilledBySignal(int signum);
-  bool operator()(int exit_status) const;
- private:
-  const int signum_;
-};
-# endif  // !GTEST_OS_WINDOWS
-
-// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
-// The death testing framework causes this to have interesting semantics,
-// since the sideeffects of the call are only visible in opt mode, and not
-// in debug mode.
-//
-// In practice, this can be used to test functions that utilize the
-// LOG(DFATAL) macro using the following style:
-//
-// int DieInDebugOr12(int* sideeffect) {
-//   if (sideeffect) {
-//     *sideeffect = 12;
-//   }
-//   LOG(DFATAL) << "death";
-//   return 12;
-// }
-//
-// TEST(TestSuite, TestDieOr12WorksInDgbAndOpt) {
-//   int sideeffect = 0;
-//   // Only asserts in dbg.
-//   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
-//
-// #ifdef NDEBUG
-//   // opt-mode has sideeffect visible.
-//   EXPECT_EQ(12, sideeffect);
-// #else
-//   // dbg-mode no visible sideeffect.
-//   EXPECT_EQ(0, sideeffect);
-// #endif
-// }
-//
-// This will assert that DieInDebugReturn12InOpt() crashes in debug
-// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
-// appropriate fallback value (12 in this case) in opt mode. If you
-// need to test that a function has appropriate side-effects in opt
-// mode, include assertions against the side-effects.  A general
-// pattern for this is:
-//
-// EXPECT_DEBUG_DEATH({
-//   // Side-effects here will have an effect after this statement in
-//   // opt mode, but none in debug mode.
-//   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
-// }, "death");
-//
-# ifdef NDEBUG
-
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
-  GTEST_EXECUTE_STATEMENT_(statement, regex)
-
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
-  GTEST_EXECUTE_STATEMENT_(statement, regex)
-
-# else
-
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
-  EXPECT_DEATH(statement, regex)
-
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
-  ASSERT_DEATH(statement, regex)
-
-# endif  // NDEBUG for EXPECT_DEBUG_DEATH
-#endif  // GTEST_HAS_DEATH_TEST
-
-// This macro is used for implementing macros such as
-// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
-// death tests are not supported. Those macros must compile on such systems
-// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
-// systems that support death tests. This allows one to write such a macro
-// on a system that does not support death tests and be sure that it will
-// compile on a death-test supporting system. It is exposed publicly so that
-// systems that have death-tests with stricter requirements than
-// GTEST_HAS_DEATH_TEST can write their own equivalent of
-// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED.
-//
-// Parameters:
-//   statement -  A statement that a macro such as EXPECT_DEATH would test
-//                for program termination. This macro has to make sure this
-//                statement is compiled but not executed, to ensure that
-//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
-//                parameter iff EXPECT_DEATH compiles with it.
-//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
-//                the output of statement.  This parameter has to be
-//                compiled but not evaluated by this macro, to ensure that
-//                this macro only accepts expressions that a macro such as
-//                EXPECT_DEATH would accept.
-//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
-//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
-//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
-//                compile inside functions where ASSERT_DEATH doesn't
-//                compile.
-//
-//  The branch that has an always false condition is used to ensure that
-//  statement and regex are compiled (and thus syntactically correct) but
-//  never executed. The unreachable code macro protects the terminator
-//  statement from generating an 'unreachable code' warning in case
-//  statement unconditionally returns or throws. The Message constructor at
-//  the end allows the syntax of streaming additional messages into the
-//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
-# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::AlwaysTrue()) { \
-      GTEST_LOG_(WARNING) \
-          << "Death tests are not supported on this platform.\n" \
-          << "Statement '" #statement "' cannot be verified."; \
-    } else if (::testing::internal::AlwaysFalse()) { \
-      ::testing::internal::RE::PartialMatch(".*", (regex)); \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-      terminator; \
-    } else \
-      ::testing::Message()
-
-// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
-// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
-// death tests are supported; otherwise they just issue a warning.  This is
-// useful when you are combining death test assertions with normal test
-// assertions in one test.
-#if GTEST_HAS_DEATH_TEST
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    EXPECT_DEATH(statement, regex)
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    ASSERT_DEATH(statement, regex)
-#else
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
-#endif
-
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
diff --git a/deps/googletest/include/gtest/gtest-matchers.h b/deps/googletest/include/gtest/gtest-matchers.h
deleted file mode 100644
index 6e73ba143..000000000
--- a/deps/googletest/include/gtest/gtest-matchers.h
+++ /dev/null
@@ -1,748 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// The Google C++ Testing and Mocking Framework (Google Test)
-//
-// This file implements just enough of the matcher interface to allow
-// EXPECT_DEATH and friends to accept a matcher argument.
-
-// IWYU pragma: private, include "testing/base/public/gunit.h"
-// IWYU pragma: friend third_party/googletest/googlemock/.*
-// IWYU pragma: friend third_party/googletest/googletest/.*
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
-#define GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
-
-#include <memory>
-#include <ostream>
-#include <string>
-
-#include "gtest/gtest-printers.h"
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-port.h"
-
-// MSVC warning C5046 is new as of VS2017 version 15.8.
-#if defined(_MSC_VER) && _MSC_VER >= 1915
-#define GTEST_MAYBE_5046_ 5046
-#else
-#define GTEST_MAYBE_5046_
-#endif
-
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(
-    4251 GTEST_MAYBE_5046_ /* class A needs to have dll-interface to be used by
-                              clients of class B */
-    /* Symbol involving type with internal linkage not defined */)
-
-namespace testing {
-
-// To implement a matcher Foo for type T, define:
-//   1. a class FooMatcherImpl that implements the
-//      MatcherInterface<T> interface, and
-//   2. a factory function that creates a Matcher<T> object from a
-//      FooMatcherImpl*.
-//
-// The two-level delegation design makes it possible to allow a user
-// to write "v" instead of "Eq(v)" where a Matcher is expected, which
-// is impossible if we pass matchers by pointers.  It also eases
-// ownership management as Matcher objects can now be copied like
-// plain values.
-
-// MatchResultListener is an abstract class.  Its << operator can be
-// used by a matcher to explain why a value matches or doesn't match.
-//
-class MatchResultListener {
- public:
-  // Creates a listener object with the given underlying ostream.  The
-  // listener does not own the ostream, and does not dereference it
-  // in the constructor or destructor.
-  explicit MatchResultListener(::std::ostream* os) : stream_(os) {}
-  virtual ~MatchResultListener() = 0;  // Makes this class abstract.
-
-  // Streams x to the underlying ostream; does nothing if the ostream
-  // is NULL.
-  template <typename T>
-  MatchResultListener& operator<<(const T& x) {
-    if (stream_ != nullptr) *stream_ << x;
-    return *this;
-  }
-
-  // Returns the underlying ostream.
-  ::std::ostream* stream() { return stream_; }
-
-  // Returns true iff the listener is interested in an explanation of
-  // the match result.  A matcher's MatchAndExplain() method can use
-  // this information to avoid generating the explanation when no one
-  // intends to hear it.
-  bool IsInterested() const { return stream_ != nullptr; }
-
- private:
-  ::std::ostream* const stream_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener);
-};
-
-inline MatchResultListener::~MatchResultListener() {
-}
-
-// An instance of a subclass of this knows how to describe itself as a
-// matcher.
-class MatcherDescriberInterface {
- public:
-  virtual ~MatcherDescriberInterface() {}
-
-  // Describes this matcher to an ostream.  The function should print
-  // a verb phrase that describes the property a value matching this
-  // matcher should have.  The subject of the verb phrase is the value
-  // being matched.  For example, the DescribeTo() method of the Gt(7)
-  // matcher prints "is greater than 7".
-  virtual void DescribeTo(::std::ostream* os) const = 0;
-
-  // Describes the negation of this matcher to an ostream.  For
-  // example, if the description of this matcher is "is greater than
-  // 7", the negated description could be "is not greater than 7".
-  // You are not required to override this when implementing
-  // MatcherInterface, but it is highly advised so that your matcher
-  // can produce good error messages.
-  virtual void DescribeNegationTo(::std::ostream* os) const {
-    *os << "not (";
-    DescribeTo(os);
-    *os << ")";
-  }
-};
-
-// The implementation of a matcher.
-template <typename T>
-class MatcherInterface : public MatcherDescriberInterface {
- public:
-  // Returns true iff the matcher matches x; also explains the match
-  // result to 'listener' if necessary (see the next paragraph), in
-  // the form of a non-restrictive relative clause ("which ...",
-  // "whose ...", etc) that describes x.  For example, the
-  // MatchAndExplain() method of the Pointee(...) matcher should
-  // generate an explanation like "which points to ...".
-  //
-  // Implementations of MatchAndExplain() should add an explanation of
-  // the match result *if and only if* they can provide additional
-  // information that's not already present (or not obvious) in the
-  // print-out of x and the matcher's description.  Whether the match
-  // succeeds is not a factor in deciding whether an explanation is
-  // needed, as sometimes the caller needs to print a failure message
-  // when the match succeeds (e.g. when the matcher is used inside
-  // Not()).
-  //
-  // For example, a "has at least 10 elements" matcher should explain
-  // what the actual element count is, regardless of the match result,
-  // as it is useful information to the reader; on the other hand, an
-  // "is empty" matcher probably only needs to explain what the actual
-  // size is when the match fails, as it's redundant to say that the
-  // size is 0 when the value is already known to be empty.
-  //
-  // You should override this method when defining a new matcher.
-  //
-  // It's the responsibility of the caller (Google Test) to guarantee
-  // that 'listener' is not NULL.  This helps to simplify a matcher's
-  // implementation when it doesn't care about the performance, as it
-  // can talk to 'listener' without checking its validity first.
-  // However, in order to implement dummy listeners efficiently,
-  // listener->stream() may be NULL.
-  virtual bool MatchAndExplain(T x, MatchResultListener* listener) const = 0;
-
-  // Inherits these methods from MatcherDescriberInterface:
-  //   virtual void DescribeTo(::std::ostream* os) const = 0;
-  //   virtual void DescribeNegationTo(::std::ostream* os) const;
-};
-
-namespace internal {
-
-// Converts a MatcherInterface<T> to a MatcherInterface<const T&>.
-template <typename T>
-class MatcherInterfaceAdapter : public MatcherInterface<const T&> {
- public:
-  explicit MatcherInterfaceAdapter(const MatcherInterface<T>* impl)
-      : impl_(impl) {}
-  ~MatcherInterfaceAdapter() override { delete impl_; }
-
-  void DescribeTo(::std::ostream* os) const override { impl_->DescribeTo(os); }
-
-  void DescribeNegationTo(::std::ostream* os) const override {
-    impl_->DescribeNegationTo(os);
-  }
-
-  bool MatchAndExplain(const T& x,
-                       MatchResultListener* listener) const override {
-    return impl_->MatchAndExplain(x, listener);
-  }
-
- private:
-  const MatcherInterface<T>* const impl_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatcherInterfaceAdapter);
-};
-
-struct AnyEq {
-  template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a == b; }
-};
-struct AnyNe {
-  template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a != b; }
-};
-struct AnyLt {
-  template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a < b; }
-};
-struct AnyGt {
-  template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a > b; }
-};
-struct AnyLe {
-  template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a <= b; }
-};
-struct AnyGe {
-  template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a >= b; }
-};
-
-// A match result listener that ignores the explanation.
-class DummyMatchResultListener : public MatchResultListener {
- public:
-  DummyMatchResultListener() : MatchResultListener(nullptr) {}
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DummyMatchResultListener);
-};
-
-// A match result listener that forwards the explanation to a given
-// ostream.  The difference between this and MatchResultListener is
-// that the former is concrete.
-class StreamMatchResultListener : public MatchResultListener {
- public:
-  explicit StreamMatchResultListener(::std::ostream* os)
-      : MatchResultListener(os) {}
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener);
-};
-
-// An internal class for implementing Matcher<T>, which will derive
-// from it.  We put functionalities common to all Matcher<T>
-// specializations here to avoid code duplication.
-template <typename T>
-class MatcherBase {
- public:
-  // Returns true iff the matcher matches x; also explains the match
-  // result to 'listener'.
-  bool MatchAndExplain(const T& x, MatchResultListener* listener) const {
-    return impl_->MatchAndExplain(x, listener);
-  }
-
-  // Returns true iff this matcher matches x.
-  bool Matches(const T& x) const {
-    DummyMatchResultListener dummy;
-    return MatchAndExplain(x, &dummy);
-  }
-
-  // Describes this matcher to an ostream.
-  void DescribeTo(::std::ostream* os) const { impl_->DescribeTo(os); }
-
-  // Describes the negation of this matcher to an ostream.
-  void DescribeNegationTo(::std::ostream* os) const {
-    impl_->DescribeNegationTo(os);
-  }
-
-  // Explains why x matches, or doesn't match, the matcher.
-  void ExplainMatchResultTo(const T& x, ::std::ostream* os) const {
-    StreamMatchResultListener listener(os);
-    MatchAndExplain(x, &listener);
-  }
-
-  // Returns the describer for this matcher object; retains ownership
-  // of the describer, which is only guaranteed to be alive when
-  // this matcher object is alive.
-  const MatcherDescriberInterface* GetDescriber() const {
-    return impl_.get();
-  }
-
- protected:
-  MatcherBase() {}
-
-  // Constructs a matcher from its implementation.
-  explicit MatcherBase(const MatcherInterface<const T&>* impl) : impl_(impl) {}
-
-  template <typename U>
-  explicit MatcherBase(
-      const MatcherInterface<U>* impl,
-      typename internal::EnableIf<
-          !internal::IsSame<U, const U&>::value>::type* = nullptr)
-      : impl_(new internal::MatcherInterfaceAdapter<U>(impl)) {}
-
-  MatcherBase(const MatcherBase&) = default;
-  MatcherBase& operator=(const MatcherBase&) = default;
-  MatcherBase(MatcherBase&&) = default;
-  MatcherBase& operator=(MatcherBase&&) = default;
-
-  virtual ~MatcherBase() {}
-
- private:
-  std::shared_ptr<const MatcherInterface<const T&>> impl_;
-};
-
-}  // namespace internal
-
-// A Matcher<T> is a copyable and IMMUTABLE (except by assignment)
-// object that can check whether a value of type T matches.  The
-// implementation of Matcher<T> is just a std::shared_ptr to const
-// MatcherInterface<T>.  Don't inherit from Matcher!
-template <typename T>
-class Matcher : public internal::MatcherBase<T> {
- public:
-  // Constructs a null matcher.  Needed for storing Matcher objects in STL
-  // containers.  A default-constructed matcher is not yet initialized.  You
-  // cannot use it until a valid value has been assigned to it.
-  explicit Matcher() {}  // NOLINT
-
-  // Constructs a matcher from its implementation.
-  explicit Matcher(const MatcherInterface<const T&>* impl)
-      : internal::MatcherBase<T>(impl) {}
-
-  template <typename U>
-  explicit Matcher(const MatcherInterface<U>* impl,
-                   typename internal::EnableIf<
-                       !internal::IsSame<U, const U&>::value>::type* = nullptr)
-      : internal::MatcherBase<T>(impl) {}
-
-  // Implicit constructor here allows people to write
-  // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes
-  Matcher(T value);  // NOLINT
-};
-
-// The following two specializations allow the user to write str
-// instead of Eq(str) and "foo" instead of Eq("foo") when a std::string
-// matcher is expected.
-template <>
-class GTEST_API_ Matcher<const std::string&>
-    : public internal::MatcherBase<const std::string&> {
- public:
-  Matcher() {}
-
-  explicit Matcher(const MatcherInterface<const std::string&>* impl)
-      : internal::MatcherBase<const std::string&>(impl) {}
-
-  // Allows the user to write str instead of Eq(str) sometimes, where
-  // str is a std::string object.
-  Matcher(const std::string& s);  // NOLINT
-
-  // Allows the user to write "foo" instead of Eq("foo") sometimes.
-  Matcher(const char* s);  // NOLINT
-};
-
-template <>
-class GTEST_API_ Matcher<std::string>
-    : public internal::MatcherBase<std::string> {
- public:
-  Matcher() {}
-
-  explicit Matcher(const MatcherInterface<const std::string&>* impl)
-      : internal::MatcherBase<std::string>(impl) {}
-  explicit Matcher(const MatcherInterface<std::string>* impl)
-      : internal::MatcherBase<std::string>(impl) {}
-
-  // Allows the user to write str instead of Eq(str) sometimes, where
-  // str is a string object.
-  Matcher(const std::string& s);  // NOLINT
-
-  // Allows the user to write "foo" instead of Eq("foo") sometimes.
-  Matcher(const char* s);  // NOLINT
-};
-
-#if GTEST_HAS_ABSL
-// The following two specializations allow the user to write str
-// instead of Eq(str) and "foo" instead of Eq("foo") when a absl::string_view
-// matcher is expected.
-template <>
-class GTEST_API_ Matcher<const absl::string_view&>
-    : public internal::MatcherBase<const absl::string_view&> {
- public:
-  Matcher() {}
-
-  explicit Matcher(const MatcherInterface<const absl::string_view&>* impl)
-      : internal::MatcherBase<const absl::string_view&>(impl) {}
-
-  // Allows the user to write str instead of Eq(str) sometimes, where
-  // str is a std::string object.
-  Matcher(const std::string& s);  // NOLINT
-
-  // Allows the user to write "foo" instead of Eq("foo") sometimes.
-  Matcher(const char* s);  // NOLINT
-
-  // Allows the user to pass absl::string_views directly.
-  Matcher(absl::string_view s);  // NOLINT
-};
-
-template <>
-class GTEST_API_ Matcher<absl::string_view>
-    : public internal::MatcherBase<absl::string_view> {
- public:
-  Matcher() {}
-
-  explicit Matcher(const MatcherInterface<const absl::string_view&>* impl)
-      : internal::MatcherBase<absl::string_view>(impl) {}
-  explicit Matcher(const MatcherInterface<absl::string_view>* impl)
-      : internal::MatcherBase<absl::string_view>(impl) {}
-
-  // Allows the user to write str instead of Eq(str) sometimes, where
-  // str is a std::string object.
-  Matcher(const std::string& s);  // NOLINT
-
-  // Allows the user to write "foo" instead of Eq("foo") sometimes.
-  Matcher(const char* s);  // NOLINT
-
-  // Allows the user to pass absl::string_views directly.
-  Matcher(absl::string_view s);  // NOLINT
-};
-#endif  // GTEST_HAS_ABSL
-
-// Prints a matcher in a human-readable format.
-template <typename T>
-std::ostream& operator<<(std::ostream& os, const Matcher<T>& matcher) {
-  matcher.DescribeTo(&os);
-  return os;
-}
-
-// The PolymorphicMatcher class template makes it easy to implement a
-// polymorphic matcher (i.e. a matcher that can match values of more
-// than one type, e.g. Eq(n) and NotNull()).
-//
-// To define a polymorphic matcher, a user should provide an Impl
-// class that has a DescribeTo() method and a DescribeNegationTo()
-// method, and define a member function (or member function template)
-//
-//   bool MatchAndExplain(const Value& value,
-//                        MatchResultListener* listener) const;
-//
-// See the definition of NotNull() for a complete example.
-template <class Impl>
-class PolymorphicMatcher {
- public:
-  explicit PolymorphicMatcher(const Impl& an_impl) : impl_(an_impl) {}
-
-  // Returns a mutable reference to the underlying matcher
-  // implementation object.
-  Impl& mutable_impl() { return impl_; }
-
-  // Returns an immutable reference to the underlying matcher
-  // implementation object.
-  const Impl& impl() const { return impl_; }
-
-  template <typename T>
-  operator Matcher<T>() const {
-    return Matcher<T>(new MonomorphicImpl<const T&>(impl_));
-  }
-
- private:
-  template <typename T>
-  class MonomorphicImpl : public MatcherInterface<T> {
-   public:
-    explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {}
-
-    virtual void DescribeTo(::std::ostream* os) const { impl_.DescribeTo(os); }
-
-    virtual void DescribeNegationTo(::std::ostream* os) const {
-      impl_.DescribeNegationTo(os);
-    }
-
-    virtual bool MatchAndExplain(T x, MatchResultListener* listener) const {
-      return impl_.MatchAndExplain(x, listener);
-    }
-
-   private:
-    const Impl impl_;
-  };
-
-  Impl impl_;
-};
-
-// Creates a matcher from its implementation.
-// DEPRECATED: Especially in the generic code, prefer:
-//   Matcher<T>(new MyMatcherImpl<const T&>(...));
-//
-// MakeMatcher may create a Matcher that accepts its argument by value, which
-// leads to unnecessary copies & lack of support for non-copyable types.
-template <typename T>
-inline Matcher<T> MakeMatcher(const MatcherInterface<T>* impl) {
-  return Matcher<T>(impl);
-}
-
-// Creates a polymorphic matcher from its implementation.  This is
-// easier to use than the PolymorphicMatcher<Impl> constructor as it
-// doesn't require you to explicitly write the template argument, e.g.
-//
-//   MakePolymorphicMatcher(foo);
-// vs
-//   PolymorphicMatcher<TypeOfFoo>(foo);
-template <class Impl>
-inline PolymorphicMatcher<Impl> MakePolymorphicMatcher(const Impl& impl) {
-  return PolymorphicMatcher<Impl>(impl);
-}
-
-namespace internal {
-// Implements a matcher that compares a given value with a
-// pre-supplied value using one of the ==, <=, <, etc, operators.  The
-// two values being compared don't have to have the same type.
-//
-// The matcher defined here is polymorphic (for example, Eq(5) can be
-// used to match an int, a short, a double, etc).  Therefore we use
-// a template type conversion operator in the implementation.
-//
-// The following template definition assumes that the Rhs parameter is
-// a "bare" type (i.e. neither 'const T' nor 'T&').
-template <typename D, typename Rhs, typename Op>
-class ComparisonBase {
- public:
-  explicit ComparisonBase(const Rhs& rhs) : rhs_(rhs) {}
-  template <typename Lhs>
-  operator Matcher<Lhs>() const {
-    return Matcher<Lhs>(new Impl<const Lhs&>(rhs_));
-  }
-
- private:
-  template <typename T>
-  static const T& Unwrap(const T& v) { return v; }
-  template <typename T>
-  static const T& Unwrap(std::reference_wrapper<T> v) { return v; }
-
-  template <typename Lhs, typename = Rhs>
-  class Impl : public MatcherInterface<Lhs> {
-   public:
-    explicit Impl(const Rhs& rhs) : rhs_(rhs) {}
-    bool MatchAndExplain(Lhs lhs,
-                         MatchResultListener* /* listener */) const override {
-      return Op()(lhs, Unwrap(rhs_));
-    }
-    void DescribeTo(::std::ostream* os) const override {
-      *os << D::Desc() << " ";
-      UniversalPrint(Unwrap(rhs_), os);
-    }
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << D::NegatedDesc() <<  " ";
-      UniversalPrint(Unwrap(rhs_), os);
-    }
-
-   private:
-    Rhs rhs_;
-  };
-  Rhs rhs_;
-};
-
-template <typename Rhs>
-class EqMatcher : public ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq> {
- public:
-  explicit EqMatcher(const Rhs& rhs)
-      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) { }
-  static const char* Desc() { return "is equal to"; }
-  static const char* NegatedDesc() { return "isn't equal to"; }
-};
-template <typename Rhs>
-class NeMatcher : public ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe> {
- public:
-  explicit NeMatcher(const Rhs& rhs)
-      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) { }
-  static const char* Desc() { return "isn't equal to"; }
-  static const char* NegatedDesc() { return "is equal to"; }
-};
-template <typename Rhs>
-class LtMatcher : public ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt> {
- public:
-  explicit LtMatcher(const Rhs& rhs)
-      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) { }
-  static const char* Desc() { return "is <"; }
-  static const char* NegatedDesc() { return "isn't <"; }
-};
-template <typename Rhs>
-class GtMatcher : public ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt> {
- public:
-  explicit GtMatcher(const Rhs& rhs)
-      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) { }
-  static const char* Desc() { return "is >"; }
-  static const char* NegatedDesc() { return "isn't >"; }
-};
-template <typename Rhs>
-class LeMatcher : public ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe> {
- public:
-  explicit LeMatcher(const Rhs& rhs)
-      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) { }
-  static const char* Desc() { return "is <="; }
-  static const char* NegatedDesc() { return "isn't <="; }
-};
-template <typename Rhs>
-class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> {
- public:
-  explicit GeMatcher(const Rhs& rhs)
-      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) { }
-  static const char* Desc() { return "is >="; }
-  static const char* NegatedDesc() { return "isn't >="; }
-};
-
-// Implements polymorphic matchers MatchesRegex(regex) and
-// ContainsRegex(regex), which can be used as a Matcher<T> as long as
-// T can be converted to a string.
-class MatchesRegexMatcher {
- public:
-  MatchesRegexMatcher(const RE* regex, bool full_match)
-      : regex_(regex), full_match_(full_match) {}
-
-#if GTEST_HAS_ABSL
-  bool MatchAndExplain(const absl::string_view& s,
-                       MatchResultListener* listener) const {
-    return MatchAndExplain(std::string(s), listener);
-  }
-#endif  // GTEST_HAS_ABSL
-
-  // Accepts pointer types, particularly:
-  //   const char*
-  //   char*
-  //   const wchar_t*
-  //   wchar_t*
-  template <typename CharType>
-  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
-    return s != nullptr && MatchAndExplain(std::string(s), listener);
-  }
-
-  // Matches anything that can convert to std::string.
-  //
-  // This is a template, not just a plain function with const std::string&,
-  // because absl::string_view has some interfering non-explicit constructors.
-  template <class MatcheeStringType>
-  bool MatchAndExplain(const MatcheeStringType& s,
-                       MatchResultListener* /* listener */) const {
-    const std::string& s2(s);
-    return full_match_ ? RE::FullMatch(s2, *regex_)
-                       : RE::PartialMatch(s2, *regex_);
-  }
-
-  void DescribeTo(::std::ostream* os) const {
-    *os << (full_match_ ? "matches" : "contains") << " regular expression ";
-    UniversalPrinter<std::string>::Print(regex_->pattern(), os);
-  }
-
-  void DescribeNegationTo(::std::ostream* os) const {
-    *os << "doesn't " << (full_match_ ? "match" : "contain")
-        << " regular expression ";
-    UniversalPrinter<std::string>::Print(regex_->pattern(), os);
-  }
-
- private:
-  const std::shared_ptr<const RE> regex_;
-  const bool full_match_;
-};
-}  // namespace internal
-
-// Matches a string that fully matches regular expression 'regex'.
-// The matcher takes ownership of 'regex'.
-inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
-    const internal::RE* regex) {
-  return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true));
-}
-inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
-    const std::string& regex) {
-  return MatchesRegex(new internal::RE(regex));
-}
-
-// Matches a string that contains regular expression 'regex'.
-// The matcher takes ownership of 'regex'.
-inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
-    const internal::RE* regex) {
-  return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false));
-}
-inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
-    const std::string& regex) {
-  return ContainsRegex(new internal::RE(regex));
-}
-
-// Creates a polymorphic matcher that matches anything equal to x.
-// Note: if the parameter of Eq() were declared as const T&, Eq("foo")
-// wouldn't compile.
-template <typename T>
-inline internal::EqMatcher<T> Eq(T x) { return internal::EqMatcher<T>(x); }
-
-// Constructs a Matcher<T> from a 'value' of type T.  The constructed
-// matcher matches any value that's equal to 'value'.
-template <typename T>
-Matcher<T>::Matcher(T value) { *this = Eq(value); }
-
-// Creates a monomorphic matcher that matches anything with type Lhs
-// and equal to rhs.  A user may need to use this instead of Eq(...)
-// in order to resolve an overloading ambiguity.
-//
-// TypedEq<T>(x) is just a convenient short-hand for Matcher<T>(Eq(x))
-// or Matcher<T>(x), but more readable than the latter.
-//
-// We could define similar monomorphic matchers for other comparison
-// operations (e.g. TypedLt, TypedGe, and etc), but decided not to do
-// it yet as those are used much less than Eq() in practice.  A user
-// can always write Matcher<T>(Lt(5)) to be explicit about the type,
-// for example.
-template <typename Lhs, typename Rhs>
-inline Matcher<Lhs> TypedEq(const Rhs& rhs) { return Eq(rhs); }
-
-// Creates a polymorphic matcher that matches anything >= x.
-template <typename Rhs>
-inline internal::GeMatcher<Rhs> Ge(Rhs x) {
-  return internal::GeMatcher<Rhs>(x);
-}
-
-// Creates a polymorphic matcher that matches anything > x.
-template <typename Rhs>
-inline internal::GtMatcher<Rhs> Gt(Rhs x) {
-  return internal::GtMatcher<Rhs>(x);
-}
-
-// Creates a polymorphic matcher that matches anything <= x.
-template <typename Rhs>
-inline internal::LeMatcher<Rhs> Le(Rhs x) {
-  return internal::LeMatcher<Rhs>(x);
-}
-
-// Creates a polymorphic matcher that matches anything < x.
-template <typename Rhs>
-inline internal::LtMatcher<Rhs> Lt(Rhs x) {
-  return internal::LtMatcher<Rhs>(x);
-}
-
-// Creates a polymorphic matcher that matches anything != x.
-template <typename Rhs>
-inline internal::NeMatcher<Rhs> Ne(Rhs x) {
-  return internal::NeMatcher<Rhs>(x);
-}
-}  // namespace testing
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251 5046
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
diff --git a/deps/googletest/include/gtest/gtest-message.h b/deps/googletest/include/gtest/gtest-message.h
deleted file mode 100644
index 4a80e11e6..000000000
--- a/deps/googletest/include/gtest/gtest-message.h
+++ /dev/null
@@ -1,218 +0,0 @@
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-//
-// The Google C++ Testing and Mocking Framework (Google Test)
-//
-// This header file defines the Message class.
-//
-// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
-// leave some internal implementation details in this header file.
-// They are clearly marked by comments like this:
-//
-//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-//
-// Such code is NOT meant to be used by a user directly, and is subject
-// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
-// program!
-
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-
-#include <limits>
-#include <memory>
-
-#include "gtest/internal/gtest-port.h"
-
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
-/* class A needs to have dll-interface to be used by clients of class B */)
-
-// Ensures that there is at least one operator<< in the global namespace.
-// See Message& operator<<(...) below for why.
-void operator<<(const testing::internal::Secret&, int);
-
-namespace testing {
-
-// The Message class works like an ostream repeater.
-//
-// Typical usage:
-//
-//   1. You stream a bunch of values to a Message object.
-//      It will remember the text in a stringstream.
-//   2. Then you stream the Message object to an ostream.
-//      This causes the text in the Message to be streamed
-//      to the ostream.
-//
-// For example;
-//
-//   testing::Message foo;
-//   foo << 1 << " != " << 2;
-//   std::cout << foo;
-//
-// will print "1 != 2".
-//
-// Message is not intended to be inherited from.  In particular, its
-// destructor is not virtual.
-//
-// Note that stringstream behaves differently in gcc and in MSVC.  You
-// can stream a NULL char pointer to it in the former, but not in the
-// latter (it causes an access violation if you do).  The Message
-// class hides this difference by treating a NULL char pointer as
-// "(null)".
-class GTEST_API_ Message {
- private:
-  // The type of basic IO manipulators (endl, ends, and flush) for
-  // narrow streams.
-  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
-
- public:
-  // Constructs an empty Message.
-  Message();
-
-  // Copy constructor.
-  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
-    *ss_ << msg.GetString();
-  }
-
-  // Constructs a Message from a C-string.
-  explicit Message(const char* str) : ss_(new ::std::stringstream) {
-    *ss_ << str;
-  }
-
-  // Streams a non-pointer value to this object.
-  template <typename T>
-  inline Message& operator <<(const T& val) {
-    // Some libraries overload << for STL containers.  These
-    // overloads are defined in the global namespace instead of ::std.
-    //
-    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
-    // overloads are visible in either the std namespace or the global
-    // namespace, but not other namespaces, including the testing
-    // namespace which Google Test's Message class is in.
-    //
-    // To allow STL containers (and other types that has a << operator
-    // defined in the global namespace) to be used in Google Test
-    // assertions, testing::Message must access the custom << operator
-    // from the global namespace.  With this using declaration,
-    // overloads of << defined in the global namespace and those
-    // visible via Koenig lookup are both exposed in this function.
-    using ::operator <<;
-    *ss_ << val;
-    return *this;
-  }
-
-  // Streams a pointer value to this object.
-  //
-  // This function is an overload of the previous one.  When you
-  // stream a pointer to a Message, this definition will be used as it
-  // is more specialized.  (The C++ Standard, section
-  // [temp.func.order].)  If you stream a non-pointer, then the
-  // previous definition will be used.
-  //
-  // The reason for this overload is that streaming a NULL pointer to
-  // ostream is undefined behavior.  Depending on the compiler, you
-  // may get "0", "(nil)", "(null)", or an access violation.  To
-  // ensure consistent result across compilers, we always treat NULL
-  // as "(null)".
-  template <typename T>
-  inline Message& operator <<(T* const& pointer) {  // NOLINT
-    if (pointer == nullptr) {
-      *ss_ << "(null)";
-    } else {
-      *ss_ << pointer;
-    }
-    return *this;
-  }
-
-  // Since the basic IO manipulators are overloaded for both narrow
-  // and wide streams, we have to provide this specialized definition
-  // of operator <<, even though its body is the same as the
-  // templatized version above.  Without this definition, streaming
-  // endl or other basic IO manipulators to Message will confuse the
-  // compiler.
-  Message& operator <<(BasicNarrowIoManip val) {
-    *ss_ << val;
-    return *this;
-  }
-
-  // Instead of 1/0, we want to see true/false for bool values.
-  Message& operator <<(bool b) {
-    return *this << (b ? "true" : "false");
-  }
-
-  // These two overloads allow streaming a wide C string to a Message
-  // using the UTF-8 encoding.
-  Message& operator <<(const wchar_t* wide_c_str);
-  Message& operator <<(wchar_t* wide_c_str);
-
-#if GTEST_HAS_STD_WSTRING
-  // Converts the given wide string to a narrow string using the UTF-8
-  // encoding, and streams the result to this Message object.
-  Message& operator <<(const ::std::wstring& wstr);
-#endif  // GTEST_HAS_STD_WSTRING
-
-  // Gets the text streamed to this object so far as an std::string.
-  // Each '\0' character in the buffer is replaced with "\\0".
-  //
-  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-  std::string GetString() const;
-
- private:
-  // We'll hold the text streamed to this object here.
-  const std::unique_ptr< ::std::stringstream> ss_;
-
-  // We declare (but don't implement) this to prevent the compiler
-  // from implementing the assignment operator.
-  void operator=(const Message&);
-};
-
-// Streams a Message to an ostream.
-inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
-  return os << sb.GetString();
-}
-
-namespace internal {
-
-// Converts a streamable value to an std::string.  A NULL pointer is
-// converted to "(null)".  When the input value is a ::string,
-// ::std::string, ::wstring, or ::std::wstring object, each NUL
-// character in it is replaced with "\\0".
-template <typename T>
-std::string StreamableToString(const T& streamable) {
-  return (Message() << streamable).GetString();
-}
-
-}  // namespace internal
-}  // namespace testing
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
diff --git a/deps/googletest/include/gtest/gtest-param-test.h b/deps/googletest/include/gtest/gtest-param-test.h
deleted file mode 100644
index d7c9dd8c7..000000000
--- a/deps/googletest/include/gtest/gtest-param-test.h
+++ /dev/null
@@ -1,503 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Macros and functions for implementing parameterized tests
-// in Google C++ Testing and Mocking Framework (Google Test)
-//
-// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// GOOGLETEST_CM0001 DO NOT DELETE
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-
-
-// Value-parameterized tests allow you to test your code with different
-// parameters without writing multiple copies of the same test.
-//
-// Here is how you use value-parameterized tests:
-
-#if 0
-
-// To write value-parameterized tests, first you should define a fixture
-// class. It is usually derived from testing::TestWithParam<T> (see below for
-// another inheritance scheme that's sometimes useful in more complicated
-// class hierarchies), where the type of your parameter values.
-// TestWithParam<T> is itself derived from testing::Test. T can be any
-// copyable type. If it's a raw pointer, you are responsible for managing the
-// lifespan of the pointed values.
-
-class FooTest : public ::testing::TestWithParam<const char*> {
-  // You can implement all the usual class fixture members here.
-};
-
-// Then, use the TEST_P macro to define as many parameterized tests
-// for this fixture as you want. The _P suffix is for "parameterized"
-// or "pattern", whichever you prefer to think.
-
-TEST_P(FooTest, DoesBlah) {
-  // Inside a test, access the test parameter with the GetParam() method
-  // of the TestWithParam<T> class:
-  EXPECT_TRUE(foo.Blah(GetParam()));
-  ...
-}
-
-TEST_P(FooTest, HasBlahBlah) {
-  ...
-}
-
-// Finally, you can use INSTANTIATE_TEST_SUITE_P to instantiate the test
-// case with any set of parameters you want. Google Test defines a number
-// of functions for generating test parameters. They return what we call
-// (surprise!) parameter generators. Here is a summary of them, which
-// are all in the testing namespace:
-//
-//
-//  Range(begin, end [, step]) - Yields values {begin, begin+step,
-//                               begin+step+step, ...}. The values do not
-//                               include end. step defaults to 1.
-//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
-//  ValuesIn(container)        - Yields values from a C-style array, an STL
-//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
-//  Bool()                     - Yields sequence {false, true}.
-//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
-//                               for the math savvy) of the values generated
-//                               by the N generators.
-//
-// For more details, see comments at the definitions of these functions below
-// in this file.
-//
-// The following statement will instantiate tests from the FooTest test suite
-// each with parameter values "meeny", "miny", and "moe".
-
-INSTANTIATE_TEST_SUITE_P(InstantiationName,
-                         FooTest,
-                         Values("meeny", "miny", "moe"));
-
-// To distinguish different instances of the pattern, (yes, you
-// can instantiate it more than once) the first argument to the
-// INSTANTIATE_TEST_SUITE_P macro is a prefix that will be added to the
-// actual test suite name. Remember to pick unique prefixes for different
-// instantiations. The tests from the instantiation above will have
-// these names:
-//
-//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
-//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
-//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
-//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
-//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
-//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
-//
-// You can use these names in --gtest_filter.
-//
-// This statement will instantiate all tests from FooTest again, each
-// with parameter values "cat" and "dog":
-
-const char* pets[] = {"cat", "dog"};
-INSTANTIATE_TEST_SUITE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
-
-// The tests from the instantiation above will have these names:
-//
-//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
-//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
-//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
-//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
-//
-// Please note that INSTANTIATE_TEST_SUITE_P will instantiate all tests
-// in the given test suite, whether their definitions come before or
-// AFTER the INSTANTIATE_TEST_SUITE_P statement.
-//
-// Please also note that generator expressions (including parameters to the
-// generators) are evaluated in InitGoogleTest(), after main() has started.
-// This allows the user on one hand, to adjust generator parameters in order
-// to dynamically determine a set of tests to run and on the other hand,
-// give the user a chance to inspect the generated tests with Google Test
-// reflection API before RUN_ALL_TESTS() is executed.
-//
-// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
-// for more examples.
-//
-// In the future, we plan to publish the API for defining new parameter
-// generators. But for now this interface remains part of the internal
-// implementation and is subject to change.
-//
-//
-// A parameterized test fixture must be derived from testing::Test and from
-// testing::WithParamInterface<T>, where T is the type of the parameter
-// values. Inheriting from TestWithParam<T> satisfies that requirement because
-// TestWithParam<T> inherits from both Test and WithParamInterface. In more
-// complicated hierarchies, however, it is occasionally useful to inherit
-// separately from Test and WithParamInterface. For example:
-
-class BaseTest : public ::testing::Test {
-  // You can inherit all the usual members for a non-parameterized test
-  // fixture here.
-};
-
-class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
-  // The usual test fixture members go here too.
-};
-
-TEST_F(BaseTest, HasFoo) {
-  // This is an ordinary non-parameterized test.
-}
-
-TEST_P(DerivedTest, DoesBlah) {
-  // GetParam works just the same here as if you inherit from TestWithParam.
-  EXPECT_TRUE(foo.Blah(GetParam()));
-}
-
-#endif  // 0
-
-#include <utility>
-
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-param-util.h"
-#include "gtest/internal/gtest-port.h"
-
-namespace testing {
-
-// Functions producing parameter generators.
-//
-// Google Test uses these generators to produce parameters for value-
-// parameterized tests. When a parameterized test suite is instantiated
-// with a particular generator, Google Test creates and runs tests
-// for each element in the sequence produced by the generator.
-//
-// In the following sample, tests from test suite FooTest are instantiated
-// each three times with parameter values 3, 5, and 8:
-//
-// class FooTest : public TestWithParam<int> { ... };
-//
-// TEST_P(FooTest, TestThis) {
-// }
-// TEST_P(FooTest, TestThat) {
-// }
-// INSTANTIATE_TEST_SUITE_P(TestSequence, FooTest, Values(3, 5, 8));
-//
-
-// Range() returns generators providing sequences of values in a range.
-//
-// Synopsis:
-// Range(start, end)
-//   - returns a generator producing a sequence of values {start, start+1,
-//     start+2, ..., }.
-// Range(start, end, step)
-//   - returns a generator producing a sequence of values {start, start+step,
-//     start+step+step, ..., }.
-// Notes:
-//   * The generated sequences never include end. For example, Range(1, 5)
-//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
-//     returns a generator producing {1, 3, 5, 7}.
-//   * start and end must have the same type. That type may be any integral or
-//     floating-point type or a user defined type satisfying these conditions:
-//     * It must be assignable (have operator=() defined).
-//     * It must have operator+() (operator+(int-compatible type) for
-//       two-operand version).
-//     * It must have operator<() defined.
-//     Elements in the resulting sequences will also have that type.
-//   * Condition start < end must be satisfied in order for resulting sequences
-//     to contain any elements.
-//
-template <typename T, typename IncrementT>
-internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
-  return internal::ParamGenerator<T>(
-      new internal::RangeGenerator<T, IncrementT>(start, end, step));
-}
-
-template <typename T>
-internal::ParamGenerator<T> Range(T start, T end) {
-  return Range(start, end, 1);
-}
-
-// ValuesIn() function allows generation of tests with parameters coming from
-// a container.
-//
-// Synopsis:
-// ValuesIn(const T (&array)[N])
-//   - returns a generator producing sequences with elements from
-//     a C-style array.
-// ValuesIn(const Container& container)
-//   - returns a generator producing sequences with elements from
-//     an STL-style container.
-// ValuesIn(Iterator begin, Iterator end)
-//   - returns a generator producing sequences with elements from
-//     a range [begin, end) defined by a pair of STL-style iterators. These
-//     iterators can also be plain C pointers.
-//
-// Please note that ValuesIn copies the values from the containers
-// passed in and keeps them to generate tests in RUN_ALL_TESTS().
-//
-// Examples:
-//
-// This instantiates tests from test suite StringTest
-// each with C-string values of "foo", "bar", and "baz":
-//
-// const char* strings[] = {"foo", "bar", "baz"};
-// INSTANTIATE_TEST_SUITE_P(StringSequence, StringTest, ValuesIn(strings));
-//
-// This instantiates tests from test suite StlStringTest
-// each with STL strings with values "a" and "b":
-//
-// ::std::vector< ::std::string> GetParameterStrings() {
-//   ::std::vector< ::std::string> v;
-//   v.push_back("a");
-//   v.push_back("b");
-//   return v;
-// }
-//
-// INSTANTIATE_TEST_SUITE_P(CharSequence,
-//                          StlStringTest,
-//                          ValuesIn(GetParameterStrings()));
-//
-//
-// This will also instantiate tests from CharTest
-// each with parameter values 'a' and 'b':
-//
-// ::std::list<char> GetParameterChars() {
-//   ::std::list<char> list;
-//   list.push_back('a');
-//   list.push_back('b');
-//   return list;
-// }
-// ::std::list<char> l = GetParameterChars();
-// INSTANTIATE_TEST_SUITE_P(CharSequence2,
-//                          CharTest,
-//                          ValuesIn(l.begin(), l.end()));
-//
-template <typename ForwardIterator>
-internal::ParamGenerator<
-  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
-ValuesIn(ForwardIterator begin, ForwardIterator end) {
-  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
-      ::value_type ParamType;
-  return internal::ParamGenerator<ParamType>(
-      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
-}
-
-template <typename T, size_t N>
-internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
-  return ValuesIn(array, array + N);
-}
-
-template <class Container>
-internal::ParamGenerator<typename Container::value_type> ValuesIn(
-    const Container& container) {
-  return ValuesIn(container.begin(), container.end());
-}
-
-// Values() allows generating tests from explicitly specified list of
-// parameters.
-//
-// Synopsis:
-// Values(T v1, T v2, ..., T vN)
-//   - returns a generator producing sequences with elements v1, v2, ..., vN.
-//
-// For example, this instantiates tests from test suite BarTest each
-// with values "one", "two", and "three":
-//
-// INSTANTIATE_TEST_SUITE_P(NumSequence,
-//                          BarTest,
-//                          Values("one", "two", "three"));
-//
-// This instantiates tests from test suite BazTest each with values 1, 2, 3.5.
-// The exact type of values will depend on the type of parameter in BazTest.
-//
-// INSTANTIATE_TEST_SUITE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
-//
-//
-template <typename... T>
-internal::ValueArray<T...> Values(T... v) {
-  return internal::ValueArray<T...>(std::move(v)...);
-}
-
-// Bool() allows generating tests with parameters in a set of (false, true).
-//
-// Synopsis:
-// Bool()
-//   - returns a generator producing sequences with elements {false, true}.
-//
-// It is useful when testing code that depends on Boolean flags. Combinations
-// of multiple flags can be tested when several Bool()'s are combined using
-// Combine() function.
-//
-// In the following example all tests in the test suite FlagDependentTest
-// will be instantiated twice with parameters false and true.
-//
-// class FlagDependentTest : public testing::TestWithParam<bool> {
-//   virtual void SetUp() {
-//     external_flag = GetParam();
-//   }
-// }
-// INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool());
-//
-inline internal::ParamGenerator<bool> Bool() {
-  return Values(false, true);
-}
-
-// Combine() allows the user to combine two or more sequences to produce
-// values of a Cartesian product of those sequences' elements.
-//
-// Synopsis:
-// Combine(gen1, gen2, ..., genN)
-//   - returns a generator producing sequences with elements coming from
-//     the Cartesian product of elements from the sequences generated by
-//     gen1, gen2, ..., genN. The sequence elements will have a type of
-//     std::tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
-//     of elements from sequences produces by gen1, gen2, ..., genN.
-//
-// Combine can have up to 10 arguments.
-//
-// Example:
-//
-// This will instantiate tests in test suite AnimalTest each one with
-// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
-// tuple("dog", BLACK), and tuple("dog", WHITE):
-//
-// enum Color { BLACK, GRAY, WHITE };
-// class AnimalTest
-//     : public testing::TestWithParam<std::tuple<const char*, Color> > {...};
-//
-// TEST_P(AnimalTest, AnimalLooksNice) {...}
-//
-// INSTANTIATE_TEST_SUITE_P(AnimalVariations, AnimalTest,
-//                          Combine(Values("cat", "dog"),
-//                                  Values(BLACK, WHITE)));
-//
-// This will instantiate tests in FlagDependentTest with all variations of two
-// Boolean flags:
-//
-// class FlagDependentTest
-//     : public testing::TestWithParam<std::tuple<bool, bool> > {
-//   virtual void SetUp() {
-//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
-//     std::tie(external_flag_1, external_flag_2) = GetParam();
-//   }
-// };
-//
-// TEST_P(FlagDependentTest, TestFeature1) {
-//   // Test your code using external_flag_1 and external_flag_2 here.
-// }
-// INSTANTIATE_TEST_SUITE_P(TwoBoolSequence, FlagDependentTest,
-//                          Combine(Bool(), Bool()));
-//
-template <typename... Generator>
-internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
-  return internal::CartesianProductHolder<Generator...>(g...);
-}
-
-#define TEST_P(test_suite_name, test_name)                                     \
-  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                     \
-      : public test_suite_name {                                               \
-   public:                                                                     \
-    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {}                    \
-    virtual void TestBody();                                                   \
-                                                                               \
-   private:                                                                    \
-    static int AddToRegistry() {                                               \
-      ::testing::UnitTest::GetInstance()                                       \
-          ->parameterized_test_registry()                                      \
-          .GetTestSuitePatternHolder<test_suite_name>(                         \
-              #test_suite_name,                                                \
-              ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
-          ->AddTestPattern(                                                    \
-              GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name),  \
-              new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
-                  test_suite_name, test_name)>());                             \
-      return 0;                                                                \
-    }                                                                          \
-    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,    \
-                                                           test_name));        \
-  };                                                                           \
-  int GTEST_TEST_CLASS_NAME_(test_suite_name,                                  \
-                             test_name)::gtest_registering_dummy_ =            \
-      GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::AddToRegistry();     \
-  void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
-
-// The last argument to INSTANTIATE_TEST_SUITE_P allows the user to specify
-// generator and an optional function or functor that generates custom test name
-// suffixes based on the test parameters. Such a function or functor should
-// accept one argument of type testing::TestParamInfo<class ParamType>, and
-// return std::string.
-//
-// testing::PrintToStringParamName is a builtin test suffix generator that
-// returns the value of testing::PrintToString(GetParam()).
-//
-// Note: test names must be non-empty, unique, and may only contain ASCII
-// alphanumeric characters or underscore. Because PrintToString adds quotes
-// to std::string and C strings, it won't work for these types.
-
-#define GTEST_EXPAND_(arg) arg
-#define GTEST_GET_FIRST_(first, ...) first
-#define GTEST_GET_SECOND_(first, second, ...) second
-
-#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)                \
-  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>      \
-      gtest_##prefix##test_suite_name##_EvalGenerator_() {                    \
-    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));        \
-  }                                                                           \
-  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(   \
-      const ::testing::TestParamInfo<test_suite_name::ParamType>& info) {     \
-    if (::testing::internal::AlwaysFalse()) {                                 \
-      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(      \
-          __VA_ARGS__,                                                        \
-          ::testing::internal::DefaultParamName<test_suite_name::ParamType>,  \
-          DUMMY_PARAM_)));                                                    \
-      auto t = std::make_tuple(__VA_ARGS__);                                  \
-      static_assert(std::tuple_size<decltype(t)>::value <= 2,                 \
-                    "Too Many Args!");                                        \
-    }                                                                         \
-    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                 \
-        __VA_ARGS__,                                                          \
-        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,    \
-        DUMMY_PARAM_))))(info);                                               \
-  }                                                                           \
-  static int gtest_##prefix##test_suite_name##_dummy_                         \
-      GTEST_ATTRIBUTE_UNUSED_ =                                               \
-          ::testing::UnitTest::GetInstance()                                  \
-              ->parameterized_test_registry()                                 \
-              .GetTestSuitePatternHolder<test_suite_name>(                    \
-                  #test_suite_name,                                           \
-                  ::testing::internal::CodeLocation(__FILE__, __LINE__))      \
-              ->AddTestSuiteInstantiation(                                    \
-                  #prefix, &gtest_##prefix##test_suite_name##_EvalGenerator_, \
-                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,       \
-                  __FILE__, __LINE__)
-
-// Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-#define INSTANTIATE_TEST_CASE_P                                            \
-  static_assert(::testing::internal::InstantiateTestCase_P_IsDeprecated(), \
-                "");                                                       \
-  INSTANTIATE_TEST_SUITE_P
-#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/deps/googletest/include/gtest/gtest-printers.h b/deps/googletest/include/gtest/gtest-printers.h
deleted file mode 100644
index b4833c6eb..000000000
--- a/deps/googletest/include/gtest/gtest-printers.h
+++ /dev/null
@@ -1,927 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-// Google Test - The Google C++ Testing and Mocking Framework
-//
-// This file implements a universal value printer that can print a
-// value of any type T:
-//
-//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
-//
-// A user can teach this function how to print a class type T by
-// defining either operator<<() or PrintTo() in the namespace that
-// defines T.  More specifically, the FIRST defined function in the
-// following list will be used (assuming T is defined in namespace
-// foo):
-//
-//   1. foo::PrintTo(const T&, ostream*)
-//   2. operator<<(ostream&, const T&) defined in either foo or the
-//      global namespace.
-//
-// However if T is an STL-style container then it is printed element-wise
-// unless foo::PrintTo(const T&, ostream*) is defined. Note that
-// operator<<() is ignored for container types.
-//
-// If none of the above is defined, it will print the debug string of
-// the value if it is a protocol buffer, or print the raw bytes in the
-// value otherwise.
-//
-// To aid debugging: when T is a reference type, the address of the
-// value is also printed; when T is a (const) char pointer, both the
-// pointer value and the NUL-terminated string it points to are
-// printed.
-//
-// We also provide some convenient wrappers:
-//
-//   // Prints a value to a string.  For a (const or not) char
-//   // pointer, the NUL-terminated string (but not the pointer) is
-//   // printed.
-//   std::string ::testing::PrintToString(const T& value);
-//
-//   // Prints a value tersely: for a reference type, the referenced
-//   // value (but not the address) is printed; for a (const or not) char
-//   // pointer, the NUL-terminated string (but not the pointer) is
-//   // printed.
-//   void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
-//
-//   // Prints value using the type inferred by the compiler.  The difference
-//   // from UniversalTersePrint() is that this function prints both the
-//   // pointer and the NUL-terminated string for a (const or not) char pointer.
-//   void ::testing::internal::UniversalPrint(const T& value, ostream*);
-//
-//   // Prints the fields of a tuple tersely to a string vector, one
-//   // element for each field. Tuple support must be enabled in
-//   // gtest-port.h.
-//   std::vector<string> UniversalTersePrintTupleFieldsToStrings(
-//       const Tuple& value);
-//
-// Known limitation:
-//
-// The print primitives print the elements of an STL-style container
-// using the compiler-inferred type of *iter where iter is a
-// const_iterator of the container.  When const_iterator is an input
-// iterator but not a forward iterator, this inferred type may not
-// match value_type, and the print output may be incorrect.  In
-// practice, this is rarely a problem as for most containers
-// const_iterator is a forward iterator.  We'll fix this if there's an
-// actual need for it.  Note that this fix cannot rely on value_type
-// being defined as many user-defined container types don't have
-// value_type.
-
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-
-#include <functional>
-#include <ostream>  // NOLINT
-#include <sstream>
-#include <string>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-#include <vector>
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-port.h"
-
-#if GTEST_HAS_ABSL
-#include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
-#include "absl/types/variant.h"
-#endif  // GTEST_HAS_ABSL
-
-namespace testing {
-
-// Definitions in the 'internal' and 'internal2' name spaces are
-// subject to change without notice.  DO NOT USE THEM IN USER CODE!
-namespace internal2 {
-
-// Prints the given number of bytes in the given object to the given
-// ostream.
-GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
-                                     size_t count,
-                                     ::std::ostream* os);
-
-// For selecting which printer to use when a given type has neither <<
-// nor PrintTo().
-enum TypeKind {
-  kProtobuf,              // a protobuf type
-  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
-                          // (e.g. a named or unnamed enum type)
-#if GTEST_HAS_ABSL
-  kConvertibleToStringView,  // a type implicitly convertible to
-                             // absl::string_view
-#endif
-  kOtherType  // anything else
-};
-
-// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
-// by the universal printer to print a value of type T when neither
-// operator<< nor PrintTo() is defined for T, where kTypeKind is the
-// "kind" of T as defined by enum TypeKind.
-template <typename T, TypeKind kTypeKind>
-class TypeWithoutFormatter {
- public:
-  // This default version is called when kTypeKind is kOtherType.
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    PrintBytesInObjectTo(static_cast<const unsigned char*>(
-                             reinterpret_cast<const void*>(&value)),
-                         sizeof(value), os);
-  }
-};
-
-// We print a protobuf using its ShortDebugString() when the string
-// doesn't exceed this many characters; otherwise we print it using
-// DebugString() for better readability.
-const size_t kProtobufOneLinerMaxLength = 50;
-
-template <typename T>
-class TypeWithoutFormatter<T, kProtobuf> {
- public:
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    std::string pretty_str = value.ShortDebugString();
-    if (pretty_str.length() > kProtobufOneLinerMaxLength) {
-      pretty_str = "\n" + value.DebugString();
-    }
-    *os << ("<" + pretty_str + ">");
-  }
-};
-
-template <typename T>
-class TypeWithoutFormatter<T, kConvertibleToInteger> {
- public:
-  // Since T has no << operator or PrintTo() but can be implicitly
-  // converted to BiggestInt, we print it as a BiggestInt.
-  //
-  // Most likely T is an enum type (either named or unnamed), in which
-  // case printing it as an integer is the desired behavior.  In case
-  // T is not an enum, printing it as an integer is the best we can do
-  // given that it has no user-defined printer.
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    const internal::BiggestInt kBigInt = value;
-    *os << kBigInt;
-  }
-};
-
-#if GTEST_HAS_ABSL
-template <typename T>
-class TypeWithoutFormatter<T, kConvertibleToStringView> {
- public:
-  // Since T has neither operator<< nor PrintTo() but can be implicitly
-  // converted to absl::string_view, we print it as a absl::string_view.
-  //
-  // Note: the implementation is further below, as it depends on
-  // internal::PrintTo symbol which is defined later in the file.
-  static void PrintValue(const T& value, ::std::ostream* os);
-};
-#endif
-
-// Prints the given value to the given ostream.  If the value is a
-// protocol message, its debug string is printed; if it's an enum or
-// of a type implicitly convertible to BiggestInt, it's printed as an
-// integer; otherwise the bytes in the value are printed.  This is
-// what UniversalPrinter<T>::Print() does when it knows nothing about
-// type T and T has neither << operator nor PrintTo().
-//
-// A user can override this behavior for a class type Foo by defining
-// a << operator in the namespace where Foo is defined.
-//
-// We put this operator in namespace 'internal2' instead of 'internal'
-// to simplify the implementation, as much code in 'internal' needs to
-// use << in STL, which would conflict with our own << were it defined
-// in 'internal'.
-//
-// Note that this operator<< takes a generic std::basic_ostream<Char,
-// CharTraits> type instead of the more restricted std::ostream.  If
-// we define it to take an std::ostream instead, we'll get an
-// "ambiguous overloads" compiler error when trying to print a type
-// Foo that supports streaming to std::basic_ostream<Char,
-// CharTraits>, as the compiler cannot tell whether
-// operator<<(std::ostream&, const T&) or
-// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
-// specific.
-template <typename Char, typename CharTraits, typename T>
-::std::basic_ostream<Char, CharTraits>& operator<<(
-    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
-  TypeWithoutFormatter<T, (internal::IsAProtocolMessage<T>::value
-                               ? kProtobuf
-                               : std::is_convertible<
-                                     const T&, internal::BiggestInt>::value
-                                     ? kConvertibleToInteger
-                                     :
-#if GTEST_HAS_ABSL
-                                     std::is_convertible<
-                                         const T&, absl::string_view>::value
-                                         ? kConvertibleToStringView
-                                         :
-#endif
-                                         kOtherType)>::PrintValue(x, &os);
-  return os;
-}
-
-}  // namespace internal2
-}  // namespace testing
-
-// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
-// magic needed for implementing UniversalPrinter won't work.
-namespace testing_internal {
-
-// Used to print a value that is not an STL-style container when the
-// user doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
-  // With the following statement, during unqualified name lookup,
-  // testing::internal2::operator<< appears as if it was declared in
-  // the nearest enclosing namespace that contains both
-  // ::testing_internal and ::testing::internal2, i.e. the global
-  // namespace.  For more details, refer to the C++ Standard section
-  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
-  // testing::internal2::operator<< in case T doesn't come with a <<
-  // operator.
-  //
-  // We cannot write 'using ::testing::internal2::operator<<;', which
-  // gcc 3.3 fails to compile due to a compiler bug.
-  using namespace ::testing::internal2;  // NOLINT
-
-  // Assuming T is defined in namespace foo, in the next statement,
-  // the compiler will consider all of:
-  //
-  //   1. foo::operator<< (thanks to Koenig look-up),
-  //   2. ::operator<< (as the current namespace is enclosed in ::),
-  //   3. testing::internal2::operator<< (thanks to the using statement above).
-  //
-  // The operator<< whose type matches T best will be picked.
-  //
-  // We deliberately allow #2 to be a candidate, as sometimes it's
-  // impossible to define #1 (e.g. when foo is ::std, defining
-  // anything in it is undefined behavior unless you are a compiler
-  // vendor.).
-  *os << value;
-}
-
-}  // namespace testing_internal
-
-namespace testing {
-namespace internal {
-
-// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
-// value of type ToPrint that is an operand of a comparison assertion
-// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
-// the comparison, and is used to help determine the best way to
-// format the value.  In particular, when the value is a C string
-// (char pointer) and the other operand is an STL string object, we
-// want to format the C string as a string, since we know it is
-// compared by value with the string object.  If the value is a char
-// pointer but the other operand is not an STL string object, we don't
-// know whether the pointer is supposed to point to a NUL-terminated
-// string, and thus want to print it as a pointer to be safe.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-
-// The default case.
-template <typename ToPrint, typename OtherOperand>
-class FormatForComparison {
- public:
-  static ::std::string Format(const ToPrint& value) {
-    return ::testing::PrintToString(value);
-  }
-};
-
-// Array.
-template <typename ToPrint, size_t N, typename OtherOperand>
-class FormatForComparison<ToPrint[N], OtherOperand> {
- public:
-  static ::std::string Format(const ToPrint* value) {
-    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
-  }
-};
-
-// By default, print C string as pointers to be safe, as we don't know
-// whether they actually point to a NUL-terminated string.
-
-#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
-  template <typename OtherOperand>                                      \
-  class FormatForComparison<CharType*, OtherOperand> {                  \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(static_cast<const void*>(value)); \
-    }                                                                   \
-  }
-
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
-GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
-
-#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
-
-// If a C string is compared with an STL string object, we know it's meant
-// to point to a NUL-terminated string, and thus can print it as a string.
-
-#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
-  template <>                                                           \
-  class FormatForComparison<CharType*, OtherStringType> {               \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(value);                           \
-    }                                                                   \
-  }
-
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
-
-#if GTEST_HAS_STD_WSTRING
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
-GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
-#endif
-
-#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
-
-// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
-// operand to be used in a failure message.  The type (but not value)
-// of the other operand may affect the format.  This allows us to
-// print a char* as a raw pointer when it is compared against another
-// char* or void*, and print it as a C string when it is compared
-// against an std::string object, for example.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-template <typename T1, typename T2>
-std::string FormatForComparisonFailureMessage(
-    const T1& value, const T2& /* other_operand */) {
-  return FormatForComparison<T1, T2>::Format(value);
-}
-
-// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
-// value to the given ostream.  The caller must ensure that
-// 'ostream_ptr' is not NULL, or the behavior is undefined.
-//
-// We define UniversalPrinter as a class template (as opposed to a
-// function template), as we need to partially specialize it for
-// reference types, which cannot be done with function templates.
-template <typename T>
-class UniversalPrinter;
-
-template <typename T>
-void UniversalPrint(const T& value, ::std::ostream* os);
-
-enum DefaultPrinterType {
-  kPrintContainer,
-  kPrintPointer,
-  kPrintFunctionPointer,
-  kPrintOther,
-};
-template <DefaultPrinterType type> struct WrapPrinterType {};
-
-// Used to print an STL-style container when the user doesn't define
-// a PrintTo() for it.
-template <typename C>
-void DefaultPrintTo(WrapPrinterType<kPrintContainer> /* dummy */,
-                    const C& container, ::std::ostream* os) {
-  const size_t kMaxCount = 32;  // The maximum number of elements to print.
-  *os << '{';
-  size_t count = 0;
-  for (typename C::const_iterator it = container.begin();
-       it != container.end(); ++it, ++count) {
-    if (count > 0) {
-      *os << ',';
-      if (count == kMaxCount) {  // Enough has been printed.
-        *os << " ...";
-        break;
-      }
-    }
-    *os << ' ';
-    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
-    // handle *it being a native array.
-    internal::UniversalPrint(*it, os);
-  }
-
-  if (count > 0) {
-    *os << ' ';
-  }
-  *os << '}';
-}
-
-// Used to print a pointer that is neither a char pointer nor a member
-// pointer, when the user doesn't define PrintTo() for it.  (A member
-// variable pointer or member function pointer doesn't really point to
-// a location in the address space.  Their representation is
-// implementation-defined.  Therefore they will be printed as raw
-// bytes.)
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintPointer> /* dummy */,
-                    T* p, ::std::ostream* os) {
-  if (p == nullptr) {
-    *os << "NULL";
-  } else {
-    // T is not a function type.  We just call << to print p,
-    // relying on ADL to pick up user-defined << for their pointer
-    // types, if any.
-    *os << p;
-  }
-}
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintFunctionPointer> /* dummy */,
-                    T* p, ::std::ostream* os) {
-  if (p == nullptr) {
-    *os << "NULL";
-  } else {
-    // T is a function type, so '*os << p' doesn't do what we want
-    // (it just prints p as bool).  We want to print p as a const
-    // void*.
-    *os << reinterpret_cast<const void*>(p);
-  }
-}
-
-// Used to print a non-container, non-pointer value when the user
-// doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */,
-                    const T& value, ::std::ostream* os) {
-  ::testing_internal::DefaultPrintNonContainerTo(value, os);
-}
-
-// Prints the given value using the << operator if it has one;
-// otherwise prints the bytes in it.  This is what
-// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
-// or overloaded for type T.
-//
-// A user can override this behavior for a class type Foo by defining
-// an overload of PrintTo() in the namespace where Foo is defined.  We
-// give the user this option as sometimes defining a << operator for
-// Foo is not desirable (e.g. the coding style may prevent doing it,
-// or there is already a << operator but it doesn't do what the user
-// wants).
-template <typename T>
-void PrintTo(const T& value, ::std::ostream* os) {
-  // DefaultPrintTo() is overloaded.  The type of its first argument
-  // determines which version will be picked.
-  //
-  // Note that we check for container types here, prior to we check
-  // for protocol message types in our operator<<.  The rationale is:
-  //
-  // For protocol messages, we want to give people a chance to
-  // override Google Mock's format by defining a PrintTo() or
-  // operator<<.  For STL containers, other formats can be
-  // incompatible with Google Mock's format for the container
-  // elements; therefore we check for container types here to ensure
-  // that our format is used.
-  //
-  // Note that MSVC and clang-cl do allow an implicit conversion from
-  // pointer-to-function to pointer-to-object, but clang-cl warns on it.
-  // So don't use ImplicitlyConvertible if it can be helped since it will
-  // cause this warning, and use a separate overload of DefaultPrintTo for
-  // function pointers so that the `*os << p` in the object pointer overload
-  // doesn't cause that warning either.
-  DefaultPrintTo(
-      WrapPrinterType <
-                  (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
-              !IsRecursiveContainer<T>::value
-          ? kPrintContainer
-          : !std::is_pointer<T>::value
-                ? kPrintOther
-                : std::is_function<typename std::remove_pointer<T>::type>::value
-                      ? kPrintFunctionPointer
-                      : kPrintPointer > (),
-      value, os);
-}
-
-// The following list of PrintTo() overloads tells
-// UniversalPrinter<T>::Print() how to print standard types (built-in
-// types, strings, plain arrays, and pointers).
-
-// Overloads for various char types.
-GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
-GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
-inline void PrintTo(char c, ::std::ostream* os) {
-  // When printing a plain char, we always treat it as unsigned.  This
-  // way, the output won't be affected by whether the compiler thinks
-  // char is signed or not.
-  PrintTo(static_cast<unsigned char>(c), os);
-}
-
-// Overloads for other simple built-in types.
-inline void PrintTo(bool x, ::std::ostream* os) {
-  *os << (x ? "true" : "false");
-}
-
-// Overload for wchar_t type.
-// Prints a wchar_t as a symbol if it is printable or as its internal
-// code otherwise and also as its decimal code (except for L'\0').
-// The L'\0' char is printed as "L'\\0'". The decimal code is printed
-// as signed integer when wchar_t is implemented by the compiler
-// as a signed type and is printed as an unsigned integer when wchar_t
-// is implemented as an unsigned type.
-GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
-
-// Overloads for C strings.
-GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
-inline void PrintTo(char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const char*>(s), os);
-}
-
-// signed/unsigned char is often used for representing binary data, so
-// we print pointers to it as void* to be safe.
-inline void PrintTo(const signed char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-inline void PrintTo(signed char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-inline void PrintTo(unsigned char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-
-// MSVC can be configured to define wchar_t as a typedef of unsigned
-// short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
-// type.  When wchar_t is a typedef, defining an overload for const
-// wchar_t* would cause unsigned short* be printed as a wide string,
-// possibly causing invalid memory accesses.
-#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
-// Overloads for wide C strings
-GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
-inline void PrintTo(wchar_t* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
-}
-#endif
-
-// Overload for C arrays.  Multi-dimensional arrays are printed
-// properly.
-
-// Prints the given number of elements in an array, without printing
-// the curly braces.
-template <typename T>
-void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
-  UniversalPrint(a[0], os);
-  for (size_t i = 1; i != count; i++) {
-    *os << ", ";
-    UniversalPrint(a[i], os);
-  }
-}
-
-// Overloads for ::std::string.
-GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
-inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
-  PrintStringTo(s, os);
-}
-
-// Overloads for ::std::wstring.
-#if GTEST_HAS_STD_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
-inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
-  PrintWideStringTo(s, os);
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-#if GTEST_HAS_ABSL
-// Overload for absl::string_view.
-inline void PrintTo(absl::string_view sp, ::std::ostream* os) {
-  PrintTo(::std::string(sp), os);
-}
-#endif  // GTEST_HAS_ABSL
-
-inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; }
-
-template <typename T>
-void PrintTo(std::reference_wrapper<T> ref, ::std::ostream* os) {
-  UniversalPrinter<T&>::Print(ref.get(), os);
-}
-
-// Helper function for printing a tuple.  T must be instantiated with
-// a tuple type.
-template <typename T>
-void PrintTupleTo(const T&, std::integral_constant<size_t, 0>,
-                  ::std::ostream*) {}
-
-template <typename T, size_t I>
-void PrintTupleTo(const T& t, std::integral_constant<size_t, I>,
-                  ::std::ostream* os) {
-  PrintTupleTo(t, std::integral_constant<size_t, I - 1>(), os);
-  GTEST_INTENTIONAL_CONST_COND_PUSH_()
-  if (I > 1) {
-    GTEST_INTENTIONAL_CONST_COND_POP_()
-    *os << ", ";
-  }
-  UniversalPrinter<typename std::tuple_element<I - 1, T>::type>::Print(
-      std::get<I - 1>(t), os);
-}
-
-template <typename... Types>
-void PrintTo(const ::std::tuple<Types...>& t, ::std::ostream* os) {
-  *os << "(";
-  PrintTupleTo(t, std::integral_constant<size_t, sizeof...(Types)>(), os);
-  *os << ")";
-}
-
-// Overload for std::pair.
-template <typename T1, typename T2>
-void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
-  *os << '(';
-  // We cannot use UniversalPrint(value.first, os) here, as T1 may be
-  // a reference type.  The same for printing value.second.
-  UniversalPrinter<T1>::Print(value.first, os);
-  *os << ", ";
-  UniversalPrinter<T2>::Print(value.second, os);
-  *os << ')';
-}
-
-// Implements printing a non-reference type T by letting the compiler
-// pick the right overload of PrintTo() for T.
-template <typename T>
-class UniversalPrinter {
- public:
-  // MSVC warns about adding const to a function type, so we want to
-  // disable the warning.
-  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
-
-  // Note: we deliberately don't call this PrintTo(), as that name
-  // conflicts with ::testing::internal::PrintTo in the body of the
-  // function.
-  static void Print(const T& value, ::std::ostream* os) {
-    // By default, ::testing::internal::PrintTo() is used for printing
-    // the value.
-    //
-    // Thanks to Koenig look-up, if T is a class and has its own
-    // PrintTo() function defined in its namespace, that function will
-    // be visible here.  Since it is more specific than the generic ones
-    // in ::testing::internal, it will be picked by the compiler in the
-    // following statement - exactly what we want.
-    PrintTo(value, os);
-  }
-
-  GTEST_DISABLE_MSC_WARNINGS_POP_()
-};
-
-#if GTEST_HAS_ABSL
-
-// Printer for absl::optional
-
-template <typename T>
-class UniversalPrinter<::absl::optional<T>> {
- public:
-  static void Print(const ::absl::optional<T>& value, ::std::ostream* os) {
-    *os << '(';
-    if (!value) {
-      *os << "nullopt";
-    } else {
-      UniversalPrint(*value, os);
-    }
-    *os << ')';
-  }
-};
-
-// Printer for absl::variant
-
-template <typename... T>
-class UniversalPrinter<::absl::variant<T...>> {
- public:
-  static void Print(const ::absl::variant<T...>& value, ::std::ostream* os) {
-    *os << '(';
-    absl::visit(Visitor{os}, value);
-    *os << ')';
-  }
-
- private:
-  struct Visitor {
-    template <typename U>
-    void operator()(const U& u) const {
-      *os << "'" << GetTypeName<U>() << "' with value ";
-      UniversalPrint(u, os);
-    }
-    ::std::ostream* os;
-  };
-};
-
-#endif  // GTEST_HAS_ABSL
-
-// UniversalPrintArray(begin, len, os) prints an array of 'len'
-// elements, starting at address 'begin'.
-template <typename T>
-void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
-  if (len == 0) {
-    *os << "{}";
-  } else {
-    *os << "{ ";
-    const size_t kThreshold = 18;
-    const size_t kChunkSize = 8;
-    // If the array has more than kThreshold elements, we'll have to
-    // omit some details by printing only the first and the last
-    // kChunkSize elements.
-    if (len <= kThreshold) {
-      PrintRawArrayTo(begin, len, os);
-    } else {
-      PrintRawArrayTo(begin, kChunkSize, os);
-      *os << ", ..., ";
-      PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
-    }
-    *os << " }";
-  }
-}
-// This overload prints a (const) char array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const char* begin, size_t len, ::std::ostream* os);
-
-// This overload prints a (const) wchar_t array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const wchar_t* begin, size_t len, ::std::ostream* os);
-
-// Implements printing an array type T[N].
-template <typename T, size_t N>
-class UniversalPrinter<T[N]> {
- public:
-  // Prints the given array, omitting some elements when there are too
-  // many.
-  static void Print(const T (&a)[N], ::std::ostream* os) {
-    UniversalPrintArray(a, N, os);
-  }
-};
-
-// Implements printing a reference type T&.
-template <typename T>
-class UniversalPrinter<T&> {
- public:
-  // MSVC warns about adding const to a function type, so we want to
-  // disable the warning.
-  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
-
-  static void Print(const T& value, ::std::ostream* os) {
-    // Prints the address of the value.  We use reinterpret_cast here
-    // as static_cast doesn't compile when T is a function type.
-    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
-
-    // Then prints the value itself.
-    UniversalPrint(value, os);
-  }
-
-  GTEST_DISABLE_MSC_WARNINGS_POP_()
-};
-
-// Prints a value tersely: for a reference type, the referenced value
-// (but not the address) is printed; for a (const) char pointer, the
-// NUL-terminated string (but not the pointer) is printed.
-
-template <typename T>
-class UniversalTersePrinter {
- public:
-  static void Print(const T& value, ::std::ostream* os) {
-    UniversalPrint(value, os);
-  }
-};
-template <typename T>
-class UniversalTersePrinter<T&> {
- public:
-  static void Print(const T& value, ::std::ostream* os) {
-    UniversalPrint(value, os);
-  }
-};
-template <typename T, size_t N>
-class UniversalTersePrinter<T[N]> {
- public:
-  static void Print(const T (&value)[N], ::std::ostream* os) {
-    UniversalPrinter<T[N]>::Print(value, os);
-  }
-};
-template <>
-class UniversalTersePrinter<const char*> {
- public:
-  static void Print(const char* str, ::std::ostream* os) {
-    if (str == nullptr) {
-      *os << "NULL";
-    } else {
-      UniversalPrint(std::string(str), os);
-    }
-  }
-};
-template <>
-class UniversalTersePrinter<char*> {
- public:
-  static void Print(char* str, ::std::ostream* os) {
-    UniversalTersePrinter<const char*>::Print(str, os);
-  }
-};
-
-#if GTEST_HAS_STD_WSTRING
-template <>
-class UniversalTersePrinter<const wchar_t*> {
- public:
-  static void Print(const wchar_t* str, ::std::ostream* os) {
-    if (str == nullptr) {
-      *os << "NULL";
-    } else {
-      UniversalPrint(::std::wstring(str), os);
-    }
-  }
-};
-#endif
-
-template <>
-class UniversalTersePrinter<wchar_t*> {
- public:
-  static void Print(wchar_t* str, ::std::ostream* os) {
-    UniversalTersePrinter<const wchar_t*>::Print(str, os);
-  }
-};
-
-template <typename T>
-void UniversalTersePrint(const T& value, ::std::ostream* os) {
-  UniversalTersePrinter<T>::Print(value, os);
-}
-
-// Prints a value using the type inferred by the compiler.  The
-// difference between this and UniversalTersePrint() is that for a
-// (const) char pointer, this prints both the pointer and the
-// NUL-terminated string.
-template <typename T>
-void UniversalPrint(const T& value, ::std::ostream* os) {
-  // A workaround for the bug in VC++ 7.1 that prevents us from instantiating
-  // UniversalPrinter with T directly.
-  typedef T T1;
-  UniversalPrinter<T1>::Print(value, os);
-}
-
-typedef ::std::vector< ::std::string> Strings;
-
-  // Tersely prints the first N fields of a tuple to a string vector,
-  // one element for each field.
-template <typename Tuple>
-void TersePrintPrefixToStrings(const Tuple&, std::integral_constant<size_t, 0>,
-                               Strings*) {}
-template <typename Tuple, size_t I>
-void TersePrintPrefixToStrings(const Tuple& t,
-                               std::integral_constant<size_t, I>,
-                               Strings* strings) {
-  TersePrintPrefixToStrings(t, std::integral_constant<size_t, I - 1>(),
-                            strings);
-  ::std::stringstream ss;
-  UniversalTersePrint(std::get<I - 1>(t), &ss);
-  strings->push_back(ss.str());
-}
-
-// Prints the fields of a tuple tersely to a string vector, one
-// element for each field.  See the comment before
-// UniversalTersePrint() for how we define "tersely".
-template <typename Tuple>
-Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
-  Strings result;
-  TersePrintPrefixToStrings(
-      value, std::integral_constant<size_t, std::tuple_size<Tuple>::value>(),
-      &result);
-  return result;
-}
-
-}  // namespace internal
-
-#if GTEST_HAS_ABSL
-namespace internal2 {
-template <typename T>
-void TypeWithoutFormatter<T, kConvertibleToStringView>::PrintValue(
-    const T& value, ::std::ostream* os) {
-  internal::PrintTo(absl::string_view(value), os);
-}
-}  // namespace internal2
-#endif
-
-template <typename T>
-::std::string PrintToString(const T& value) {
-  ::std::stringstream ss;
-  internal::UniversalTersePrinter<T>::Print(value, &ss);
-  return ss.str();
-}
-
-}  // namespace testing
-
-// Include any custom printer added by the local installation.
-// We must include this header at the end to make sure it can use the
-// declarations from this file.
-#include "gtest/internal/custom/gtest-printers.h"
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
diff --git a/deps/googletest/include/gtest/gtest-spi.h b/deps/googletest/include/gtest/gtest-spi.h
deleted file mode 100644
index aa38870e8..000000000
--- a/deps/googletest/include/gtest/gtest-spi.h
+++ /dev/null
@@ -1,238 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-//
-// Utilities for testing Google Test itself and code that uses Google Test
-// (e.g. frameworks built on top of Google Test).
-
-// GOOGLETEST_CM0004 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-
-#include "gtest/gtest.h"
-
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
-/* class A needs to have dll-interface to be used by clients of class B */)
-
-namespace testing {
-
-// This helper class can be used to mock out Google Test failure reporting
-// so that we can test Google Test or code that builds on Google Test.
-//
-// An object of this class appends a TestPartResult object to the
-// TestPartResultArray object given in the constructor whenever a Google Test
-// failure is reported. It can either intercept only failures that are
-// generated in the same thread that created this object or it can intercept
-// all generated failures. The scope of this mock object can be controlled with
-// the second argument to the two arguments constructor.
-class GTEST_API_ ScopedFakeTestPartResultReporter
-    : public TestPartResultReporterInterface {
- public:
-  // The two possible mocking modes of this object.
-  enum InterceptMode {
-    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
-    INTERCEPT_ALL_THREADS           // Intercepts all failures.
-  };
-
-  // The c'tor sets this object as the test part result reporter used
-  // by Google Test.  The 'result' parameter specifies where to report the
-  // results. This reporter will only catch failures generated in the current
-  // thread. DEPRECATED
-  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
-
-  // Same as above, but you can choose the interception scope of this object.
-  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
-                                   TestPartResultArray* result);
-
-  // The d'tor restores the previous test part result reporter.
-  ~ScopedFakeTestPartResultReporter() override;
-
-  // Appends the TestPartResult object to the TestPartResultArray
-  // received in the constructor.
-  //
-  // This method is from the TestPartResultReporterInterface
-  // interface.
-  void ReportTestPartResult(const TestPartResult& result) override;
-
- private:
-  void Init();
-
-  const InterceptMode intercept_mode_;
-  TestPartResultReporterInterface* old_reporter_;
-  TestPartResultArray* const result_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
-};
-
-namespace internal {
-
-// A helper class for implementing EXPECT_FATAL_FAILURE() and
-// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
-// TestPartResultArray contains exactly one failure that has the given
-// type and contains the given substring.  If that's not the case, a
-// non-fatal failure will be generated.
-class GTEST_API_ SingleFailureChecker {
- public:
-  // The constructor remembers the arguments.
-  SingleFailureChecker(const TestPartResultArray* results,
-                       TestPartResult::Type type, const std::string& substr);
-  ~SingleFailureChecker();
- private:
-  const TestPartResultArray* const results_;
-  const TestPartResult::Type type_;
-  const std::string substr_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
-};
-
-}  // namespace internal
-
-}  // namespace testing
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
-
-// A set of macros for testing Google Test assertions or code that's expected
-// to generate Google Test fatal failures.  It verifies that the given
-// statement will cause exactly one fatal Google Test failure with 'substr'
-// being part of the failure message.
-//
-// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
-// affects and considers failures generated in the current thread and
-// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
-//
-// The verification of the assertion is done correctly even when the statement
-// throws an exception or aborts the current function.
-//
-// Known restrictions:
-//   - 'statement' cannot reference local non-static variables or
-//     non-static members of the current object.
-//   - 'statement' cannot return a value.
-//   - You cannot stream a failure message to this macro.
-//
-// Note that even though the implementations of the following two
-// macros are much alike, we cannot refactor them to use a common
-// helper macro, due to some peculiarity in how the preprocessor
-// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
-// gtest_unittest.cc will fail to compile if we do that.
-#define EXPECT_FATAL_FAILURE(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ALL_THREADS, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-// A macro for testing Google Test assertions or code that's expected to
-// generate Google Test non-fatal failures.  It asserts that the given
-// statement will cause exactly one non-fatal Google Test failure with 'substr'
-// being part of the failure message.
-//
-// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
-// affects and considers failures generated in the current thread and
-// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
-//
-// 'statement' is allowed to reference local variables and members of
-// the current object.
-//
-// The verification of the assertion is done correctly even when the statement
-// throws an exception or aborts the current function.
-//
-// Known restrictions:
-//   - You cannot stream a failure message to this macro.
-//
-// Note that even though the implementations of the following two
-// macros are much alike, we cannot refactor them to use a common
-// helper macro, due to some peculiarity in how the preprocessor
-// works.  If we do that, the code won't compile when the user gives
-// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
-// expands to code containing an unprotected comma.  The
-// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
-// catches that.
-//
-// For the same reason, we have to write
-//   if (::testing::internal::AlwaysTrue()) { statement; }
-// instead of
-//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
-// to avoid an MSVC warning on unreachable code.
-#define EXPECT_NONFATAL_FAILURE(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
-          &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/deps/googletest/include/gtest/gtest-test-part.h b/deps/googletest/include/gtest/gtest-test-part.h
deleted file mode 100644
index 1e1cb097a..000000000
--- a/deps/googletest/include/gtest/gtest-test-part.h
+++ /dev/null
@@ -1,184 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-
-#include <iosfwd>
-#include <vector>
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-string.h"
-
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
-/* class A needs to have dll-interface to be used by clients of class B */)
-
-namespace testing {
-
-// A copyable object representing the result of a test part (i.e. an
-// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
-//
-// Don't inherit from TestPartResult as its destructor is not virtual.
-class GTEST_API_ TestPartResult {
- public:
-  // The possible outcomes of a test part (i.e. an assertion or an
-  // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
-  enum Type {
-    kSuccess,          // Succeeded.
-    kNonFatalFailure,  // Failed but the test can continue.
-    kFatalFailure,     // Failed and the test should be terminated.
-    kSkip              // Skipped.
-  };
-
-  // C'tor.  TestPartResult does NOT have a default constructor.
-  // Always use this constructor (with parameters) to create a
-  // TestPartResult object.
-  TestPartResult(Type a_type, const char* a_file_name, int a_line_number,
-                 const char* a_message)
-      : type_(a_type),
-        file_name_(a_file_name == nullptr ? "" : a_file_name),
-        line_number_(a_line_number),
-        summary_(ExtractSummary(a_message)),
-        message_(a_message) {}
-
-  // Gets the outcome of the test part.
-  Type type() const { return type_; }
-
-  // Gets the name of the source file where the test part took place, or
-  // NULL if it's unknown.
-  const char* file_name() const {
-    return file_name_.empty() ? nullptr : file_name_.c_str();
-  }
-
-  // Gets the line in the source file where the test part took place,
-  // or -1 if it's unknown.
-  int line_number() const { return line_number_; }
-
-  // Gets the summary of the failure message.
-  const char* summary() const { return summary_.c_str(); }
-
-  // Gets the message associated with the test part.
-  const char* message() const { return message_.c_str(); }
-
-  // Returns true iff the test part was skipped.
-  bool skipped() const { return type_ == kSkip; }
-
-  // Returns true iff the test part passed.
-  bool passed() const { return type_ == kSuccess; }
-
-  // Returns true iff the test part non-fatally failed.
-  bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
-
-  // Returns true iff the test part fatally failed.
-  bool fatally_failed() const { return type_ == kFatalFailure; }
-
-  // Returns true iff the test part failed.
-  bool failed() const { return fatally_failed() || nonfatally_failed(); }
-
- private:
-  Type type_;
-
-  // Gets the summary of the failure message by omitting the stack
-  // trace in it.
-  static std::string ExtractSummary(const char* message);
-
-  // The name of the source file where the test part took place, or
-  // "" if the source file is unknown.
-  std::string file_name_;
-  // The line in the source file where the test part took place, or -1
-  // if the line number is unknown.
-  int line_number_;
-  std::string summary_;  // The test failure summary.
-  std::string message_;  // The test failure message.
-};
-
-// Prints a TestPartResult object.
-std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
-
-// An array of TestPartResult objects.
-//
-// Don't inherit from TestPartResultArray as its destructor is not
-// virtual.
-class GTEST_API_ TestPartResultArray {
- public:
-  TestPartResultArray() {}
-
-  // Appends the given TestPartResult to the array.
-  void Append(const TestPartResult& result);
-
-  // Returns the TestPartResult at the given index (0-based).
-  const TestPartResult& GetTestPartResult(int index) const;
-
-  // Returns the number of TestPartResult objects in the array.
-  int size() const;
-
- private:
-  std::vector<TestPartResult> array_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
-};
-
-// This interface knows how to report a test part result.
-class GTEST_API_ TestPartResultReporterInterface {
- public:
-  virtual ~TestPartResultReporterInterface() {}
-
-  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
-};
-
-namespace internal {
-
-// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
-// statement generates new fatal failures. To do so it registers itself as the
-// current test part result reporter. Besides checking if fatal failures were
-// reported, it only delegates the reporting to the former result reporter.
-// The original result reporter is restored in the destructor.
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-class GTEST_API_ HasNewFatalFailureHelper
-    : public TestPartResultReporterInterface {
- public:
-  HasNewFatalFailureHelper();
-  ~HasNewFatalFailureHelper() override;
-  void ReportTestPartResult(const TestPartResult& result) override;
-  bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
- private:
-  bool has_new_fatal_failure_;
-  TestPartResultReporterInterface* original_reporter_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
-};
-
-}  // namespace internal
-
-}  // namespace testing
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
diff --git a/deps/googletest/include/gtest/gtest-typed-test.h b/deps/googletest/include/gtest/gtest-typed-test.h
deleted file mode 100644
index b3319f682..000000000
--- a/deps/googletest/include/gtest/gtest-typed-test.h
+++ /dev/null
@@ -1,336 +0,0 @@
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
-
-// This header implements typed tests and type-parameterized tests.
-
-// Typed (aka type-driven) tests repeat the same test for types in a
-// list.  You must know which types you want to test with when writing
-// typed tests. Here's how you do it:
-
-#if 0
-
-// First, define a fixture class template.  It should be parameterized
-// by a type.  Remember to derive it from testing::Test.
-template <typename T>
-class FooTest : public testing::Test {
- public:
-  ...
-  typedef std::list<T> List;
-  static T shared_;
-  T value_;
-};
-
-// Next, associate a list of types with the test suite, which will be
-// repeated for each type in the list.  The typedef is necessary for
-// the macro to parse correctly.
-typedef testing::Types<char, int, unsigned int> MyTypes;
-TYPED_TEST_SUITE(FooTest, MyTypes);
-
-// If the type list contains only one type, you can write that type
-// directly without Types<...>:
-//   TYPED_TEST_SUITE(FooTest, int);
-
-// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
-// tests for this test suite as you want.
-TYPED_TEST(FooTest, DoesBlah) {
-  // Inside a test, refer to TypeParam to get the type parameter.
-  // Since we are inside a derived class template, C++ requires use to
-  // visit the members of FooTest via 'this'.
-  TypeParam n = this->value_;
-
-  // To visit static members of the fixture, add the TestFixture::
-  // prefix.
-  n += TestFixture::shared_;
-
-  // To refer to typedefs in the fixture, add the "typename
-  // TestFixture::" prefix.
-  typename TestFixture::List values;
-  values.push_back(n);
-  ...
-}
-
-TYPED_TEST(FooTest, HasPropertyA) { ... }
-
-// TYPED_TEST_SUITE takes an optional third argument which allows to specify a
-// class that generates custom test name suffixes based on the type. This should
-// be a class which has a static template function GetName(int index) returning
-// a string for each type. The provided integer index equals the index of the
-// type in the provided type list. In many cases the index can be ignored.
-//
-// For example:
-//   class MyTypeNames {
-//    public:
-//     template <typename T>
-//     static std::string GetName(int) {
-//       if (std::is_same<T, char>()) return "char";
-//       if (std::is_same<T, int>()) return "int";
-//       if (std::is_same<T, unsigned int>()) return "unsignedInt";
-//     }
-//   };
-//   TYPED_TEST_SUITE(FooTest, MyTypes, MyTypeNames);
-
-#endif  // 0
-
-// Type-parameterized tests are abstract test patterns parameterized
-// by a type.  Compared with typed tests, type-parameterized tests
-// allow you to define the test pattern without knowing what the type
-// parameters are.  The defined pattern can be instantiated with
-// different types any number of times, in any number of translation
-// units.
-//
-// If you are designing an interface or concept, you can define a
-// suite of type-parameterized tests to verify properties that any
-// valid implementation of the interface/concept should have.  Then,
-// each implementation can easily instantiate the test suite to verify
-// that it conforms to the requirements, without having to write
-// similar tests repeatedly.  Here's an example:
-
-#if 0
-
-// First, define a fixture class template.  It should be parameterized
-// by a type.  Remember to derive it from testing::Test.
-template <typename T>
-class FooTest : public testing::Test {
-  ...
-};
-
-// Next, declare that you will define a type-parameterized test suite
-// (the _P suffix is for "parameterized" or "pattern", whichever you
-// prefer):
-TYPED_TEST_SUITE_P(FooTest);
-
-// Then, use TYPED_TEST_P() to define as many type-parameterized tests
-// for this type-parameterized test suite as you want.
-TYPED_TEST_P(FooTest, DoesBlah) {
-  // Inside a test, refer to TypeParam to get the type parameter.
-  TypeParam n = 0;
-  ...
-}
-
-TYPED_TEST_P(FooTest, HasPropertyA) { ... }
-
-// Now the tricky part: you need to register all test patterns before
-// you can instantiate them.  The first argument of the macro is the
-// test suite name; the rest are the names of the tests in this test
-// case.
-REGISTER_TYPED_TEST_SUITE_P(FooTest,
-                            DoesBlah, HasPropertyA);
-
-// Finally, you are free to instantiate the pattern with the types you
-// want.  If you put the above code in a header file, you can #include
-// it in multiple C++ source files and instantiate it multiple times.
-//
-// To distinguish different instances of the pattern, the first
-// argument to the INSTANTIATE_* macro is a prefix that will be added
-// to the actual test suite name.  Remember to pick unique prefixes for
-// different instances.
-typedef testing::Types<char, int, unsigned int> MyTypes;
-INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
-
-// If the type list contains only one type, you can write that type
-// directly without Types<...>:
-//   INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, int);
-//
-// Similar to the optional argument of TYPED_TEST_SUITE above,
-// INSTANTIATE_TEST_SUITE_P takes an optional fourth argument which allows to
-// generate custom names.
-//   INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes, MyTypeNames);
-
-#endif  // 0
-
-#include "gtest/internal/gtest-port.h"
-#include "gtest/internal/gtest-type-util.h"
-
-// Implements typed tests.
-
-#if GTEST_HAS_TYPED_TEST
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Expands to the name of the typedef for the type parameters of the
-// given test suite.
-#define GTEST_TYPE_PARAMS_(TestSuiteName) gtest_type_params_##TestSuiteName##_
-
-// Expands to the name of the typedef for the NameGenerator, responsible for
-// creating the suffixes of the name.
-#define GTEST_NAME_GENERATOR_(TestSuiteName) \
-  gtest_type_params_##TestSuiteName##_NameGenerator
-
-// The 'Types' template argument below must have spaces around it
-// since some compilers may choke on '>>' when passing a template
-// instance (e.g. Types<int>)
-#define TYPED_TEST_SUITE(CaseName, Types, ...)                           \
-  typedef ::testing::internal::TypeList<Types>::type GTEST_TYPE_PARAMS_( \
-      CaseName);                                                         \
-  typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type  \
-      GTEST_NAME_GENERATOR_(CaseName)
-
-# define TYPED_TEST(CaseName, TestName)                                       \
-  template <typename gtest_TypeParam_>                                        \
-  class GTEST_TEST_CLASS_NAME_(CaseName, TestName)                            \
-      : public CaseName<gtest_TypeParam_> {                                   \
-   private:                                                                   \
-    typedef CaseName<gtest_TypeParam_> TestFixture;                           \
-    typedef gtest_TypeParam_ TypeParam;                                       \
-    virtual void TestBody();                                                  \
-  };                                                                          \
-  static bool gtest_##CaseName##_##TestName##_registered_                     \
-        GTEST_ATTRIBUTE_UNUSED_ =                                             \
-      ::testing::internal::TypeParameterizedTest<                             \
-          CaseName,                                                           \
-          ::testing::internal::TemplateSel<GTEST_TEST_CLASS_NAME_(CaseName,   \
-                                                                  TestName)>, \
-          GTEST_TYPE_PARAMS_(                                                 \
-              CaseName)>::Register("",                                        \
-                                   ::testing::internal::CodeLocation(         \
-                                       __FILE__, __LINE__),                   \
-                                   #CaseName, #TestName, 0,                   \
-                                   ::testing::internal::GenerateNames<        \
-                                       GTEST_NAME_GENERATOR_(CaseName),       \
-                                       GTEST_TYPE_PARAMS_(CaseName)>());      \
-  template <typename gtest_TypeParam_>                                        \
-  void GTEST_TEST_CLASS_NAME_(CaseName,                                       \
-                              TestName)<gtest_TypeParam_>::TestBody()
-
-// Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-#define TYPED_TEST_CASE                                                \
-  static_assert(::testing::internal::TypedTestCaseIsDeprecated(), ""); \
-  TYPED_TEST_SUITE
-#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-#endif  // GTEST_HAS_TYPED_TEST
-
-// Implements type-parameterized tests.
-
-#if GTEST_HAS_TYPED_TEST_P
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Expands to the namespace name that the type-parameterized tests for
-// the given type-parameterized test suite are defined in.  The exact
-// name of the namespace is subject to change without notice.
-#define GTEST_SUITE_NAMESPACE_(TestSuiteName) gtest_suite_##TestSuiteName##_
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Expands to the name of the variable used to remember the names of
-// the defined tests in the given test suite.
-#define GTEST_TYPED_TEST_SUITE_P_STATE_(TestSuiteName) \
-  gtest_typed_test_suite_p_state_##TestSuiteName##_
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
-//
-// Expands to the name of the variable used to remember the names of
-// the registered tests in the given test suite.
-#define GTEST_REGISTERED_TEST_NAMES_(TestSuiteName) \
-  gtest_registered_test_names_##TestSuiteName##_
-
-// The variables defined in the type-parameterized test macros are
-// static as typically these macros are used in a .h file that can be
-// #included in multiple translation units linked together.
-#define TYPED_TEST_SUITE_P(SuiteName)              \
-  static ::testing::internal::TypedTestSuitePState \
-      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
-
-// Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-#define TYPED_TEST_CASE_P                                                 \
-  static_assert(::testing::internal::TypedTestCase_P_IsDeprecated(), ""); \
-  TYPED_TEST_SUITE_P
-#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-#define TYPED_TEST_P(SuiteName, TestName)                             \
-  namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                       \
-    template <typename gtest_TypeParam_>                              \
-    class TestName : public SuiteName<gtest_TypeParam_> {             \
-     private:                                                         \
-      typedef SuiteName<gtest_TypeParam_> TestFixture;                \
-      typedef gtest_TypeParam_ TypeParam;                             \
-      virtual void TestBody();                                        \
-    };                                                                \
-    static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
-        GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(       \
-            __FILE__, __LINE__, #SuiteName, #TestName);               \
-  }                                                                   \
-  template <typename gtest_TypeParam_>                                \
-  void GTEST_SUITE_NAMESPACE_(                                        \
-      SuiteName)::TestName<gtest_TypeParam_>::TestBody()
-
-#define REGISTER_TYPED_TEST_SUITE_P(SuiteName, ...)                            \
-  namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                                \
-    typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
-  }                                                                            \
-  static const char* const GTEST_REGISTERED_TEST_NAMES_(                       \
-      SuiteName) GTEST_ATTRIBUTE_UNUSED_ =                                     \
-      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames(    \
-          __FILE__, __LINE__, #__VA_ARGS__)
-
-// Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-#define REGISTER_TYPED_TEST_CASE_P                                           \
-  static_assert(::testing::internal::RegisterTypedTestCase_P_IsDeprecated(), \
-                "");                                                         \
-  REGISTER_TYPED_TEST_SUITE_P
-#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-// The 'Types' template argument below must have spaces around it
-// since some compilers may choke on '>>' when passing a template
-// instance (e.g. Types<int>)
-#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)       \
-  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =        \
-      ::testing::internal::TypeParameterizedTestSuite<                      \
-          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,    \
-          ::testing::internal::TypeList<Types>::type>::                     \
-          Register(#Prefix,                                                 \
-                   ::testing::internal::CodeLocation(__FILE__, __LINE__),   \
-                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName), #SuiteName, \
-                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),                 \
-                   ::testing::internal::GenerateNames<                      \
-                       ::testing::internal::NameGeneratorSelector<          \
-                           __VA_ARGS__>::type,                              \
-                       ::testing::internal::TypeList<Types>::type>())
-
-// Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-#define INSTANTIATE_TYPED_TEST_CASE_P                                      \
-  static_assert(                                                           \
-      ::testing::internal::InstantiateTypedTestCase_P_IsDeprecated(), ""); \
-  INSTANTIATE_TYPED_TEST_SUITE_P
-#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-#endif  // GTEST_HAS_TYPED_TEST_P
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
diff --git a/deps/googletest/include/gtest/gtest.h b/deps/googletest/include/gtest/gtest.h
deleted file mode 100644
index 76f4098bf..000000000
--- a/deps/googletest/include/gtest/gtest.h
+++ /dev/null
@@ -1,2453 +0,0 @@
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-//
-// The Google C++ Testing and Mocking Framework (Google Test)
-//
-// This header file defines the public API for Google Test.  It should be
-// included by any test program that uses Google Test.
-//
-// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
-// leave some internal implementation details in this header file.
-// They are clearly marked by comments like this:
-//
-//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-//
-// Such code is NOT meant to be used by a user directly, and is subject
-// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
-// program!
-//
-// Acknowledgment: Google Test borrowed the idea of automatic test
-// registration from Barthelemy Dagenais' (barthelemy@prologique.com)
-// easyUnit framework.
-
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_H_
-
-#include <cstddef>
-#include <limits>
-#include <memory>
-#include <ostream>
-#include <type_traits>
-#include <vector>
-
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-string.h"
-#include "gtest/gtest-death-test.h"
-#include "gtest/gtest-matchers.h"
-#include "gtest/gtest-message.h"
-#include "gtest/gtest-param-test.h"
-#include "gtest/gtest-printers.h"
-#include "gtest/gtest_prod.h"
-#include "gtest/gtest-test-part.h"
-#include "gtest/gtest-typed-test.h"
-
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
-/* class A needs to have dll-interface to be used by clients of class B */)
-
-namespace testing {
-
-// Silence C4100 (unreferenced formal parameter) and 4805
-// unsafe mix of type 'const int' and type 'const bool'
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable:4805)
-# pragma warning(disable:4100)
-#endif
-
-
-// Declares the flags.
-
-// This flag temporary enables the disabled tests.
-GTEST_DECLARE_bool_(also_run_disabled_tests);
-
-// This flag brings the debugger on an assertion failure.
-GTEST_DECLARE_bool_(break_on_failure);
-
-// This flag controls whether Google Test catches all test-thrown exceptions
-// and logs them as failures.
-GTEST_DECLARE_bool_(catch_exceptions);
-
-// This flag enables using colors in terminal output. Available values are
-// "yes" to enable colors, "no" (disable colors), or "auto" (the default)
-// to let Google Test decide.
-GTEST_DECLARE_string_(color);
-
-// This flag sets up the filter to select by name using a glob pattern
-// the tests to run. If the filter is not given all tests are executed.
-GTEST_DECLARE_string_(filter);
-
-// This flag controls whether Google Test installs a signal handler that dumps
-// debugging information when fatal signals are raised.
-GTEST_DECLARE_bool_(install_failure_signal_handler);
-
-// This flag causes the Google Test to list tests. None of the tests listed
-// are actually run if the flag is provided.
-GTEST_DECLARE_bool_(list_tests);
-
-// This flag controls whether Google Test emits a detailed XML report to a file
-// in addition to its normal textual output.
-GTEST_DECLARE_string_(output);
-
-// This flags control whether Google Test prints the elapsed time for each
-// test.
-GTEST_DECLARE_bool_(print_time);
-
-// This flags control whether Google Test prints UTF8 characters as text.
-GTEST_DECLARE_bool_(print_utf8);
-
-// This flag specifies the random number seed.
-GTEST_DECLARE_int32_(random_seed);
-
-// This flag sets how many times the tests are repeated. The default value
-// is 1. If the value is -1 the tests are repeating forever.
-GTEST_DECLARE_int32_(repeat);
-
-// This flag controls whether Google Test includes Google Test internal
-// stack frames in failure stack traces.
-GTEST_DECLARE_bool_(show_internal_stack_frames);
-
-// When this flag is specified, tests' order is randomized on every iteration.
-GTEST_DECLARE_bool_(shuffle);
-
-// This flag specifies the maximum number of stack frames to be
-// printed in a failure message.
-GTEST_DECLARE_int32_(stack_trace_depth);
-
-// When this flag is specified, a failed assertion will throw an
-// exception if exceptions are enabled, or exit the program with a
-// non-zero code otherwise. For use with an external test framework.
-GTEST_DECLARE_bool_(throw_on_failure);
-
-// When this flag is set with a "host:port" string, on supported
-// platforms test results are streamed to the specified port on
-// the specified host machine.
-GTEST_DECLARE_string_(stream_result_to);
-
-#if GTEST_USE_OWN_FLAGFILE_FLAG_
-GTEST_DECLARE_string_(flagfile);
-#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
-
-// The upper limit for valid stack trace depths.
-const int kMaxStackTraceDepth = 100;
-
-namespace internal {
-
-class AssertHelper;
-class DefaultGlobalTestPartResultReporter;
-class ExecDeathTest;
-class NoExecDeathTest;
-class FinalSuccessChecker;
-class GTestFlagSaver;
-class StreamingListenerTest;
-class TestResultAccessor;
-class TestEventListenersAccessor;
-class TestEventRepeater;
-class UnitTestRecordPropertyTestHelper;
-class WindowsDeathTest;
-class FuchsiaDeathTest;
-class UnitTestImpl* GetUnitTestImpl();
-void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
-                                    const std::string& message);
-
-}  // namespace internal
-
-// The friend relationship of some of these classes is cyclic.
-// If we don't forward declare them the compiler might confuse the classes
-// in friendship clauses with same named classes on the scope.
-class Test;
-class TestSuite;
-
-// Old API is still available but deprecated
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-using TestCase = TestSuite;
-#endif
-class TestInfo;
-class UnitTest;
-
-// A class for indicating whether an assertion was successful.  When
-// the assertion wasn't successful, the AssertionResult object
-// remembers a non-empty message that describes how it failed.
-//
-// To create an instance of this class, use one of the factory functions
-// (AssertionSuccess() and AssertionFailure()).
-//
-// This class is useful for two purposes:
-//   1. Defining predicate functions to be used with Boolean test assertions
-//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
-//   2. Defining predicate-format functions to be
-//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
-//
-// For example, if you define IsEven predicate:
-//
-//   testing::AssertionResult IsEven(int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess();
-//     else
-//       return testing::AssertionFailure() << n << " is odd";
-//   }
-//
-// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
-// will print the message
-//
-//   Value of: IsEven(Fib(5))
-//     Actual: false (5 is odd)
-//   Expected: true
-//
-// instead of a more opaque
-//
-//   Value of: IsEven(Fib(5))
-//     Actual: false
-//   Expected: true
-//
-// in case IsEven is a simple Boolean predicate.
-//
-// If you expect your predicate to be reused and want to support informative
-// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
-// about half as often as positive ones in our tests), supply messages for
-// both success and failure cases:
-//
-//   testing::AssertionResult IsEven(int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess() << n << " is even";
-//     else
-//       return testing::AssertionFailure() << n << " is odd";
-//   }
-//
-// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
-//
-//   Value of: IsEven(Fib(6))
-//     Actual: true (8 is even)
-//   Expected: false
-//
-// NB: Predicates that support negative Boolean assertions have reduced
-// performance in positive ones so be careful not to use them in tests
-// that have lots (tens of thousands) of positive Boolean assertions.
-//
-// To use this class with EXPECT_PRED_FORMAT assertions such as:
-//
-//   // Verifies that Foo() returns an even number.
-//   EXPECT_PRED_FORMAT1(IsEven, Foo());
-//
-// you need to define:
-//
-//   testing::AssertionResult IsEven(const char* expr, int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess();
-//     else
-//       return testing::AssertionFailure()
-//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
-//   }
-//
-// If Foo() returns 5, you will see the following message:
-//
-//   Expected: Foo() is even
-//     Actual: it's 5
-//
-class GTEST_API_ AssertionResult {
- public:
-  // Copy constructor.
-  // Used in EXPECT_TRUE/FALSE(assertion_result).
-  AssertionResult(const AssertionResult& other);
-
-#if defined(_MSC_VER) && _MSC_VER < 1910
-  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
-#endif
-
-  // Used in the EXPECT_TRUE/FALSE(bool_expression).
-  //
-  // T must be contextually convertible to bool.
-  //
-  // The second parameter prevents this overload from being considered if
-  // the argument is implicitly convertible to AssertionResult. In that case
-  // we want AssertionResult's copy constructor to be used.
-  template <typename T>
-  explicit AssertionResult(
-      const T& success,
-      typename internal::EnableIf<
-          !std::is_convertible<T, AssertionResult>::value>::type*
-      /*enabler*/
-      = nullptr)
-      : success_(success) {}
-
-#if defined(_MSC_VER) && _MSC_VER < 1910
-  GTEST_DISABLE_MSC_WARNINGS_POP_()
-#endif
-
-  // Assignment operator.
-  AssertionResult& operator=(AssertionResult other) {
-    swap(other);
-    return *this;
-  }
-
-  // Returns true iff the assertion succeeded.
-  operator bool() const { return success_; }  // NOLINT
-
-  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-  AssertionResult operator!() const;
-
-  // Returns the text streamed into this AssertionResult. Test assertions
-  // use it when they fail (i.e., the predicate's outcome doesn't match the
-  // assertion's expectation). When nothing has been streamed into the
-  // object, returns an empty string.
-  const char* message() const {
-    return message_.get() != nullptr ? message_->c_str() : "";
-  }
-  // Deprecated; please use message() instead.
-  const char* failure_message() const { return message(); }
-
-  // Streams a custom failure message into this object.
-  template <typename T> AssertionResult& operator<<(const T& value) {
-    AppendMessage(Message() << value);
-    return *this;
-  }
-
-  // Allows streaming basic output manipulators such as endl or flush into
-  // this object.
-  AssertionResult& operator<<(
-      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
-    AppendMessage(Message() << basic_manipulator);
-    return *this;
-  }
-
- private:
-  // Appends the contents of message to message_.
-  void AppendMessage(const Message& a_message) {
-    if (message_.get() == nullptr) message_.reset(new ::std::string);
-    message_->append(a_message.GetString().c_str());
-  }
-
-  // Swap the contents of this AssertionResult with other.
-  void swap(AssertionResult& other);
-
-  // Stores result of the assertion predicate.
-  bool success_;
-  // Stores the message describing the condition in case the expectation
-  // construct is not satisfied with the predicate's outcome.
-  // Referenced via a pointer to avoid taking too much stack frame space
-  // with test assertions.
-  std::unique_ptr< ::std::string> message_;
-};
-
-// Makes a successful assertion result.
-GTEST_API_ AssertionResult AssertionSuccess();
-
-// Makes a failed assertion result.
-GTEST_API_ AssertionResult AssertionFailure();
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << msg.
-GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
-
-}  // namespace testing
-
-// Includes the auto-generated header that implements a family of generic
-// predicate assertion macros. This include comes late because it relies on
-// APIs declared above.
-#include "gtest/gtest_pred_impl.h"
-
-namespace testing {
-
-// The abstract class that all tests inherit from.
-//
-// In Google Test, a unit test program contains one or many TestSuites, and
-// each TestSuite contains one or many Tests.
-//
-// When you define a test using the TEST macro, you don't need to
-// explicitly derive from Test - the TEST macro automatically does
-// this for you.
-//
-// The only time you derive from Test is when defining a test fixture
-// to be used in a TEST_F.  For example:
-//
-//   class FooTest : public testing::Test {
-//    protected:
-//     void SetUp() override { ... }
-//     void TearDown() override { ... }
-//     ...
-//   };
-//
-//   TEST_F(FooTest, Bar) { ... }
-//   TEST_F(FooTest, Baz) { ... }
-//
-// Test is not copyable.
-class GTEST_API_ Test {
- public:
-  friend class TestInfo;
-
-  // The d'tor is virtual as we intend to inherit from Test.
-  virtual ~Test();
-
-  // Sets up the stuff shared by all tests in this test case.
-  //
-  // Google Test will call Foo::SetUpTestSuite() before running the first
-  // test in test case Foo.  Hence a sub-class can define its own
-  // SetUpTestSuite() method to shadow the one defined in the super
-  // class.
-  static void SetUpTestSuite() {}
-
-  // Tears down the stuff shared by all tests in this test case.
-  //
-  // Google Test will call Foo::TearDownTestSuite() after running the last
-  // test in test case Foo.  Hence a sub-class can define its own
-  // TearDownTestSuite() method to shadow the one defined in the super
-  // class.
-  static void TearDownTestSuite() {}
-
-  // Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  static void TearDownTestCase() {}
-  static void SetUpTestCase() {}
-#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-  // Returns true iff the current test has a fatal failure.
-  static bool HasFatalFailure();
-
-  // Returns true iff the current test has a non-fatal failure.
-  static bool HasNonfatalFailure();
-
-  // Returns true iff the current test was skipped.
-  static bool IsSkipped();
-
-  // Returns true iff the current test has a (either fatal or
-  // non-fatal) failure.
-  static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
-
-  // Logs a property for the current test, test suite, or for the entire
-  // invocation of the test program when used outside of the context of a
-  // test suite.  Only the last value for a given key is remembered.  These
-  // are public static so they can be called from utility functions that are
-  // not members of the test fixture.  Calls to RecordProperty made during
-  // lifespan of the test (from the moment its constructor starts to the
-  // moment its destructor finishes) will be output in XML as attributes of
-  // the <testcase> element.  Properties recorded from fixture's
-  // SetUpTestSuite or TearDownTestSuite are logged as attributes of the
-  // corresponding <testsuite> element.  Calls to RecordProperty made in the
-  // global context (before or after invocation of RUN_ALL_TESTS and from
-  // SetUp/TearDown method of Environment objects registered with Google
-  // Test) will be output as attributes of the <testsuites> element.
-  static void RecordProperty(const std::string& key, const std::string& value);
-  static void RecordProperty(const std::string& key, int value);
-
- protected:
-  // Creates a Test object.
-  Test();
-
-  // Sets up the test fixture.
-  virtual void SetUp();
-
-  // Tears down the test fixture.
-  virtual void TearDown();
-
- private:
-  // Returns true iff the current test has the same fixture class as
-  // the first test in the current test suite.
-  static bool HasSameFixtureClass();
-
-  // Runs the test after the test fixture has been set up.
-  //
-  // A sub-class must implement this to define the test logic.
-  //
-  // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
-  // Instead, use the TEST or TEST_F macro.
-  virtual void TestBody() = 0;
-
-  // Sets up, executes, and tears down the test.
-  void Run();
-
-  // Deletes self.  We deliberately pick an unusual name for this
-  // internal method to avoid clashing with names used in user TESTs.
-  void DeleteSelf_() { delete this; }
-
-  const std::unique_ptr<GTEST_FLAG_SAVER_> gtest_flag_saver_;
-
-  // Often a user misspells SetUp() as Setup() and spends a long time
-  // wondering why it is never called by Google Test.  The declaration of
-  // the following method is solely for catching such an error at
-  // compile time:
-  //
-  //   - The return type is deliberately chosen to be not void, so it
-  //   will be a conflict if void Setup() is declared in the user's
-  //   test fixture.
-  //
-  //   - This method is private, so it will be another compiler error
-  //   if the method is called from the user's test fixture.
-  //
-  // DO NOT OVERRIDE THIS FUNCTION.
-  //
-  // If you see an error about overriding the following function or
-  // about it being private, you have mis-spelled SetUp() as Setup().
-  struct Setup_should_be_spelled_SetUp {};
-  virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
-
-  // We disallow copying Tests.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
-};
-
-typedef internal::TimeInMillis TimeInMillis;
-
-// A copyable object representing a user specified test property which can be
-// output as a key/value string pair.
-//
-// Don't inherit from TestProperty as its destructor is not virtual.
-class TestProperty {
- public:
-  // C'tor.  TestProperty does NOT have a default constructor.
-  // Always use this constructor (with parameters) to create a
-  // TestProperty object.
-  TestProperty(const std::string& a_key, const std::string& a_value) :
-    key_(a_key), value_(a_value) {
-  }
-
-  // Gets the user supplied key.
-  const char* key() const {
-    return key_.c_str();
-  }
-
-  // Gets the user supplied value.
-  const char* value() const {
-    return value_.c_str();
-  }
-
-  // Sets a new value, overriding the one supplied in the constructor.
-  void SetValue(const std::string& new_value) {
-    value_ = new_value;
-  }
-
- private:
-  // The key supplied by the user.
-  std::string key_;
-  // The value supplied by the user.
-  std::string value_;
-};
-
-// The result of a single Test.  This includes a list of
-// TestPartResults, a list of TestProperties, a count of how many
-// death tests there are in the Test, and how much time it took to run
-// the Test.
-//
-// TestResult is not copyable.
-class GTEST_API_ TestResult {
- public:
-  // Creates an empty TestResult.
-  TestResult();
-
-  // D'tor.  Do not inherit from TestResult.
-  ~TestResult();
-
-  // Gets the number of all test parts.  This is the sum of the number
-  // of successful test parts and the number of failed test parts.
-  int total_part_count() const;
-
-  // Returns the number of the test properties.
-  int test_property_count() const;
-
-  // Returns true iff the test passed (i.e. no test part failed).
-  bool Passed() const { return !Skipped() && !Failed(); }
-
-  // Returns true iff the test was skipped.
-  bool Skipped() const;
-
-  // Returns true iff the test failed.
-  bool Failed() const;
-
-  // Returns true iff the test fatally failed.
-  bool HasFatalFailure() const;
-
-  // Returns true iff the test has a non-fatal failure.
-  bool HasNonfatalFailure() const;
-
-  // Returns the elapsed time, in milliseconds.
-  TimeInMillis elapsed_time() const { return elapsed_time_; }
-
-  // Returns the i-th test part result among all the results. i can range from 0
-  // to total_part_count() - 1. If i is not in that range, aborts the program.
-  const TestPartResult& GetTestPartResult(int i) const;
-
-  // Returns the i-th test property. i can range from 0 to
-  // test_property_count() - 1. If i is not in that range, aborts the
-  // program.
-  const TestProperty& GetTestProperty(int i) const;
-
- private:
-  friend class TestInfo;
-  friend class TestSuite;
-  friend class UnitTest;
-  friend class internal::DefaultGlobalTestPartResultReporter;
-  friend class internal::ExecDeathTest;
-  friend class internal::TestResultAccessor;
-  friend class internal::UnitTestImpl;
-  friend class internal::WindowsDeathTest;
-  friend class internal::FuchsiaDeathTest;
-
-  // Gets the vector of TestPartResults.
-  const std::vector<TestPartResult>& test_part_results() const {
-    return test_part_results_;
-  }
-
-  // Gets the vector of TestProperties.
-  const std::vector<TestProperty>& test_properties() const {
-    return test_properties_;
-  }
-
-  // Sets the elapsed time.
-  void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
-
-  // Adds a test property to the list. The property is validated and may add
-  // a non-fatal failure if invalid (e.g., if it conflicts with reserved
-  // key names). If a property is already recorded for the same key, the
-  // value will be updated, rather than storing multiple values for the same
-  // key.  xml_element specifies the element for which the property is being
-  // recorded and is used for validation.
-  void RecordProperty(const std::string& xml_element,
-                      const TestProperty& test_property);
-
-  // Adds a failure if the key is a reserved attribute of Google Test
-  // testsuite tags.  Returns true if the property is valid.
-  // FIXME: Validate attribute names are legal and human readable.
-  static bool ValidateTestProperty(const std::string& xml_element,
-                                   const TestProperty& test_property);
-
-  // Adds a test part result to the list.
-  void AddTestPartResult(const TestPartResult& test_part_result);
-
-  // Returns the death test count.
-  int death_test_count() const { return death_test_count_; }
-
-  // Increments the death test count, returning the new count.
-  int increment_death_test_count() { return ++death_test_count_; }
-
-  // Clears the test part results.
-  void ClearTestPartResults();
-
-  // Clears the object.
-  void Clear();
-
-  // Protects mutable state of the property vector and of owned
-  // properties, whose values may be updated.
-  internal::Mutex test_properites_mutex_;
-
-  // The vector of TestPartResults
-  std::vector<TestPartResult> test_part_results_;
-  // The vector of TestProperties
-  std::vector<TestProperty> test_properties_;
-  // Running count of death tests.
-  int death_test_count_;
-  // The elapsed time, in milliseconds.
-  TimeInMillis elapsed_time_;
-
-  // We disallow copying TestResult.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
-};  // class TestResult
-
-// A TestInfo object stores the following information about a test:
-//
-//   Test suite name
-//   Test name
-//   Whether the test should be run
-//   A function pointer that creates the test object when invoked
-//   Test result
-//
-// The constructor of TestInfo registers itself with the UnitTest
-// singleton such that the RUN_ALL_TESTS() macro knows which tests to
-// run.
-class GTEST_API_ TestInfo {
- public:
-  // Destructs a TestInfo object.  This function is not virtual, so
-  // don't inherit from TestInfo.
-  ~TestInfo();
-
-  // Returns the test suite name.
-  const char* test_suite_name() const { return test_suite_name_.c_str(); }
-
-// Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  const char* test_case_name() const { return test_suite_name(); }
-#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-  // Returns the test name.
-  const char* name() const { return name_.c_str(); }
-
-  // Returns the name of the parameter type, or NULL if this is not a typed
-  // or a type-parameterized test.
-  const char* type_param() const {
-    if (type_param_.get() != nullptr) return type_param_->c_str();
-    return nullptr;
-  }
-
-  // Returns the text representation of the value parameter, or NULL if this
-  // is not a value-parameterized test.
-  const char* value_param() const {
-    if (value_param_.get() != nullptr) return value_param_->c_str();
-    return nullptr;
-  }
-
-  // Returns the file name where this test is defined.
-  const char* file() const { return location_.file.c_str(); }
-
-  // Returns the line where this test is defined.
-  int line() const { return location_.line; }
-
-  // Return true if this test should not be run because it's in another shard.
-  bool is_in_another_shard() const { return is_in_another_shard_; }
-
-  // Returns true if this test should run, that is if the test is not
-  // disabled (or it is disabled but the also_run_disabled_tests flag has
-  // been specified) and its full name matches the user-specified filter.
-  //
-  // Google Test allows the user to filter the tests by their full names.
-  // The full name of a test Bar in test suite Foo is defined as
-  // "Foo.Bar".  Only the tests that match the filter will run.
-  //
-  // A filter is a colon-separated list of glob (not regex) patterns,
-  // optionally followed by a '-' and a colon-separated list of
-  // negative patterns (tests to exclude).  A test is run if it
-  // matches one of the positive patterns and does not match any of
-  // the negative patterns.
-  //
-  // For example, *A*:Foo.* is a filter that matches any string that
-  // contains the character 'A' or starts with "Foo.".
-  bool should_run() const { return should_run_; }
-
-  // Returns true iff this test will appear in the XML report.
-  bool is_reportable() const {
-    // The XML report includes tests matching the filter, excluding those
-    // run in other shards.
-    return matches_filter_ && !is_in_another_shard_;
-  }
-
-  // Returns the result of the test.
-  const TestResult* result() const { return &result_; }
-
- private:
-#if GTEST_HAS_DEATH_TEST
-  friend class internal::DefaultDeathTestFactory;
-#endif  // GTEST_HAS_DEATH_TEST
-  friend class Test;
-  friend class TestSuite;
-  friend class internal::UnitTestImpl;
-  friend class internal::StreamingListenerTest;
-  friend TestInfo* internal::MakeAndRegisterTestInfo(
-      const char* test_suite_name, const char* name, const char* type_param,
-      const char* value_param, internal::CodeLocation code_location,
-      internal::TypeId fixture_class_id, internal::SetUpTestSuiteFunc set_up_tc,
-      internal::TearDownTestSuiteFunc tear_down_tc,
-      internal::TestFactoryBase* factory);
-
-  // Constructs a TestInfo object. The newly constructed instance assumes
-  // ownership of the factory object.
-  TestInfo(const std::string& test_suite_name, const std::string& name,
-           const char* a_type_param,   // NULL if not a type-parameterized test
-           const char* a_value_param,  // NULL if not a value-parameterized test
-           internal::CodeLocation a_code_location,
-           internal::TypeId fixture_class_id,
-           internal::TestFactoryBase* factory);
-
-  // Increments the number of death tests encountered in this test so
-  // far.
-  int increment_death_test_count() {
-    return result_.increment_death_test_count();
-  }
-
-  // Creates the test object, runs it, records its result, and then
-  // deletes it.
-  void Run();
-
-  static void ClearTestResult(TestInfo* test_info) {
-    test_info->result_.Clear();
-  }
-
-  // These fields are immutable properties of the test.
-  const std::string test_suite_name_;    // test suite name
-  const std::string name_;               // Test name
-  // Name of the parameter type, or NULL if this is not a typed or a
-  // type-parameterized test.
-  const std::unique_ptr<const ::std::string> type_param_;
-  // Text representation of the value parameter, or NULL if this is not a
-  // value-parameterized test.
-  const std::unique_ptr<const ::std::string> value_param_;
-  internal::CodeLocation location_;
-  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
-  bool should_run_;                 // True iff this test should run
-  bool is_disabled_;                // True iff this test is disabled
-  bool matches_filter_;             // True if this test matches the
-                                    // user-specified filter.
-  bool is_in_another_shard_;        // Will be run in another shard.
-  internal::TestFactoryBase* const factory_;  // The factory that creates
-                                              // the test object
-
-  // This field is mutable and needs to be reset before running the
-  // test for the second time.
-  TestResult result_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
-};
-
-// A test suite, which consists of a vector of TestInfos.
-//
-// TestSuite is not copyable.
-class GTEST_API_ TestSuite {
- public:
-  // Creates a TestSuite with the given name.
-  //
-  // TestSuite does NOT have a default constructor.  Always use this
-  // constructor to create a TestSuite object.
-  //
-  // Arguments:
-  //
-  //   name:         name of the test suite
-  //   a_type_param: the name of the test's type parameter, or NULL if
-  //                 this is not a type-parameterized test.
-  //   set_up_tc:    pointer to the function that sets up the test suite
-  //   tear_down_tc: pointer to the function that tears down the test suite
-  TestSuite(const char* name, const char* a_type_param,
-            internal::SetUpTestSuiteFunc set_up_tc,
-            internal::TearDownTestSuiteFunc tear_down_tc);
-
-  // Destructor of TestSuite.
-  virtual ~TestSuite();
-
-  // Gets the name of the TestSuite.
-  const char* name() const { return name_.c_str(); }
-
-  // Returns the name of the parameter type, or NULL if this is not a
-  // type-parameterized test suite.
-  const char* type_param() const {
-    if (type_param_.get() != nullptr) return type_param_->c_str();
-    return nullptr;
-  }
-
-  // Returns true if any test in this test suite should run.
-  bool should_run() const { return should_run_; }
-
-  // Gets the number of successful tests in this test suite.
-  int successful_test_count() const;
-
-  // Gets the number of skipped tests in this test suite.
-  int skipped_test_count() const;
-
-  // Gets the number of failed tests in this test suite.
-  int failed_test_count() const;
-
-  // Gets the number of disabled tests that will be reported in the XML report.
-  int reportable_disabled_test_count() const;
-
-  // Gets the number of disabled tests in this test suite.
-  int disabled_test_count() const;
-
-  // Gets the number of tests to be printed in the XML report.
-  int reportable_test_count() const;
-
-  // Get the number of tests in this test suite that should run.
-  int test_to_run_count() const;
-
-  // Gets the number of all tests in this test suite.
-  int total_test_count() const;
-
-  // Returns true iff the test suite passed.
-  bool Passed() const { return !Failed(); }
-
-  // Returns true iff the test suite failed.
-  bool Failed() const { return failed_test_count() > 0; }
-
-  // Returns the elapsed time, in milliseconds.
-  TimeInMillis elapsed_time() const { return elapsed_time_; }
-
-  // Returns the i-th test among all the tests. i can range from 0 to
-  // total_test_count() - 1. If i is not in that range, returns NULL.
-  const TestInfo* GetTestInfo(int i) const;
-
-  // Returns the TestResult that holds test properties recorded during
-  // execution of SetUpTestSuite and TearDownTestSuite.
-  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
-
- private:
-  friend class Test;
-  friend class internal::UnitTestImpl;
-
-  // Gets the (mutable) vector of TestInfos in this TestSuite.
-  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
-
-  // Gets the (immutable) vector of TestInfos in this TestSuite.
-  const std::vector<TestInfo*>& test_info_list() const {
-    return test_info_list_;
-  }
-
-  // Returns the i-th test among all the tests. i can range from 0 to
-  // total_test_count() - 1. If i is not in that range, returns NULL.
-  TestInfo* GetMutableTestInfo(int i);
-
-  // Sets the should_run member.
-  void set_should_run(bool should) { should_run_ = should; }
-
-  // Adds a TestInfo to this test suite.  Will delete the TestInfo upon
-  // destruction of the TestSuite object.
-  void AddTestInfo(TestInfo * test_info);
-
-  // Clears the results of all tests in this test suite.
-  void ClearResult();
-
-  // Clears the results of all tests in the given test suite.
-  static void ClearTestSuiteResult(TestSuite* test_suite) {
-    test_suite->ClearResult();
-  }
-
-  // Runs every test in this TestSuite.
-  void Run();
-
-  // Runs SetUpTestSuite() for this TestSuite.  This wrapper is needed
-  // for catching exceptions thrown from SetUpTestSuite().
-  void RunSetUpTestSuite() {
-    if (set_up_tc_ != nullptr) {
-      (*set_up_tc_)();
-    }
-  }
-
-  // Runs TearDownTestSuite() for this TestSuite.  This wrapper is
-  // needed for catching exceptions thrown from TearDownTestSuite().
-  void RunTearDownTestSuite() {
-    if (tear_down_tc_ != nullptr) {
-      (*tear_down_tc_)();
-    }
-  }
-
-  // Returns true iff test passed.
-  static bool TestPassed(const TestInfo* test_info) {
-    return test_info->should_run() && test_info->result()->Passed();
-  }
-
-  // Returns true iff test skipped.
-  static bool TestSkipped(const TestInfo* test_info) {
-    return test_info->should_run() && test_info->result()->Skipped();
-  }
-
-  // Returns true iff test failed.
-  static bool TestFailed(const TestInfo* test_info) {
-    return test_info->should_run() && test_info->result()->Failed();
-  }
-
-  // Returns true iff the test is disabled and will be reported in the XML
-  // report.
-  static bool TestReportableDisabled(const TestInfo* test_info) {
-    return test_info->is_reportable() && test_info->is_disabled_;
-  }
-
-  // Returns true iff test is disabled.
-  static bool TestDisabled(const TestInfo* test_info) {
-    return test_info->is_disabled_;
-  }
-
-  // Returns true iff this test will appear in the XML report.
-  static bool TestReportable(const TestInfo* test_info) {
-    return test_info->is_reportable();
-  }
-
-  // Returns true if the given test should run.
-  static bool ShouldRunTest(const TestInfo* test_info) {
-    return test_info->should_run();
-  }
-
-  // Shuffles the tests in this test suite.
-  void ShuffleTests(internal::Random* random);
-
-  // Restores the test order to before the first shuffle.
-  void UnshuffleTests();
-
-  // Name of the test suite.
-  std::string name_;
-  // Name of the parameter type, or NULL if this is not a typed or a
-  // type-parameterized test.
-  const std::unique_ptr<const ::std::string> type_param_;
-  // The vector of TestInfos in their original order.  It owns the
-  // elements in the vector.
-  std::vector<TestInfo*> test_info_list_;
-  // Provides a level of indirection for the test list to allow easy
-  // shuffling and restoring the test order.  The i-th element in this
-  // vector is the index of the i-th test in the shuffled test list.
-  std::vector<int> test_indices_;
-  // Pointer to the function that sets up the test suite.
-  internal::SetUpTestSuiteFunc set_up_tc_;
-  // Pointer to the function that tears down the test suite.
-  internal::TearDownTestSuiteFunc tear_down_tc_;
-  // True iff any test in this test suite should run.
-  bool should_run_;
-  // Elapsed time, in milliseconds.
-  TimeInMillis elapsed_time_;
-  // Holds test properties recorded during execution of SetUpTestSuite and
-  // TearDownTestSuite.
-  TestResult ad_hoc_test_result_;
-
-  // We disallow copying TestSuites.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestSuite);
-};
-
-// An Environment object is capable of setting up and tearing down an
-// environment.  You should subclass this to define your own
-// environment(s).
-//
-// An Environment object does the set-up and tear-down in virtual
-// methods SetUp() and TearDown() instead of the constructor and the
-// destructor, as:
-//
-//   1. You cannot safely throw from a destructor.  This is a problem
-//      as in some cases Google Test is used where exceptions are enabled, and
-//      we may want to implement ASSERT_* using exceptions where they are
-//      available.
-//   2. You cannot use ASSERT_* directly in a constructor or
-//      destructor.
-class Environment {
- public:
-  // The d'tor is virtual as we need to subclass Environment.
-  virtual ~Environment() {}
-
-  // Override this to define how to set up the environment.
-  virtual void SetUp() {}
-
-  // Override this to define how to tear down the environment.
-  virtual void TearDown() {}
- private:
-  // If you see an error about overriding the following function or
-  // about it being private, you have mis-spelled SetUp() as Setup().
-  struct Setup_should_be_spelled_SetUp {};
-  virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
-};
-
-#if GTEST_HAS_EXCEPTIONS
-
-// Exception which can be thrown from TestEventListener::OnTestPartResult.
-class GTEST_API_ AssertionException
-    : public internal::GoogleTestFailureException {
- public:
-  explicit AssertionException(const TestPartResult& result)
-      : GoogleTestFailureException(result) {}
-};
-
-#endif  // GTEST_HAS_EXCEPTIONS
-
-// The interface for tracing execution of tests. The methods are organized in
-// the order the corresponding events are fired.
-class TestEventListener {
- public:
-  virtual ~TestEventListener() {}
-
-  // Fired before any test activity starts.
-  virtual void OnTestProgramStart(const UnitTest& unit_test) = 0;
-
-  // Fired before each iteration of tests starts.  There may be more than
-  // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration
-  // index, starting from 0.
-  virtual void OnTestIterationStart(const UnitTest& unit_test,
-                                    int iteration) = 0;
-
-  // Fired before environment set-up for each iteration of tests starts.
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0;
-
-  // Fired after environment set-up for each iteration of tests ends.
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0;
-
-  // Fired before the test suite starts.
-  virtual void OnTestSuiteStart(const TestSuite& /*test_suite*/) {}
-
-  //  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-  // Fired before the test starts.
-  virtual void OnTestStart(const TestInfo& test_info) = 0;
-
-  // Fired after a failed assertion or a SUCCEED() invocation.
-  // If you want to throw an exception from this function to skip to the next
-  // TEST, it must be AssertionException defined above, or inherited from it.
-  virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
-
-  // Fired after the test ends.
-  virtual void OnTestEnd(const TestInfo& test_info) = 0;
-
-  // Fired after the test suite ends.
-  virtual void OnTestSuiteEnd(const TestSuite& /*test_suite*/) {}
-
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-  // Fired before environment tear-down for each iteration of tests starts.
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0;
-
-  // Fired after environment tear-down for each iteration of tests ends.
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
-
-  // Fired after each iteration of tests finishes.
-  virtual void OnTestIterationEnd(const UnitTest& unit_test,
-                                  int iteration) = 0;
-
-  // Fired after all test activities have ended.
-  virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
-};
-
-// The convenience class for users who need to override just one or two
-// methods and are not concerned that a possible change to a signature of
-// the methods they override will not be caught during the build.  For
-// comments about each method please see the definition of TestEventListener
-// above.
-class EmptyTestEventListener : public TestEventListener {
- public:
-  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
-  void OnTestIterationStart(const UnitTest& /*unit_test*/,
-                            int /*iteration*/) override {}
-  void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {}
-  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
-  void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {}
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  void OnTestCaseStart(const TestCase& /*test_case*/) override {}
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-  void OnTestStart(const TestInfo& /*test_info*/) override {}
-  void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {}
-  void OnTestEnd(const TestInfo& /*test_info*/) override {}
-  void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  void OnTestCaseEnd(const TestCase& /*test_case*/) override {}
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-  void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {}
-  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
-  void OnTestIterationEnd(const UnitTest& /*unit_test*/,
-                          int /*iteration*/) override {}
-  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
-};
-
-// TestEventListeners lets users add listeners to track events in Google Test.
-class GTEST_API_ TestEventListeners {
- public:
-  TestEventListeners();
-  ~TestEventListeners();
-
-  // Appends an event listener to the end of the list. Google Test assumes
-  // the ownership of the listener (i.e. it will delete the listener when
-  // the test program finishes).
-  void Append(TestEventListener* listener);
-
-  // Removes the given event listener from the list and returns it.  It then
-  // becomes the caller's responsibility to delete the listener. Returns
-  // NULL if the listener is not found in the list.
-  TestEventListener* Release(TestEventListener* listener);
-
-  // Returns the standard listener responsible for the default console
-  // output.  Can be removed from the listeners list to shut down default
-  // console output.  Note that removing this object from the listener list
-  // with Release transfers its ownership to the caller and makes this
-  // function return NULL the next time.
-  TestEventListener* default_result_printer() const {
-    return default_result_printer_;
-  }
-
-  // Returns the standard listener responsible for the default XML output
-  // controlled by the --gtest_output=xml flag.  Can be removed from the
-  // listeners list by users who want to shut down the default XML output
-  // controlled by this flag and substitute it with custom one.  Note that
-  // removing this object from the listener list with Release transfers its
-  // ownership to the caller and makes this function return NULL the next
-  // time.
-  TestEventListener* default_xml_generator() const {
-    return default_xml_generator_;
-  }
-
- private:
-  friend class TestSuite;
-  friend class TestInfo;
-  friend class internal::DefaultGlobalTestPartResultReporter;
-  friend class internal::NoExecDeathTest;
-  friend class internal::TestEventListenersAccessor;
-  friend class internal::UnitTestImpl;
-
-  // Returns repeater that broadcasts the TestEventListener events to all
-  // subscribers.
-  TestEventListener* repeater();
-
-  // Sets the default_result_printer attribute to the provided listener.
-  // The listener is also added to the listener list and previous
-  // default_result_printer is removed from it and deleted. The listener can
-  // also be NULL in which case it will not be added to the list. Does
-  // nothing if the previous and the current listener objects are the same.
-  void SetDefaultResultPrinter(TestEventListener* listener);
-
-  // Sets the default_xml_generator attribute to the provided listener.  The
-  // listener is also added to the listener list and previous
-  // default_xml_generator is removed from it and deleted. The listener can
-  // also be NULL in which case it will not be added to the list. Does
-  // nothing if the previous and the current listener objects are the same.
-  void SetDefaultXmlGenerator(TestEventListener* listener);
-
-  // Controls whether events will be forwarded by the repeater to the
-  // listeners in the list.
-  bool EventForwardingEnabled() const;
-  void SuppressEventForwarding();
-
-  // The actual list of listeners.
-  internal::TestEventRepeater* repeater_;
-  // Listener responsible for the standard result output.
-  TestEventListener* default_result_printer_;
-  // Listener responsible for the creation of the XML output file.
-  TestEventListener* default_xml_generator_;
-
-  // We disallow copying TestEventListeners.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
-};
-
-// A UnitTest consists of a vector of TestSuites.
-//
-// This is a singleton class.  The only instance of UnitTest is
-// created when UnitTest::GetInstance() is first called.  This
-// instance is never deleted.
-//
-// UnitTest is not copyable.
-//
-// This class is thread-safe as long as the methods are called
-// according to their specification.
-class GTEST_API_ UnitTest {
- public:
-  // Gets the singleton UnitTest object.  The first time this method
-  // is called, a UnitTest object is constructed and returned.
-  // Consecutive calls will return the same object.
-  static UnitTest* GetInstance();
-
-  // Runs all tests in this UnitTest object and prints the result.
-  // Returns 0 if successful, or 1 otherwise.
-  //
-  // This method can only be called from the main thread.
-  //
-  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-  int Run() GTEST_MUST_USE_RESULT_;
-
-  // Returns the working directory when the first TEST() or TEST_F()
-  // was executed.  The UnitTest object owns the string.
-  const char* original_working_dir() const;
-
-  // Returns the TestSuite object for the test that's currently running,
-  // or NULL if no test is running.
-  const TestSuite* current_test_suite() const GTEST_LOCK_EXCLUDED_(mutex_);
-
-// Legacy API is still available but deprecated
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  const TestCase* current_test_case() const GTEST_LOCK_EXCLUDED_(mutex_);
-#endif
-
-  // Returns the TestInfo object for the test that's currently running,
-  // or NULL if no test is running.
-  const TestInfo* current_test_info() const
-      GTEST_LOCK_EXCLUDED_(mutex_);
-
-  // Returns the random seed used at the start of the current test run.
-  int random_seed() const;
-
-  // Returns the ParameterizedTestSuiteRegistry object used to keep track of
-  // value-parameterized tests and instantiate and register them.
-  //
-  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-  internal::ParameterizedTestSuiteRegistry& parameterized_test_registry()
-      GTEST_LOCK_EXCLUDED_(mutex_);
-
-  // Gets the number of successful test suites.
-  int successful_test_suite_count() const;
-
-  // Gets the number of failed test suites.
-  int failed_test_suite_count() const;
-
-  // Gets the number of all test suites.
-  int total_test_suite_count() const;
-
-  // Gets the number of all test suites that contain at least one test
-  // that should run.
-  int test_suite_to_run_count() const;
-
-  //  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  int successful_test_case_count() const;
-  int failed_test_case_count() const;
-  int total_test_case_count() const;
-  int test_case_to_run_count() const;
-#endif  //  EMOVE_LEGACY_TEST_CASEAPI
-
-  // Gets the number of successful tests.
-  int successful_test_count() const;
-
-  // Gets the number of skipped tests.
-  int skipped_test_count() const;
-
-  // Gets the number of failed tests.
-  int failed_test_count() const;
-
-  // Gets the number of disabled tests that will be reported in the XML report.
-  int reportable_disabled_test_count() const;
-
-  // Gets the number of disabled tests.
-  int disabled_test_count() const;
-
-  // Gets the number of tests to be printed in the XML report.
-  int reportable_test_count() const;
-
-  // Gets the number of all tests.
-  int total_test_count() const;
-
-  // Gets the number of tests that should run.
-  int test_to_run_count() const;
-
-  // Gets the time of the test program start, in ms from the start of the
-  // UNIX epoch.
-  TimeInMillis start_timestamp() const;
-
-  // Gets the elapsed time, in milliseconds.
-  TimeInMillis elapsed_time() const;
-
-  // Returns true iff the unit test passed (i.e. all test suites passed).
-  bool Passed() const;
-
-  // Returns true iff the unit test failed (i.e. some test suite failed
-  // or something outside of all tests failed).
-  bool Failed() const;
-
-  // Gets the i-th test suite among all the test suites. i can range from 0 to
-  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
-  const TestSuite* GetTestSuite(int i) const;
-
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  const TestCase* GetTestCase(int i) const;
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-  // Returns the TestResult containing information on test failures and
-  // properties logged outside of individual test suites.
-  const TestResult& ad_hoc_test_result() const;
-
-  // Returns the list of event listeners that can be used to track events
-  // inside Google Test.
-  TestEventListeners& listeners();
-
- private:
-  // Registers and returns a global test environment.  When a test
-  // program is run, all global test environments will be set-up in
-  // the order they were registered.  After all tests in the program
-  // have finished, all global test environments will be torn-down in
-  // the *reverse* order they were registered.
-  //
-  // The UnitTest object takes ownership of the given environment.
-  //
-  // This method can only be called from the main thread.
-  Environment* AddEnvironment(Environment* env);
-
-  // Adds a TestPartResult to the current TestResult object.  All
-  // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc)
-  // eventually call this to report their results.  The user code
-  // should use the assertion macros instead of calling this directly.
-  void AddTestPartResult(TestPartResult::Type result_type,
-                         const char* file_name,
-                         int line_number,
-                         const std::string& message,
-                         const std::string& os_stack_trace)
-      GTEST_LOCK_EXCLUDED_(mutex_);
-
-  // Adds a TestProperty to the current TestResult object when invoked from
-  // inside a test, to current TestSuite's ad_hoc_test_result_ when invoked
-  // from SetUpTestSuite or TearDownTestSuite, or to the global property set
-  // when invoked elsewhere.  If the result already contains a property with
-  // the same key, the value will be updated.
-  void RecordProperty(const std::string& key, const std::string& value);
-
-  // Gets the i-th test suite among all the test suites. i can range from 0 to
-  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
-  TestSuite* GetMutableTestSuite(int i);
-
-  // Accessors for the implementation object.
-  internal::UnitTestImpl* impl() { return impl_; }
-  const internal::UnitTestImpl* impl() const { return impl_; }
-
-  // These classes and functions are friends as they need to access private
-  // members of UnitTest.
-  friend class ScopedTrace;
-  friend class Test;
-  friend class internal::AssertHelper;
-  friend class internal::StreamingListenerTest;
-  friend class internal::UnitTestRecordPropertyTestHelper;
-  friend Environment* AddGlobalTestEnvironment(Environment* env);
-  friend internal::UnitTestImpl* internal::GetUnitTestImpl();
-  friend void internal::ReportFailureInUnknownLocation(
-      TestPartResult::Type result_type,
-      const std::string& message);
-
-  // Creates an empty UnitTest.
-  UnitTest();
-
-  // D'tor
-  virtual ~UnitTest();
-
-  // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
-  // Google Test trace stack.
-  void PushGTestTrace(const internal::TraceInfo& trace)
-      GTEST_LOCK_EXCLUDED_(mutex_);
-
-  // Pops a trace from the per-thread Google Test trace stack.
-  void PopGTestTrace()
-      GTEST_LOCK_EXCLUDED_(mutex_);
-
-  // Protects mutable state in *impl_.  This is mutable as some const
-  // methods need to lock it too.
-  mutable internal::Mutex mutex_;
-
-  // Opaque implementation object.  This field is never changed once
-  // the object is constructed.  We don't mark it as const here, as
-  // doing so will cause a warning in the constructor of UnitTest.
-  // Mutable state in *impl_ is protected by mutex_.
-  internal::UnitTestImpl* impl_;
-
-  // We disallow copying UnitTest.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
-};
-
-// A convenient wrapper for adding an environment for the test
-// program.
-//
-// You should call this before RUN_ALL_TESTS() is called, probably in
-// main().  If you use gtest_main, you need to call this before main()
-// starts for it to take effect.  For example, you can define a global
-// variable like this:
-//
-//   testing::Environment* const foo_env =
-//       testing::AddGlobalTestEnvironment(new FooEnvironment);
-//
-// However, we strongly recommend you to write your own main() and
-// call AddGlobalTestEnvironment() there, as relying on initialization
-// of global variables makes the code harder to read and may cause
-// problems when you register multiple environments from different
-// translation units and the environments have dependencies among them
-// (remember that the compiler doesn't guarantee the order in which
-// global variables from different translation units are initialized).
-inline Environment* AddGlobalTestEnvironment(Environment* env) {
-  return UnitTest::GetInstance()->AddEnvironment(env);
-}
-
-// Initializes Google Test.  This must be called before calling
-// RUN_ALL_TESTS().  In particular, it parses a command line for the
-// flags that Google Test recognizes.  Whenever a Google Test flag is
-// seen, it is removed from argv, and *argc is decremented.
-//
-// No value is returned.  Instead, the Google Test flag variables are
-// updated.
-//
-// Calling the function for the second time has no user-visible effect.
-GTEST_API_ void InitGoogleTest(int* argc, char** argv);
-
-// This overloaded version can be used in Windows programs compiled in
-// UNICODE mode.
-GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
-
-// This overloaded version can be used on Arduino/embedded platforms where
-// there is no argc/argv.
-GTEST_API_ void InitGoogleTest();
-
-namespace internal {
-
-// Separate the error generating code from the code path to reduce the stack
-// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
-// when calling EXPECT_* in a tight loop.
-template <typename T1, typename T2>
-AssertionResult CmpHelperEQFailure(const char* lhs_expression,
-                                   const char* rhs_expression,
-                                   const T1& lhs, const T2& rhs) {
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   FormatForComparisonFailureMessage(lhs, rhs),
-                   FormatForComparisonFailureMessage(rhs, lhs),
-                   false);
-}
-
-// This block of code defines operator==/!=
-// to block lexical scope lookup.
-// It prevents using invalid operator==/!= defined at namespace scope.
-struct faketype {};
-inline bool operator==(faketype, faketype) { return true; }
-inline bool operator!=(faketype, faketype) { return false; }
-
-// The helper function for {ASSERT|EXPECT}_EQ.
-template <typename T1, typename T2>
-AssertionResult CmpHelperEQ(const char* lhs_expression,
-                            const char* rhs_expression,
-                            const T1& lhs,
-                            const T2& rhs) {
-  if (lhs == rhs) {
-    return AssertionSuccess();
-  }
-
-  return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
-}
-
-// With this overloaded version, we allow anonymous enums to be used
-// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
-// can be implicitly cast to BiggestInt.
-GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression,
-                                       const char* rhs_expression,
-                                       BiggestInt lhs,
-                                       BiggestInt rhs);
-
-class EqHelper {
- public:
-  // This templatized version is for the general case.
-  template <
-      typename T1, typename T2,
-      // Disable this overload for cases where one argument is a pointer
-      // and the other is the null pointer constant.
-      typename std::enable_if<!std::is_integral<T1>::value ||
-                              !std::is_pointer<T2>::value>::type* = nullptr>
-  static AssertionResult Compare(const char* lhs_expression,
-                                 const char* rhs_expression, const T1& lhs,
-                                 const T2& rhs) {
-    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
-  }
-
-  // With this overloaded version, we allow anonymous enums to be used
-  // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
-  // enums can be implicitly cast to BiggestInt.
-  //
-  // Even though its body looks the same as the above version, we
-  // cannot merge the two, as it will make anonymous enums unhappy.
-  static AssertionResult Compare(const char* lhs_expression,
-                                 const char* rhs_expression,
-                                 BiggestInt lhs,
-                                 BiggestInt rhs) {
-    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
-  }
-
-  template <typename T>
-  static AssertionResult Compare(
-      const char* lhs_expression, const char* rhs_expression,
-      // Handle cases where '0' is used as a null pointer literal.
-      std::nullptr_t /* lhs */, T* rhs) {
-    // We already know that 'lhs' is a null pointer.
-    return CmpHelperEQ(lhs_expression, rhs_expression, static_cast<T*>(nullptr),
-                       rhs);
-  }
-};
-
-// Separate the error generating code from the code path to reduce the stack
-// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers
-// when calling EXPECT_OP in a tight loop.
-template <typename T1, typename T2>
-AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
-                                   const T1& val1, const T2& val2,
-                                   const char* op) {
-  return AssertionFailure()
-         << "Expected: (" << expr1 << ") " << op << " (" << expr2
-         << "), actual: " << FormatForComparisonFailureMessage(val1, val2)
-         << " vs " << FormatForComparisonFailureMessage(val2, val1);
-}
-
-// A macro for implementing the helper functions needed to implement
-// ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
-// of similar code.
-//
-// For each templatized helper function, we also define an overloaded
-// version for BiggestInt in order to reduce code bloat and allow
-// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
-// with gcc 4.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-template <typename T1, typename T2>\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   const T1& val1, const T2& val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
-  }\
-}\
-GTEST_API_ AssertionResult CmpHelper##op_name(\
-    const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2)
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-
-// Implements the helper function for {ASSERT|EXPECT}_NE
-GTEST_IMPL_CMP_HELPER_(NE, !=);
-// Implements the helper function for {ASSERT|EXPECT}_LE
-GTEST_IMPL_CMP_HELPER_(LE, <=);
-// Implements the helper function for {ASSERT|EXPECT}_LT
-GTEST_IMPL_CMP_HELPER_(LT, <);
-// Implements the helper function for {ASSERT|EXPECT}_GE
-GTEST_IMPL_CMP_HELPER_(GE, >=);
-// Implements the helper function for {ASSERT|EXPECT}_GT
-GTEST_IMPL_CMP_HELPER_(GT, >);
-
-#undef GTEST_IMPL_CMP_HELPER_
-
-// The helper function for {ASSERT|EXPECT}_STREQ.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
-                                          const char* s2_expression,
-                                          const char* s1,
-                                          const char* s2);
-
-// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
-                                              const char* s2_expression,
-                                              const char* s1,
-                                              const char* s2);
-
-// The helper function for {ASSERT|EXPECT}_STRNE.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                                          const char* s2_expression,
-                                          const char* s1,
-                                          const char* s2);
-
-// The helper function for {ASSERT|EXPECT}_STRCASENE.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
-                                              const char* s2_expression,
-                                              const char* s1,
-                                              const char* s2);
-
-
-// Helper function for *_STREQ on wide strings.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
-                                          const char* s2_expression,
-                                          const wchar_t* s1,
-                                          const wchar_t* s2);
-
-// Helper function for *_STRNE on wide strings.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                                          const char* s2_expression,
-                                          const wchar_t* s1,
-                                          const wchar_t* s2);
-
-}  // namespace internal
-
-// IsSubstring() and IsNotSubstring() are intended to be used as the
-// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by
-// themselves.  They check whether needle is a substring of haystack
-// (NULL is considered a substring of itself only), and return an
-// appropriate error message when they fail.
-//
-// The {needle,haystack}_expr arguments are the stringified
-// expressions that generated the two real arguments.
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
-
-#if GTEST_HAS_STD_WSTRING
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
-#endif  // GTEST_HAS_STD_WSTRING
-
-namespace internal {
-
-// Helper template function for comparing floating-points.
-//
-// Template parameter:
-//
-//   RawType: the raw floating-point type (either float or double)
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-template <typename RawType>
-AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
-                                         const char* rhs_expression,
-                                         RawType lhs_value,
-                                         RawType rhs_value) {
-  const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
-
-  if (lhs.AlmostEquals(rhs)) {
-    return AssertionSuccess();
-  }
-
-  ::std::stringstream lhs_ss;
-  lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-         << lhs_value;
-
-  ::std::stringstream rhs_ss;
-  rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-         << rhs_value;
-
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   StringStreamToString(&lhs_ss),
-                   StringStreamToString(&rhs_ss),
-                   false);
-}
-
-// Helper function for implementing ASSERT_NEAR.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
-                                                const char* expr2,
-                                                const char* abs_error_expr,
-                                                double val1,
-                                                double val2,
-                                                double abs_error);
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-// A class that enables one to stream messages to assertion macros
-class GTEST_API_ AssertHelper {
- public:
-  // Constructor.
-  AssertHelper(TestPartResult::Type type,
-               const char* file,
-               int line,
-               const char* message);
-  ~AssertHelper();
-
-  // Message assignment is a semantic trick to enable assertion
-  // streaming; see the GTEST_MESSAGE_ macro below.
-  void operator=(const Message& message) const;
-
- private:
-  // We put our data in a struct so that the size of the AssertHelper class can
-  // be as small as possible.  This is important because gcc is incapable of
-  // re-using stack space even for temporary variables, so every EXPECT_EQ
-  // reserves stack space for another AssertHelper.
-  struct AssertHelperData {
-    AssertHelperData(TestPartResult::Type t,
-                     const char* srcfile,
-                     int line_num,
-                     const char* msg)
-        : type(t), file(srcfile), line(line_num), message(msg) { }
-
-    TestPartResult::Type const type;
-    const char* const file;
-    int const line;
-    std::string const message;
-
-   private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
-  };
-
-  AssertHelperData* const data_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
-};
-
-enum GTestColor { COLOR_DEFAULT, COLOR_RED, COLOR_GREEN, COLOR_YELLOW };
-
-GTEST_API_ GTEST_ATTRIBUTE_PRINTF_(2, 3) void ColoredPrintf(GTestColor color,
-                                                            const char* fmt,
-                                                            ...);
-
-}  // namespace internal
-
-// The pure interface class that all value-parameterized tests inherit from.
-// A value-parameterized class must inherit from both ::testing::Test and
-// ::testing::WithParamInterface. In most cases that just means inheriting
-// from ::testing::TestWithParam, but more complicated test hierarchies
-// may need to inherit from Test and WithParamInterface at different levels.
-//
-// This interface has support for accessing the test parameter value via
-// the GetParam() method.
-//
-// Use it with one of the parameter generator defining functions, like Range(),
-// Values(), ValuesIn(), Bool(), and Combine().
-//
-// class FooTest : public ::testing::TestWithParam<int> {
-//  protected:
-//   FooTest() {
-//     // Can use GetParam() here.
-//   }
-//   ~FooTest() override {
-//     // Can use GetParam() here.
-//   }
-//   void SetUp() override {
-//     // Can use GetParam() here.
-//   }
-//   void TearDown override {
-//     // Can use GetParam() here.
-//   }
-// };
-// TEST_P(FooTest, DoesBar) {
-//   // Can use GetParam() method here.
-//   Foo foo;
-//   ASSERT_TRUE(foo.DoesBar(GetParam()));
-// }
-// INSTANTIATE_TEST_SUITE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
-
-template <typename T>
-class WithParamInterface {
- public:
-  typedef T ParamType;
-  virtual ~WithParamInterface() {}
-
-  // The current parameter value. Is also available in the test fixture's
-  // constructor.
-  static const ParamType& GetParam() {
-    GTEST_CHECK_(parameter_ != nullptr)
-        << "GetParam() can only be called inside a value-parameterized test "
-        << "-- did you intend to write TEST_P instead of TEST_F?";
-    return *parameter_;
-  }
-
- private:
-  // Sets parameter value. The caller is responsible for making sure the value
-  // remains alive and unchanged throughout the current test.
-  static void SetParam(const ParamType* parameter) {
-    parameter_ = parameter;
-  }
-
-  // Static value used for accessing parameter during a test lifetime.
-  static const ParamType* parameter_;
-
-  // TestClass must be a subclass of WithParamInterface<T> and Test.
-  template <class TestClass> friend class internal::ParameterizedTestFactory;
-};
-
-template <typename T>
-const T* WithParamInterface<T>::parameter_ = nullptr;
-
-// Most value-parameterized classes can ignore the existence of
-// WithParamInterface, and can just inherit from ::testing::TestWithParam.
-
-template <typename T>
-class TestWithParam : public Test, public WithParamInterface<T> {
-};
-
-// Macros for indicating success/failure in test code.
-
-// Skips test in runtime.
-// Skipping test aborts current function.
-// Skipped tests are neither successful nor failed.
-#define GTEST_SKIP() GTEST_SKIP_("Skipped")
-
-// ADD_FAILURE unconditionally adds a failure to the current test.
-// SUCCEED generates a success - it doesn't automatically make the
-// current test successful, as a test is only successful when it has
-// no failure.
-//
-// EXPECT_* verifies that a certain condition is satisfied.  If not,
-// it behaves like ADD_FAILURE.  In particular:
-//
-//   EXPECT_TRUE  verifies that a Boolean condition is true.
-//   EXPECT_FALSE verifies that a Boolean condition is false.
-//
-// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
-// that they will also abort the current function on failure.  People
-// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
-// writing data-driven tests often find themselves using ADD_FAILURE
-// and EXPECT_* more.
-
-// Generates a nonfatal failure with a generic message.
-#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
-
-// Generates a nonfatal failure at the given source file location with
-// a generic message.
-#define ADD_FAILURE_AT(file, line) \
-  GTEST_MESSAGE_AT_(file, line, "Failed", \
-                    ::testing::TestPartResult::kNonFatalFailure)
-
-// Generates a fatal failure with a generic message.
-#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
-
-// Define this macro to 1 to omit the definition of FAIL(), which is a
-// generic name and clashes with some other libraries.
-#if !GTEST_DONT_DEFINE_FAIL
-# define FAIL() GTEST_FAIL()
-#endif
-
-// Generates a success with a generic message.
-#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
-
-// Define this macro to 1 to omit the definition of SUCCEED(), which
-// is a generic name and clashes with some other libraries.
-#if !GTEST_DONT_DEFINE_SUCCEED
-# define SUCCEED() GTEST_SUCCEED()
-#endif
-
-// Macros for testing exceptions.
-//
-//    * {ASSERT|EXPECT}_THROW(statement, expected_exception):
-//         Tests that the statement throws the expected exception.
-//    * {ASSERT|EXPECT}_NO_THROW(statement):
-//         Tests that the statement doesn't throw any exception.
-//    * {ASSERT|EXPECT}_ANY_THROW(statement):
-//         Tests that the statement throws an exception.
-
-#define EXPECT_THROW(statement, expected_exception) \
-  GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_NO_THROW(statement) \
-  GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_ANY_THROW(statement) \
-  GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_THROW(statement, expected_exception) \
-  GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
-#define ASSERT_NO_THROW(statement) \
-  GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
-#define ASSERT_ANY_THROW(statement) \
-  GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
-
-// Boolean assertions. Condition can be either a Boolean expression or an
-// AssertionResult. For more information on how to use AssertionResult with
-// these macros see comments on that class.
-#define EXPECT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
-                      GTEST_NONFATAL_FAILURE_)
-#define EXPECT_FALSE(condition) \
-  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
-                      GTEST_NONFATAL_FAILURE_)
-#define ASSERT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
-                      GTEST_FATAL_FAILURE_)
-#define ASSERT_FALSE(condition) \
-  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
-                      GTEST_FATAL_FAILURE_)
-
-// Macros for testing equalities and inequalities.
-//
-//    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
-//    * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2
-//    * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2
-//    * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2
-//    * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2
-//    * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2
-//
-// When they are not, Google Test prints both the tested expressions and
-// their actual values.  The values must be compatible built-in types,
-// or you will get a compiler error.  By "compatible" we mean that the
-// values can be compared by the respective operator.
-//
-// Note:
-//
-//   1. It is possible to make a user-defined type work with
-//   {ASSERT|EXPECT}_??(), but that requires overloading the
-//   comparison operators and is thus discouraged by the Google C++
-//   Usage Guide.  Therefore, you are advised to use the
-//   {ASSERT|EXPECT}_TRUE() macro to assert that two objects are
-//   equal.
-//
-//   2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on
-//   pointers (in particular, C strings).  Therefore, if you use it
-//   with two C strings, you are testing how their locations in memory
-//   are related, not how their content is related.  To compare two C
-//   strings by content, use {ASSERT|EXPECT}_STR*().
-//
-//   3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to
-//   {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you
-//   what the actual value is when it fails, and similarly for the
-//   other comparisons.
-//
-//   4. Do not depend on the order in which {ASSERT|EXPECT}_??()
-//   evaluate their arguments, which is undefined.
-//
-//   5. These macros evaluate their arguments exactly once.
-//
-// Examples:
-//
-//   EXPECT_NE(Foo(), 5);
-//   EXPECT_EQ(a_pointer, NULL);
-//   ASSERT_LT(i, array_size);
-//   ASSERT_GT(records.size(), 0) << "There is no record left.";
-
-#define EXPECT_EQ(val1, val2) \
-  EXPECT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2)
-#define EXPECT_NE(val1, val2) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
-#define EXPECT_LE(val1, val2) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
-#define EXPECT_LT(val1, val2) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
-#define EXPECT_GE(val1, val2) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
-#define EXPECT_GT(val1, val2) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
-
-#define GTEST_ASSERT_EQ(val1, val2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2)
-#define GTEST_ASSERT_NE(val1, val2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
-#define GTEST_ASSERT_LE(val1, val2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
-#define GTEST_ASSERT_LT(val1, val2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
-#define GTEST_ASSERT_GE(val1, val2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
-#define GTEST_ASSERT_GT(val1, val2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
-
-// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of
-// ASSERT_XY(), which clashes with some users' own code.
-
-#if !GTEST_DONT_DEFINE_ASSERT_EQ
-# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
-#endif
-
-#if !GTEST_DONT_DEFINE_ASSERT_NE
-# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
-#endif
-
-#if !GTEST_DONT_DEFINE_ASSERT_LE
-# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
-#endif
-
-#if !GTEST_DONT_DEFINE_ASSERT_LT
-# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
-#endif
-
-#if !GTEST_DONT_DEFINE_ASSERT_GE
-# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
-#endif
-
-#if !GTEST_DONT_DEFINE_ASSERT_GT
-# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
-#endif
-
-// C-string Comparisons.  All tests treat NULL and any non-NULL string
-// as different.  Two NULLs are equal.
-//
-//    * {ASSERT|EXPECT}_STREQ(s1, s2):     Tests that s1 == s2
-//    * {ASSERT|EXPECT}_STRNE(s1, s2):     Tests that s1 != s2
-//    * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case
-//    * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case
-//
-// For wide or narrow string objects, you can use the
-// {ASSERT|EXPECT}_??() macros.
-//
-// Don't depend on the order in which the arguments are evaluated,
-// which is undefined.
-//
-// These macros evaluate their arguments exactly once.
-
-#define EXPECT_STREQ(s1, s2) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
-#define EXPECT_STRNE(s1, s2) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
-#define EXPECT_STRCASEEQ(s1, s2) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define EXPECT_STRCASENE(s1, s2)\
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
-
-#define ASSERT_STREQ(s1, s2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
-#define ASSERT_STRNE(s1, s2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
-#define ASSERT_STRCASEEQ(s1, s2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define ASSERT_STRCASENE(s1, s2)\
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
-
-// Macros for comparing floating-point numbers.
-//
-//    * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2):
-//         Tests that two float values are almost equal.
-//    * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2):
-//         Tests that two double values are almost equal.
-//    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
-//         Tests that v1 and v2 are within the given distance to each other.
-//
-// Google Test uses ULP-based comparison to automatically pick a default
-// error bound that is appropriate for the operands.  See the
-// FloatingPoint template class in gtest-internal.h if you are
-// interested in the implementation details.
-
-#define EXPECT_FLOAT_EQ(val1, val2)\
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
-                      val1, val2)
-
-#define EXPECT_DOUBLE_EQ(val1, val2)\
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
-                      val1, val2)
-
-#define ASSERT_FLOAT_EQ(val1, val2)\
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
-                      val1, val2)
-
-#define ASSERT_DOUBLE_EQ(val1, val2)\
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
-                      val1, val2)
-
-#define EXPECT_NEAR(val1, val2, abs_error)\
-  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
-
-#define ASSERT_NEAR(val1, val2, abs_error)\
-  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
-
-// These predicate format functions work on floating-point values, and
-// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
-//
-//   EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0);
-
-// Asserts that val1 is less than, or almost equal to, val2.  Fails
-// otherwise.  In particular, it fails if either val1 or val2 is NaN.
-GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
-                                   float val1, float val2);
-GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
-                                    double val1, double val2);
-
-
-#if GTEST_OS_WINDOWS
-
-// Macros that test for HRESULT failure and success, these are only useful
-// on Windows, and rely on Windows SDK macros and APIs to compile.
-//
-//    * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr)
-//
-// When expr unexpectedly fails or succeeds, Google Test prints the
-// expected result and the actual result with both a human-readable
-// string representation of the error, if available, as well as the
-// hex result code.
-# define EXPECT_HRESULT_SUCCEEDED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
-
-# define ASSERT_HRESULT_SUCCEEDED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
-
-# define EXPECT_HRESULT_FAILED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
-
-# define ASSERT_HRESULT_FAILED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
-
-#endif  // GTEST_OS_WINDOWS
-
-// Macros that execute statement and check that it doesn't generate new fatal
-// failures in the current thread.
-//
-//   * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement);
-//
-// Examples:
-//
-//   EXPECT_NO_FATAL_FAILURE(Process());
-//   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
-//
-#define ASSERT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
-#define EXPECT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
-
-// Causes a trace (including the given source file path and line number,
-// and the given message) to be included in every test failure message generated
-// by code in the scope of the lifetime of an instance of this class. The effect
-// is undone with the destruction of the instance.
-//
-// The message argument can be anything streamable to std::ostream.
-//
-// Example:
-//   testing::ScopedTrace trace("file.cc", 123, "message");
-//
-class GTEST_API_ ScopedTrace {
- public:
-  // The c'tor pushes the given source file location and message onto
-  // a trace stack maintained by Google Test.
-
-  // Template version. Uses Message() to convert the values into strings.
-  // Slow, but flexible.
-  template <typename T>
-  ScopedTrace(const char* file, int line, const T& message) {
-    PushTrace(file, line, (Message() << message).GetString());
-  }
-
-  // Optimize for some known types.
-  ScopedTrace(const char* file, int line, const char* message) {
-    PushTrace(file, line, message ? message : "(null)");
-  }
-
-  ScopedTrace(const char* file, int line, const std::string& message) {
-    PushTrace(file, line, message);
-  }
-
-  // The d'tor pops the info pushed by the c'tor.
-  //
-  // Note that the d'tor is not virtual in order to be efficient.
-  // Don't inherit from ScopedTrace!
-  ~ScopedTrace();
-
- private:
-  void PushTrace(const char* file, int line, std::string message);
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
-} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
-                            // c'tor and d'tor.  Therefore it doesn't
-                            // need to be used otherwise.
-
-// Causes a trace (including the source file path, the current line
-// number, and the given message) to be included in every test failure
-// message generated by code in the current scope.  The effect is
-// undone when the control leaves the current scope.
-//
-// The message argument can be anything streamable to std::ostream.
-//
-// In the implementation, we include the current line number as part
-// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
-// to appear in the same block - as long as they are on different
-// lines.
-//
-// Assuming that each thread maintains its own stack of traces.
-// Therefore, a SCOPED_TRACE() would (correctly) only affect the
-// assertions in its own thread.
-#define SCOPED_TRACE(message) \
-  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
-    __FILE__, __LINE__, (message))
-
-
-// Compile-time assertion for type equality.
-// StaticAssertTypeEq<type1, type2>() compiles iff type1 and type2 are
-// the same type.  The value it returns is not interesting.
-//
-// Instead of making StaticAssertTypeEq a class template, we make it a
-// function template that invokes a helper class template.  This
-// prevents a user from misusing StaticAssertTypeEq<T1, T2> by
-// defining objects of that type.
-//
-// CAVEAT:
-//
-// When used inside a method of a class template,
-// StaticAssertTypeEq<T1, T2>() is effective ONLY IF the method is
-// instantiated.  For example, given:
-//
-//   template <typename T> class Foo {
-//    public:
-//     void Bar() { testing::StaticAssertTypeEq<int, T>(); }
-//   };
-//
-// the code:
-//
-//   void Test1() { Foo<bool> foo; }
-//
-// will NOT generate a compiler error, as Foo<bool>::Bar() is never
-// actually instantiated.  Instead, you need:
-//
-//   void Test2() { Foo<bool> foo; foo.Bar(); }
-//
-// to cause a compiler error.
-template <typename T1, typename T2>
-bool StaticAssertTypeEq() {
-  (void)internal::StaticAssertTypeEqHelper<T1, T2>();
-  return true;
-}
-
-// Defines a test.
-//
-// The first parameter is the name of the test suite, and the second
-// parameter is the name of the test within the test suite.
-//
-// The convention is to end the test suite name with "Test".  For
-// example, a test suite for the Foo class can be named FooTest.
-//
-// Test code should appear between braces after an invocation of
-// this macro.  Example:
-//
-//   TEST(FooTest, InitializesCorrectly) {
-//     Foo foo;
-//     EXPECT_TRUE(foo.StatusIsOK());
-//   }
-
-// Note that we call GetTestTypeId() instead of GetTypeId<
-// ::testing::Test>() here to get the type ID of testing::Test.  This
-// is to work around a suspected linker bug when using Google Test as
-// a framework on Mac OS X.  The bug causes GetTypeId<
-// ::testing::Test>() to return different values depending on whether
-// the call is from the Google Test framework itself or from user test
-// code.  GetTestTypeId() is guaranteed to always return the same
-// value, as it always calls GetTypeId<>() from the Google Test
-// framework.
-#define GTEST_TEST(test_suite_name, test_name)             \
-  GTEST_TEST_(test_suite_name, test_name, ::testing::Test, \
-              ::testing::internal::GetTestTypeId())
-
-// Define this macro to 1 to omit the definition of TEST(), which
-// is a generic name and clashes with some other libraries.
-#if !GTEST_DONT_DEFINE_TEST
-#define TEST(test_suite_name, test_name) GTEST_TEST(test_suite_name, test_name)
-#endif
-
-// Defines a test that uses a test fixture.
-//
-// The first parameter is the name of the test fixture class, which
-// also doubles as the test suite name.  The second parameter is the
-// name of the test within the test suite.
-//
-// A test fixture class must be declared earlier.  The user should put
-// the test code between braces after using this macro.  Example:
-//
-//   class FooTest : public testing::Test {
-//    protected:
-//     void SetUp() override { b_.AddElement(3); }
-//
-//     Foo a_;
-//     Foo b_;
-//   };
-//
-//   TEST_F(FooTest, InitializesCorrectly) {
-//     EXPECT_TRUE(a_.StatusIsOK());
-//   }
-//
-//   TEST_F(FooTest, ReturnsElementCountCorrectly) {
-//     EXPECT_EQ(a_.size(), 0);
-//     EXPECT_EQ(b_.size(), 1);
-//   }
-//
-// GOOGLETEST_CM0011 DO NOT DELETE
-#define TEST_F(test_fixture, test_name)\
-  GTEST_TEST_(test_fixture, test_name, test_fixture, \
-              ::testing::internal::GetTypeId<test_fixture>())
-
-// Returns a path to temporary directory.
-// Tries to determine an appropriate directory for the platform.
-GTEST_API_ std::string TempDir();
-
-#ifdef _MSC_VER
-#  pragma warning(pop)
-#endif
-
-// Dynamically registers a test with the framework.
-//
-// This is an advanced API only to be used when the `TEST` macros are
-// insufficient. The macros should be preferred when possible, as they avoid
-// most of the complexity of calling this function.
-//
-// The `factory` argument is a factory callable (move-constructible) object or
-// function pointer that creates a new instance of the Test object. It
-// handles ownership to the caller. The signature of the callable is
-// `Fixture*()`, where `Fixture` is the test fixture class for the test. All
-// tests registered with the same `test_suite_name` must return the same
-// fixture type. This is checked at runtime.
-//
-// The framework will infer the fixture class from the factory and will call
-// the `SetUpTestSuite` and `TearDownTestSuite` for it.
-//
-// Must be called before `RUN_ALL_TESTS()` is invoked, otherwise behavior is
-// undefined.
-//
-// Use case example:
-//
-// class MyFixture : public ::testing::Test {
-//  public:
-//   // All of these optional, just like in regular macro usage.
-//   static void SetUpTestSuite() { ... }
-//   static void TearDownTestSuite() { ... }
-//   void SetUp() override { ... }
-//   void TearDown() override { ... }
-// };
-//
-// class MyTest : public MyFixture {
-//  public:
-//   explicit MyTest(int data) : data_(data) {}
-//   void TestBody() override { ... }
-//
-//  private:
-//   int data_;
-// };
-//
-// void RegisterMyTests(const std::vector<int>& values) {
-//   for (int v : values) {
-//     ::testing::RegisterTest(
-//         "MyFixture", ("Test" + std::to_string(v)).c_str(), nullptr,
-//         std::to_string(v).c_str(),
-//         __FILE__, __LINE__,
-//         // Important to use the fixture type as the return type here.
-//         [=]() -> MyFixture* { return new MyTest(v); });
-//   }
-// }
-// ...
-// int main(int argc, char** argv) {
-//   std::vector<int> values_to_test = LoadValuesFromConfig();
-//   RegisterMyTests(values_to_test);
-//   ...
-//   return RUN_ALL_TESTS();
-// }
-//
-template <int&... ExplicitParameterBarrier, typename Factory>
-TestInfo* RegisterTest(const char* test_suite_name, const char* test_name,
-                       const char* type_param, const char* value_param,
-                       const char* file, int line, Factory factory) {
-  using TestT = typename std::remove_pointer<decltype(factory())>::type;
-
-  class FactoryImpl : public internal::TestFactoryBase {
-   public:
-    explicit FactoryImpl(Factory f) : factory_(std::move(f)) {}
-    Test* CreateTest() override { return factory_(); }
-
-   private:
-    Factory factory_;
-  };
-
-  return internal::MakeAndRegisterTestInfo(
-      test_suite_name, test_name, type_param, value_param,
-      internal::CodeLocation(file, line), internal::GetTypeId<TestT>(),
-      internal::SuiteApiResolver<TestT>::GetSetUpCaseOrSuite(file, line),
-      internal::SuiteApiResolver<TestT>::GetTearDownCaseOrSuite(file, line),
-      new FactoryImpl{std::move(factory)});
-}
-
-}  // namespace testing
-
-// Use this function in main() to run all tests.  It returns 0 if all
-// tests are successful, or 1 otherwise.
-//
-// RUN_ALL_TESTS() should be invoked after the command line has been
-// parsed by InitGoogleTest().
-//
-// This function was formerly a macro; thus, it is in the global
-// namespace and has an all-caps name.
-int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
-
-inline int RUN_ALL_TESTS() {
-  return ::testing::UnitTest::GetInstance()->Run();
-}
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
diff --git a/deps/googletest/include/gtest/gtest_pred_impl.h b/deps/googletest/include/gtest/gtest_pred_impl.h
deleted file mode 100644
index d514255c7..000000000
--- a/deps/googletest/include/gtest/gtest_pred_impl.h
+++ /dev/null
@@ -1,359 +0,0 @@
-// Copyright 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// This file is AUTOMATICALLY GENERATED on 01/02/2019 by command
-// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
-//
-// Implements a family of generic predicate assertion macros.
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-
-#include "gtest/gtest.h"
-
-namespace testing {
-
-// This header implements a family of generic predicate assertion
-// macros:
-//
-//   ASSERT_PRED_FORMAT1(pred_format, v1)
-//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
-//   ...
-//
-// where pred_format is a function or functor that takes n (in the
-// case of ASSERT_PRED_FORMATn) values and their source expression
-// text, and returns a testing::AssertionResult.  See the definition
-// of ASSERT_EQ in gtest.h for an example.
-//
-// If you don't care about formatting, you can use the more
-// restrictive version:
-//
-//   ASSERT_PRED1(pred, v1)
-//   ASSERT_PRED2(pred, v1, v2)
-//   ...
-//
-// where pred is an n-ary function or functor that returns bool,
-// and the values v1, v2, ..., must support the << operator for
-// streaming to std::ostream.
-//
-// We also define the EXPECT_* variations.
-//
-// For now we only support predicates whose arity is at most 5.
-// Please email googletestframework@googlegroups.com if you need
-// support for higher arities.
-
-// GTEST_ASSERT_ is the basic statement to which all of the assertions
-// in this file reduce.  Don't use this in your code.
-
-#define GTEST_ASSERT_(expression, on_failure) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar = (expression)) \
-    ; \
-  else \
-    on_failure(gtest_ar.failure_message())
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1>
-AssertionResult AssertPred1Helper(const char* pred_text,
-                                  const char* e1,
-                                  Pred pred,
-                                  const T1& v1) {
-  if (pred(v1)) return AssertionSuccess();
-
-  return AssertionFailure()
-         << pred_text << "(" << e1 << ") evaluates to false, where"
-         << "\n"
-         << e1 << " evaluates to " << ::testing::PrintToString(v1);
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, v1), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
-// this in your code.
-#define GTEST_PRED1_(pred, v1, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
-                                             #v1, \
-                                             pred, \
-                                             v1), on_failure)
-
-// Unary predicate assertion macros.
-#define EXPECT_PRED_FORMAT1(pred_format, v1) \
-  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT1(pred_format, v1) \
-  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2>
-AssertionResult AssertPred2Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2) {
-  if (pred(v1, v2)) return AssertionSuccess();
-
-  return AssertionFailure()
-         << pred_text << "(" << e1 << ", " << e2
-         << ") evaluates to false, where"
-         << "\n"
-         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
-         << e2 << " evaluates to " << ::testing::PrintToString(v2);
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
-// this in your code.
-#define GTEST_PRED2_(pred, v1, v2, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             pred, \
-                                             v1, \
-                                             v2), on_failure)
-
-// Binary predicate assertion macros.
-#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
-  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED2(pred, v1, v2) \
-  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
-  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED2(pred, v1, v2) \
-  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3>
-AssertionResult AssertPred3Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3) {
-  if (pred(v1, v2, v3)) return AssertionSuccess();
-
-  return AssertionFailure()
-         << pred_text << "(" << e1 << ", " << e2 << ", " << e3
-         << ") evaluates to false, where"
-         << "\n"
-         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
-         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
-         << e3 << " evaluates to " << ::testing::PrintToString(v3);
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
-// this in your code.
-#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3), on_failure)
-
-// Ternary predicate assertion macros.
-#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
-  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED3(pred, v1, v2, v3) \
-  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
-  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED3(pred, v1, v2, v3) \
-  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4>
-AssertionResult AssertPred4Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4) {
-  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
-
-  return AssertionFailure()
-         << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4
-         << ") evaluates to false, where"
-         << "\n"
-         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
-         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
-         << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n"
-         << e4 << " evaluates to " << ::testing::PrintToString(v4);
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
-// this in your code.
-#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4), on_failure)
-
-// 4-ary predicate assertion macros.
-#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
-  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
-  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
-  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
-  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5>
-AssertionResult AssertPred5Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  const char* e5,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4,
-                                  const T5& v5) {
-  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
-
-  return AssertionFailure()
-         << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4
-         << ", " << e5 << ") evaluates to false, where"
-         << "\n"
-         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
-         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
-         << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n"
-         << e4 << " evaluates to " << ::testing::PrintToString(v4) << "\n"
-         << e5 << " evaluates to " << ::testing::PrintToString(v5);
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
-// this in your code.
-#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             #v5, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4, \
-                                             v5), on_failure)
-
-// 5-ary predicate assertion macros.
-#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
-  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
-  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
-  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
-  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
-
-
-
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/deps/googletest/include/gtest/gtest_prod.h b/deps/googletest/include/gtest/gtest_prod.h
deleted file mode 100644
index e651671eb..000000000
--- a/deps/googletest/include/gtest/gtest_prod.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-//
-// Google C++ Testing and Mocking Framework definitions useful in production code.
-// GOOGLETEST_CM0003 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-
-// When you need to test the private or protected members of a class,
-// use the FRIEND_TEST macro to declare your tests as friends of the
-// class.  For example:
-//
-// class MyClass {
-//  private:
-//   void PrivateMethod();
-//   FRIEND_TEST(MyClassTest, PrivateMethodWorks);
-// };
-//
-// class MyClassTest : public testing::Test {
-//   // ...
-// };
-//
-// TEST_F(MyClassTest, PrivateMethodWorks) {
-//   // Can call MyClass::PrivateMethod() here.
-// }
-//
-// Note: The test class must be in the same namespace as the class being tested.
-// For example, putting MyClassTest in an anonymous namespace will not work.
-
-#define FRIEND_TEST(test_case_name, test_name)\
-friend class test_case_name##_##test_name##_Test
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/deps/googletest/include/gtest/internal/custom/README.md b/deps/googletest/include/gtest/internal/custom/README.md
deleted file mode 100644
index ff391fb4e..000000000
--- a/deps/googletest/include/gtest/internal/custom/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# Customization Points
-
-The custom directory is an injection point for custom user configurations.
-
-## Header `gtest.h`
-
-### The following macros can be defined:
-
-*   `GTEST_OS_STACK_TRACE_GETTER_` - The name of an implementation of
-    `OsStackTraceGetterInterface`.
-*   `GTEST_CUSTOM_TEMPDIR_FUNCTION_` - An override for `testing::TempDir()`. See
-    `testing::TempDir` for semantics and signature.
-
-## Header `gtest-port.h`
-
-The following macros can be defined:
-
-### Flag related macros:
-
-*   `GTEST_FLAG(flag_name)`
-*   `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its
-    own flagfile flag parsing.
-*   `GTEST_DECLARE_bool_(name)`
-*   `GTEST_DECLARE_int32_(name)`
-*   `GTEST_DECLARE_string_(name)`
-*   `GTEST_DEFINE_bool_(name, default_val, doc)`
-*   `GTEST_DEFINE_int32_(name, default_val, doc)`
-*   `GTEST_DEFINE_string_(name, default_val, doc)`
-
-### Logging:
-
-*   `GTEST_LOG_(severity)`
-*   `GTEST_CHECK_(condition)`
-*   Functions `LogToStderr()` and `FlushInfoLog()` have to be provided too.
-
-### Threading:
-
-*   `GTEST_HAS_NOTIFICATION_` - Enabled if Notification is already provided.
-*   `GTEST_HAS_MUTEX_AND_THREAD_LOCAL_` - Enabled if `Mutex` and `ThreadLocal`
-    are already provided. Must also provide `GTEST_DECLARE_STATIC_MUTEX_(mutex)`
-    and `GTEST_DEFINE_STATIC_MUTEX_(mutex)`
-*   `GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)`
-*   `GTEST_LOCK_EXCLUDED_(locks)`
-
-### Underlying library support features
-
-*   `GTEST_HAS_CXXABI_H_`
-
-### Exporting API symbols:
-
-*   `GTEST_API_` - Specifier for exported symbols.
-
-## Header `gtest-printers.h`
-
-*   See documentation at `gtest/gtest-printers.h` for details on how to define a
-    custom printer.
diff --git a/deps/googletest/include/gtest/internal/custom/gtest-port.h b/deps/googletest/include/gtest/internal/custom/gtest-port.h
deleted file mode 100644
index cd85d956d..000000000
--- a/deps/googletest/include/gtest/internal/custom/gtest-port.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2015, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Injection point for custom user configurations. See README for details
-//
-// ** Custom implementation starts here **
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/deps/googletest/include/gtest/internal/custom/gtest-printers.h b/deps/googletest/include/gtest/internal/custom/gtest-printers.h
deleted file mode 100644
index eb4467abc..000000000
--- a/deps/googletest/include/gtest/internal/custom/gtest-printers.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2015, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// This file provides an injection point for custom printers in a local
-// installation of gTest.
-// It will be included from gtest-printers.h and the overrides in this file
-// will be visible to everyone.
-//
-// Injection point for custom user configurations. See README for details
-//
-// ** Custom implementation starts here **
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
diff --git a/deps/googletest/include/gtest/internal/custom/gtest.h b/deps/googletest/include/gtest/internal/custom/gtest.h
deleted file mode 100644
index 4c8e07be2..000000000
--- a/deps/googletest/include/gtest/internal/custom/gtest.h
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2015, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Injection point for custom user configurations. See README for details
-//
-// ** Custom implementation starts here **
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
diff --git a/deps/googletest/include/gtest/internal/gtest-death-test-internal.h b/deps/googletest/include/gtest/internal/gtest-death-test-internal.h
deleted file mode 100644
index 68bd35306..000000000
--- a/deps/googletest/include/gtest/internal/gtest-death-test-internal.h
+++ /dev/null
@@ -1,304 +0,0 @@
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// The Google C++ Testing and Mocking Framework (Google Test)
-//
-// This header file defines internal utilities needed for implementing
-// death tests.  They are subject to change without notice.
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-
-#include "gtest/gtest-matchers.h"
-#include "gtest/internal/gtest-internal.h"
-
-#include <stdio.h>
-#include <memory>
-
-namespace testing {
-namespace internal {
-
-GTEST_DECLARE_string_(internal_run_death_test);
-
-// Names of the flags (needed for parsing Google Test flags).
-const char kDeathTestStyleFlag[] = "death_test_style";
-const char kDeathTestUseFork[] = "death_test_use_fork";
-const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
-
-#if GTEST_HAS_DEATH_TEST
-
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
-/* class A needs to have dll-interface to be used by clients of class B */)
-
-// DeathTest is a class that hides much of the complexity of the
-// GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
-// returns a concrete class that depends on the prevailing death test
-// style, as defined by the --gtest_death_test_style and/or
-// --gtest_internal_run_death_test flags.
-
-// In describing the results of death tests, these terms are used with
-// the corresponding definitions:
-//
-// exit status:  The integer exit information in the format specified
-//               by wait(2)
-// exit code:    The integer code passed to exit(3), _exit(2), or
-//               returned from main()
-class GTEST_API_ DeathTest {
- public:
-  // Create returns false if there was an error determining the
-  // appropriate action to take for the current death test; for example,
-  // if the gtest_death_test_style flag is set to an invalid value.
-  // The LastMessage method will return a more detailed message in that
-  // case.  Otherwise, the DeathTest pointer pointed to by the "test"
-  // argument is set.  If the death test should be skipped, the pointer
-  // is set to NULL; otherwise, it is set to the address of a new concrete
-  // DeathTest object that controls the execution of the current test.
-  static bool Create(const char* statement, Matcher<const std::string&> matcher,
-                     const char* file, int line, DeathTest** test);
-  DeathTest();
-  virtual ~DeathTest() { }
-
-  // A helper class that aborts a death test when it's deleted.
-  class ReturnSentinel {
-   public:
-    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
-    ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
-   private:
-    DeathTest* const test_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
-  } GTEST_ATTRIBUTE_UNUSED_;
-
-  // An enumeration of possible roles that may be taken when a death
-  // test is encountered.  EXECUTE means that the death test logic should
-  // be executed immediately.  OVERSEE means that the program should prepare
-  // the appropriate environment for a child process to execute the death
-  // test, then wait for it to complete.
-  enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
-
-  // An enumeration of the three reasons that a test might be aborted.
-  enum AbortReason {
-    TEST_ENCOUNTERED_RETURN_STATEMENT,
-    TEST_THREW_EXCEPTION,
-    TEST_DID_NOT_DIE
-  };
-
-  // Assumes one of the above roles.
-  virtual TestRole AssumeRole() = 0;
-
-  // Waits for the death test to finish and returns its status.
-  virtual int Wait() = 0;
-
-  // Returns true if the death test passed; that is, the test process
-  // exited during the test, its exit status matches a user-supplied
-  // predicate, and its stderr output matches a user-supplied regular
-  // expression.
-  // The user-supplied predicate may be a macro expression rather
-  // than a function pointer or functor, or else Wait and Passed could
-  // be combined.
-  virtual bool Passed(bool exit_status_ok) = 0;
-
-  // Signals that the death test did not die as expected.
-  virtual void Abort(AbortReason reason) = 0;
-
-  // Returns a human-readable outcome message regarding the outcome of
-  // the last death test.
-  static const char* LastMessage();
-
-  static void set_last_death_test_message(const std::string& message);
-
- private:
-  // A string containing a description of the outcome of the last death test.
-  static std::string last_death_test_message_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
-};
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
-
-// Factory interface for death tests.  May be mocked out for testing.
-class DeathTestFactory {
- public:
-  virtual ~DeathTestFactory() { }
-  virtual bool Create(const char* statement,
-                      Matcher<const std::string&> matcher, const char* file,
-                      int line, DeathTest** test) = 0;
-};
-
-// A concrete DeathTestFactory implementation for normal use.
-class DefaultDeathTestFactory : public DeathTestFactory {
- public:
-  bool Create(const char* statement, Matcher<const std::string&> matcher,
-              const char* file, int line, DeathTest** test) override;
-};
-
-// Returns true if exit_status describes a process that was terminated
-// by a signal, or exited normally with a nonzero exit code.
-GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
-
-// A string passed to EXPECT_DEATH (etc.) is caught by one of these overloads
-// and interpreted as a regex (rather than an Eq matcher) for legacy
-// compatibility.
-inline Matcher<const ::std::string&> MakeDeathTestMatcher(
-    ::testing::internal::RE regex) {
-  return ContainsRegex(regex.pattern());
-}
-inline Matcher<const ::std::string&> MakeDeathTestMatcher(const char* regex) {
-  return ContainsRegex(regex);
-}
-inline Matcher<const ::std::string&> MakeDeathTestMatcher(
-    const ::std::string& regex) {
-  return ContainsRegex(regex);
-}
-
-// If a Matcher<const ::std::string&> is passed to EXPECT_DEATH (etc.), it's
-// used directly.
-inline Matcher<const ::std::string&> MakeDeathTestMatcher(
-    Matcher<const ::std::string&> matcher) {
-  return matcher;
-}
-
-// Traps C++ exceptions escaping statement and reports them as test
-// failures. Note that trapping SEH exceptions is not implemented here.
-# if GTEST_HAS_EXCEPTIONS
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
-  try { \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } catch (const ::std::exception& gtest_exception) { \
-    fprintf(\
-        stderr, \
-        "\n%s: Caught std::exception-derived exception escaping the " \
-        "death test statement. Exception message: %s\n", \
-        ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
-        gtest_exception.what()); \
-    fflush(stderr); \
-    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
-  } catch (...) { \
-    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
-  }
-
-# else
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
-  GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
-
-# endif
-
-// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
-// ASSERT_EXIT*, and EXPECT_EXIT*.
-#define GTEST_DEATH_TEST_(statement, predicate, regex_or_matcher, fail)        \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
-  if (::testing::internal::AlwaysTrue()) {                                     \
-    ::testing::internal::DeathTest* gtest_dt;                                  \
-    if (!::testing::internal::DeathTest::Create(                               \
-            #statement,                                                        \
-            ::testing::internal::MakeDeathTestMatcher(regex_or_matcher),       \
-            __FILE__, __LINE__, &gtest_dt)) {                                  \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__);                        \
-    }                                                                          \
-    if (gtest_dt != nullptr) {                                                 \
-      std::unique_ptr< ::testing::internal::DeathTest> gtest_dt_ptr(gtest_dt); \
-      switch (gtest_dt->AssumeRole()) {                                        \
-        case ::testing::internal::DeathTest::OVERSEE_TEST:                     \
-          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) {                \
-            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__);                  \
-          }                                                                    \
-          break;                                                               \
-        case ::testing::internal::DeathTest::EXECUTE_TEST: {                   \
-          ::testing::internal::DeathTest::ReturnSentinel gtest_sentinel(       \
-              gtest_dt);                                                       \
-          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt);            \
-          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE);   \
-          break;                                                               \
-        }                                                                      \
-        default:                                                               \
-          break;                                                               \
-      }                                                                        \
-    }                                                                          \
-  } else                                                                       \
-    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__)                                \
-        : fail(::testing::internal::DeathTest::LastMessage())
-// The symbol "fail" here expands to something into which a message
-// can be streamed.
-
-// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
-// NDEBUG mode. In this case we need the statements to be executed and the macro
-// must accept a streamed message even though the message is never printed.
-// The regex object is not evaluated, but it is used to prevent "unused"
-// warnings and to avoid an expression that doesn't compile in debug mode.
-#define GTEST_EXECUTE_STATEMENT_(statement, regex_or_matcher)    \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                  \
-  if (::testing::internal::AlwaysTrue()) {                       \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);   \
-  } else if (!::testing::internal::AlwaysTrue()) {               \
-    ::testing::internal::MakeDeathTestMatcher(regex_or_matcher); \
-  } else                                                         \
-    ::testing::Message()
-
-// A class representing the parsed contents of the
-// --gtest_internal_run_death_test flag, as it existed when
-// RUN_ALL_TESTS was called.
-class InternalRunDeathTestFlag {
- public:
-  InternalRunDeathTestFlag(const std::string& a_file,
-                           int a_line,
-                           int an_index,
-                           int a_write_fd)
-      : file_(a_file), line_(a_line), index_(an_index),
-        write_fd_(a_write_fd) {}
-
-  ~InternalRunDeathTestFlag() {
-    if (write_fd_ >= 0)
-      posix::Close(write_fd_);
-  }
-
-  const std::string& file() const { return file_; }
-  int line() const { return line_; }
-  int index() const { return index_; }
-  int write_fd() const { return write_fd_; }
-
- private:
-  std::string file_;
-  int line_;
-  int index_;
-  int write_fd_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
-};
-
-// Returns a newly created InternalRunDeathTestFlag object with fields
-// initialized from the GTEST_FLAG(internal_run_death_test) flag if
-// the flag is specified; otherwise returns NULL.
-InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
diff --git a/deps/googletest/include/gtest/internal/gtest-filepath.h b/deps/googletest/include/gtest/internal/gtest-filepath.h
deleted file mode 100644
index 3adc2a53f..000000000
--- a/deps/googletest/include/gtest/internal/gtest-filepath.h
+++ /dev/null
@@ -1,211 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Google Test filepath utilities
-//
-// This header file declares classes and functions used internally by
-// Google Test.  They are subject to change without notice.
-//
-// This file is #included in gtest/internal/gtest-internal.h.
-// Do not include this header file separately!
-
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-
-#include "gtest/internal/gtest-string.h"
-
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
-/* class A needs to have dll-interface to be used by clients of class B */)
-
-namespace testing {
-namespace internal {
-
-// FilePath - a class for file and directory pathname manipulation which
-// handles platform-specific conventions (like the pathname separator).
-// Used for helper functions for naming files in a directory for xml output.
-// Except for Set methods, all methods are const or static, which provides an
-// "immutable value object" -- useful for peace of mind.
-// A FilePath with a value ending in a path separator ("like/this/") represents
-// a directory, otherwise it is assumed to represent a file. In either case,
-// it may or may not represent an actual file or directory in the file system.
-// Names are NOT checked for syntax correctness -- no checking for illegal
-// characters, malformed paths, etc.
-
-class GTEST_API_ FilePath {
- public:
-  FilePath() : pathname_("") { }
-  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
-
-  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
-    Normalize();
-  }
-
-  FilePath& operator=(const FilePath& rhs) {
-    Set(rhs);
-    return *this;
-  }
-
-  void Set(const FilePath& rhs) {
-    pathname_ = rhs.pathname_;
-  }
-
-  const std::string& string() const { return pathname_; }
-  const char* c_str() const { return pathname_.c_str(); }
-
-  // Returns the current working directory, or "" if unsuccessful.
-  static FilePath GetCurrentDir();
-
-  // Given directory = "dir", base_name = "test", number = 0,
-  // extension = "xml", returns "dir/test.xml". If number is greater
-  // than zero (e.g., 12), returns "dir/test_12.xml".
-  // On Windows platform, uses \ as the separator rather than /.
-  static FilePath MakeFileName(const FilePath& directory,
-                               const FilePath& base_name,
-                               int number,
-                               const char* extension);
-
-  // Given directory = "dir", relative_path = "test.xml",
-  // returns "dir/test.xml".
-  // On Windows, uses \ as the separator rather than /.
-  static FilePath ConcatPaths(const FilePath& directory,
-                              const FilePath& relative_path);
-
-  // Returns a pathname for a file that does not currently exist. The pathname
-  // will be directory/base_name.extension or
-  // directory/base_name_<number>.extension if directory/base_name.extension
-  // already exists. The number will be incremented until a pathname is found
-  // that does not already exist.
-  // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
-  // There could be a race condition if two or more processes are calling this
-  // function at the same time -- they could both pick the same filename.
-  static FilePath GenerateUniqueFileName(const FilePath& directory,
-                                         const FilePath& base_name,
-                                         const char* extension);
-
-  // Returns true iff the path is "".
-  bool IsEmpty() const { return pathname_.empty(); }
-
-  // If input name has a trailing separator character, removes it and returns
-  // the name, otherwise return the name string unmodified.
-  // On Windows platform, uses \ as the separator, other platforms use /.
-  FilePath RemoveTrailingPathSeparator() const;
-
-  // Returns a copy of the FilePath with the directory part removed.
-  // Example: FilePath("path/to/file").RemoveDirectoryName() returns
-  // FilePath("file"). If there is no directory part ("just_a_file"), it returns
-  // the FilePath unmodified. If there is no file part ("just_a_dir/") it
-  // returns an empty FilePath ("").
-  // On Windows platform, '\' is the path separator, otherwise it is '/'.
-  FilePath RemoveDirectoryName() const;
-
-  // RemoveFileName returns the directory path with the filename removed.
-  // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
-  // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
-  // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
-  // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
-  // On Windows platform, '\' is the path separator, otherwise it is '/'.
-  FilePath RemoveFileName() const;
-
-  // Returns a copy of the FilePath with the case-insensitive extension removed.
-  // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
-  // FilePath("dir/file"). If a case-insensitive extension is not
-  // found, returns a copy of the original FilePath.
-  FilePath RemoveExtension(const char* extension) const;
-
-  // Creates directories so that path exists. Returns true if successful or if
-  // the directories already exist; returns false if unable to create
-  // directories for any reason. Will also return false if the FilePath does
-  // not represent a directory (that is, it doesn't end with a path separator).
-  bool CreateDirectoriesRecursively() const;
-
-  // Create the directory so that path exists. Returns true if successful or
-  // if the directory already exists; returns false if unable to create the
-  // directory for any reason, including if the parent directory does not
-  // exist. Not named "CreateDirectory" because that's a macro on Windows.
-  bool CreateFolder() const;
-
-  // Returns true if FilePath describes something in the file-system,
-  // either a file, directory, or whatever, and that something exists.
-  bool FileOrDirectoryExists() const;
-
-  // Returns true if pathname describes a directory in the file-system
-  // that exists.
-  bool DirectoryExists() const;
-
-  // Returns true if FilePath ends with a path separator, which indicates that
-  // it is intended to represent a directory. Returns false otherwise.
-  // This does NOT check that a directory (or file) actually exists.
-  bool IsDirectory() const;
-
-  // Returns true if pathname describes a root directory. (Windows has one
-  // root directory per disk drive.)
-  bool IsRootDirectory() const;
-
-  // Returns true if pathname describes an absolute path.
-  bool IsAbsolutePath() const;
-
- private:
-  // Replaces multiple consecutive separators with a single separator.
-  // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
-  // redundancies that might be in a pathname involving "." or "..".
-  //
-  // A pathname with multiple consecutive separators may occur either through
-  // user error or as a result of some scripts or APIs that generate a pathname
-  // with a trailing separator. On other platforms the same API or script
-  // may NOT generate a pathname with a trailing "/". Then elsewhere that
-  // pathname may have another "/" and pathname components added to it,
-  // without checking for the separator already being there.
-  // The script language and operating system may allow paths like "foo//bar"
-  // but some of the functions in FilePath will not handle that correctly. In
-  // particular, RemoveTrailingPathSeparator() only removes one separator, and
-  // it is called in CreateDirectoriesRecursively() assuming that it will change
-  // a pathname from directory syntax (trailing separator) to filename syntax.
-  //
-  // On Windows this method also replaces the alternate path separator '/' with
-  // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
-  // "bar\\foo".
-
-  void Normalize();
-
-  // Returns a pointer to the last occurrence of a valid path separator in
-  // the FilePath. On Windows, for example, both '/' and '\' are valid path
-  // separators. Returns NULL if no path separator was found.
-  const char* FindLastPathSeparator() const;
-
-  std::string pathname_;
-};  // class FilePath
-
-}  // namespace internal
-}  // namespace testing
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
diff --git a/deps/googletest/include/gtest/internal/gtest-internal.h b/deps/googletest/include/gtest/internal/gtest-internal.h
deleted file mode 100644
index 273266081..000000000
--- a/deps/googletest/include/gtest/internal/gtest-internal.h
+++ /dev/null
@@ -1,1428 +0,0 @@
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// The Google C++ Testing and Mocking Framework (Google Test)
-//
-// This header file declares functions and macros used internally by
-// Google Test.  They are subject to change without notice.
-
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-
-#include "gtest/internal/gtest-port.h"
-
-#if GTEST_OS_LINUX
-# include <stdlib.h>
-# include <sys/types.h>
-# include <sys/wait.h>
-# include <unistd.h>
-#endif  // GTEST_OS_LINUX
-
-#if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
-#endif
-
-#include <ctype.h>
-#include <float.h>
-#include <string.h>
-#include <iomanip>
-#include <limits>
-#include <map>
-#include <set>
-#include <string>
-#include <type_traits>
-#include <vector>
-
-#include "gtest/gtest-message.h"
-#include "gtest/internal/gtest-filepath.h"
-#include "gtest/internal/gtest-string.h"
-#include "gtest/internal/gtest-type-util.h"
-
-// Due to C++ preprocessor weirdness, we need double indirection to
-// concatenate two tokens when one of them is __LINE__.  Writing
-//
-//   foo ## __LINE__
-//
-// will result in the token foo__LINE__, instead of foo followed by
-// the current line number.  For more details, see
-// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
-#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
-#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
-
-// Stringifies its argument.
-#define GTEST_STRINGIFY_(name) #name
-
-namespace proto2 { class Message; }
-
-namespace testing {
-
-// Forward declarations.
-
-class AssertionResult;                 // Result of an assertion.
-class Message;                         // Represents a failure message.
-class Test;                            // Represents a test.
-class TestInfo;                        // Information about a test.
-class TestPartResult;                  // Result of a test part.
-class UnitTest;                        // A collection of test suites.
-
-template <typename T>
-::std::string PrintToString(const T& value);
-
-namespace internal {
-
-struct TraceInfo;                      // Information about a trace point.
-class TestInfoImpl;                    // Opaque implementation of TestInfo
-class UnitTestImpl;                    // Opaque implementation of UnitTest
-
-// The text used in failure messages to indicate the start of the
-// stack trace.
-GTEST_API_ extern const char kStackTraceMarker[];
-
-// An IgnoredValue object can be implicitly constructed from ANY value.
-class IgnoredValue {
-  struct Sink {};
- public:
-  // This constructor template allows any value to be implicitly
-  // converted to IgnoredValue.  The object has no data member and
-  // doesn't try to remember anything about the argument.  We
-  // deliberately omit the 'explicit' keyword in order to allow the
-  // conversion to be implicit.
-  // Disable the conversion if T already has a magical conversion operator.
-  // Otherwise we get ambiguity.
-  template <typename T,
-            typename std::enable_if<!std::is_convertible<T, Sink>::value,
-                                    int>::type = 0>
-  IgnoredValue(const T& /* ignored */) {}  // NOLINT(runtime/explicit)
-};
-
-// Appends the user-supplied message to the Google-Test-generated message.
-GTEST_API_ std::string AppendUserMessage(
-    const std::string& gtest_msg, const Message& user_msg);
-
-#if GTEST_HAS_EXCEPTIONS
-
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \
-/* an exported class was derived from a class that was not exported */)
-
-// This exception is thrown by (and only by) a failed Google Test
-// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
-// are enabled).  We derive it from std::runtime_error, which is for
-// errors presumably detectable only at run time.  Since
-// std::runtime_error inherits from std::exception, many testing
-// frameworks know how to extract and print the message inside it.
-class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
- public:
-  explicit GoogleTestFailureException(const TestPartResult& failure);
-};
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4275
-
-#endif  // GTEST_HAS_EXCEPTIONS
-
-namespace edit_distance {
-// Returns the optimal edits to go from 'left' to 'right'.
-// All edits cost the same, with replace having lower priority than
-// add/remove.
-// Simple implementation of the Wagner-Fischer algorithm.
-// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm
-enum EditType { kMatch, kAdd, kRemove, kReplace };
-GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
-    const std::vector<size_t>& left, const std::vector<size_t>& right);
-
-// Same as above, but the input is represented as strings.
-GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
-    const std::vector<std::string>& left,
-    const std::vector<std::string>& right);
-
-// Create a diff of the input strings in Unified diff format.
-GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
-                                         const std::vector<std::string>& right,
-                                         size_t context = 2);
-
-}  // namespace edit_distance
-
-// Calculate the diff between 'left' and 'right' and return it in unified diff
-// format.
-// If not null, stores in 'total_line_count' the total number of lines found
-// in left + right.
-GTEST_API_ std::string DiffStrings(const std::string& left,
-                                   const std::string& right,
-                                   size_t* total_line_count);
-
-// Constructs and returns the message for an equality assertion
-// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
-//
-// The first four parameters are the expressions used in the assertion
-// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
-// where foo is 5 and bar is 6, we have:
-//
-//   expected_expression: "foo"
-//   actual_expression:   "bar"
-//   expected_value:      "5"
-//   actual_value:        "6"
-//
-// The ignoring_case parameter is true iff the assertion is a
-// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
-// be inserted into the message.
-GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
-                                     const char* actual_expression,
-                                     const std::string& expected_value,
-                                     const std::string& actual_value,
-                                     bool ignoring_case);
-
-// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
-GTEST_API_ std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value);
-
-// This template class represents an IEEE floating-point number
-// (either single-precision or double-precision, depending on the
-// template parameters).
-//
-// The purpose of this class is to do more sophisticated number
-// comparison.  (Due to round-off error, etc, it's very unlikely that
-// two floating-points will be equal exactly.  Hence a naive
-// comparison by the == operation often doesn't work.)
-//
-// Format of IEEE floating-point:
-//
-//   The most-significant bit being the leftmost, an IEEE
-//   floating-point looks like
-//
-//     sign_bit exponent_bits fraction_bits
-//
-//   Here, sign_bit is a single bit that designates the sign of the
-//   number.
-//
-//   For float, there are 8 exponent bits and 23 fraction bits.
-//
-//   For double, there are 11 exponent bits and 52 fraction bits.
-//
-//   More details can be found at
-//   http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
-//
-// Template parameter:
-//
-//   RawType: the raw floating-point type (either float or double)
-template <typename RawType>
-class FloatingPoint {
- public:
-  // Defines the unsigned integer type that has the same size as the
-  // floating point number.
-  typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
-
-  // Constants.
-
-  // # of bits in a number.
-  static const size_t kBitCount = 8*sizeof(RawType);
-
-  // # of fraction bits in a number.
-  static const size_t kFractionBitCount =
-    std::numeric_limits<RawType>::digits - 1;
-
-  // # of exponent bits in a number.
-  static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
-
-  // The mask for the sign bit.
-  static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
-
-  // The mask for the fraction bits.
-  static const Bits kFractionBitMask =
-    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
-
-  // The mask for the exponent bits.
-  static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
-
-  // How many ULP's (Units in the Last Place) we want to tolerate when
-  // comparing two numbers.  The larger the value, the more error we
-  // allow.  A 0 value means that two numbers must be exactly the same
-  // to be considered equal.
-  //
-  // The maximum error of a single floating-point operation is 0.5
-  // units in the last place.  On Intel CPU's, all floating-point
-  // calculations are done with 80-bit precision, while double has 64
-  // bits.  Therefore, 4 should be enough for ordinary use.
-  //
-  // See the following article for more details on ULP:
-  // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
-  static const size_t kMaxUlps = 4;
-
-  // Constructs a FloatingPoint from a raw floating-point number.
-  //
-  // On an Intel CPU, passing a non-normalized NAN (Not a Number)
-  // around may change its bits, although the new value is guaranteed
-  // to be also a NAN.  Therefore, don't expect this constructor to
-  // preserve the bits in x when x is a NAN.
-  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
-
-  // Static methods
-
-  // Reinterprets a bit pattern as a floating-point number.
-  //
-  // This function is needed to test the AlmostEquals() method.
-  static RawType ReinterpretBits(const Bits bits) {
-    FloatingPoint fp(0);
-    fp.u_.bits_ = bits;
-    return fp.u_.value_;
-  }
-
-  // Returns the floating-point number that represent positive infinity.
-  static RawType Infinity() {
-    return ReinterpretBits(kExponentBitMask);
-  }
-
-  // Returns the maximum representable finite floating-point number.
-  static RawType Max();
-
-  // Non-static methods
-
-  // Returns the bits that represents this number.
-  const Bits &bits() const { return u_.bits_; }
-
-  // Returns the exponent bits of this number.
-  Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
-
-  // Returns the fraction bits of this number.
-  Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
-
-  // Returns the sign bit of this number.
-  Bits sign_bit() const { return kSignBitMask & u_.bits_; }
-
-  // Returns true iff this is NAN (not a number).
-  bool is_nan() const {
-    // It's a NAN if the exponent bits are all ones and the fraction
-    // bits are not entirely zeros.
-    return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
-  }
-
-  // Returns true iff this number is at most kMaxUlps ULP's away from
-  // rhs.  In particular, this function:
-  //
-  //   - returns false if either number is (or both are) NAN.
-  //   - treats really large numbers as almost equal to infinity.
-  //   - thinks +0.0 and -0.0 are 0 DLP's apart.
-  bool AlmostEquals(const FloatingPoint& rhs) const {
-    // The IEEE standard says that any comparison operation involving
-    // a NAN must return false.
-    if (is_nan() || rhs.is_nan()) return false;
-
-    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
-        <= kMaxUlps;
-  }
-
- private:
-  // The data type used to store the actual floating-point number.
-  union FloatingPointUnion {
-    RawType value_;  // The raw floating-point number.
-    Bits bits_;      // The bits that represent the number.
-  };
-
-  // Converts an integer from the sign-and-magnitude representation to
-  // the biased representation.  More precisely, let N be 2 to the
-  // power of (kBitCount - 1), an integer x is represented by the
-  // unsigned number x + N.
-  //
-  // For instance,
-  //
-  //   -N + 1 (the most negative number representable using
-  //          sign-and-magnitude) is represented by 1;
-  //   0      is represented by N; and
-  //   N - 1  (the biggest number representable using
-  //          sign-and-magnitude) is represented by 2N - 1.
-  //
-  // Read http://en.wikipedia.org/wiki/Signed_number_representations
-  // for more details on signed number representations.
-  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
-    if (kSignBitMask & sam) {
-      // sam represents a negative number.
-      return ~sam + 1;
-    } else {
-      // sam represents a positive number.
-      return kSignBitMask | sam;
-    }
-  }
-
-  // Given two numbers in the sign-and-magnitude representation,
-  // returns the distance between them as an unsigned number.
-  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
-                                                     const Bits &sam2) {
-    const Bits biased1 = SignAndMagnitudeToBiased(sam1);
-    const Bits biased2 = SignAndMagnitudeToBiased(sam2);
-    return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
-  }
-
-  FloatingPointUnion u_;
-};
-
-// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
-// macro defined by <windows.h>.
-template <>
-inline float FloatingPoint<float>::Max() { return FLT_MAX; }
-template <>
-inline double FloatingPoint<double>::Max() { return DBL_MAX; }
-
-// Typedefs the instances of the FloatingPoint template class that we
-// care to use.
-typedef FloatingPoint<float> Float;
-typedef FloatingPoint<double> Double;
-
-// In order to catch the mistake of putting tests that use different
-// test fixture classes in the same test suite, we need to assign
-// unique IDs to fixture classes and compare them.  The TypeId type is
-// used to hold such IDs.  The user should treat TypeId as an opaque
-// type: the only operation allowed on TypeId values is to compare
-// them for equality using the == operator.
-typedef const void* TypeId;
-
-template <typename T>
-class TypeIdHelper {
- public:
-  // dummy_ must not have a const type.  Otherwise an overly eager
-  // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
-  // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
-  static bool dummy_;
-};
-
-template <typename T>
-bool TypeIdHelper<T>::dummy_ = false;
-
-// GetTypeId<T>() returns the ID of type T.  Different values will be
-// returned for different types.  Calling the function twice with the
-// same type argument is guaranteed to return the same ID.
-template <typename T>
-TypeId GetTypeId() {
-  // The compiler is required to allocate a different
-  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
-  // the template.  Therefore, the address of dummy_ is guaranteed to
-  // be unique.
-  return &(TypeIdHelper<T>::dummy_);
-}
-
-// Returns the type ID of ::testing::Test.  Always call this instead
-// of GetTypeId< ::testing::Test>() to get the type ID of
-// ::testing::Test, as the latter may give the wrong result due to a
-// suspected linker bug when compiling Google Test as a Mac OS X
-// framework.
-GTEST_API_ TypeId GetTestTypeId();
-
-// Defines the abstract factory interface that creates instances
-// of a Test object.
-class TestFactoryBase {
- public:
-  virtual ~TestFactoryBase() {}
-
-  // Creates a test instance to run. The instance is both created and destroyed
-  // within TestInfoImpl::Run()
-  virtual Test* CreateTest() = 0;
-
- protected:
-  TestFactoryBase() {}
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
-};
-
-// This class provides implementation of TeastFactoryBase interface.
-// It is used in TEST and TEST_F macros.
-template <class TestClass>
-class TestFactoryImpl : public TestFactoryBase {
- public:
-  Test* CreateTest() override { return new TestClass; }
-};
-
-#if GTEST_OS_WINDOWS
-
-// Predicate-formatters for implementing the HRESULT checking macros
-// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
-// We pass a long instead of HRESULT to avoid causing an
-// include dependency for the HRESULT type.
-GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
-                                            long hr);  // NOLINT
-GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
-                                            long hr);  // NOLINT
-
-#endif  // GTEST_OS_WINDOWS
-
-// Types of SetUpTestSuite() and TearDownTestSuite() functions.
-using SetUpTestSuiteFunc = void (*)();
-using TearDownTestSuiteFunc = void (*)();
-
-struct CodeLocation {
-  CodeLocation(const std::string& a_file, int a_line)
-      : file(a_file), line(a_line) {}
-
-  std::string file;
-  int line;
-};
-
-//  Helper to identify which setup function for TestCase / TestSuite to call.
-//  Only one function is allowed, either TestCase or TestSute but not both.
-
-// Utility functions to help SuiteApiResolver
-using SetUpTearDownSuiteFuncType = void (*)();
-
-inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull(
-    SetUpTearDownSuiteFuncType a, SetUpTearDownSuiteFuncType def) {
-  return a == def ? nullptr : a;
-}
-
-template <typename T>
-//  Note that SuiteApiResolver inherits from T because
-//  SetUpTestSuite()/TearDownTestSuite() could be protected. Ths way
-//  SuiteApiResolver can access them.
-struct SuiteApiResolver : T {
-  // testing::Test is only forward declared at this point. So we make it a
-  // dependend class for the compiler to be OK with it.
-  using Test =
-      typename std::conditional<sizeof(T) != 0, ::testing::Test, void>::type;
-
-  static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char* filename,
-                                                        int line_num) {
-    SetUpTearDownSuiteFuncType test_case_fp =
-        GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase);
-    SetUpTearDownSuiteFuncType test_suite_fp =
-        GetNotDefaultOrNull(&T::SetUpTestSuite, &Test::SetUpTestSuite);
-
-    GTEST_CHECK_(!test_case_fp || !test_suite_fp)
-        << "Test can not provide both SetUpTestSuite and SetUpTestCase, please "
-           "make sure there is only one present at "
-        << filename << ":" << line_num;
-
-    return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
-  }
-
-  static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char* filename,
-                                                           int line_num) {
-    SetUpTearDownSuiteFuncType test_case_fp =
-        GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase);
-    SetUpTearDownSuiteFuncType test_suite_fp =
-        GetNotDefaultOrNull(&T::TearDownTestSuite, &Test::TearDownTestSuite);
-
-    GTEST_CHECK_(!test_case_fp || !test_suite_fp)
-        << "Test can not provide both TearDownTestSuite and TearDownTestCase,"
-           " please make sure there is only one present at"
-        << filename << ":" << line_num;
-
-    return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
-  }
-};
-
-// Creates a new TestInfo object and registers it with Google Test;
-// returns the created object.
-//
-// Arguments:
-//
-//   test_suite_name:   name of the test suite
-//   name:             name of the test
-//   type_param        the name of the test's type parameter, or NULL if
-//                     this is not a typed or a type-parameterized test.
-//   value_param       text representation of the test's value parameter,
-//                     or NULL if this is not a type-parameterized test.
-//   code_location:    code location where the test is defined
-//   fixture_class_id: ID of the test fixture class
-//   set_up_tc:        pointer to the function that sets up the test suite
-//   tear_down_tc:     pointer to the function that tears down the test suite
-//   factory:          pointer to the factory that creates a test object.
-//                     The newly created TestInfo instance will assume
-//                     ownership of the factory object.
-GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
-    const char* test_suite_name, const char* name, const char* type_param,
-    const char* value_param, CodeLocation code_location,
-    TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc,
-    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory);
-
-// If *pstr starts with the given prefix, modifies *pstr to be right
-// past the prefix and returns true; otherwise leaves *pstr unchanged
-// and returns false.  None of pstr, *pstr, and prefix can be NULL.
-GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
-
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
-/* class A needs to have dll-interface to be used by clients of class B */)
-
-// State of the definition of a type-parameterized test suite.
-class GTEST_API_ TypedTestSuitePState {
- public:
-  TypedTestSuitePState() : registered_(false) {}
-
-  // Adds the given test name to defined_test_names_ and return true
-  // if the test suite hasn't been registered; otherwise aborts the
-  // program.
-  bool AddTestName(const char* file, int line, const char* case_name,
-                   const char* test_name) {
-    if (registered_) {
-      fprintf(stderr,
-              "%s Test %s must be defined before "
-              "REGISTER_TYPED_TEST_SUITE_P(%s, ...).\n",
-              FormatFileLocation(file, line).c_str(), test_name, case_name);
-      fflush(stderr);
-      posix::Abort();
-    }
-    registered_tests_.insert(
-        ::std::make_pair(test_name, CodeLocation(file, line)));
-    return true;
-  }
-
-  bool TestExists(const std::string& test_name) const {
-    return registered_tests_.count(test_name) > 0;
-  }
-
-  const CodeLocation& GetCodeLocation(const std::string& test_name) const {
-    RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name);
-    GTEST_CHECK_(it != registered_tests_.end());
-    return it->second;
-  }
-
-  // Verifies that registered_tests match the test names in
-  // defined_test_names_; returns registered_tests if successful, or
-  // aborts the program otherwise.
-  const char* VerifyRegisteredTestNames(
-      const char* file, int line, const char* registered_tests);
-
- private:
-  typedef ::std::map<std::string, CodeLocation> RegisteredTestsMap;
-
-  bool registered_;
-  RegisteredTestsMap registered_tests_;
-};
-
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-using TypedTestCasePState = TypedTestSuitePState;
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
-
-// Skips to the first non-space char after the first comma in 'str';
-// returns NULL if no comma is found in 'str'.
-inline const char* SkipComma(const char* str) {
-  const char* comma = strchr(str, ',');
-  if (comma == nullptr) {
-    return nullptr;
-  }
-  while (IsSpace(*(++comma))) {}
-  return comma;
-}
-
-// Returns the prefix of 'str' before the first comma in it; returns
-// the entire string if it contains no comma.
-inline std::string GetPrefixUntilComma(const char* str) {
-  const char* comma = strchr(str, ',');
-  return comma == nullptr ? str : std::string(str, comma);
-}
-
-// Splits a given string on a given delimiter, populating a given
-// vector with the fields.
-void SplitString(const ::std::string& str, char delimiter,
-                 ::std::vector< ::std::string>* dest);
-
-// The default argument to the template below for the case when the user does
-// not provide a name generator.
-struct DefaultNameGenerator {
-  template <typename T>
-  static std::string GetName(int i) {
-    return StreamableToString(i);
-  }
-};
-
-template <typename Provided = DefaultNameGenerator>
-struct NameGeneratorSelector {
-  typedef Provided type;
-};
-
-template <typename NameGenerator>
-void GenerateNamesRecursively(Types0, std::vector<std::string>*, int) {}
-
-template <typename NameGenerator, typename Types>
-void GenerateNamesRecursively(Types, std::vector<std::string>* result, int i) {
-  result->push_back(NameGenerator::template GetName<typename Types::Head>(i));
-  GenerateNamesRecursively<NameGenerator>(typename Types::Tail(), result,
-                                          i + 1);
-}
-
-template <typename NameGenerator, typename Types>
-std::vector<std::string> GenerateNames() {
-  std::vector<std::string> result;
-  GenerateNamesRecursively<NameGenerator>(Types(), &result, 0);
-  return result;
-}
-
-// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
-// registers a list of type-parameterized tests with Google Test.  The
-// return value is insignificant - we just need to return something
-// such that we can call this function in a namespace scope.
-//
-// Implementation note: The GTEST_TEMPLATE_ macro declares a template
-// template parameter.  It's defined in gtest-type-util.h.
-template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
-class TypeParameterizedTest {
- public:
-  // 'index' is the index of the test in the type list 'Types'
-  // specified in INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, TestSuite,
-  // Types).  Valid values for 'index' are [0, N - 1] where N is the
-  // length of Types.
-  static bool Register(const char* prefix, const CodeLocation& code_location,
-                       const char* case_name, const char* test_names, int index,
-                       const std::vector<std::string>& type_names =
-                           GenerateNames<DefaultNameGenerator, Types>()) {
-    typedef typename Types::Head Type;
-    typedef Fixture<Type> FixtureClass;
-    typedef typename GTEST_BIND_(TestSel, Type) TestClass;
-
-    // First, registers the first type-parameterized test in the type
-    // list.
-    MakeAndRegisterTestInfo(
-        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name +
-         "/" + type_names[static_cast<size_t>(index)])
-            .c_str(),
-        StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(),
-        GetTypeName<Type>().c_str(),
-        nullptr,  // No value parameter.
-        code_location, GetTypeId<FixtureClass>(),
-        SuiteApiResolver<TestClass>::GetSetUpCaseOrSuite(
-            code_location.file.c_str(), code_location.line),
-        SuiteApiResolver<TestClass>::GetTearDownCaseOrSuite(
-            code_location.file.c_str(), code_location.line),
-        new TestFactoryImpl<TestClass>);
-
-    // Next, recurses (at compile time) with the tail of the type list.
-    return TypeParameterizedTest<Fixture, TestSel,
-                                 typename Types::Tail>::Register(prefix,
-                                                                 code_location,
-                                                                 case_name,
-                                                                 test_names,
-                                                                 index + 1,
-                                                                 type_names);
-  }
-};
-
-// The base case for the compile time recursion.
-template <GTEST_TEMPLATE_ Fixture, class TestSel>
-class TypeParameterizedTest<Fixture, TestSel, Types0> {
- public:
-  static bool Register(const char* /*prefix*/, const CodeLocation&,
-                       const char* /*case_name*/, const char* /*test_names*/,
-                       int /*index*/,
-                       const std::vector<std::string>& =
-                           std::vector<std::string>() /*type_names*/) {
-    return true;
-  }
-};
-
-// TypeParameterizedTestSuite<Fixture, Tests, Types>::Register()
-// registers *all combinations* of 'Tests' and 'Types' with Google
-// Test.  The return value is insignificant - we just need to return
-// something such that we can call this function in a namespace scope.
-template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
-class TypeParameterizedTestSuite {
- public:
-  static bool Register(const char* prefix, CodeLocation code_location,
-                       const TypedTestSuitePState* state, const char* case_name,
-                       const char* test_names,
-                       const std::vector<std::string>& type_names =
-                           GenerateNames<DefaultNameGenerator, Types>()) {
-    std::string test_name = StripTrailingSpaces(
-        GetPrefixUntilComma(test_names));
-    if (!state->TestExists(test_name)) {
-      fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
-              case_name, test_name.c_str(),
-              FormatFileLocation(code_location.file.c_str(),
-                                 code_location.line).c_str());
-      fflush(stderr);
-      posix::Abort();
-    }
-    const CodeLocation& test_location = state->GetCodeLocation(test_name);
-
-    typedef typename Tests::Head Head;
-
-    // First, register the first test in 'Test' for each type in 'Types'.
-    TypeParameterizedTest<Fixture, Head, Types>::Register(
-        prefix, test_location, case_name, test_names, 0, type_names);
-
-    // Next, recurses (at compile time) with the tail of the test list.
-    return TypeParameterizedTestSuite<Fixture, typename Tests::Tail,
-                                      Types>::Register(prefix, code_location,
-                                                       state, case_name,
-                                                       SkipComma(test_names),
-                                                       type_names);
-  }
-};
-
-// The base case for the compile time recursion.
-template <GTEST_TEMPLATE_ Fixture, typename Types>
-class TypeParameterizedTestSuite<Fixture, Templates0, Types> {
- public:
-  static bool Register(const char* /*prefix*/, const CodeLocation&,
-                       const TypedTestSuitePState* /*state*/,
-                       const char* /*case_name*/, const char* /*test_names*/,
-                       const std::vector<std::string>& =
-                           std::vector<std::string>() /*type_names*/) {
-    return true;
-  }
-};
-
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// Returns the current OS stack trace as an std::string.
-//
-// The maximum number of stack frames to be included is specified by
-// the gtest_stack_trace_depth flag.  The skip_count parameter
-// specifies the number of top frames to be skipped, which doesn't
-// count against the number of frames to be included.
-//
-// For example, if Foo() calls Bar(), which in turn calls
-// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
-// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
-    UnitTest* unit_test, int skip_count);
-
-// Helpers for suppressing warnings on unreachable code or constant
-// condition.
-
-// Always returns true.
-GTEST_API_ bool AlwaysTrue();
-
-// Always returns false.
-inline bool AlwaysFalse() { return !AlwaysTrue(); }
-
-// Helper for suppressing false warning from Clang on a const char*
-// variable declared in a conditional expression always being NULL in
-// the else branch.
-struct GTEST_API_ ConstCharPtr {
-  ConstCharPtr(const char* str) : value(str) {}
-  operator bool() const { return true; }
-  const char* value;
-};
-
-// A simple Linear Congruential Generator for generating random
-// numbers with a uniform distribution.  Unlike rand() and srand(), it
-// doesn't use global state (and therefore can't interfere with user
-// code).  Unlike rand_r(), it's portable.  An LCG isn't very random,
-// but it's good enough for our purposes.
-class GTEST_API_ Random {
- public:
-  static const UInt32 kMaxRange = 1u << 31;
-
-  explicit Random(UInt32 seed) : state_(seed) {}
-
-  void Reseed(UInt32 seed) { state_ = seed; }
-
-  // Generates a random number from [0, range).  Crashes if 'range' is
-  // 0 or greater than kMaxRange.
-  UInt32 Generate(UInt32 range);
-
- private:
-  UInt32 state_;
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
-};
-
-// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a
-// compiler error iff T1 and T2 are different types.
-template <typename T1, typename T2>
-struct CompileAssertTypesEqual;
-
-template <typename T>
-struct CompileAssertTypesEqual<T, T> {
-};
-
-// Removes the reference from a type if it is a reference type,
-// otherwise leaves it unchanged.  This is the same as
-// tr1::remove_reference, which is not widely available yet.
-template <typename T>
-struct RemoveReference { typedef T type; };  // NOLINT
-template <typename T>
-struct RemoveReference<T&> { typedef T type; };  // NOLINT
-
-// A handy wrapper around RemoveReference that works when the argument
-// T depends on template parameters.
-#define GTEST_REMOVE_REFERENCE_(T) \
-    typename ::testing::internal::RemoveReference<T>::type
-
-// Removes const from a type if it is a const type, otherwise leaves
-// it unchanged.  This is the same as tr1::remove_const, which is not
-// widely available yet.
-template <typename T>
-struct RemoveConst { typedef T type; };  // NOLINT
-template <typename T>
-struct RemoveConst<const T> { typedef T type; };  // NOLINT
-
-// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above
-// definition to fail to remove the const in 'const int[3]' and 'const
-// char[3][4]'.  The following specialization works around the bug.
-template <typename T, size_t N>
-struct RemoveConst<const T[N]> {
-  typedef typename RemoveConst<T>::type type[N];
-};
-
-// A handy wrapper around RemoveConst that works when the argument
-// T depends on template parameters.
-#define GTEST_REMOVE_CONST_(T) \
-    typename ::testing::internal::RemoveConst<T>::type
-
-// Turns const U&, U&, const U, and U all into U.
-#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
-    GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
-
-// IsAProtocolMessage<T>::value is a compile-time bool constant that's
-// true iff T is type proto2::Message or a subclass of it.
-template <typename T>
-struct IsAProtocolMessage
-    : public bool_constant<
-  std::is_convertible<const T*, const ::proto2::Message*>::value> {
-};
-
-// When the compiler sees expression IsContainerTest<C>(0), if C is an
-// STL-style container class, the first overload of IsContainerTest
-// will be viable (since both C::iterator* and C::const_iterator* are
-// valid types and NULL can be implicitly converted to them).  It will
-// be picked over the second overload as 'int' is a perfect match for
-// the type of argument 0.  If C::iterator or C::const_iterator is not
-// a valid type, the first overload is not viable, and the second
-// overload will be picked.  Therefore, we can determine whether C is
-// a container class by checking the type of IsContainerTest<C>(0).
-// The value of the expression is insignificant.
-//
-// In C++11 mode we check the existence of a const_iterator and that an
-// iterator is properly implemented for the container.
-//
-// For pre-C++11 that we look for both C::iterator and C::const_iterator.
-// The reason is that C++ injects the name of a class as a member of the
-// class itself (e.g. you can refer to class iterator as either
-// 'iterator' or 'iterator::iterator').  If we look for C::iterator
-// only, for example, we would mistakenly think that a class named
-// iterator is an STL container.
-//
-// Also note that the simpler approach of overloading
-// IsContainerTest(typename C::const_iterator*) and
-// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
-typedef int IsContainer;
-template <class C,
-          class Iterator = decltype(::std::declval<const C&>().begin()),
-          class = decltype(::std::declval<const C&>().end()),
-          class = decltype(++::std::declval<Iterator&>()),
-          class = decltype(*::std::declval<Iterator>()),
-          class = typename C::const_iterator>
-IsContainer IsContainerTest(int /* dummy */) {
-  return 0;
-}
-
-typedef char IsNotContainer;
-template <class C>
-IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
-
-// Trait to detect whether a type T is a hash table.
-// The heuristic used is that the type contains an inner type `hasher` and does
-// not contain an inner type `reverse_iterator`.
-// If the container is iterable in reverse, then order might actually matter.
-template <typename T>
-struct IsHashTable {
- private:
-  template <typename U>
-  static char test(typename U::hasher*, typename U::reverse_iterator*);
-  template <typename U>
-  static int test(typename U::hasher*, ...);
-  template <typename U>
-  static char test(...);
-
- public:
-  static const bool value = sizeof(test<T>(nullptr, nullptr)) == sizeof(int);
-};
-
-template <typename T>
-const bool IsHashTable<T>::value;
-
-template <typename C,
-          bool = sizeof(IsContainerTest<C>(0)) == sizeof(IsContainer)>
-struct IsRecursiveContainerImpl;
-
-template <typename C>
-struct IsRecursiveContainerImpl<C, false> : public false_type {};
-
-// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to
-// obey the same inconsistencies as the IsContainerTest, namely check if
-// something is a container is relying on only const_iterator in C++11 and
-// is relying on both const_iterator and iterator otherwise
-template <typename C>
-struct IsRecursiveContainerImpl<C, true> {
-  using value_type = decltype(*std::declval<typename C::const_iterator>());
-  using type =
-      is_same<typename std::remove_const<
-                  typename std::remove_reference<value_type>::type>::type,
-              C>;
-};
-
-// IsRecursiveContainer<Type> is a unary compile-time predicate that
-// evaluates whether C is a recursive container type. A recursive container
-// type is a container type whose value_type is equal to the container type
-// itself. An example for a recursive container type is
-// boost::filesystem::path, whose iterator has a value_type that is equal to
-// boost::filesystem::path.
-template <typename C>
-struct IsRecursiveContainer : public IsRecursiveContainerImpl<C>::type {};
-
-// EnableIf<condition>::type is void when 'Cond' is true, and
-// undefined when 'Cond' is false.  To use SFINAE to make a function
-// overload only apply when a particular expression is true, add
-// "typename EnableIf<expression>::type* = 0" as the last parameter.
-template<bool> struct EnableIf;
-template<> struct EnableIf<true> { typedef void type; };  // NOLINT
-
-// Utilities for native arrays.
-
-// ArrayEq() compares two k-dimensional native arrays using the
-// elements' operator==, where k can be any integer >= 0.  When k is
-// 0, ArrayEq() degenerates into comparing a single pair of values.
-
-template <typename T, typename U>
-bool ArrayEq(const T* lhs, size_t size, const U* rhs);
-
-// This generic version is used when k is 0.
-template <typename T, typename U>
-inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
-
-// This overload is used when k >= 1.
-template <typename T, typename U, size_t N>
-inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
-  return internal::ArrayEq(lhs, N, rhs);
-}
-
-// This helper reduces code bloat.  If we instead put its logic inside
-// the previous ArrayEq() function, arrays with different sizes would
-// lead to different copies of the template code.
-template <typename T, typename U>
-bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
-  for (size_t i = 0; i != size; i++) {
-    if (!internal::ArrayEq(lhs[i], rhs[i]))
-      return false;
-  }
-  return true;
-}
-
-// Finds the first element in the iterator range [begin, end) that
-// equals elem.  Element may be a native array type itself.
-template <typename Iter, typename Element>
-Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
-  for (Iter it = begin; it != end; ++it) {
-    if (internal::ArrayEq(*it, elem))
-      return it;
-  }
-  return end;
-}
-
-// CopyArray() copies a k-dimensional native array using the elements'
-// operator=, where k can be any integer >= 0.  When k is 0,
-// CopyArray() degenerates into copying a single value.
-
-template <typename T, typename U>
-void CopyArray(const T* from, size_t size, U* to);
-
-// This generic version is used when k is 0.
-template <typename T, typename U>
-inline void CopyArray(const T& from, U* to) { *to = from; }
-
-// This overload is used when k >= 1.
-template <typename T, typename U, size_t N>
-inline void CopyArray(const T(&from)[N], U(*to)[N]) {
-  internal::CopyArray(from, N, *to);
-}
-
-// This helper reduces code bloat.  If we instead put its logic inside
-// the previous CopyArray() function, arrays with different sizes
-// would lead to different copies of the template code.
-template <typename T, typename U>
-void CopyArray(const T* from, size_t size, U* to) {
-  for (size_t i = 0; i != size; i++) {
-    internal::CopyArray(from[i], to + i);
-  }
-}
-
-// The relation between an NativeArray object (see below) and the
-// native array it represents.
-// We use 2 different structs to allow non-copyable types to be used, as long
-// as RelationToSourceReference() is passed.
-struct RelationToSourceReference {};
-struct RelationToSourceCopy {};
-
-// Adapts a native array to a read-only STL-style container.  Instead
-// of the complete STL container concept, this adaptor only implements
-// members useful for Google Mock's container matchers.  New members
-// should be added as needed.  To simplify the implementation, we only
-// support Element being a raw type (i.e. having no top-level const or
-// reference modifier).  It's the client's responsibility to satisfy
-// this requirement.  Element can be an array type itself (hence
-// multi-dimensional arrays are supported).
-template <typename Element>
-class NativeArray {
- public:
-  // STL-style container typedefs.
-  typedef Element value_type;
-  typedef Element* iterator;
-  typedef const Element* const_iterator;
-
-  // Constructs from a native array. References the source.
-  NativeArray(const Element* array, size_t count, RelationToSourceReference) {
-    InitRef(array, count);
-  }
-
-  // Constructs from a native array. Copies the source.
-  NativeArray(const Element* array, size_t count, RelationToSourceCopy) {
-    InitCopy(array, count);
-  }
-
-  // Copy constructor.
-  NativeArray(const NativeArray& rhs) {
-    (this->*rhs.clone_)(rhs.array_, rhs.size_);
-  }
-
-  ~NativeArray() {
-    if (clone_ != &NativeArray::InitRef)
-      delete[] array_;
-  }
-
-  // STL-style container methods.
-  size_t size() const { return size_; }
-  const_iterator begin() const { return array_; }
-  const_iterator end() const { return array_ + size_; }
-  bool operator==(const NativeArray& rhs) const {
-    return size() == rhs.size() &&
-        ArrayEq(begin(), size(), rhs.begin());
-  }
-
- private:
-  enum {
-    kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper<
-        Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value
-  };
-
-  // Initializes this object with a copy of the input.
-  void InitCopy(const Element* array, size_t a_size) {
-    Element* const copy = new Element[a_size];
-    CopyArray(array, a_size, copy);
-    array_ = copy;
-    size_ = a_size;
-    clone_ = &NativeArray::InitCopy;
-  }
-
-  // Initializes this object with a reference of the input.
-  void InitRef(const Element* array, size_t a_size) {
-    array_ = array;
-    size_ = a_size;
-    clone_ = &NativeArray::InitRef;
-  }
-
-  const Element* array_;
-  size_t size_;
-  void (NativeArray::*clone_)(const Element*, size_t);
-
-  GTEST_DISALLOW_ASSIGN_(NativeArray);
-};
-
-// Backport of std::index_sequence.
-template <size_t... Is>
-struct IndexSequence {
-  using type = IndexSequence;
-};
-
-// Double the IndexSequence, and one if plus_one is true.
-template <bool plus_one, typename T, size_t sizeofT>
-struct DoubleSequence;
-template <size_t... I, size_t sizeofT>
-struct DoubleSequence<true, IndexSequence<I...>, sizeofT> {
-  using type = IndexSequence<I..., (sizeofT + I)..., 2 * sizeofT>;
-};
-template <size_t... I, size_t sizeofT>
-struct DoubleSequence<false, IndexSequence<I...>, sizeofT> {
-  using type = IndexSequence<I..., (sizeofT + I)...>;
-};
-
-// Backport of std::make_index_sequence.
-// It uses O(ln(N)) instantiation depth.
-template <size_t N>
-struct MakeIndexSequence
-    : DoubleSequence<N % 2 == 1, typename MakeIndexSequence<N / 2>::type,
-                     N / 2>::type {};
-
-template <>
-struct MakeIndexSequence<0> : IndexSequence<> {};
-
-// FIXME: This implementation of ElemFromList is O(1) in instantiation depth,
-// but it is O(N^2) in total instantiations. Not sure if this is the best
-// tradeoff, as it will make it somewhat slow to compile.
-template <typename T, size_t, size_t>
-struct ElemFromListImpl {};
-
-template <typename T, size_t I>
-struct ElemFromListImpl<T, I, I> {
-  using type = T;
-};
-
-// Get the Nth element from T...
-// It uses O(1) instantiation depth.
-template <size_t N, typename I, typename... T>
-struct ElemFromList;
-
-template <size_t N, size_t... I, typename... T>
-struct ElemFromList<N, IndexSequence<I...>, T...>
-    : ElemFromListImpl<T, N, I>... {};
-
-template <typename... T>
-class FlatTuple;
-
-template <typename Derived, size_t I>
-struct FlatTupleElemBase;
-
-template <typename... T, size_t I>
-struct FlatTupleElemBase<FlatTuple<T...>, I> {
-  using value_type =
-      typename ElemFromList<I, typename MakeIndexSequence<sizeof...(T)>::type,
-                            T...>::type;
-  FlatTupleElemBase() = default;
-  explicit FlatTupleElemBase(value_type t) : value(std::move(t)) {}
-  value_type value;
-};
-
-template <typename Derived, typename Idx>
-struct FlatTupleBase;
-
-template <size_t... Idx, typename... T>
-struct FlatTupleBase<FlatTuple<T...>, IndexSequence<Idx...>>
-    : FlatTupleElemBase<FlatTuple<T...>, Idx>... {
-  using Indices = IndexSequence<Idx...>;
-  FlatTupleBase() = default;
-  explicit FlatTupleBase(T... t)
-      : FlatTupleElemBase<FlatTuple<T...>, Idx>(std::move(t))... {}
-};
-
-// Analog to std::tuple but with different tradeoffs.
-// This class minimizes the template instantiation depth, thus allowing more
-// elements that std::tuple would. std::tuple has been seen to require an
-// instantiation depth of more than 10x the number of elements in some
-// implementations.
-// FlatTuple and ElemFromList are not recursive and have a fixed depth
-// regardless of T...
-// MakeIndexSequence, on the other hand, it is recursive but with an
-// instantiation depth of O(ln(N)).
-template <typename... T>
-class FlatTuple
-    : private FlatTupleBase<FlatTuple<T...>,
-                            typename MakeIndexSequence<sizeof...(T)>::type> {
-  using Indices = typename FlatTuple::FlatTupleBase::Indices;
-
- public:
-  FlatTuple() = default;
-  explicit FlatTuple(T... t) : FlatTuple::FlatTupleBase(std::move(t)...) {}
-
-  template <size_t I>
-  const typename ElemFromList<I, Indices, T...>::type& Get() const {
-    return static_cast<const FlatTupleElemBase<FlatTuple, I>*>(this)->value;
-  }
-
-  template <size_t I>
-  typename ElemFromList<I, Indices, T...>::type& Get() {
-    return static_cast<FlatTupleElemBase<FlatTuple, I>*>(this)->value;
-  }
-};
-
-// Utility functions to be called with static_assert to induce deprecation
-// warnings.
-GTEST_INTERNAL_DEPRECATED(
-    "INSTANTIATE_TEST_CASE_P is deprecated, please use "
-    "INSTANTIATE_TEST_SUITE_P")
-constexpr bool InstantiateTestCase_P_IsDeprecated() { return true; }
-
-GTEST_INTERNAL_DEPRECATED(
-    "TYPED_TEST_CASE_P is deprecated, please use "
-    "TYPED_TEST_SUITE_P")
-constexpr bool TypedTestCase_P_IsDeprecated() { return true; }
-
-GTEST_INTERNAL_DEPRECATED(
-    "TYPED_TEST_CASE is deprecated, please use "
-    "TYPED_TEST_SUITE")
-constexpr bool TypedTestCaseIsDeprecated() { return true; }
-
-GTEST_INTERNAL_DEPRECATED(
-    "REGISTER_TYPED_TEST_CASE_P is deprecated, please use "
-    "REGISTER_TYPED_TEST_SUITE_P")
-constexpr bool RegisterTypedTestCase_P_IsDeprecated() { return true; }
-
-GTEST_INTERNAL_DEPRECATED(
-    "INSTANTIATE_TYPED_TEST_CASE_P is deprecated, please use "
-    "INSTANTIATE_TYPED_TEST_SUITE_P")
-constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
-
-}  // namespace internal
-}  // namespace testing
-
-#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
-  ::testing::internal::AssertHelper(result_type, file, line, message) \
-    = ::testing::Message()
-
-#define GTEST_MESSAGE_(message, result_type) \
-  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
-
-#define GTEST_FATAL_FAILURE_(message) \
-  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
-
-#define GTEST_NONFATAL_FAILURE_(message) \
-  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
-
-#define GTEST_SUCCESS_(message) \
-  GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
-
-#define GTEST_SKIP_(message) \
-  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kSkip)
-
-// Suppress MSVC warning 4072 (unreachable code) for the code following
-// statement if it returns or throws (or doesn't return or throw in some
-// situations).
-#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
-  if (::testing::internal::AlwaysTrue()) { statement; }
-
-#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
-    bool gtest_caught_expected = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (expected_exception const&) { \
-      gtest_caught_expected = true; \
-    } \
-    catch (...) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws a different type."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-    if (!gtest_caught_expected) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws nothing."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
-      fail(gtest_msg.value)
-
-#define GTEST_TEST_NO_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
-      fail("Expected: " #statement " doesn't throw an exception.\n" \
-           "  Actual: it throws.")
-
-#define GTEST_TEST_ANY_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    bool gtest_caught_any = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      gtest_caught_any = true; \
-    } \
-    if (!gtest_caught_any) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
-      fail("Expected: " #statement " throws an exception.\n" \
-           "  Actual: it doesn't.")
-
-
-// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
-// either a boolean expression or an AssertionResult. text is a textual
-// represenation of expression as it was passed into the EXPECT_TRUE.
-#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar_ = \
-      ::testing::AssertionResult(expression)) \
-    ; \
-  else \
-    fail(::testing::internal::GetBoolAssertionFailureMessage(\
-        gtest_ar_, text, #actual, #expected).c_str())
-
-#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
-      fail("Expected: " #statement " doesn't generate new fatal " \
-           "failures in the current thread.\n" \
-           "  Actual: it does.")
-
-// Expands to the name of the class that implements the given test.
-#define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
-  test_suite_name##_##test_name##_Test
-
-// Helper macro for defining tests.
-#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id)      \
-  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                    \
-      : public parent_class {                                                 \
-   public:                                                                    \
-    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {}                   \
-                                                                              \
-   private:                                                                   \
-    virtual void TestBody();                                                  \
-    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;     \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
-                                                           test_name));       \
-  };                                                                          \
-                                                                              \
-  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name,          \
-                                                    test_name)::test_info_ =  \
-      ::testing::internal::MakeAndRegisterTestInfo(                           \
-          #test_suite_name, #test_name, nullptr, nullptr,                     \
-          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
-          ::testing::internal::SuiteApiResolver<                              \
-              parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__),         \
-          ::testing::internal::SuiteApiResolver<                              \
-              parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__),      \
-          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(    \
-              test_suite_name, test_name)>);                                  \
-  void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
diff --git a/deps/googletest/include/gtest/internal/gtest-param-util.h b/deps/googletest/include/gtest/internal/gtest-param-util.h
deleted file mode 100644
index e900b3ffb..000000000
--- a/deps/googletest/include/gtest/internal/gtest-param-util.h
+++ /dev/null
@@ -1,880 +0,0 @@
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-// Type and function utilities for implementing parameterized tests.
-
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-
-#include <ctype.h>
-
-#include <cassert>
-#include <iterator>
-#include <memory>
-#include <set>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-port.h"
-#include "gtest/gtest-printers.h"
-
-namespace testing {
-// Input to a parameterized test name generator, describing a test parameter.
-// Consists of the parameter value and the integer parameter index.
-template <class ParamType>
-struct TestParamInfo {
-  TestParamInfo(const ParamType& a_param, size_t an_index) :
-    param(a_param),
-    index(an_index) {}
-  ParamType param;
-  size_t index;
-};
-
-// A builtin parameterized test name generator which returns the result of
-// testing::PrintToString.
-struct PrintToStringParamName {
-  template <class ParamType>
-  std::string operator()(const TestParamInfo<ParamType>& info) const {
-    return PrintToString(info.param);
-  }
-};
-
-namespace internal {
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-// Utility Functions
-
-// Outputs a message explaining invalid registration of different
-// fixture class for the same test suite. This may happen when
-// TEST_P macro is used to define two tests with the same name
-// but in different namespaces.
-GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name,
-                                           CodeLocation code_location);
-
-template <typename> class ParamGeneratorInterface;
-template <typename> class ParamGenerator;
-
-// Interface for iterating over elements provided by an implementation
-// of ParamGeneratorInterface<T>.
-template <typename T>
-class ParamIteratorInterface {
- public:
-  virtual ~ParamIteratorInterface() {}
-  // A pointer to the base generator instance.
-  // Used only for the purposes of iterator comparison
-  // to make sure that two iterators belong to the same generator.
-  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
-  // Advances iterator to point to the next element
-  // provided by the generator. The caller is responsible
-  // for not calling Advance() on an iterator equal to
-  // BaseGenerator()->End().
-  virtual void Advance() = 0;
-  // Clones the iterator object. Used for implementing copy semantics
-  // of ParamIterator<T>.
-  virtual ParamIteratorInterface* Clone() const = 0;
-  // Dereferences the current iterator and provides (read-only) access
-  // to the pointed value. It is the caller's responsibility not to call
-  // Current() on an iterator equal to BaseGenerator()->End().
-  // Used for implementing ParamGenerator<T>::operator*().
-  virtual const T* Current() const = 0;
-  // Determines whether the given iterator and other point to the same
-  // element in the sequence generated by the generator.
-  // Used for implementing ParamGenerator<T>::operator==().
-  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
-};
-
-// Class iterating over elements provided by an implementation of
-// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
-// and implements the const forward iterator concept.
-template <typename T>
-class ParamIterator {
- public:
-  typedef T value_type;
-  typedef const T& reference;
-  typedef ptrdiff_t difference_type;
-
-  // ParamIterator assumes ownership of the impl_ pointer.
-  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
-  ParamIterator& operator=(const ParamIterator& other) {
-    if (this != &other)
-      impl_.reset(other.impl_->Clone());
-    return *this;
-  }
-
-  const T& operator*() const { return *impl_->Current(); }
-  const T* operator->() const { return impl_->Current(); }
-  // Prefix version of operator++.
-  ParamIterator& operator++() {
-    impl_->Advance();
-    return *this;
-  }
-  // Postfix version of operator++.
-  ParamIterator operator++(int /*unused*/) {
-    ParamIteratorInterface<T>* clone = impl_->Clone();
-    impl_->Advance();
-    return ParamIterator(clone);
-  }
-  bool operator==(const ParamIterator& other) const {
-    return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
-  }
-  bool operator!=(const ParamIterator& other) const {
-    return !(*this == other);
-  }
-
- private:
-  friend class ParamGenerator<T>;
-  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
-  std::unique_ptr<ParamIteratorInterface<T> > impl_;
-};
-
-// ParamGeneratorInterface<T> is the binary interface to access generators
-// defined in other translation units.
-template <typename T>
-class ParamGeneratorInterface {
- public:
-  typedef T ParamType;
-
-  virtual ~ParamGeneratorInterface() {}
-
-  // Generator interface definition
-  virtual ParamIteratorInterface<T>* Begin() const = 0;
-  virtual ParamIteratorInterface<T>* End() const = 0;
-};
-
-// Wraps ParamGeneratorInterface<T> and provides general generator syntax
-// compatible with the STL Container concept.
-// This class implements copy initialization semantics and the contained
-// ParamGeneratorInterface<T> instance is shared among all copies
-// of the original object. This is possible because that instance is immutable.
-template<typename T>
-class ParamGenerator {
- public:
-  typedef ParamIterator<T> iterator;
-
-  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
-  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
-
-  ParamGenerator& operator=(const ParamGenerator& other) {
-    impl_ = other.impl_;
-    return *this;
-  }
-
-  iterator begin() const { return iterator(impl_->Begin()); }
-  iterator end() const { return iterator(impl_->End()); }
-
- private:
-  std::shared_ptr<const ParamGeneratorInterface<T> > impl_;
-};
-
-// Generates values from a range of two comparable values. Can be used to
-// generate sequences of user-defined types that implement operator+() and
-// operator<().
-// This class is used in the Range() function.
-template <typename T, typename IncrementT>
-class RangeGenerator : public ParamGeneratorInterface<T> {
- public:
-  RangeGenerator(T begin, T end, IncrementT step)
-      : begin_(begin), end_(end),
-        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
-  ~RangeGenerator() override {}
-
-  ParamIteratorInterface<T>* Begin() const override {
-    return new Iterator(this, begin_, 0, step_);
-  }
-  ParamIteratorInterface<T>* End() const override {
-    return new Iterator(this, end_, end_index_, step_);
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<T> {
-   public:
-    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
-             IncrementT step)
-        : base_(base), value_(value), index_(index), step_(step) {}
-    ~Iterator() override {}
-
-    const ParamGeneratorInterface<T>* BaseGenerator() const override {
-      return base_;
-    }
-    void Advance() override {
-      value_ = static_cast<T>(value_ + step_);
-      index_++;
-    }
-    ParamIteratorInterface<T>* Clone() const override {
-      return new Iterator(*this);
-    }
-    const T* Current() const override { return &value_; }
-    bool Equals(const ParamIteratorInterface<T>& other) const override {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const int other_index =
-          CheckedDowncastToActualType<const Iterator>(&other)->index_;
-      return index_ == other_index;
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : ParamIteratorInterface<T>(),
-          base_(other.base_), value_(other.value_), index_(other.index_),
-          step_(other.step_) {}
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<T>* const base_;
-    T value_;
-    int index_;
-    const IncrementT step_;
-  };  // class RangeGenerator::Iterator
-
-  static int CalculateEndIndex(const T& begin,
-                               const T& end,
-                               const IncrementT& step) {
-    int end_index = 0;
-    for (T i = begin; i < end; i = static_cast<T>(i + step))
-      end_index++;
-    return end_index;
-  }
-
-  // No implementation - assignment is unsupported.
-  void operator=(const RangeGenerator& other);
-
-  const T begin_;
-  const T end_;
-  const IncrementT step_;
-  // The index for the end() iterator. All the elements in the generated
-  // sequence are indexed (0-based) to aid iterator comparison.
-  const int end_index_;
-};  // class RangeGenerator
-
-
-// Generates values from a pair of STL-style iterators. Used in the
-// ValuesIn() function. The elements are copied from the source range
-// since the source can be located on the stack, and the generator
-// is likely to persist beyond that stack frame.
-template <typename T>
-class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
- public:
-  template <typename ForwardIterator>
-  ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
-      : container_(begin, end) {}
-  ~ValuesInIteratorRangeGenerator() override {}
-
-  ParamIteratorInterface<T>* Begin() const override {
-    return new Iterator(this, container_.begin());
-  }
-  ParamIteratorInterface<T>* End() const override {
-    return new Iterator(this, container_.end());
-  }
-
- private:
-  typedef typename ::std::vector<T> ContainerType;
-
-  class Iterator : public ParamIteratorInterface<T> {
-   public:
-    Iterator(const ParamGeneratorInterface<T>* base,
-             typename ContainerType::const_iterator iterator)
-        : base_(base), iterator_(iterator) {}
-    ~Iterator() override {}
-
-    const ParamGeneratorInterface<T>* BaseGenerator() const override {
-      return base_;
-    }
-    void Advance() override {
-      ++iterator_;
-      value_.reset();
-    }
-    ParamIteratorInterface<T>* Clone() const override {
-      return new Iterator(*this);
-    }
-    // We need to use cached value referenced by iterator_ because *iterator_
-    // can return a temporary object (and of type other then T), so just
-    // having "return &*iterator_;" doesn't work.
-    // value_ is updated here and not in Advance() because Advance()
-    // can advance iterator_ beyond the end of the range, and we cannot
-    // detect that fact. The client code, on the other hand, is
-    // responsible for not calling Current() on an out-of-range iterator.
-    const T* Current() const override {
-      if (value_.get() == nullptr) value_.reset(new T(*iterator_));
-      return value_.get();
-    }
-    bool Equals(const ParamIteratorInterface<T>& other) const override {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      return iterator_ ==
-          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
-    }
-
-   private:
-    Iterator(const Iterator& other)
-          // The explicit constructor call suppresses a false warning
-          // emitted by gcc when supplied with the -Wextra option.
-        : ParamIteratorInterface<T>(),
-          base_(other.base_),
-          iterator_(other.iterator_) {}
-
-    const ParamGeneratorInterface<T>* const base_;
-    typename ContainerType::const_iterator iterator_;
-    // A cached value of *iterator_. We keep it here to allow access by
-    // pointer in the wrapping iterator's operator->().
-    // value_ needs to be mutable to be accessed in Current().
-    // Use of std::unique_ptr helps manage cached value's lifetime,
-    // which is bound by the lifespan of the iterator itself.
-    mutable std::unique_ptr<const T> value_;
-  };  // class ValuesInIteratorRangeGenerator::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const ValuesInIteratorRangeGenerator& other);
-
-  const ContainerType container_;
-};  // class ValuesInIteratorRangeGenerator
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Default parameterized test name generator, returns a string containing the
-// integer test parameter index.
-template <class ParamType>
-std::string DefaultParamName(const TestParamInfo<ParamType>& info) {
-  Message name_stream;
-  name_stream << info.index;
-  return name_stream.GetString();
-}
-
-template <typename T = int>
-void TestNotEmpty() {
-  static_assert(sizeof(T) == 0, "Empty arguments are not allowed.");
-}
-template <typename T = int>
-void TestNotEmpty(const T&) {}
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Stores a parameter value and later creates tests parameterized with that
-// value.
-template <class TestClass>
-class ParameterizedTestFactory : public TestFactoryBase {
- public:
-  typedef typename TestClass::ParamType ParamType;
-  explicit ParameterizedTestFactory(ParamType parameter) :
-      parameter_(parameter) {}
-  Test* CreateTest() override {
-    TestClass::SetParam(&parameter_);
-    return new TestClass();
-  }
-
- private:
-  const ParamType parameter_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// TestMetaFactoryBase is a base class for meta-factories that create
-// test factories for passing into MakeAndRegisterTestInfo function.
-template <class ParamType>
-class TestMetaFactoryBase {
- public:
-  virtual ~TestMetaFactoryBase() {}
-
-  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// TestMetaFactory creates test factories for passing into
-// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
-// ownership of test factory pointer, same factory object cannot be passed
-// into that method twice. But ParameterizedTestSuiteInfo is going to call
-// it for each Test/Parameter value combination. Thus it needs meta factory
-// creator class.
-template <class TestSuite>
-class TestMetaFactory
-    : public TestMetaFactoryBase<typename TestSuite::ParamType> {
- public:
-  using ParamType = typename TestSuite::ParamType;
-
-  TestMetaFactory() {}
-
-  TestFactoryBase* CreateTestFactory(ParamType parameter) override {
-    return new ParameterizedTestFactory<TestSuite>(parameter);
-  }
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// ParameterizedTestSuiteInfoBase is a generic interface
-// to ParameterizedTestSuiteInfo classes. ParameterizedTestSuiteInfoBase
-// accumulates test information provided by TEST_P macro invocations
-// and generators provided by INSTANTIATE_TEST_SUITE_P macro invocations
-// and uses that information to register all resulting test instances
-// in RegisterTests method. The ParameterizeTestSuiteRegistry class holds
-// a collection of pointers to the ParameterizedTestSuiteInfo objects
-// and calls RegisterTests() on each of them when asked.
-class ParameterizedTestSuiteInfoBase {
- public:
-  virtual ~ParameterizedTestSuiteInfoBase() {}
-
-  // Base part of test suite name for display purposes.
-  virtual const std::string& GetTestSuiteName() const = 0;
-  // Test case id to verify identity.
-  virtual TypeId GetTestSuiteTypeId() const = 0;
-  // UnitTest class invokes this method to register tests in this
-  // test suite right before running them in RUN_ALL_TESTS macro.
-  // This method should not be called more than once on any single
-  // instance of a ParameterizedTestSuiteInfoBase derived class.
-  virtual void RegisterTests() = 0;
-
- protected:
-  ParameterizedTestSuiteInfoBase() {}
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfoBase);
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// ParameterizedTestSuiteInfo accumulates tests obtained from TEST_P
-// macro invocations for a particular test suite and generators
-// obtained from INSTANTIATE_TEST_SUITE_P macro invocations for that
-// test suite. It registers tests with all values generated by all
-// generators when asked.
-template <class TestSuite>
-class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
- public:
-  // ParamType and GeneratorCreationFunc are private types but are required
-  // for declarations of public methods AddTestPattern() and
-  // AddTestSuiteInstantiation().
-  using ParamType = typename TestSuite::ParamType;
-  // A function that returns an instance of appropriate generator type.
-  typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
-  using ParamNameGeneratorFunc = std::string(const TestParamInfo<ParamType>&);
-
-  explicit ParameterizedTestSuiteInfo(const char* name,
-                                      CodeLocation code_location)
-      : test_suite_name_(name), code_location_(code_location) {}
-
-  // Test case base name for display purposes.
-  const std::string& GetTestSuiteName() const override {
-    return test_suite_name_;
-  }
-  // Test case id to verify identity.
-  TypeId GetTestSuiteTypeId() const override { return GetTypeId<TestSuite>(); }
-  // TEST_P macro uses AddTestPattern() to record information
-  // about a single test in a LocalTestInfo structure.
-  // test_suite_name is the base name of the test suite (without invocation
-  // prefix). test_base_name is the name of an individual test without
-  // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
-  // test suite base name and DoBar is test base name.
-  void AddTestPattern(const char* test_suite_name, const char* test_base_name,
-                      TestMetaFactoryBase<ParamType>* meta_factory) {
-    tests_.push_back(std::shared_ptr<TestInfo>(
-        new TestInfo(test_suite_name, test_base_name, meta_factory)));
-  }
-  // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information
-  // about a generator.
-  int AddTestSuiteInstantiation(const std::string& instantiation_name,
-                                GeneratorCreationFunc* func,
-                                ParamNameGeneratorFunc* name_func,
-                                const char* file, int line) {
-    instantiations_.push_back(
-        InstantiationInfo(instantiation_name, func, name_func, file, line));
-    return 0;  // Return value used only to run this method in namespace scope.
-  }
-  // UnitTest class invokes this method to register tests in this test suite
-  // test suites right before running tests in RUN_ALL_TESTS macro.
-  // This method should not be called more than once on any single
-  // instance of a ParameterizedTestSuiteInfoBase derived class.
-  // UnitTest has a guard to prevent from calling this method more than once.
-  void RegisterTests() override {
-    for (typename TestInfoContainer::iterator test_it = tests_.begin();
-         test_it != tests_.end(); ++test_it) {
-      std::shared_ptr<TestInfo> test_info = *test_it;
-      for (typename InstantiationContainer::iterator gen_it =
-               instantiations_.begin(); gen_it != instantiations_.end();
-               ++gen_it) {
-        const std::string& instantiation_name = gen_it->name;
-        ParamGenerator<ParamType> generator((*gen_it->generator)());
-        ParamNameGeneratorFunc* name_func = gen_it->name_func;
-        const char* file = gen_it->file;
-        int line = gen_it->line;
-
-        std::string test_suite_name;
-        if ( !instantiation_name.empty() )
-          test_suite_name = instantiation_name + "/";
-        test_suite_name += test_info->test_suite_base_name;
-
-        size_t i = 0;
-        std::set<std::string> test_param_names;
-        for (typename ParamGenerator<ParamType>::iterator param_it =
-                 generator.begin();
-             param_it != generator.end(); ++param_it, ++i) {
-          Message test_name_stream;
-
-          std::string param_name = name_func(
-              TestParamInfo<ParamType>(*param_it, i));
-
-          GTEST_CHECK_(IsValidParamName(param_name))
-              << "Parameterized test name '" << param_name
-              << "' is invalid, in " << file
-              << " line " << line << std::endl;
-
-          GTEST_CHECK_(test_param_names.count(param_name) == 0)
-              << "Duplicate parameterized test name '" << param_name
-              << "', in " << file << " line " << line << std::endl;
-
-          test_param_names.insert(param_name);
-
-          test_name_stream << test_info->test_base_name << "/" << param_name;
-          MakeAndRegisterTestInfo(
-              test_suite_name.c_str(), test_name_stream.GetString().c_str(),
-              nullptr,  // No type parameter.
-              PrintToString(*param_it).c_str(), code_location_,
-              GetTestSuiteTypeId(),
-              SuiteApiResolver<TestSuite>::GetSetUpCaseOrSuite(file, line),
-              SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line),
-              test_info->test_meta_factory->CreateTestFactory(*param_it));
-        }  // for param_it
-      }  // for gen_it
-    }  // for test_it
-  }    // RegisterTests
-
- private:
-  // LocalTestInfo structure keeps information about a single test registered
-  // with TEST_P macro.
-  struct TestInfo {
-    TestInfo(const char* a_test_suite_base_name, const char* a_test_base_name,
-             TestMetaFactoryBase<ParamType>* a_test_meta_factory)
-        : test_suite_base_name(a_test_suite_base_name),
-          test_base_name(a_test_base_name),
-          test_meta_factory(a_test_meta_factory) {}
-
-    const std::string test_suite_base_name;
-    const std::string test_base_name;
-    const std::unique_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
-  };
-  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo> >;
-  // Records data received from INSTANTIATE_TEST_SUITE_P macros:
-  //  <Instantiation name, Sequence generator creation function,
-  //     Name generator function, Source file, Source line>
-  struct InstantiationInfo {
-      InstantiationInfo(const std::string &name_in,
-                        GeneratorCreationFunc* generator_in,
-                        ParamNameGeneratorFunc* name_func_in,
-                        const char* file_in,
-                        int line_in)
-          : name(name_in),
-            generator(generator_in),
-            name_func(name_func_in),
-            file(file_in),
-            line(line_in) {}
-
-      std::string name;
-      GeneratorCreationFunc* generator;
-      ParamNameGeneratorFunc* name_func;
-      const char* file;
-      int line;
-  };
-  typedef ::std::vector<InstantiationInfo> InstantiationContainer;
-
-  static bool IsValidParamName(const std::string& name) {
-    // Check for empty string
-    if (name.empty())
-      return false;
-
-    // Check for invalid characters
-    for (std::string::size_type index = 0; index < name.size(); ++index) {
-      if (!isalnum(name[index]) && name[index] != '_')
-        return false;
-    }
-
-    return true;
-  }
-
-  const std::string test_suite_name_;
-  CodeLocation code_location_;
-  TestInfoContainer tests_;
-  InstantiationContainer instantiations_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfo);
-};  // class ParameterizedTestSuiteInfo
-
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-template <class TestCase>
-using ParameterizedTestCaseInfo = ParameterizedTestSuiteInfo<TestCase>;
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// ParameterizedTestSuiteRegistry contains a map of
-// ParameterizedTestSuiteInfoBase classes accessed by test suite names. TEST_P
-// and INSTANTIATE_TEST_SUITE_P macros use it to locate their corresponding
-// ParameterizedTestSuiteInfo descriptors.
-class ParameterizedTestSuiteRegistry {
- public:
-  ParameterizedTestSuiteRegistry() {}
-  ~ParameterizedTestSuiteRegistry() {
-    for (auto& test_suite_info : test_suite_infos_) {
-      delete test_suite_info;
-    }
-  }
-
-  // Looks up or creates and returns a structure containing information about
-  // tests and instantiations of a particular test suite.
-  template <class TestSuite>
-  ParameterizedTestSuiteInfo<TestSuite>* GetTestSuitePatternHolder(
-      const char* test_suite_name, CodeLocation code_location) {
-    ParameterizedTestSuiteInfo<TestSuite>* typed_test_info = nullptr;
-    for (auto& test_suite_info : test_suite_infos_) {
-      if (test_suite_info->GetTestSuiteName() == test_suite_name) {
-        if (test_suite_info->GetTestSuiteTypeId() != GetTypeId<TestSuite>()) {
-          // Complain about incorrect usage of Google Test facilities
-          // and terminate the program since we cannot guaranty correct
-          // test suite setup and tear-down in this case.
-          ReportInvalidTestSuiteType(test_suite_name, code_location);
-          posix::Abort();
-        } else {
-          // At this point we are sure that the object we found is of the same
-          // type we are looking for, so we downcast it to that type
-          // without further checks.
-          typed_test_info = CheckedDowncastToActualType<
-              ParameterizedTestSuiteInfo<TestSuite> >(test_suite_info);
-        }
-        break;
-      }
-    }
-    if (typed_test_info == nullptr) {
-      typed_test_info = new ParameterizedTestSuiteInfo<TestSuite>(
-          test_suite_name, code_location);
-      test_suite_infos_.push_back(typed_test_info);
-    }
-    return typed_test_info;
-  }
-  void RegisterTests() {
-    for (auto& test_suite_info : test_suite_infos_) {
-      test_suite_info->RegisterTests();
-    }
-  }
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  template <class TestCase>
-  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
-      const char* test_case_name, CodeLocation code_location) {
-    return GetTestSuitePatternHolder<TestCase>(test_case_name, code_location);
-  }
-
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
- private:
-  using TestSuiteInfoContainer = ::std::vector<ParameterizedTestSuiteInfoBase*>;
-
-  TestSuiteInfoContainer test_suite_infos_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteRegistry);
-};
-
-}  // namespace internal
-
-// Forward declarations of ValuesIn(), which is implemented in
-// include/gtest/gtest-param-test.h.
-template <class Container>
-internal::ParamGenerator<typename Container::value_type> ValuesIn(
-    const Container& container);
-
-namespace internal {
-// Used in the Values() function to provide polymorphic capabilities.
-
-template <typename... Ts>
-class ValueArray {
- public:
-  ValueArray(Ts... v) : v_{std::move(v)...} {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {  // NOLINT
-    return ValuesIn(MakeVector<T>(MakeIndexSequence<sizeof...(Ts)>()));
-  }
-
- private:
-  template <typename T, size_t... I>
-  std::vector<T> MakeVector(IndexSequence<I...>) const {
-    return std::vector<T>{static_cast<T>(v_.template Get<I>())...};
-  }
-
-  FlatTuple<Ts...> v_;
-};
-
-template <typename... T>
-class CartesianProductGenerator
-    : public ParamGeneratorInterface<::std::tuple<T...>> {
- public:
-  typedef ::std::tuple<T...> ParamType;
-
-  CartesianProductGenerator(const std::tuple<ParamGenerator<T>...>& g)
-      : generators_(g) {}
-  ~CartesianProductGenerator() override {}
-
-  ParamIteratorInterface<ParamType>* Begin() const override {
-    return new Iterator(this, generators_, false);
-  }
-  ParamIteratorInterface<ParamType>* End() const override {
-    return new Iterator(this, generators_, true);
-  }
-
- private:
-  template <class I>
-  class IteratorImpl;
-  template <size_t... I>
-  class IteratorImpl<IndexSequence<I...>>
-      : public ParamIteratorInterface<ParamType> {
-   public:
-    IteratorImpl(const ParamGeneratorInterface<ParamType>* base,
-             const std::tuple<ParamGenerator<T>...>& generators, bool is_end)
-        : base_(base),
-          begin_(std::get<I>(generators).begin()...),
-          end_(std::get<I>(generators).end()...),
-          current_(is_end ? end_ : begin_) {
-      ComputeCurrentValue();
-    }
-    ~IteratorImpl() override {}
-
-    const ParamGeneratorInterface<ParamType>* BaseGenerator() const override {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    void Advance() override {
-      assert(!AtEnd());
-      // Advance the last iterator.
-      ++std::get<sizeof...(T) - 1>(current_);
-      // if that reaches end, propagate that up.
-      AdvanceIfEnd<sizeof...(T) - 1>();
-      ComputeCurrentValue();
-    }
-    ParamIteratorInterface<ParamType>* Clone() const override {
-      return new IteratorImpl(*this);
-    }
-
-    const ParamType* Current() const override { return current_value_.get(); }
-
-    bool Equals(const ParamIteratorInterface<ParamType>& other) const override {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const IteratorImpl* typed_other =
-          CheckedDowncastToActualType<const IteratorImpl>(&other);
-
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      if (AtEnd() && typed_other->AtEnd()) return true;
-
-      bool same = true;
-      bool dummy[] = {
-          (same = same && std::get<I>(current_) ==
-                              std::get<I>(typed_other->current_))...};
-      (void)dummy;
-      return same;
-    }
-
-   private:
-    template <size_t ThisI>
-    void AdvanceIfEnd() {
-      if (std::get<ThisI>(current_) != std::get<ThisI>(end_)) return;
-
-      bool last = ThisI == 0;
-      if (last) {
-        // We are done. Nothing else to propagate.
-        return;
-      }
-
-      constexpr size_t NextI = ThisI - (ThisI != 0);
-      std::get<ThisI>(current_) = std::get<ThisI>(begin_);
-      ++std::get<NextI>(current_);
-      AdvanceIfEnd<NextI>();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = std::make_shared<ParamType>(*std::get<I>(current_)...);
-    }
-    bool AtEnd() const {
-      bool at_end = false;
-      bool dummy[] = {
-          (at_end = at_end || std::get<I>(current_) == std::get<I>(end_))...};
-      (void)dummy;
-      return at_end;
-    }
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    std::tuple<typename ParamGenerator<T>::iterator...> begin_;
-    std::tuple<typename ParamGenerator<T>::iterator...> end_;
-    std::tuple<typename ParamGenerator<T>::iterator...> current_;
-    std::shared_ptr<ParamType> current_value_;
-  };
-
-  using Iterator = IteratorImpl<typename MakeIndexSequence<sizeof...(T)>::type>;
-
-  std::tuple<ParamGenerator<T>...> generators_;
-};
-
-template <class... Gen>
-class CartesianProductHolder {
- public:
-  CartesianProductHolder(const Gen&... g) : generators_(g...) {}
-  template <typename... T>
-  operator ParamGenerator<::std::tuple<T...>>() const {
-    return ParamGenerator<::std::tuple<T...>>(
-        new CartesianProductGenerator<T...>(generators_));
-  }
-
- private:
-  std::tuple<Gen...> generators_;
-};
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
diff --git a/deps/googletest/include/gtest/internal/gtest-port-arch.h b/deps/googletest/include/gtest/internal/gtest-port-arch.h
deleted file mode 100644
index cece93dba..000000000
--- a/deps/googletest/include/gtest/internal/gtest-port-arch.h
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright 2015, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// The Google C++ Testing and Mocking Framework (Google Test)
-//
-// This header file defines the GTEST_OS_* macro.
-// It is separate from gtest-port.h so that custom/gtest-port.h can include it.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
-
-// Determines the platform on which Google Test is compiled.
-#ifdef __CYGWIN__
-# define GTEST_OS_CYGWIN 1
-# elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
-#  define GTEST_OS_WINDOWS_MINGW 1
-#  define GTEST_OS_WINDOWS 1
-#elif defined _WIN32
-# define GTEST_OS_WINDOWS 1
-# ifdef _WIN32_WCE
-#  define GTEST_OS_WINDOWS_MOBILE 1
-# elif defined(WINAPI_FAMILY)
-#  include <winapifamily.h>
-#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#   define GTEST_OS_WINDOWS_DESKTOP 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
-#   define GTEST_OS_WINDOWS_PHONE 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
-#   define GTEST_OS_WINDOWS_RT 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
-#   define GTEST_OS_WINDOWS_PHONE 1
-#   define GTEST_OS_WINDOWS_TV_TITLE 1
-#  else
-    // WINAPI_FAMILY defined but no known partition matched.
-    // Default to desktop.
-#   define GTEST_OS_WINDOWS_DESKTOP 1
-#  endif
-# else
-#  define GTEST_OS_WINDOWS_DESKTOP 1
-# endif  // _WIN32_WCE
-#elif defined __OS2__
-# define GTEST_OS_OS2 1
-#elif defined __APPLE__
-# define GTEST_OS_MAC 1
-# if TARGET_OS_IPHONE
-#  define GTEST_OS_IOS 1
-# endif
-#elif defined __DragonFly__
-# define GTEST_OS_DRAGONFLY 1
-#elif defined __FreeBSD__
-# define GTEST_OS_FREEBSD 1
-#elif defined __Fuchsia__
-# define GTEST_OS_FUCHSIA 1
-#elif defined(__GLIBC__) && defined(__FreeBSD_kernel__)
-# define GTEST_OS_GNU_KFREEBSD 1
-#elif defined __linux__
-# define GTEST_OS_LINUX 1
-# if defined __ANDROID__
-#  define GTEST_OS_LINUX_ANDROID 1
-# endif
-#elif defined __MVS__
-# define GTEST_OS_ZOS 1
-#elif defined(__sun) && defined(__SVR4)
-# define GTEST_OS_SOLARIS 1
-#elif defined(_AIX)
-# define GTEST_OS_AIX 1
-#elif defined(__hpux)
-# define GTEST_OS_HPUX 1
-#elif defined __native_client__
-# define GTEST_OS_NACL 1
-#elif defined __NetBSD__
-# define GTEST_OS_NETBSD 1
-#elif defined __OpenBSD__
-# define GTEST_OS_OPENBSD 1
-#elif defined __QNX__
-# define GTEST_OS_QNX 1
-#elif defined(__HAIKU__)
-#define GTEST_OS_HAIKU 1
-#endif  // __CYGWIN__
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
diff --git a/deps/googletest/include/gtest/internal/gtest-port.h b/deps/googletest/include/gtest/internal/gtest-port.h
deleted file mode 100644
index 2990de6d6..000000000
--- a/deps/googletest/include/gtest/internal/gtest-port.h
+++ /dev/null
@@ -1,2320 +0,0 @@
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Low-level types and utilities for porting Google Test to various
-// platforms.  All macros ending with _ and symbols defined in an
-// internal namespace are subject to change without notice.  Code
-// outside Google Test MUST NOT USE THEM DIRECTLY.  Macros that don't
-// end with _ are part of Google Test's public API and can be used by
-// code outside Google Test.
-//
-// This file is fundamental to Google Test.  All other Google Test source
-// files are expected to #include this.  Therefore, it cannot #include
-// any other Google Test header.
-
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-
-// Environment-describing macros
-// -----------------------------
-//
-// Google Test can be used in many different environments.  Macros in
-// this section tell Google Test what kind of environment it is being
-// used in, such that Google Test can provide environment-specific
-// features and implementations.
-//
-// Google Test tries to automatically detect the properties of its
-// environment, so users usually don't need to worry about these
-// macros.  However, the automatic detection is not perfect.
-// Sometimes it's necessary for a user to define some of the following
-// macros in the build script to override Google Test's decisions.
-//
-// If the user doesn't define a macro in the list, Google Test will
-// provide a default definition.  After this header is #included, all
-// macros in this list will be defined to either 1 or 0.
-//
-// Notes to maintainers:
-//   - Each macro here is a user-tweakable knob; do not grow the list
-//     lightly.
-//   - Use #if to key off these macros.  Don't use #ifdef or "#if
-//     defined(...)", which will not work as these macros are ALWAYS
-//     defined.
-//
-//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
-//                              is/isn't available.
-//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
-//                              are enabled.
-//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
-//                              expressions are/aren't available.
-//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
-//                              is/isn't available.
-//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
-//                              enabled.
-//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
-//                              std::wstring does/doesn't work (Google Test can
-//                              be used where std::wstring is unavailable).
-//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
-//                              compiler supports Microsoft's "Structured
-//                              Exception Handling".
-//   GTEST_HAS_STREAM_REDIRECTION
-//                            - Define it to 1/0 to indicate whether the
-//                              platform supports I/O stream redirection using
-//                              dup() and dup2().
-//   GTEST_LINKED_AS_SHARED_LIBRARY
-//                            - Define to 1 when compiling tests that use
-//                              Google Test as a shared library (known as
-//                              DLL on Windows).
-//   GTEST_CREATE_SHARED_LIBRARY
-//                            - Define to 1 when compiling Google Test itself
-//                              as a shared library.
-//   GTEST_DEFAULT_DEATH_TEST_STYLE
-//                            - The default value of --gtest_death_test_style.
-//                              The legacy default has been "fast" in the open
-//                              source version since 2008. The recommended value
-//                              is "threadsafe", and can be set in
-//                              custom/gtest-port.h.
-
-// Platform-indicating macros
-// --------------------------
-//
-// Macros indicating the platform on which Google Test is being used
-// (a macro is defined to 1 if compiled on the given platform;
-// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
-// defines these macros automatically.  Code outside Google Test MUST
-// NOT define them.
-//
-//   GTEST_OS_AIX      - IBM AIX
-//   GTEST_OS_CYGWIN   - Cygwin
-//   GTEST_OS_DRAGONFLY - DragonFlyBSD
-//   GTEST_OS_FREEBSD  - FreeBSD
-//   GTEST_OS_FUCHSIA  - Fuchsia
-//   GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD
-//   GTEST_OS_HAIKU    - Haiku
-//   GTEST_OS_HPUX     - HP-UX
-//   GTEST_OS_LINUX    - Linux
-//     GTEST_OS_LINUX_ANDROID - Google Android
-//   GTEST_OS_MAC      - Mac OS X
-//     GTEST_OS_IOS    - iOS
-//   GTEST_OS_NACL     - Google Native Client (NaCl)
-//   GTEST_OS_NETBSD   - NetBSD
-//   GTEST_OS_OPENBSD  - OpenBSD
-//   GTEST_OS_OS2      - OS/2
-//   GTEST_OS_QNX      - QNX
-//   GTEST_OS_SOLARIS  - Sun Solaris
-//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
-//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
-//     GTEST_OS_WINDOWS_MINGW    - MinGW
-//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
-//     GTEST_OS_WINDOWS_PHONE    - Windows Phone
-//     GTEST_OS_WINDOWS_RT       - Windows Store App/WinRT
-//   GTEST_OS_ZOS      - z/OS
-//
-// Among the platforms, Cygwin, Linux, Mac OS X, and Windows have the
-// most stable support.  Since core members of the Google Test project
-// don't have access to other platforms, support for them may be less
-// stable.  If you notice any problems on your platform, please notify
-// googletestframework@googlegroups.com (patches for fixing them are
-// even more welcome!).
-//
-// It is possible that none of the GTEST_OS_* macros are defined.
-
-// Feature-indicating macros
-// -------------------------
-//
-// Macros indicating which Google Test features are available (a macro
-// is defined to 1 if the corresponding feature is supported;
-// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
-// defines these macros automatically.  Code outside Google Test MUST
-// NOT define them.
-//
-// These macros are public so that portable tests can be written.
-// Such tests typically surround code using a feature with an #if
-// which controls that code.  For example:
-//
-// #if GTEST_HAS_DEATH_TEST
-//   EXPECT_DEATH(DoSomethingDeadly());
-// #endif
-//
-//   GTEST_HAS_DEATH_TEST   - death tests
-//   GTEST_HAS_TYPED_TEST   - typed tests
-//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
-//   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
-//   GOOGLETEST_CM0007 DO NOT DELETE
-//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
-//                            GTEST_HAS_POSIX_RE (see above) which users can
-//                            define themselves.
-//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
-//                            the above RE\b(s) are mutually exclusive.
-
-// Misc public macros
-// ------------------
-//
-//   GTEST_FLAG(flag_name)  - references the variable corresponding to
-//                            the given Google Test flag.
-
-// Internal utilities
-// ------------------
-//
-// The following macros and utilities are for Google Test's INTERNAL
-// use only.  Code outside Google Test MUST NOT USE THEM DIRECTLY.
-//
-// Macros for basic C++ coding:
-//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
-//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
-//                              variable don't have to be used.
-//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
-//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
-//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
-//   GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
-//                                        suppressed (constant conditional).
-//   GTEST_INTENTIONAL_CONST_COND_POP_  - finish code section where MSVC C4127
-//                                        is suppressed.
-//
-// Synchronization:
-//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
-//                            - synchronization primitives.
-//
-// Template meta programming:
-//   IteratorTraits - partial implementation of std::iterator_traits, which
-//                    is not available in libCstd when compiled with Sun C++.
-//
-//
-// Regular expressions:
-//   RE             - a simple regular expression class using the POSIX
-//                    Extended Regular Expression syntax on UNIX-like platforms
-//                    GOOGLETEST_CM0008 DO NOT DELETE
-//                    or a reduced regular exception syntax on other
-//                    platforms, including Windows.
-// Logging:
-//   GTEST_LOG_()   - logs messages at the specified severity level.
-//   LogToStderr()  - directs all log messages to stderr.
-//   FlushInfoLog() - flushes informational log messages.
-//
-// Stdout and stderr capturing:
-//   CaptureStdout()     - starts capturing stdout.
-//   GetCapturedStdout() - stops capturing stdout and returns the captured
-//                         string.
-//   CaptureStderr()     - starts capturing stderr.
-//   GetCapturedStderr() - stops capturing stderr and returns the captured
-//                         string.
-//
-// Integer types:
-//   TypeWithSize   - maps an integer to a int type.
-//   Int32, UInt32, Int64, UInt64, TimeInMillis
-//                  - integers of known sizes.
-//   BiggestInt     - the biggest signed integer type.
-//
-// Command-line utilities:
-//   GTEST_DECLARE_*()  - declares a flag.
-//   GTEST_DEFINE_*()   - defines a flag.
-//   GetInjectableArgvs() - returns the command line as a vector of strings.
-//
-// Environment variable utilities:
-//   GetEnv()             - gets the value of an environment variable.
-//   BoolFromGTestEnv()   - parses a bool environment variable.
-//   Int32FromGTestEnv()  - parses an Int32 environment variable.
-//   StringFromGTestEnv() - parses a string environment variable.
-//
-// Deprecation warnings:
-//   GTEST_INTERNAL_DEPRECATED(message) - attribute marking a function as
-//                                        deprecated; calling a marked function
-//                                        should generate a compiler warning
-
-#include <ctype.h>   // for isspace, etc
-#include <stddef.h>  // for ptrdiff_t
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <memory>
-#include <type_traits>
-
-#ifndef _WIN32_WCE
-# include <sys/types.h>
-# include <sys/stat.h>
-#endif  // !_WIN32_WCE
-
-#if defined __APPLE__
-# include <AvailabilityMacros.h>
-# include <TargetConditionals.h>
-#endif
-
-#include <algorithm>  // NOLINT
-#include <iostream>   // NOLINT
-#include <sstream>    // NOLINT
-#include <string>     // NOLINT
-#include <tuple>
-#include <utility>
-#include <vector>  // NOLINT
-
-#include "gtest/internal/gtest-port-arch.h"
-#include "gtest/internal/custom/gtest-port.h"
-
-#if !defined(GTEST_DEV_EMAIL_)
-# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
-# define GTEST_FLAG_PREFIX_ "gtest_"
-# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
-# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
-# define GTEST_NAME_ "Google Test"
-# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
-#endif  // !defined(GTEST_DEV_EMAIL_)
-
-#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
-# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
-#endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
-
-// Determines the version of gcc that is used to compile this.
-#ifdef __GNUC__
-// 40302 means version 4.3.2.
-# define GTEST_GCC_VER_ \
-    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
-#endif  // __GNUC__
-
-// Macros for disabling Microsoft Visual C++ warnings.
-//
-//   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
-//   /* code that triggers warnings C4800 and C4385 */
-//   GTEST_DISABLE_MSC_WARNINGS_POP_()
-#if defined(_MSC_VER)
-# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
-    __pragma(warning(push))                        \
-    __pragma(warning(disable: warnings))
-# define GTEST_DISABLE_MSC_WARNINGS_POP_()          \
-    __pragma(warning(pop))
-#else
-// Not all compilers are MSVC
-# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
-# define GTEST_DISABLE_MSC_WARNINGS_POP_()
-#endif
-
-// Clang on Windows does not understand MSVC's pragma warning.
-// We need clang-specific way to disable function deprecation warning.
-#ifdef __clang__
-# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                         \
-    _Pragma("clang diagnostic push")                                  \
-    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
-    _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
-#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
-    _Pragma("clang diagnostic pop")
-#else
-# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
-    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
-# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
-    GTEST_DISABLE_MSC_WARNINGS_POP_()
-#endif
-
-// Brings in definitions for functions used in the testing::internal::posix
-// namespace (read, write, close, chdir, isatty, stat). We do not currently
-// use them on Windows Mobile.
-#if GTEST_OS_WINDOWS
-# if !GTEST_OS_WINDOWS_MOBILE
-#  include <direct.h>
-#  include <io.h>
-# endif
-// In order to avoid having to include <windows.h>, use forward declaration
-#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
-// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
-// separate (equivalent) structs, instead of using typedef
-typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION;
-#else
-// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
-// This assumption is verified by
-// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
-typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
-#endif
-#else
-// This assumes that non-Windows OSes provide unistd.h. For OSes where this
-// is not the case, we need to include headers that provide the functions
-// mentioned above.
-# include <unistd.h>
-# include <strings.h>
-#endif  // GTEST_OS_WINDOWS
-
-#if GTEST_OS_LINUX_ANDROID
-// Used to define __ANDROID_API__ matching the target NDK API level.
-#  include <android/api-level.h>  // NOLINT
-#endif
-
-// Defines this to true iff Google Test can use POSIX regular expressions.
-#ifndef GTEST_HAS_POSIX_RE
-# if GTEST_OS_LINUX_ANDROID
-// On Android, <regex.h> is only available starting with Gingerbread.
-#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
-# else
-#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
-# endif
-#endif
-
-#if GTEST_USES_PCRE
-// The appropriate headers have already been included.
-
-#elif GTEST_HAS_POSIX_RE
-
-// On some platforms, <regex.h> needs someone to define size_t, and
-// won't compile otherwise.  We can #include it here as we already
-// included <stdlib.h>, which is guaranteed to define size_t through
-// <stddef.h>.
-# include <regex.h>  // NOLINT
-
-# define GTEST_USES_POSIX_RE 1
-
-#elif GTEST_OS_WINDOWS
-
-// <regex.h> is not available on Windows.  Use our own simple regex
-// implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
-#else
-
-// <regex.h> may not be available on this platform.  Use our own
-// simple regex implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
-#endif  // GTEST_USES_PCRE
-
-#ifndef GTEST_HAS_EXCEPTIONS
-// The user didn't tell us whether exceptions are enabled, so we need
-// to figure it out.
-# if defined(_MSC_VER) && defined(_CPPUNWIND)
-// MSVC defines _CPPUNWIND to 1 iff exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__BORLANDC__)
-// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
-// macro to enable exceptions, so we'll do the same.
-// Assumes that exceptions are enabled by default.
-#  ifndef _HAS_EXCEPTIONS
-#   define _HAS_EXCEPTIONS 1
-#  endif  // _HAS_EXCEPTIONS
-#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
-# elif defined(__clang__)
-// clang defines __EXCEPTIONS iff exceptions are enabled before clang 220714,
-// but iff cleanups are enabled after that. In Obj-C++ files, there can be
-// cleanups for ObjC exceptions which also need cleanups, even if C++ exceptions
-// are disabled. clang has __has_feature(cxx_exceptions) which checks for C++
-// exceptions starting at clang r206352, but which checked for cleanups prior to
-// that. To reliably check for C++ exception availability with clang, check for
-// __EXCEPTIONS && __has_feature(cxx_exceptions).
-#  define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
-# elif defined(__GNUC__) && __EXCEPTIONS
-// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__SUNPRO_CC)
-// Sun Pro CC supports exceptions.  However, there is no compile-time way of
-// detecting whether they are enabled or not.  Therefore, we assume that
-// they are enabled unless the user tells us otherwise.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__IBMCPP__) && __EXCEPTIONS
-// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__HP_aCC)
-// Exception handling is in effect by default in HP aCC compiler. It has to
-// be turned of by +noeh compiler option if desired.
-#  define GTEST_HAS_EXCEPTIONS 1
-# else
-// For other compilers, we assume exceptions are disabled to be
-// conservative.
-#  define GTEST_HAS_EXCEPTIONS 0
-# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
-#endif  // GTEST_HAS_EXCEPTIONS
-
-#if !defined(GTEST_HAS_STD_STRING)
-// Even though we don't use this macro any longer, we keep it in case
-// some clients still depend on it.
-# define GTEST_HAS_STD_STRING 1
-#elif !GTEST_HAS_STD_STRING
-// The user told us that ::std::string isn't available.
-# error "::std::string isn't available."
-#endif  // !defined(GTEST_HAS_STD_STRING)
-
-#ifndef GTEST_HAS_STD_WSTRING
-// The user didn't tell us whether ::std::wstring is available, so we need
-// to figure it out.
-// Cygwin 1.7 and below doesn't support ::std::wstring.
-// Solaris' libc++ doesn't support it either.  Android has
-// no support for it at least as recent as Froyo (2.2).
-#define GTEST_HAS_STD_WSTRING                                         \
-  (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
-     GTEST_OS_HAIKU))
-
-#endif  // GTEST_HAS_STD_WSTRING
-
-// Determines whether RTTI is available.
-#ifndef GTEST_HAS_RTTI
-// The user didn't tell us whether RTTI is enabled, so we need to
-// figure it out.
-
-# ifdef _MSC_VER
-
-#  ifdef _CPPRTTI  // MSVC defines this macro iff RTTI is enabled.
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
-
-// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
-# elif defined(__GNUC__)
-
-#  ifdef __GXX_RTTI
-// When building against STLport with the Android NDK and with
-// -frtti -fno-exceptions, the build fails at link time with undefined
-// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
-// so disable RTTI when detected.
-#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
-       !defined(__EXCEPTIONS)
-#    define GTEST_HAS_RTTI 0
-#   else
-#    define GTEST_HAS_RTTI 1
-#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif  // __GXX_RTTI
-
-// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
-// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
-// first version with C++ support.
-# elif defined(__clang__)
-
-#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
-
-// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
-// both the typeid and dynamic_cast features are present.
-# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
-
-#  ifdef __RTTI_ALL__
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
-
-# else
-
-// For all other compilers, we assume RTTI is enabled.
-#  define GTEST_HAS_RTTI 1
-
-# endif  // _MSC_VER
-
-#endif  // GTEST_HAS_RTTI
-
-// It's this header's responsibility to #include <typeinfo> when RTTI
-// is enabled.
-#if GTEST_HAS_RTTI
-# include <typeinfo>
-#endif
-
-// Determines whether Google Test can use the pthreads library.
-#ifndef GTEST_HAS_PTHREAD
-// The user didn't tell us explicitly, so we make reasonable assumptions about
-// which platforms have pthreads support.
-//
-// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
-// to your compiler flags.
-#define GTEST_HAS_PTHREAD                                                      \
-  (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX ||          \
-   GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \
-   GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD ||          \
-   GTEST_OS_HAIKU)
-#endif  // GTEST_HAS_PTHREAD
-
-#if GTEST_HAS_PTHREAD
-// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
-// true.
-# include <pthread.h>  // NOLINT
-
-// For timespec and nanosleep, used below.
-# include <time.h>  // NOLINT
-#endif
-
-// Determines whether clone(2) is supported.
-// Usually it will only be available on Linux, excluding
-// Linux on the Itanium architecture.
-// Also see http://linux.die.net/man/2/clone.
-#ifndef GTEST_HAS_CLONE
-// The user didn't tell us, so we need to figure it out.
-
-# if GTEST_OS_LINUX && !defined(__ia64__)
-#  if GTEST_OS_LINUX_ANDROID
-// On Android, clone() became available at different API levels for each 32-bit
-// architecture.
-#    if defined(__LP64__) || \
-        (defined(__arm__) && __ANDROID_API__ >= 9) || \
-        (defined(__mips__) && __ANDROID_API__ >= 12) || \
-        (defined(__i386__) && __ANDROID_API__ >= 17)
-#     define GTEST_HAS_CLONE 1
-#    else
-#     define GTEST_HAS_CLONE 0
-#    endif
-#  else
-#   define GTEST_HAS_CLONE 1
-#  endif
-# else
-#  define GTEST_HAS_CLONE 0
-# endif  // GTEST_OS_LINUX && !defined(__ia64__)
-
-#endif  // GTEST_HAS_CLONE
-
-// Determines whether to support stream redirection. This is used to test
-// output correctness and to implement death tests.
-#ifndef GTEST_HAS_STREAM_REDIRECTION
-// By default, we assume that stream redirection is supported on all
-// platforms except known mobile ones.
-# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
-#  define GTEST_HAS_STREAM_REDIRECTION 0
-# else
-#  define GTEST_HAS_STREAM_REDIRECTION 1
-# endif  // !GTEST_OS_WINDOWS_MOBILE
-#endif  // GTEST_HAS_STREAM_REDIRECTION
-
-// Determines whether to support death tests.
-// pops up a dialog window that cannot be suppressed programmatically.
-#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS ||             \
-     (GTEST_OS_MAC && !GTEST_OS_IOS) ||                                   \
-     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW ||  \
-     GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \
-     GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA ||           \
-     GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU)
-# define GTEST_HAS_DEATH_TEST 1
-#endif
-
-// Determines whether to support type-driven tests.
-
-// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
-// Sun Pro CC, IBM Visual Age, and HP aCC support.
-#if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \
-    defined(__IBMCPP__) || defined(__HP_aCC)
-# define GTEST_HAS_TYPED_TEST 1
-# define GTEST_HAS_TYPED_TEST_P 1
-#endif
-
-// Determines whether the system compiler uses UTF-16 for encoding wide strings.
-#define GTEST_WIDE_STRING_USES_UTF16_ \
-  (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_AIX || GTEST_OS_OS2)
-
-// Determines whether test results can be streamed to a socket.
-#if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \
-    GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD
-# define GTEST_CAN_STREAM_RESULTS_ 1
-#endif
-
-// Defines some utility macros.
-
-// The GNU compiler emits a warning if nested "if" statements are followed by
-// an "else" statement and braces are not used to explicitly disambiguate the
-// "else" binding.  This leads to problems with code like:
-//
-//   if (gate)
-//     ASSERT_*(condition) << "Some message";
-//
-// The "switch (0) case 0:" idiom is used to suppress this.
-#ifdef __INTEL_COMPILER
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
-#else
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
-#endif
-
-// Use this annotation at the end of a struct/class definition to
-// prevent the compiler from optimizing away instances that are never
-// used.  This is useful when all interesting logic happens inside the
-// c'tor and / or d'tor.  Example:
-//
-//   struct Foo {
-//     Foo() { ... }
-//   } GTEST_ATTRIBUTE_UNUSED_;
-//
-// Also use it after a variable or parameter declaration to tell the
-// compiler the variable/parameter does not have to be used.
-#if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
-#elif defined(__clang__)
-# if __has_attribute(unused)
-#  define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
-# endif
-#endif
-#ifndef GTEST_ATTRIBUTE_UNUSED_
-# define GTEST_ATTRIBUTE_UNUSED_
-#endif
-
-// Use this annotation before a function that takes a printf format string.
-#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
-# if defined(__MINGW_PRINTF_FORMAT)
-// MinGW has two different printf implementations. Ensure the format macro
-// matches the selected implementation. See
-// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
-#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
-       __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \
-                                 first_to_check)))
-# else
-#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
-       __attribute__((__format__(__printf__, string_index, first_to_check)))
-# endif
-#else
-# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
-#endif
-
-
-// A macro to disallow operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_ASSIGN_(type) \
-  void operator=(type const &) = delete
-
-// A macro to disallow copy constructor and operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
-  type(type const &) = delete; \
-  GTEST_DISALLOW_ASSIGN_(type)
-
-// Tell the compiler to warn about unused return values for functions declared
-// with this macro.  The macro should be used on function declarations
-// following the argument list:
-//
-//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
-#if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
-#else
-# define GTEST_MUST_USE_RESULT_
-#endif  // __GNUC__ && !COMPILER_ICC
-
-// MS C++ compiler emits warning when a conditional expression is compile time
-// constant. In some contexts this warning is false positive and needs to be
-// suppressed. Use the following two macros in such cases:
-//
-// GTEST_INTENTIONAL_CONST_COND_PUSH_()
-// while (true) {
-// GTEST_INTENTIONAL_CONST_COND_POP_()
-// }
-# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
-    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
-# define GTEST_INTENTIONAL_CONST_COND_POP_() \
-    GTEST_DISABLE_MSC_WARNINGS_POP_()
-
-// Determine whether the compiler supports Microsoft's Structured Exception
-// Handling.  This is supported by several Windows compilers but generally
-// does not exist on any other system.
-#ifndef GTEST_HAS_SEH
-// The user didn't tell us, so we need to figure it out.
-
-# if defined(_MSC_VER) || defined(__BORLANDC__)
-// These two compilers are known to support SEH.
-#  define GTEST_HAS_SEH 1
-# else
-// Assume no SEH.
-#  define GTEST_HAS_SEH 0
-# endif
-
-#endif  // GTEST_HAS_SEH
-
-#ifndef GTEST_IS_THREADSAFE
-
-#define GTEST_IS_THREADSAFE                                                 \
-  (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ ||                                     \
-   (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) || \
-   GTEST_HAS_PTHREAD)
-
-#endif  // GTEST_IS_THREADSAFE
-
-// GTEST_API_ qualifies all symbols that must be exported. The definitions below
-// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
-// gtest/internal/custom/gtest-port.h
-#ifndef GTEST_API_
-
-#ifdef _MSC_VER
-# if GTEST_LINKED_AS_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllimport)
-# elif GTEST_CREATE_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllexport)
-# endif
-#elif __GNUC__ >= 4 || defined(__clang__)
-# define GTEST_API_ __attribute__((visibility ("default")))
-#endif  // _MSC_VER
-
-#endif  // GTEST_API_
-
-#ifndef GTEST_API_
-# define GTEST_API_
-#endif  // GTEST_API_
-
-#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
-# define GTEST_DEFAULT_DEATH_TEST_STYLE  "fast"
-#endif  // GTEST_DEFAULT_DEATH_TEST_STYLE
-
-#ifdef __GNUC__
-// Ask the compiler to never inline a given function.
-# define GTEST_NO_INLINE_ __attribute__((noinline))
-#else
-# define GTEST_NO_INLINE_
-#endif
-
-// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
-#if !defined(GTEST_HAS_CXXABI_H_)
-# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
-#  define GTEST_HAS_CXXABI_H_ 1
-# else
-#  define GTEST_HAS_CXXABI_H_ 0
-# endif
-#endif
-
-// A function level attribute to disable checking for use of uninitialized
-// memory when built with MemorySanitizer.
-#if defined(__clang__)
-# if __has_feature(memory_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \
-       __attribute__((no_sanitize_memory))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-# endif  // __has_feature(memory_sanitizer)
-#else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-#endif  // __clang__
-
-// A function level attribute to disable AddressSanitizer instrumentation.
-#if defined(__clang__)
-# if __has_feature(address_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
-       __attribute__((no_sanitize_address))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-# endif  // __has_feature(address_sanitizer)
-#else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-#endif  // __clang__
-
-// A function level attribute to disable HWAddressSanitizer instrumentation.
-#if defined(__clang__)
-# if __has_feature(hwaddress_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
-       __attribute__((no_sanitize("hwaddress")))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-# endif  // __has_feature(hwaddress_sanitizer)
-#else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-#endif  // __clang__
-
-// A function level attribute to disable ThreadSanitizer instrumentation.
-#if defined(__clang__)
-# if __has_feature(thread_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \
-       __attribute__((no_sanitize_thread))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-# endif  // __has_feature(thread_sanitizer)
-#else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-#endif  // __clang__
-
-namespace testing {
-
-class Message;
-
-// Legacy imports for backwards compatibility.
-// New code should use std:: names directly.
-using std::get;
-using std::make_tuple;
-using std::tuple;
-using std::tuple_element;
-using std::tuple_size;
-
-namespace internal {
-
-// A secret type that Google Test users don't know about.  It has no
-// definition on purpose.  Therefore it's impossible to create a
-// Secret object, which is what we want.
-class Secret;
-
-// The GTEST_COMPILE_ASSERT_ is a legacy macro used to verify that a compile
-// time expression is true (in new code, use static_assert instead). For
-// example, you could use it to verify the size of a static array:
-//
-//   GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES,
-//                         names_incorrect_size);
-//
-// The second argument to the macro must be a valid C++ identifier. If the
-// expression is false, compiler will issue an error containing this identifier.
-#define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
-
-// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h.
-//
-// This template is declared, but intentionally undefined.
-template <typename T1, typename T2>
-struct StaticAssertTypeEqHelper;
-
-template <typename T>
-struct StaticAssertTypeEqHelper<T, T> {
-  enum { value = true };
-};
-
-// Same as std::is_same<>.
-template <typename T, typename U>
-struct IsSame {
-  enum { value = false };
-};
-template <typename T>
-struct IsSame<T, T> {
-  enum { value = true };
-};
-
-// Evaluates to the number of elements in 'array'.
-#define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0]))
-
-// A helper for suppressing warnings on constant condition.  It just
-// returns 'condition'.
-GTEST_API_ bool IsTrue(bool condition);
-
-// Defines RE.
-
-#if GTEST_USES_PCRE
-// if used, PCRE is injected by custom/gtest-port.h
-#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
-
-// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
-// Regular Expression syntax.
-class GTEST_API_ RE {
- public:
-  // A copy constructor is required by the Standard to initialize object
-  // references from r-values.
-  RE(const RE& other) { Init(other.pattern()); }
-
-  // Constructs an RE from a string.
-  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
-
-  RE(const char* regex) { Init(regex); }  // NOLINT
-  ~RE();
-
-  // Returns the string representation of the regex.
-  const char* pattern() const { return pattern_; }
-
-  // FullMatch(str, re) returns true iff regular expression re matches
-  // the entire str.
-  // PartialMatch(str, re) returns true iff regular expression re
-  // matches a substring of str (including str itself).
-  static bool FullMatch(const ::std::string& str, const RE& re) {
-    return FullMatch(str.c_str(), re);
-  }
-  static bool PartialMatch(const ::std::string& str, const RE& re) {
-    return PartialMatch(str.c_str(), re);
-  }
-
-  static bool FullMatch(const char* str, const RE& re);
-  static bool PartialMatch(const char* str, const RE& re);
-
- private:
-  void Init(const char* regex);
-  const char* pattern_;
-  bool is_valid_;
-
-# if GTEST_USES_POSIX_RE
-
-  regex_t full_regex_;     // For FullMatch().
-  regex_t partial_regex_;  // For PartialMatch().
-
-# else  // GTEST_USES_SIMPLE_RE
-
-  const char* full_pattern_;  // For FullMatch();
-
-# endif
-
-  GTEST_DISALLOW_ASSIGN_(RE);
-};
-
-#endif  // GTEST_USES_PCRE
-
-// Formats a source file path and a line number as they would appear
-// in an error message from the compiler used to compile this code.
-GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
-
-// Formats a file location for compiler-independent XML output.
-// Although this function is not platform dependent, we put it next to
-// FormatFileLocation in order to contrast the two functions.
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
-                                                               int line);
-
-// Defines logging utilities:
-//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
-//                          message itself is streamed into the macro.
-//   LogToStderr()  - directs all log messages to stderr.
-//   FlushInfoLog() - flushes informational log messages.
-
-enum GTestLogSeverity {
-  GTEST_INFO,
-  GTEST_WARNING,
-  GTEST_ERROR,
-  GTEST_FATAL
-};
-
-// Formats log entry severity, provides a stream object for streaming the
-// log message, and terminates the message with a newline when going out of
-// scope.
-class GTEST_API_ GTestLog {
- public:
-  GTestLog(GTestLogSeverity severity, const char* file, int line);
-
-  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
-  ~GTestLog();
-
-  ::std::ostream& GetStream() { return ::std::cerr; }
-
- private:
-  const GTestLogSeverity severity_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
-};
-
-#if !defined(GTEST_LOG_)
-
-# define GTEST_LOG_(severity) \
-    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
-                                  __FILE__, __LINE__).GetStream()
-
-inline void LogToStderr() {}
-inline void FlushInfoLog() { fflush(nullptr); }
-
-#endif  // !defined(GTEST_LOG_)
-
-#if !defined(GTEST_CHECK_)
-// INTERNAL IMPLEMENTATION - DO NOT USE.
-//
-// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
-// is not satisfied.
-//  Synopsys:
-//    GTEST_CHECK_(boolean_condition);
-//     or
-//    GTEST_CHECK_(boolean_condition) << "Additional message";
-//
-//    This checks the condition and if the condition is not satisfied
-//    it prints message about the condition violation, including the
-//    condition itself, plus additional message streamed into it, if any,
-//    and then it aborts the program. It aborts the program irrespective of
-//    whether it is built in the debug mode or not.
-# define GTEST_CHECK_(condition) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::IsTrue(condition)) \
-      ; \
-    else \
-      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
-#endif  // !defined(GTEST_CHECK_)
-
-// An all-mode assert to verify that the given POSIX-style function
-// call returns 0 (indicating success).  Known limitation: this
-// doesn't expand to a balanced 'if' statement, so enclose the macro
-// in {} if you need to use it as the only statement in an 'if'
-// branch.
-#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
-  if (const int gtest_error = (posix_call)) \
-    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
-                      << gtest_error
-
-// Adds reference to a type if it is not a reference type,
-// otherwise leaves it unchanged.  This is the same as
-// tr1::add_reference, which is not widely available yet.
-template <typename T>
-struct AddReference { typedef T& type; };  // NOLINT
-template <typename T>
-struct AddReference<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper around AddReference that works when the argument T
-// depends on template parameters.
-#define GTEST_ADD_REFERENCE_(T) \
-    typename ::testing::internal::AddReference<T>::type
-
-// Transforms "T" into "const T&" according to standard reference collapsing
-// rules (this is only needed as a backport for C++98 compilers that do not
-// support reference collapsing). Specifically, it transforms:
-//
-//   char         ==> const char&
-//   const char   ==> const char&
-//   char&        ==> char&
-//   const char&  ==> const char&
-//
-// Note that the non-const reference will not have "const" added. This is
-// standard, and necessary so that "T" can always bind to "const T&".
-template <typename T>
-struct ConstRef { typedef const T& type; };
-template <typename T>
-struct ConstRef<T&> { typedef T& type; };
-
-// The argument T must depend on some template parameters.
-#define GTEST_REFERENCE_TO_CONST_(T) \
-  typename ::testing::internal::ConstRef<T>::type
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Use ImplicitCast_ as a safe version of static_cast for upcasting in
-// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
-// const Foo*).  When you use ImplicitCast_, the compiler checks that
-// the cast is safe.  Such explicit ImplicitCast_s are necessary in
-// surprisingly many situations where C++ demands an exact type match
-// instead of an argument type convertible to a target type.
-//
-// The syntax for using ImplicitCast_ is the same as for static_cast:
-//
-//   ImplicitCast_<ToType>(expr)
-//
-// ImplicitCast_ would have been part of the C++ standard library,
-// but the proposal was submitted too late.  It will probably make
-// its way into the language in the future.
-//
-// This relatively ugly name is intentional. It prevents clashes with
-// similar functions users may have (e.g., implicit_cast). The internal
-// namespace alone is not enough because the function can be found by ADL.
-template<typename To>
-inline To ImplicitCast_(To x) { return x; }
-
-// When you upcast (that is, cast a pointer from type Foo to type
-// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
-// always succeed.  When you downcast (that is, cast a pointer from
-// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
-// how do you know the pointer is really of type SubclassOfFoo?  It
-// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
-// when you downcast, you should use this macro.  In debug mode, we
-// use dynamic_cast<> to double-check the downcast is legal (we die
-// if it's not).  In normal mode, we do the efficient static_cast<>
-// instead.  Thus, it's important to test in debug mode to make sure
-// the cast is legal!
-//    This is the only place in the code we should use dynamic_cast<>.
-// In particular, you SHOULDN'T be using dynamic_cast<> in order to
-// do RTTI (eg code like this:
-//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
-//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
-// You should design the code some other way not to need this.
-//
-// This relatively ugly name is intentional. It prevents clashes with
-// similar functions users may have (e.g., down_cast). The internal
-// namespace alone is not enough because the function can be found by ADL.
-template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
-inline To DownCast_(From* f) {  // so we only accept pointers
-  // Ensures that To is a sub-type of From *.  This test is here only
-  // for compile-time type checking, and has no overhead in an
-  // optimized build at run-time, as it will be optimized away
-  // completely.
-  GTEST_INTENTIONAL_CONST_COND_PUSH_()
-  if (false) {
-  GTEST_INTENTIONAL_CONST_COND_POP_()
-  const To to = nullptr;
-  ::testing::internal::ImplicitCast_<From*>(to);
-  }
-
-#if GTEST_HAS_RTTI
-  // RTTI: debug mode only!
-  GTEST_CHECK_(f == nullptr || dynamic_cast<To>(f) != nullptr);
-#endif
-  return static_cast<To>(f);
-}
-
-// Downcasts the pointer of type Base to Derived.
-// Derived must be a subclass of Base. The parameter MUST
-// point to a class of type Derived, not any subclass of it.
-// When RTTI is available, the function performs a runtime
-// check to enforce this.
-template <class Derived, class Base>
-Derived* CheckedDowncastToActualType(Base* base) {
-#if GTEST_HAS_RTTI
-  GTEST_CHECK_(typeid(*base) == typeid(Derived));
-#endif
-
-#if GTEST_HAS_DOWNCAST_
-  return ::down_cast<Derived*>(base);
-#elif GTEST_HAS_RTTI
-  return dynamic_cast<Derived*>(base);  // NOLINT
-#else
-  return static_cast<Derived*>(base);  // Poor man's downcast.
-#endif
-}
-
-#if GTEST_HAS_STREAM_REDIRECTION
-
-// Defines the stderr capturer:
-//   CaptureStdout     - starts capturing stdout.
-//   GetCapturedStdout - stops capturing stdout and returns the captured string.
-//   CaptureStderr     - starts capturing stderr.
-//   GetCapturedStderr - stops capturing stderr and returns the captured string.
-//
-GTEST_API_ void CaptureStdout();
-GTEST_API_ std::string GetCapturedStdout();
-GTEST_API_ void CaptureStderr();
-GTEST_API_ std::string GetCapturedStderr();
-
-#endif  // GTEST_HAS_STREAM_REDIRECTION
-// Returns the size (in bytes) of a file.
-GTEST_API_ size_t GetFileSize(FILE* file);
-
-// Reads the entire content of a file as a string.
-GTEST_API_ std::string ReadEntireFile(FILE* file);
-
-// All command line arguments.
-GTEST_API_ std::vector<std::string> GetArgvs();
-
-#if GTEST_HAS_DEATH_TEST
-
-std::vector<std::string> GetInjectableArgvs();
-// Deprecated: pass the args vector by value instead.
-void SetInjectableArgvs(const std::vector<std::string>* new_argvs);
-void SetInjectableArgvs(const std::vector<std::string>& new_argvs);
-void ClearInjectableArgvs();
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-// Defines synchronization primitives.
-#if GTEST_IS_THREADSAFE
-# if GTEST_HAS_PTHREAD
-// Sleeps for (roughly) n milliseconds.  This function is only for testing
-// Google Test's own constructs.  Don't use it in user tests, either
-// directly or indirectly.
-inline void SleepMilliseconds(int n) {
-  const timespec time = {
-    0,                  // 0 seconds.
-    n * 1000L * 1000L,  // And n ms.
-  };
-  nanosleep(&time, nullptr);
-}
-# endif  // GTEST_HAS_PTHREAD
-
-# if GTEST_HAS_NOTIFICATION_
-// Notification has already been imported into the namespace.
-// Nothing to do here.
-
-# elif GTEST_HAS_PTHREAD
-// Allows a controller thread to pause execution of newly created
-// threads until notified.  Instances of this class must be created
-// and destroyed in the controller thread.
-//
-// This class is only for testing Google Test's own constructs. Do not
-// use it in user tests, either directly or indirectly.
-class Notification {
- public:
-  Notification() : notified_(false) {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
-  }
-  ~Notification() {
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  // Notifies all threads created with this notification to start. Must
-  // be called from the controller thread.
-  void Notify() {
-    pthread_mutex_lock(&mutex_);
-    notified_ = true;
-    pthread_mutex_unlock(&mutex_);
-  }
-
-  // Blocks until the controller thread notifies. Must be called from a test
-  // thread.
-  void WaitForNotification() {
-    for (;;) {
-      pthread_mutex_lock(&mutex_);
-      const bool notified = notified_;
-      pthread_mutex_unlock(&mutex_);
-      if (notified)
-        break;
-      SleepMilliseconds(10);
-    }
-  }
-
- private:
-  pthread_mutex_t mutex_;
-  bool notified_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
-};
-
-# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
-
-GTEST_API_ void SleepMilliseconds(int n);
-
-// Provides leak-safe Windows kernel handle ownership.
-// Used in death tests and in threading support.
-class GTEST_API_ AutoHandle {
- public:
-  // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to
-  // avoid including <windows.h> in this header file. Including <windows.h> is
-  // undesirable because it defines a lot of symbols and macros that tend to
-  // conflict with client code. This assumption is verified by
-  // WindowsTypesTest.HANDLEIsVoidStar.
-  typedef void* Handle;
-  AutoHandle();
-  explicit AutoHandle(Handle handle);
-
-  ~AutoHandle();
-
-  Handle Get() const;
-  void Reset();
-  void Reset(Handle handle);
-
- private:
-  // Returns true iff the handle is a valid handle object that can be closed.
-  bool IsCloseable() const;
-
-  Handle handle_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
-};
-
-// Allows a controller thread to pause execution of newly created
-// threads until notified.  Instances of this class must be created
-// and destroyed in the controller thread.
-//
-// This class is only for testing Google Test's own constructs. Do not
-// use it in user tests, either directly or indirectly.
-class GTEST_API_ Notification {
- public:
-  Notification();
-  void Notify();
-  void WaitForNotification();
-
- private:
-  AutoHandle event_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
-};
-# endif  // GTEST_HAS_NOTIFICATION_
-
-// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
-// defined, but we don't want to use MinGW's pthreads implementation, which
-// has conformance problems with some versions of the POSIX standard.
-# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
-
-// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
-// Consequently, it cannot select a correct instantiation of ThreadWithParam
-// in order to call its Run(). Introducing ThreadWithParamBase as a
-// non-templated base class for ThreadWithParam allows us to bypass this
-// problem.
-class ThreadWithParamBase {
- public:
-  virtual ~ThreadWithParamBase() {}
-  virtual void Run() = 0;
-};
-
-// pthread_create() accepts a pointer to a function type with the C linkage.
-// According to the Standard (7.5/1), function types with different linkages
-// are different even if they are otherwise identical.  Some compilers (for
-// example, SunStudio) treat them as different types.  Since class methods
-// cannot be defined with C-linkage we need to define a free C-function to
-// pass into pthread_create().
-extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
-  static_cast<ThreadWithParamBase*>(thread)->Run();
-  return nullptr;
-}
-
-// Helper class for testing Google Test's multi-threading constructs.
-// To use it, write:
-//
-//   void ThreadFunc(int param) { /* Do things with param */ }
-//   Notification thread_can_start;
-//   ...
-//   // The thread_can_start parameter is optional; you can supply NULL.
-//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
-//   thread_can_start.Notify();
-//
-// These classes are only for testing Google Test's own constructs. Do
-// not use them in user tests, either directly or indirectly.
-template <typename T>
-class ThreadWithParam : public ThreadWithParamBase {
- public:
-  typedef void UserThreadFunc(T);
-
-  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
-      : func_(func),
-        param_(param),
-        thread_can_start_(thread_can_start),
-        finished_(false) {
-    ThreadWithParamBase* const base = this;
-    // The thread can be created only after all fields except thread_
-    // have been initialized.
-    GTEST_CHECK_POSIX_SUCCESS_(
-        pthread_create(&thread_, nullptr, &ThreadFuncWithCLinkage, base));
-  }
-  ~ThreadWithParam() override { Join(); }
-
-  void Join() {
-    if (!finished_) {
-      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, nullptr));
-      finished_ = true;
-    }
-  }
-
-  void Run() override {
-    if (thread_can_start_ != nullptr) thread_can_start_->WaitForNotification();
-    func_(param_);
-  }
-
- private:
-  UserThreadFunc* const func_;  // User-supplied thread function.
-  const T param_;  // User-supplied parameter to the thread function.
-  // When non-NULL, used to block execution until the controller thread
-  // notifies.
-  Notification* const thread_can_start_;
-  bool finished_;  // true iff we know that the thread function has finished.
-  pthread_t thread_;  // The native thread object.
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
-};
-# endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
-         // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
-
-# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
-// Mutex and ThreadLocal have already been imported into the namespace.
-// Nothing to do here.
-
-# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
-
-// Mutex implements mutex on Windows platforms.  It is used in conjunction
-// with class MutexLock:
-//
-//   Mutex mutex;
-//   ...
-//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the
-//                            // end of the current scope.
-//
-// A static Mutex *must* be defined or declared using one of the following
-// macros:
-//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
-//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
-//
-// (A non-static Mutex is defined/declared in the usual way).
-class GTEST_API_ Mutex {
- public:
-  enum MutexType { kStatic = 0, kDynamic = 1 };
-  // We rely on kStaticMutex being 0 as it is to what the linker initializes
-  // type_ in static mutexes.  critical_section_ will be initialized lazily
-  // in ThreadSafeLazyInit().
-  enum StaticConstructorSelector { kStaticMutex = 0 };
-
-  // This constructor intentionally does nothing.  It relies on type_ being
-  // statically initialized to 0 (effectively setting it to kStatic) and on
-  // ThreadSafeLazyInit() to lazily initialize the rest of the members.
-  explicit Mutex(StaticConstructorSelector /*dummy*/) {}
-
-  Mutex();
-  ~Mutex();
-
-  void Lock();
-
-  void Unlock();
-
-  // Does nothing if the current thread holds the mutex. Otherwise, crashes
-  // with high probability.
-  void AssertHeld();
-
- private:
-  // Initializes owner_thread_id_ and critical_section_ in static mutexes.
-  void ThreadSafeLazyInit();
-
-  // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503,
-  // we assume that 0 is an invalid value for thread IDs.
-  unsigned int owner_thread_id_;
-
-  // For static mutexes, we rely on these members being initialized to zeros
-  // by the linker.
-  MutexType type_;
-  long critical_section_init_phase_;  // NOLINT
-  GTEST_CRITICAL_SECTION* critical_section_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
-};
-
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-    extern ::testing::internal::Mutex mutex
-
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-    ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
-
-// We cannot name this class MutexLock because the ctor declaration would
-// conflict with a macro named MutexLock, which is defined on some
-// platforms. That macro is used as a defensive measure to prevent against
-// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
-// "MutexLock l(&mu)".  Hence the typedef trick below.
-class GTestMutexLock {
- public:
-  explicit GTestMutexLock(Mutex* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
-
-  ~GTestMutexLock() { mutex_->Unlock(); }
-
- private:
-  Mutex* const mutex_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
-};
-
-typedef GTestMutexLock MutexLock;
-
-// Base class for ValueHolder<T>.  Allows a caller to hold and delete a value
-// without knowing its type.
-class ThreadLocalValueHolderBase {
- public:
-  virtual ~ThreadLocalValueHolderBase() {}
-};
-
-// Provides a way for a thread to send notifications to a ThreadLocal
-// regardless of its parameter type.
-class ThreadLocalBase {
- public:
-  // Creates a new ValueHolder<T> object holding a default value passed to
-  // this ThreadLocal<T>'s constructor and returns it.  It is the caller's
-  // responsibility not to call this when the ThreadLocal<T> instance already
-  // has a value on the current thread.
-  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0;
-
- protected:
-  ThreadLocalBase() {}
-  virtual ~ThreadLocalBase() {}
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase);
-};
-
-// Maps a thread to a set of ThreadLocals that have values instantiated on that
-// thread and notifies them when the thread exits.  A ThreadLocal instance is
-// expected to persist until all threads it has values on have terminated.
-class GTEST_API_ ThreadLocalRegistry {
- public:
-  // Registers thread_local_instance as having value on the current thread.
-  // Returns a value that can be used to identify the thread from other threads.
-  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
-      const ThreadLocalBase* thread_local_instance);
-
-  // Invoked when a ThreadLocal instance is destroyed.
-  static void OnThreadLocalDestroyed(
-      const ThreadLocalBase* thread_local_instance);
-};
-
-class GTEST_API_ ThreadWithParamBase {
- public:
-  void Join();
-
- protected:
-  class Runnable {
-   public:
-    virtual ~Runnable() {}
-    virtual void Run() = 0;
-  };
-
-  ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start);
-  virtual ~ThreadWithParamBase();
-
- private:
-  AutoHandle thread_;
-};
-
-// Helper class for testing Google Test's multi-threading constructs.
-template <typename T>
-class ThreadWithParam : public ThreadWithParamBase {
- public:
-  typedef void UserThreadFunc(T);
-
-  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
-      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {
-  }
-  virtual ~ThreadWithParam() {}
-
- private:
-  class RunnableImpl : public Runnable {
-   public:
-    RunnableImpl(UserThreadFunc* func, T param)
-        : func_(func),
-          param_(param) {
-    }
-    virtual ~RunnableImpl() {}
-    virtual void Run() {
-      func_(param_);
-    }
-
-   private:
-    UserThreadFunc* const func_;
-    const T param_;
-
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
-  };
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
-};
-
-// Implements thread-local storage on Windows systems.
-//
-//   // Thread 1
-//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
-//
-//   // Thread 2
-//   tl.set(150);  // Changes the value for thread 2 only.
-//   EXPECT_EQ(150, tl.get());
-//
-//   // Thread 1
-//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
-//   tl.set(200);
-//   EXPECT_EQ(200, tl.get());
-//
-// The template type argument T must have a public copy constructor.
-// In addition, the default ThreadLocal constructor requires T to have
-// a public default constructor.
-//
-// The users of a TheadLocal instance have to make sure that all but one
-// threads (including the main one) using that instance have exited before
-// destroying it. Otherwise, the per-thread objects managed for them by the
-// ThreadLocal instance are not guaranteed to be destroyed on all platforms.
-//
-// Google Test only uses global ThreadLocal objects.  That means they
-// will die after main() has returned.  Therefore, no per-thread
-// object managed by Google Test will be leaked as long as all threads
-// using Google Test have exited when main() returns.
-template <typename T>
-class ThreadLocal : public ThreadLocalBase {
- public:
-  ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {}
-  explicit ThreadLocal(const T& value)
-      : default_factory_(new InstanceValueHolderFactory(value)) {}
-
-  ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
-
-  T* pointer() { return GetOrCreateValue(); }
-  const T* pointer() const { return GetOrCreateValue(); }
-  const T& get() const { return *pointer(); }
-  void set(const T& value) { *pointer() = value; }
-
- private:
-  // Holds a value of T.  Can be deleted via its base class without the caller
-  // knowing the type of T.
-  class ValueHolder : public ThreadLocalValueHolderBase {
-   public:
-    ValueHolder() : value_() {}
-    explicit ValueHolder(const T& value) : value_(value) {}
-
-    T* pointer() { return &value_; }
-
-   private:
-    T value_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
-  };
-
-
-  T* GetOrCreateValue() const {
-    return static_cast<ValueHolder*>(
-        ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer();
-  }
-
-  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
-    return default_factory_->MakeNewHolder();
-  }
-
-  class ValueHolderFactory {
-   public:
-    ValueHolderFactory() {}
-    virtual ~ValueHolderFactory() {}
-    virtual ValueHolder* MakeNewHolder() const = 0;
-
-   private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
-  };
-
-  class DefaultValueHolderFactory : public ValueHolderFactory {
-   public:
-    DefaultValueHolderFactory() {}
-    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
-
-   private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
-  };
-
-  class InstanceValueHolderFactory : public ValueHolderFactory {
-   public:
-    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
-    virtual ValueHolder* MakeNewHolder() const {
-      return new ValueHolder(value_);
-    }
-
-   private:
-    const T value_;  // The value for each thread.
-
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
-  };
-
-  std::unique_ptr<ValueHolderFactory> default_factory_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
-};
-
-# elif GTEST_HAS_PTHREAD
-
-// MutexBase and Mutex implement mutex on pthreads-based platforms.
-class MutexBase {
- public:
-  // Acquires this mutex.
-  void Lock() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
-    owner_ = pthread_self();
-    has_owner_ = true;
-  }
-
-  // Releases this mutex.
-  void Unlock() {
-    // Since the lock is being released the owner_ field should no longer be
-    // considered valid. We don't protect writing to has_owner_ here, as it's
-    // the caller's responsibility to ensure that the current thread holds the
-    // mutex when this is called.
-    has_owner_ = false;
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
-  }
-
-  // Does nothing if the current thread holds the mutex. Otherwise, crashes
-  // with high probability.
-  void AssertHeld() const {
-    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
-        << "The current thread is not holding the mutex @" << this;
-  }
-
-  // A static mutex may be used before main() is entered.  It may even
-  // be used before the dynamic initialization stage.  Therefore we
-  // must be able to initialize a static mutex object at link time.
-  // This means MutexBase has to be a POD and its member variables
-  // have to be public.
- public:
-  pthread_mutex_t mutex_;  // The underlying pthread mutex.
-  // has_owner_ indicates whether the owner_ field below contains a valid thread
-  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
-  // accesses to the owner_ field should be protected by a check of this field.
-  // An alternative might be to memset() owner_ to all zeros, but there's no
-  // guarantee that a zero'd pthread_t is necessarily invalid or even different
-  // from pthread_self().
-  bool has_owner_;
-  pthread_t owner_;  // The thread holding the mutex.
-};
-
-// Forward-declares a static mutex.
-#  define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-     extern ::testing::internal::MutexBase mutex
-
-// Defines and statically (i.e. at link time) initializes a static mutex.
-// The initialization list here does not explicitly initialize each field,
-// instead relying on default initialization for the unspecified fields. In
-// particular, the owner_ field (a pthread_t) is not explicitly initialized.
-// This allows initialization to work whether pthread_t is a scalar or struct.
-// The flag -Wmissing-field-initializers must not be specified for this to work.
-#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-  ::testing::internal::MutexBase mutex = {PTHREAD_MUTEX_INITIALIZER, false, 0}
-
-// The Mutex class can only be used for mutexes created at runtime. It
-// shares its API with MutexBase otherwise.
-class Mutex : public MutexBase {
- public:
-  Mutex() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
-    has_owner_ = false;
-  }
-  ~Mutex() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
-  }
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
-};
-
-// We cannot name this class MutexLock because the ctor declaration would
-// conflict with a macro named MutexLock, which is defined on some
-// platforms. That macro is used as a defensive measure to prevent against
-// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
-// "MutexLock l(&mu)".  Hence the typedef trick below.
-class GTestMutexLock {
- public:
-  explicit GTestMutexLock(MutexBase* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
-
-  ~GTestMutexLock() { mutex_->Unlock(); }
-
- private:
-  MutexBase* const mutex_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
-};
-
-typedef GTestMutexLock MutexLock;
-
-// Helpers for ThreadLocal.
-
-// pthread_key_create() requires DeleteThreadLocalValue() to have
-// C-linkage.  Therefore it cannot be templatized to access
-// ThreadLocal<T>.  Hence the need for class
-// ThreadLocalValueHolderBase.
-class ThreadLocalValueHolderBase {
- public:
-  virtual ~ThreadLocalValueHolderBase() {}
-};
-
-// Called by pthread to delete thread-local data stored by
-// pthread_setspecific().
-extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
-  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
-}
-
-// Implements thread-local storage on pthreads-based systems.
-template <typename T>
-class GTEST_API_ ThreadLocal {
- public:
-  ThreadLocal()
-      : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
-  explicit ThreadLocal(const T& value)
-      : key_(CreateKey()),
-        default_factory_(new InstanceValueHolderFactory(value)) {}
-
-  ~ThreadLocal() {
-    // Destroys the managed object for the current thread, if any.
-    DeleteThreadLocalValue(pthread_getspecific(key_));
-
-    // Releases resources associated with the key.  This will *not*
-    // delete managed objects for other threads.
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
-  }
-
-  T* pointer() { return GetOrCreateValue(); }
-  const T* pointer() const { return GetOrCreateValue(); }
-  const T& get() const { return *pointer(); }
-  void set(const T& value) { *pointer() = value; }
-
- private:
-  // Holds a value of type T.
-  class ValueHolder : public ThreadLocalValueHolderBase {
-   public:
-    ValueHolder() : value_() {}
-    explicit ValueHolder(const T& value) : value_(value) {}
-
-    T* pointer() { return &value_; }
-
-   private:
-    T value_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
-  };
-
-  static pthread_key_t CreateKey() {
-    pthread_key_t key;
-    // When a thread exits, DeleteThreadLocalValue() will be called on
-    // the object managed for that thread.
-    GTEST_CHECK_POSIX_SUCCESS_(
-        pthread_key_create(&key, &DeleteThreadLocalValue));
-    return key;
-  }
-
-  T* GetOrCreateValue() const {
-    ThreadLocalValueHolderBase* const holder =
-        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
-    if (holder != nullptr) {
-      return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
-    }
-
-    ValueHolder* const new_holder = default_factory_->MakeNewHolder();
-    ThreadLocalValueHolderBase* const holder_base = new_holder;
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
-    return new_holder->pointer();
-  }
-
-  class ValueHolderFactory {
-   public:
-    ValueHolderFactory() {}
-    virtual ~ValueHolderFactory() {}
-    virtual ValueHolder* MakeNewHolder() const = 0;
-
-   private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
-  };
-
-  class DefaultValueHolderFactory : public ValueHolderFactory {
-   public:
-    DefaultValueHolderFactory() {}
-    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
-
-   private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
-  };
-
-  class InstanceValueHolderFactory : public ValueHolderFactory {
-   public:
-    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
-    virtual ValueHolder* MakeNewHolder() const {
-      return new ValueHolder(value_);
-    }
-
-   private:
-    const T value_;  // The value for each thread.
-
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
-  };
-
-  // A key pthreads uses for looking up per-thread values.
-  const pthread_key_t key_;
-  std::unique_ptr<ValueHolderFactory> default_factory_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
-};
-
-# endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
-
-#else  // GTEST_IS_THREADSAFE
-
-// A dummy implementation of synchronization primitives (mutex, lock,
-// and thread-local variable).  Necessary for compiling Google Test where
-// mutex is not supported - using Google Test in multiple threads is not
-// supported on such platforms.
-
-class Mutex {
- public:
-  Mutex() {}
-  void Lock() {}
-  void Unlock() {}
-  void AssertHeld() const {}
-};
-
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-  extern ::testing::internal::Mutex mutex
-
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
-
-// We cannot name this class MutexLock because the ctor declaration would
-// conflict with a macro named MutexLock, which is defined on some
-// platforms. That macro is used as a defensive measure to prevent against
-// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
-// "MutexLock l(&mu)".  Hence the typedef trick below.
-class GTestMutexLock {
- public:
-  explicit GTestMutexLock(Mutex*) {}  // NOLINT
-};
-
-typedef GTestMutexLock MutexLock;
-
-template <typename T>
-class GTEST_API_ ThreadLocal {
- public:
-  ThreadLocal() : value_() {}
-  explicit ThreadLocal(const T& value) : value_(value) {}
-  T* pointer() { return &value_; }
-  const T* pointer() const { return &value_; }
-  const T& get() const { return value_; }
-  void set(const T& value) { value_ = value; }
- private:
-  T value_;
-};
-
-#endif  // GTEST_IS_THREADSAFE
-
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
-GTEST_API_ size_t GetThreadCount();
-
-template <bool bool_value>
-struct bool_constant {
-  typedef bool_constant<bool_value> type;
-  static const bool value = bool_value;
-};
-template <bool bool_value> const bool bool_constant<bool_value>::value;
-
-typedef bool_constant<false> false_type;
-typedef bool_constant<true> true_type;
-
-template <typename T, typename U>
-struct is_same : public false_type {};
-
-template <typename T>
-struct is_same<T, T> : public true_type {};
-
-template <typename Iterator>
-struct IteratorTraits {
-  typedef typename Iterator::value_type value_type;
-};
-
-
-template <typename T>
-struct IteratorTraits<T*> {
-  typedef T value_type;
-};
-
-template <typename T>
-struct IteratorTraits<const T*> {
-  typedef T value_type;
-};
-
-#if GTEST_OS_WINDOWS
-# define GTEST_PATH_SEP_ "\\"
-# define GTEST_HAS_ALT_PATH_SEP_ 1
-// The biggest signed integer type the compiler supports.
-typedef __int64 BiggestInt;
-#else
-# define GTEST_PATH_SEP_ "/"
-# define GTEST_HAS_ALT_PATH_SEP_ 0
-typedef long long BiggestInt;  // NOLINT
-#endif  // GTEST_OS_WINDOWS
-
-// Utilities for char.
-
-// isspace(int ch) and friends accept an unsigned char or EOF.  char
-// may be signed, depending on the compiler (or compiler flags).
-// Therefore we need to cast a char to unsigned char before calling
-// isspace(), etc.
-
-inline bool IsAlpha(char ch) {
-  return isalpha(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsAlNum(char ch) {
-  return isalnum(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsDigit(char ch) {
-  return isdigit(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsLower(char ch) {
-  return islower(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsSpace(char ch) {
-  return isspace(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsUpper(char ch) {
-  return isupper(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsXDigit(char ch) {
-  return isxdigit(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsXDigit(wchar_t ch) {
-  const unsigned char low_byte = static_cast<unsigned char>(ch);
-  return ch == low_byte && isxdigit(low_byte) != 0;
-}
-
-inline char ToLower(char ch) {
-  return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
-}
-inline char ToUpper(char ch) {
-  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
-}
-
-inline std::string StripTrailingSpaces(std::string str) {
-  std::string::iterator it = str.end();
-  while (it != str.begin() && IsSpace(*--it))
-    it = str.erase(it);
-  return str;
-}
-
-// The testing::internal::posix namespace holds wrappers for common
-// POSIX functions.  These wrappers hide the differences between
-// Windows/MSVC and POSIX systems.  Since some compilers define these
-// standard functions as macros, the wrapper cannot have the same name
-// as the wrapped function.
-
-namespace posix {
-
-// Functions with a different name on Windows.
-
-#if GTEST_OS_WINDOWS
-
-typedef struct _stat StatStruct;
-
-# ifdef __BORLANDC__
-inline int IsATTY(int fd) { return isatty(fd); }
-inline int StrCaseCmp(const char* s1, const char* s2) {
-  return stricmp(s1, s2);
-}
-inline char* StrDup(const char* src) { return strdup(src); }
-# else  // !__BORLANDC__
-#  if GTEST_OS_WINDOWS_MOBILE
-inline int IsATTY(int /* fd */) { return 0; }
-#  else
-inline int IsATTY(int fd) { return _isatty(fd); }
-#  endif  // GTEST_OS_WINDOWS_MOBILE
-inline int StrCaseCmp(const char* s1, const char* s2) {
-  return _stricmp(s1, s2);
-}
-inline char* StrDup(const char* src) { return _strdup(src); }
-# endif  // __BORLANDC__
-
-# if GTEST_OS_WINDOWS_MOBILE
-inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
-// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
-// time and thus not defined there.
-# else
-inline int FileNo(FILE* file) { return _fileno(file); }
-inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
-inline int RmDir(const char* dir) { return _rmdir(dir); }
-inline bool IsDir(const StatStruct& st) {
-  return (_S_IFDIR & st.st_mode) != 0;
-}
-# endif  // GTEST_OS_WINDOWS_MOBILE
-
-#else
-
-typedef struct stat StatStruct;
-
-inline int FileNo(FILE* file) { return fileno(file); }
-inline int IsATTY(int fd) { return isatty(fd); }
-inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
-inline int StrCaseCmp(const char* s1, const char* s2) {
-  return strcasecmp(s1, s2);
-}
-inline char* StrDup(const char* src) { return strdup(src); }
-inline int RmDir(const char* dir) { return rmdir(dir); }
-inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
-
-#endif  // GTEST_OS_WINDOWS
-
-// Functions deprecated by MSVC 8.0.
-
-GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
-
-inline const char* StrNCpy(char* dest, const char* src, size_t n) {
-  return strncpy(dest, src, n);
-}
-
-// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
-// StrError() aren't needed on Windows CE at this time and thus not
-// defined there.
-
-#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
-inline int ChDir(const char* dir) {
-#if GTEST_OS_WINDOWS
-  return _chdir(dir);
-#else
-  return chdir(dir);
-#endif
-}
-#endif
-inline FILE* FOpen(const char* path, const char* mode) {
-  return fopen(path, mode);
-}
-#if !GTEST_OS_WINDOWS_MOBILE
-inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
-  return freopen(path, mode, stream);
-}
-inline FILE* FDOpen(int fd, const char* mode) {
-#if GTEST_OS_WINDOWS
-  return _fdopen(fd, mode);
-#else
-  return fdopen(fd, mode);
-#endif
-}
-#endif
-inline int FClose(FILE* fp) { return fclose(fp); }
-#if !GTEST_OS_WINDOWS_MOBILE
-inline int Read(int fd, void* buf, unsigned int count) {
-#if GTEST_OS_WINDOWS
-  return static_cast<int>(_read(fd, buf, count));
-#else
-  return static_cast<int>(read(fd, buf, count));
-#endif
-}
-inline int Write(int fd, const void* buf, unsigned int count) {
-#if GTEST_OS_WINDOWS
-  return static_cast<int>(_write(fd, buf, count));
-#else
-  return static_cast<int>(write(fd, buf, count));
-#endif
-}
-inline int Close(int fd) {
-#if GTEST_OS_WINDOWS
-  return _close(fd);
-#else
-  return close(fd);
-#endif
-}
-inline const char* StrError(int errnum) { return strerror(errnum); }
-#endif
-inline const char* GetEnv(const char* name) {
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
-  // We are on Windows CE, which has no environment variables.
-  static_cast<void>(name);  // To prevent 'unused argument' warning.
-  return nullptr;
-#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
-  // Environment variables which we programmatically clear will be set to the
-  // empty string rather than unset (NULL).  Handle that case.
-  const char* const env = getenv(name);
-  return (env != nullptr && env[0] != '\0') ? env : nullptr;
-#else
-  return getenv(name);
-#endif
-}
-
-GTEST_DISABLE_MSC_DEPRECATED_POP_()
-
-#if GTEST_OS_WINDOWS_MOBILE
-// Windows CE has no C library. The abort() function is used in
-// several places in Google Test. This implementation provides a reasonable
-// imitation of standard behaviour.
-[[noreturn]] void Abort();
-#else
-[[noreturn]] inline void Abort() { abort(); }
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-}  // namespace posix
-
-// MSVC "deprecates" snprintf and issues warnings wherever it is used.  In
-// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
-// MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
-// function in order to achieve that.  We use macro definition here because
-// snprintf is a variadic function.
-#if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE
-// MSVC 2005 and above support variadic macros.
-# define GTEST_SNPRINTF_(buffer, size, format, ...) \
-     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
-#elif defined(_MSC_VER)
-// Windows CE does not define _snprintf_s
-# define GTEST_SNPRINTF_ _snprintf
-#else
-# define GTEST_SNPRINTF_ snprintf
-#endif
-
-// The maximum number a BiggestInt can represent.  This definition
-// works no matter BiggestInt is represented in one's complement or
-// two's complement.
-//
-// We cannot rely on numeric_limits in STL, as __int64 and long long
-// are not part of standard C++ and numeric_limits doesn't need to be
-// defined for them.
-const BiggestInt kMaxBiggestInt =
-    ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1));
-
-// This template class serves as a compile-time function from size to
-// type.  It maps a size in bytes to a primitive type with that
-// size. e.g.
-//
-//   TypeWithSize<4>::UInt
-//
-// is typedef-ed to be unsigned int (unsigned integer made up of 4
-// bytes).
-//
-// Such functionality should belong to STL, but I cannot find it
-// there.
-//
-// Google Test uses this class in the implementation of floating-point
-// comparison.
-//
-// For now it only handles UInt (unsigned int) as that's all Google Test
-// needs.  Other types can be easily added in the future if need
-// arises.
-template <size_t size>
-class TypeWithSize {
- public:
-  // This prevents the user from using TypeWithSize<N> with incorrect
-  // values of N.
-  typedef void UInt;
-};
-
-// The specialization for size 4.
-template <>
-class TypeWithSize<4> {
- public:
-  // unsigned int has size 4 in both gcc and MSVC.
-  //
-  // As base/basictypes.h doesn't compile on Windows, we cannot use
-  // uint32, uint64, and etc here.
-  typedef int Int;
-  typedef unsigned int UInt;
-};
-
-// The specialization for size 8.
-template <>
-class TypeWithSize<8> {
- public:
-#if GTEST_OS_WINDOWS
-  typedef __int64 Int;
-  typedef unsigned __int64 UInt;
-#else
-  typedef long long Int;  // NOLINT
-  typedef unsigned long long UInt;  // NOLINT
-#endif  // GTEST_OS_WINDOWS
-};
-
-// Integer types of known sizes.
-typedef TypeWithSize<4>::Int Int32;
-typedef TypeWithSize<4>::UInt UInt32;
-typedef TypeWithSize<8>::Int Int64;
-typedef TypeWithSize<8>::UInt UInt64;
-typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
-
-// Utilities for command line flags and environment variables.
-
-// Macro for referencing flags.
-#if !defined(GTEST_FLAG)
-# define GTEST_FLAG(name) FLAGS_gtest_##name
-#endif  // !defined(GTEST_FLAG)
-
-#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
-# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
-#endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
-
-#if !defined(GTEST_DECLARE_bool_)
-# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
-
-// Macros for declaring flags.
-# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
-# define GTEST_DECLARE_int32_(name) \
-    GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
-# define GTEST_DECLARE_string_(name) \
-    GTEST_API_ extern ::std::string GTEST_FLAG(name)
-
-// Macros for defining flags.
-# define GTEST_DEFINE_bool_(name, default_val, doc) \
-    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
-# define GTEST_DEFINE_int32_(name, default_val, doc) \
-    GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
-# define GTEST_DEFINE_string_(name, default_val, doc) \
-    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
-
-#endif  // !defined(GTEST_DECLARE_bool_)
-
-// Thread annotations
-#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
-# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
-# define GTEST_LOCK_EXCLUDED_(locks)
-#endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
-
-// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
-// to *value and returns true; otherwise leaves *value unchanged and returns
-// false.
-bool ParseInt32(const Message& src_text, const char* str, Int32* value);
-
-// Parses a bool/Int32/string from the environment variable
-// corresponding to the given Google Test flag.
-bool BoolFromGTestEnv(const char* flag, bool default_val);
-GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
-std::string OutputFlagAlsoCheckEnvVar();
-const char* StringFromGTestEnv(const char* flag, const char* default_val);
-
-}  // namespace internal
-}  // namespace testing
-
-#if !defined(GTEST_INTERNAL_DEPRECATED)
-
-// Internal Macro to mark an API deprecated, for googletest usage only
-// Usage: class GTEST_INTERNAL_DEPRECATED(message) MyClass or
-// GTEST_INTERNAL_DEPRECATED(message) <return_type> myFunction(); Every usage of
-// a deprecated entity will trigger a warning when compiled with
-// `-Wdeprecated-declarations` option (clang, gcc, any __GNUC__ compiler).
-// For msvc /W3 option will need to be used
-// Note that for 'other' compilers this macro evaluates to nothing to prevent
-// compilations errors.
-#if defined(_MSC_VER)
-#define GTEST_INTERNAL_DEPRECATED(message) __declspec(deprecated(message))
-#elif defined(__GNUC__)
-#define GTEST_INTERNAL_DEPRECATED(message) __attribute__((deprecated(message)))
-#else
-#define GTEST_INTERNAL_DEPRECATED(message)
-#endif
-
-#endif  // !defined(GTEST_INTERNAL_DEPRECATED)
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
diff --git a/deps/googletest/include/gtest/internal/gtest-string.h b/deps/googletest/include/gtest/internal/gtest-string.h
deleted file mode 100644
index 884b1e16f..000000000
--- a/deps/googletest/include/gtest/internal/gtest-string.h
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// The Google C++ Testing and Mocking Framework (Google Test)
-//
-// This header file declares the String class and functions used internally by
-// Google Test.  They are subject to change without notice. They should not used
-// by code external to Google Test.
-//
-// This header file is #included by gtest-internal.h.
-// It should not be #included by other files.
-
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-
-#ifdef __BORLANDC__
-// string.h is not guaranteed to provide strcpy on C++ Builder.
-# include <mem.h>
-#endif
-
-#include <string.h>
-#include <string>
-
-#include "gtest/internal/gtest-port.h"
-
-namespace testing {
-namespace internal {
-
-// String - an abstract class holding static string utilities.
-class GTEST_API_ String {
- public:
-  // Static utility methods
-
-  // Clones a 0-terminated C string, allocating memory using new.  The
-  // caller is responsible for deleting the return value using
-  // delete[].  Returns the cloned string, or NULL if the input is
-  // NULL.
-  //
-  // This is different from strdup() in string.h, which allocates
-  // memory using malloc().
-  static const char* CloneCString(const char* c_str);
-
-#if GTEST_OS_WINDOWS_MOBILE
-  // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
-  // able to pass strings to Win32 APIs on CE we need to convert them
-  // to 'Unicode', UTF-16.
-
-  // Creates a UTF-16 wide string from the given ANSI string, allocating
-  // memory using new. The caller is responsible for deleting the return
-  // value using delete[]. Returns the wide string, or NULL if the
-  // input is NULL.
-  //
-  // The wide string is created using the ANSI codepage (CP_ACP) to
-  // match the behaviour of the ANSI versions of Win32 calls and the
-  // C runtime.
-  static LPCWSTR AnsiToUtf16(const char* c_str);
-
-  // Creates an ANSI string from the given wide string, allocating
-  // memory using new. The caller is responsible for deleting the return
-  // value using delete[]. Returns the ANSI string, or NULL if the
-  // input is NULL.
-  //
-  // The returned string is created using the ANSI codepage (CP_ACP) to
-  // match the behaviour of the ANSI versions of Win32 calls and the
-  // C runtime.
-  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
-#endif
-
-  // Compares two C strings.  Returns true iff they have the same content.
-  //
-  // Unlike strcmp(), this function can handle NULL argument(s).  A
-  // NULL C string is considered different to any non-NULL C string,
-  // including the empty string.
-  static bool CStringEquals(const char* lhs, const char* rhs);
-
-  // Converts a wide C string to a String using the UTF-8 encoding.
-  // NULL will be converted to "(null)".  If an error occurred during
-  // the conversion, "(failed to convert from wide string)" is
-  // returned.
-  static std::string ShowWideCString(const wchar_t* wide_c_str);
-
-  // Compares two wide C strings.  Returns true iff they have the same
-  // content.
-  //
-  // Unlike wcscmp(), this function can handle NULL argument(s).  A
-  // NULL C string is considered different to any non-NULL C string,
-  // including the empty string.
-  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
-
-  // Compares two C strings, ignoring case.  Returns true iff they
-  // have the same content.
-  //
-  // Unlike strcasecmp(), this function can handle NULL argument(s).
-  // A NULL C string is considered different to any non-NULL C string,
-  // including the empty string.
-  static bool CaseInsensitiveCStringEquals(const char* lhs,
-                                           const char* rhs);
-
-  // Compares two wide C strings, ignoring case.  Returns true iff they
-  // have the same content.
-  //
-  // Unlike wcscasecmp(), this function can handle NULL argument(s).
-  // A NULL C string is considered different to any non-NULL wide C string,
-  // including the empty string.
-  // NB: The implementations on different platforms slightly differ.
-  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
-  // environment variable. On GNU platform this method uses wcscasecmp
-  // which compares according to LC_CTYPE category of the current locale.
-  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
-  // current locale.
-  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
-                                               const wchar_t* rhs);
-
-  // Returns true iff the given string ends with the given suffix, ignoring
-  // case. Any string is considered to end with an empty suffix.
-  static bool EndsWithCaseInsensitive(
-      const std::string& str, const std::string& suffix);
-
-  // Formats an int value as "%02d".
-  static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
-
-  // Formats an int value as "%X".
-  static std::string FormatHexInt(int value);
-
-  // Formats an int value as "%X".
-  static std::string FormatHexUInt32(UInt32 value);
-
-  // Formats a byte as "%02X".
-  static std::string FormatByte(unsigned char value);
-
- private:
-  String();  // Not meant to be instantiated.
-};  // class String
-
-// Gets the content of the stringstream's buffer as an std::string.  Each '\0'
-// character in the buffer is replaced with "\\0".
-GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
diff --git a/deps/googletest/include/gtest/internal/gtest-type-util.h b/deps/googletest/include/gtest/internal/gtest-type-util.h
deleted file mode 100644
index 4cd1cf3ce..000000000
--- a/deps/googletest/include/gtest/internal/gtest-type-util.h
+++ /dev/null
@@ -1,3347 +0,0 @@
-// This file was GENERATED by command:
-//     pump.py gtest-type-util.h.pump
-// DO NOT EDIT BY HAND!!!
-
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Type utilities needed for implementing typed and type-parameterized
-// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// Currently we support at most 50 types in a list, and at most 50
-// type-parameterized tests in one type-parameterized test suite.
-// Please contact googletestframework@googlegroups.com if you need
-// more.
-
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-
-#include "gtest/internal/gtest-port.h"
-
-// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
-// libstdc++ (which is where cxxabi.h comes from).
-# if GTEST_HAS_CXXABI_H_
-#  include <cxxabi.h>
-# elif defined(__HP_aCC)
-#  include <acxx_demangle.h>
-# endif  // GTEST_HASH_CXXABI_H_
-
-namespace testing {
-namespace internal {
-
-// Canonicalizes a given name with respect to the Standard C++ Library.
-// This handles removing the inline namespace within `std` that is
-// used by various standard libraries (e.g., `std::__1`).  Names outside
-// of namespace std are returned unmodified.
-inline std::string CanonicalizeForStdLibVersioning(std::string s) {
-  static const char prefix[] = "std::__";
-  if (s.compare(0, strlen(prefix), prefix) == 0) {
-    std::string::size_type end = s.find("::", strlen(prefix));
-    if (end != s.npos) {
-      // Erase everything between the initial `std` and the second `::`.
-      s.erase(strlen("std"), end - strlen("std"));
-    }
-  }
-  return s;
-}
-
-// GetTypeName<T>() returns a human-readable name of type T.
-// NB: This function is also used in Google Mock, so don't move it inside of
-// the typed-test-only section below.
-template <typename T>
-std::string GetTypeName() {
-# if GTEST_HAS_RTTI
-
-  const char* const name = typeid(T).name();
-#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
-  int status = 0;
-  // gcc's implementation of typeid(T).name() mangles the type name,
-  // so we have to demangle it.
-#   if GTEST_HAS_CXXABI_H_
-  using abi::__cxa_demangle;
-#   endif  // GTEST_HAS_CXXABI_H_
-  char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status);
-  const std::string name_str(status == 0 ? readable_name : name);
-  free(readable_name);
-  return CanonicalizeForStdLibVersioning(name_str);
-#  else
-  return name;
-#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
-
-# else
-
-  return "<type>";
-
-# endif  // GTEST_HAS_RTTI
-}
-
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
-// type.  This can be used as a compile-time assertion to ensure that
-// two types are equal.
-
-template <typename T1, typename T2>
-struct AssertTypeEq;
-
-template <typename T>
-struct AssertTypeEq<T, T> {
-  typedef bool type;
-};
-
-// A unique type used as the default value for the arguments of class
-// template Types.  This allows us to simulate variadic templates
-// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
-// support directly.
-struct None {};
-
-// The following family of struct and struct templates are used to
-// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
-// represents a type list with N types (T1, T2, ..., and TN) in it.
-// Except for Types0, every struct in the family has two member types:
-// Head for the first type in the list, and Tail for the rest of the
-// list.
-
-// The empty type list.
-struct Types0 {};
-
-// Type lists of length 1, 2, 3, and so on.
-
-template <typename T1>
-struct Types1 {
-  typedef T1 Head;
-  typedef Types0 Tail;
-};
-template <typename T1, typename T2>
-struct Types2 {
-  typedef T1 Head;
-  typedef Types1<T2> Tail;
-};
-
-template <typename T1, typename T2, typename T3>
-struct Types3 {
-  typedef T1 Head;
-  typedef Types2<T2, T3> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4>
-struct Types4 {
-  typedef T1 Head;
-  typedef Types3<T2, T3, T4> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-struct Types5 {
-  typedef T1 Head;
-  typedef Types4<T2, T3, T4, T5> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-struct Types6 {
-  typedef T1 Head;
-  typedef Types5<T2, T3, T4, T5, T6> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-struct Types7 {
-  typedef T1 Head;
-  typedef Types6<T2, T3, T4, T5, T6, T7> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-struct Types8 {
-  typedef T1 Head;
-  typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-struct Types9 {
-  typedef T1 Head;
-  typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-struct Types10 {
-  typedef T1 Head;
-  typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-struct Types11 {
-  typedef T1 Head;
-  typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-struct Types12 {
-  typedef T1 Head;
-  typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-struct Types13 {
-  typedef T1 Head;
-  typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-struct Types14 {
-  typedef T1 Head;
-  typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-struct Types15 {
-  typedef T1 Head;
-  typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-struct Types16 {
-  typedef T1 Head;
-  typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-struct Types17 {
-  typedef T1 Head;
-  typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-struct Types18 {
-  typedef T1 Head;
-  typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-struct Types19 {
-  typedef T1 Head;
-  typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-struct Types20 {
-  typedef T1 Head;
-  typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-struct Types21 {
-  typedef T1 Head;
-  typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-struct Types22 {
-  typedef T1 Head;
-  typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-struct Types23 {
-  typedef T1 Head;
-  typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-struct Types24 {
-  typedef T1 Head;
-  typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-struct Types25 {
-  typedef T1 Head;
-  typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-struct Types26 {
-  typedef T1 Head;
-  typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-struct Types27 {
-  typedef T1 Head;
-  typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-struct Types28 {
-  typedef T1 Head;
-  typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-struct Types29 {
-  typedef T1 Head;
-  typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-struct Types30 {
-  typedef T1 Head;
-  typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-struct Types31 {
-  typedef T1 Head;
-  typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-struct Types32 {
-  typedef T1 Head;
-  typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-struct Types33 {
-  typedef T1 Head;
-  typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-struct Types34 {
-  typedef T1 Head;
-  typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-struct Types35 {
-  typedef T1 Head;
-  typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-struct Types36 {
-  typedef T1 Head;
-  typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-struct Types37 {
-  typedef T1 Head;
-  typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-struct Types38 {
-  typedef T1 Head;
-  typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-struct Types39 {
-  typedef T1 Head;
-  typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-struct Types40 {
-  typedef T1 Head;
-  typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-struct Types41 {
-  typedef T1 Head;
-  typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-struct Types42 {
-  typedef T1 Head;
-  typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-struct Types43 {
-  typedef T1 Head;
-  typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-struct Types44 {
-  typedef T1 Head;
-  typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-struct Types45 {
-  typedef T1 Head;
-  typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-struct Types46 {
-  typedef T1 Head;
-  typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-struct Types47 {
-  typedef T1 Head;
-  typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-struct Types48 {
-  typedef T1 Head;
-  typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-struct Types49 {
-  typedef T1 Head;
-  typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48, T49> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-struct Types50 {
-  typedef T1 Head;
-  typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48, T49, T50> Tail;
-};
-
-
-}  // namespace internal
-
-// We don't want to require the users to write TypesN<...> directly,
-// as that would require them to count the length.  Types<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Types<int>
-// will appear as Types<int, None, None, ..., None> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Types<T1, ..., TN>, and Google Test will translate
-// that to TypesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Types template.
-template <typename T1 = internal::None, typename T2 = internal::None,
-    typename T3 = internal::None, typename T4 = internal::None,
-    typename T5 = internal::None, typename T6 = internal::None,
-    typename T7 = internal::None, typename T8 = internal::None,
-    typename T9 = internal::None, typename T10 = internal::None,
-    typename T11 = internal::None, typename T12 = internal::None,
-    typename T13 = internal::None, typename T14 = internal::None,
-    typename T15 = internal::None, typename T16 = internal::None,
-    typename T17 = internal::None, typename T18 = internal::None,
-    typename T19 = internal::None, typename T20 = internal::None,
-    typename T21 = internal::None, typename T22 = internal::None,
-    typename T23 = internal::None, typename T24 = internal::None,
-    typename T25 = internal::None, typename T26 = internal::None,
-    typename T27 = internal::None, typename T28 = internal::None,
-    typename T29 = internal::None, typename T30 = internal::None,
-    typename T31 = internal::None, typename T32 = internal::None,
-    typename T33 = internal::None, typename T34 = internal::None,
-    typename T35 = internal::None, typename T36 = internal::None,
-    typename T37 = internal::None, typename T38 = internal::None,
-    typename T39 = internal::None, typename T40 = internal::None,
-    typename T41 = internal::None, typename T42 = internal::None,
-    typename T43 = internal::None, typename T44 = internal::None,
-    typename T45 = internal::None, typename T46 = internal::None,
-    typename T47 = internal::None, typename T48 = internal::None,
-    typename T49 = internal::None, typename T50 = internal::None>
-struct Types {
-  typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
-};
-
-template <>
-struct Types<internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types0 type;
-};
-template <typename T1>
-struct Types<T1, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types1<T1> type;
-};
-template <typename T1, typename T2>
-struct Types<T1, T2, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types2<T1, T2> type;
-};
-template <typename T1, typename T2, typename T3>
-struct Types<T1, T2, T3, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types3<T1, T2, T3> type;
-};
-template <typename T1, typename T2, typename T3, typename T4>
-struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types4<T1, T2, T3, T4> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-struct Types<T1, T2, T3, T4, T5, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types5<T1, T2, T3, T4, T5> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types6<T1, T2, T3, T4, T5, T6> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, internal::None, internal::None, internal::None> {
-  typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, T48, internal::None, internal::None> {
-  typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, T48, T49, internal::None> {
-  typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49> type;
-};
-
-namespace internal {
-
-# define GTEST_TEMPLATE_ template <typename T> class
-
-// The template "selector" struct TemplateSel<Tmpl> is used to
-// represent Tmpl, which must be a class template with one type
-// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
-// as the type Tmpl<T>.  This allows us to actually instantiate the
-// template "selected" by TemplateSel<Tmpl>.
-//
-// This trick is necessary for simulating typedef for class templates,
-// which C++ doesn't support directly.
-template <GTEST_TEMPLATE_ Tmpl>
-struct TemplateSel {
-  template <typename T>
-  struct Bind {
-    typedef Tmpl<T> type;
-  };
-};
-
-# define GTEST_BIND_(TmplSel, T) \
-  TmplSel::template Bind<T>::type
-
-// A unique struct template used as the default value for the
-// arguments of class template Templates.  This allows us to simulate
-// variadic templates (e.g. Templates<int>, Templates<int, double>,
-// and etc), which C++ doesn't support directly.
-template <typename T>
-struct NoneT {};
-
-// The following family of struct and struct templates are used to
-// represent template lists.  In particular, TemplatesN<T1, T2, ...,
-// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
-// for Templates0, every struct in the family has two member types:
-// Head for the selector of the first template in the list, and Tail
-// for the rest of the list.
-
-// The empty template list.
-struct Templates0 {};
-
-// Template lists of length 1, 2, 3, and so on.
-
-template <GTEST_TEMPLATE_ T1>
-struct Templates1 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates0 Tail;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
-struct Templates2 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates1<T2> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
-struct Templates3 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates2<T2, T3> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4>
-struct Templates4 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates3<T2, T3, T4> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
-struct Templates5 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates4<T2, T3, T4, T5> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
-struct Templates6 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates5<T2, T3, T4, T5, T6> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7>
-struct Templates7 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates6<T2, T3, T4, T5, T6, T7> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
-struct Templates8 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
-struct Templates9 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10>
-struct Templates10 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
-struct Templates11 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
-struct Templates12 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13>
-struct Templates13 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
-struct Templates14 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
-struct Templates15 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16>
-struct Templates16 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
-struct Templates17 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
-struct Templates18 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19>
-struct Templates19 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
-struct Templates20 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
-struct Templates21 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22>
-struct Templates22 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
-struct Templates23 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
-struct Templates24 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25>
-struct Templates25 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
-struct Templates26 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
-struct Templates27 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28>
-struct Templates28 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
-struct Templates29 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
-struct Templates30 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31>
-struct Templates31 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
-struct Templates32 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
-struct Templates33 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34>
-struct Templates34 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
-struct Templates35 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
-struct Templates36 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37>
-struct Templates37 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
-struct Templates38 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
-struct Templates39 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40>
-struct Templates40 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
-struct Templates41 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
-struct Templates42 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43>
-struct Templates43 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
-struct Templates44 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
-struct Templates45 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46>
-struct Templates46 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
-struct Templates47 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
-struct Templates48 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49>
-struct Templates49 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48, T49> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50>
-struct Templates50 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48, T49, T50> Tail;
-};
-
-
-// We don't want to require the users to write TemplatesN<...> directly,
-// as that would require them to count the length.  Templates<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Templates<list>
-// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Templates<T1, ..., TN>, and Google Test will translate
-// that to TemplatesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Templates template.
-template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT,
-    GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT,
-    GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT,
-    GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT,
-    GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT,
-    GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT,
-    GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT,
-    GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT,
-    GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT,
-    GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT,
-    GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT,
-    GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT,
-    GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT,
-    GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT,
-    GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT,
-    GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT,
-    GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT,
-    GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT,
-    GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT,
-    GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT,
-    GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT,
-    GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT,
-    GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT,
-    GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT,
-    GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT>
-struct Templates {
-  typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
-};
-
-template <>
-struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates0 type;
-};
-template <GTEST_TEMPLATE_ T1>
-struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates1<T1> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
-struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates2<T1, T2> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
-struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates3<T1, T2, T3> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4>
-struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates4<T1, T2, T3, T4> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
-struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates5<T1, T2, T3, T4, T5> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
-struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates6<T1, T2, T3, T4, T5, T6> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, NoneT, NoneT, NoneT> {
-  typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, T48, NoneT, NoneT> {
-  typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, T48, T49, NoneT> {
-  typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48, T49> type;
-};
-
-// The TypeList template makes it possible to use either a single type
-// or a Types<...> list in TYPED_TEST_SUITE() and
-// INSTANTIATE_TYPED_TEST_SUITE_P().
-
-template <typename T>
-struct TypeList {
-  typedef Types1<T> type;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48, T49, T50> > {
-  typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type;
-};
-
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/deps/googletest/include/gtest/internal/gtest-type-util.h.pump b/deps/googletest/include/gtest/internal/gtest-type-util.h.pump
deleted file mode 100644
index eb014ee1b..000000000
--- a/deps/googletest/include/gtest/internal/gtest-type-util.h.pump
+++ /dev/null
@@ -1,314 +0,0 @@
-$$ -*- mode: c++; -*-
-$var n = 50  $$ Maximum length of type lists we want to support.
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-// Type utilities needed for implementing typed and type-parameterized
-// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// Currently we support at most $n types in a list, and at most $n
-// type-parameterized tests in one type-parameterized test suite.
-// Please contact googletestframework@googlegroups.com if you need
-// more.
-
-// GOOGLETEST_CM0001 DO NOT DELETE
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-
-#include "gtest/internal/gtest-port.h"
-
-// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
-// libstdc++ (which is where cxxabi.h comes from).
-# if GTEST_HAS_CXXABI_H_
-#  include <cxxabi.h>
-# elif defined(__HP_aCC)
-#  include <acxx_demangle.h>
-# endif  // GTEST_HASH_CXXABI_H_
-
-namespace testing {
-namespace internal {
-
-// Canonicalizes a given name with respect to the Standard C++ Library.
-// This handles removing the inline namespace within `std` that is
-// used by various standard libraries (e.g., `std::__1`).  Names outside
-// of namespace std are returned unmodified.
-inline std::string CanonicalizeForStdLibVersioning(std::string s) {
-  static const char prefix[] = "std::__";
-  if (s.compare(0, strlen(prefix), prefix) == 0) {
-    std::string::size_type end = s.find("::", strlen(prefix));
-    if (end != s.npos) {
-      // Erase everything between the initial `std` and the second `::`.
-      s.erase(strlen("std"), end - strlen("std"));
-    }
-  }
-  return s;
-}
-
-// GetTypeName<T>() returns a human-readable name of type T.
-// NB: This function is also used in Google Mock, so don't move it inside of
-// the typed-test-only section below.
-template <typename T>
-std::string GetTypeName() {
-# if GTEST_HAS_RTTI
-
-  const char* const name = typeid(T).name();
-#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
-  int status = 0;
-  // gcc's implementation of typeid(T).name() mangles the type name,
-  // so we have to demangle it.
-#   if GTEST_HAS_CXXABI_H_
-  using abi::__cxa_demangle;
-#   endif  // GTEST_HAS_CXXABI_H_
-  char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status);
-  const std::string name_str(status == 0 ? readable_name : name);
-  free(readable_name);
-  return CanonicalizeForStdLibVersioning(name_str);
-#  else
-  return name;
-#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
-
-# else
-
-  return "<type>";
-
-# endif  // GTEST_HAS_RTTI
-}
-
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
-// type.  This can be used as a compile-time assertion to ensure that
-// two types are equal.
-
-template <typename T1, typename T2>
-struct AssertTypeEq;
-
-template <typename T>
-struct AssertTypeEq<T, T> {
-  typedef bool type;
-};
-
-// A unique type used as the default value for the arguments of class
-// template Types.  This allows us to simulate variadic templates
-// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
-// support directly.
-struct None {};
-
-// The following family of struct and struct templates are used to
-// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
-// represents a type list with N types (T1, T2, ..., and TN) in it.
-// Except for Types0, every struct in the family has two member types:
-// Head for the first type in the list, and Tail for the rest of the
-// list.
-
-// The empty type list.
-struct Types0 {};
-
-// Type lists of length 1, 2, 3, and so on.
-
-template <typename T1>
-struct Types1 {
-  typedef T1 Head;
-  typedef Types0 Tail;
-};
-
-$range i 2..n
-
-$for i [[
-$range j 1..i
-$range k 2..i
-template <$for j, [[typename T$j]]>
-struct Types$i {
-  typedef T1 Head;
-  typedef Types$(i-1)<$for k, [[T$k]]> Tail;
-};
-
-
-]]
-
-}  // namespace internal
-
-// We don't want to require the users to write TypesN<...> directly,
-// as that would require them to count the length.  Types<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Types<int>
-// will appear as Types<int, None, None, ..., None> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Types<T1, ..., TN>, and Google Test will translate
-// that to TypesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Types template.
-
-$range i 1..n
-template <$for i, [[typename T$i = internal::None]]>
-struct Types {
-  typedef internal::Types$n<$for i, [[T$i]]> type;
-};
-
-template <>
-struct Types<$for i, [[internal::None]]> {
-  typedef internal::Types0 type;
-};
-
-$range i 1..n-1
-$for i [[
-$range j 1..i
-$range k i+1..n
-template <$for j, [[typename T$j]]>
-struct Types<$for j, [[T$j]]$for k[[, internal::None]]> {
-  typedef internal::Types$i<$for j, [[T$j]]> type;
-};
-
-]]
-
-namespace internal {
-
-# define GTEST_TEMPLATE_ template <typename T> class
-
-// The template "selector" struct TemplateSel<Tmpl> is used to
-// represent Tmpl, which must be a class template with one type
-// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
-// as the type Tmpl<T>.  This allows us to actually instantiate the
-// template "selected" by TemplateSel<Tmpl>.
-//
-// This trick is necessary for simulating typedef for class templates,
-// which C++ doesn't support directly.
-template <GTEST_TEMPLATE_ Tmpl>
-struct TemplateSel {
-  template <typename T>
-  struct Bind {
-    typedef Tmpl<T> type;
-  };
-};
-
-# define GTEST_BIND_(TmplSel, T) \
-  TmplSel::template Bind<T>::type
-
-// A unique struct template used as the default value for the
-// arguments of class template Templates.  This allows us to simulate
-// variadic templates (e.g. Templates<int>, Templates<int, double>,
-// and etc), which C++ doesn't support directly.
-template <typename T>
-struct NoneT {};
-
-// The following family of struct and struct templates are used to
-// represent template lists.  In particular, TemplatesN<T1, T2, ...,
-// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
-// for Templates0, every struct in the family has two member types:
-// Head for the selector of the first template in the list, and Tail
-// for the rest of the list.
-
-// The empty template list.
-struct Templates0 {};
-
-// Template lists of length 1, 2, 3, and so on.
-
-template <GTEST_TEMPLATE_ T1>
-struct Templates1 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates0 Tail;
-};
-
-$range i 2..n
-
-$for i [[
-$range j 1..i
-$range k 2..i
-template <$for j, [[GTEST_TEMPLATE_ T$j]]>
-struct Templates$i {
-  typedef TemplateSel<T1> Head;
-  typedef Templates$(i-1)<$for k, [[T$k]]> Tail;
-};
-
-
-]]
-
-// We don't want to require the users to write TemplatesN<...> directly,
-// as that would require them to count the length.  Templates<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Templates<list>
-// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Templates<T1, ..., TN>, and Google Test will translate
-// that to TemplatesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Templates template.
-
-$range i 1..n
-template <$for i, [[GTEST_TEMPLATE_ T$i = NoneT]]>
-struct Templates {
-  typedef Templates$n<$for i, [[T$i]]> type;
-};
-
-template <>
-struct Templates<$for i, [[NoneT]]> {
-  typedef Templates0 type;
-};
-
-$range i 1..n-1
-$for i [[
-$range j 1..i
-$range k i+1..n
-template <$for j, [[GTEST_TEMPLATE_ T$j]]>
-struct Templates<$for j, [[T$j]]$for k[[, NoneT]]> {
-  typedef Templates$i<$for j, [[T$j]]> type;
-};
-
-]]
-
-// The TypeList template makes it possible to use either a single type
-// or a Types<...> list in TYPED_TEST_SUITE() and
-// INSTANTIATE_TYPED_TEST_SUITE_P().
-
-template <typename T>
-struct TypeList {
-  typedef Types1<T> type;
-};
-
-
-$range i 1..n
-template <$for i, [[typename T$i]]>
-struct TypeList<Types<$for i, [[T$i]]> > {
-  typedef typename Types<$for i, [[T$i]]>::type type;
-};
-
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/deps/googletest/src/gtest-all.cc b/deps/googletest/src/gtest-all.cc
deleted file mode 100644
index ad292905c..000000000
--- a/deps/googletest/src/gtest-all.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-//
-// Google C++ Testing and Mocking Framework (Google Test)
-//
-// Sometimes it's desirable to build Google Test by compiling a single file.
-// This file serves this purpose.
-
-// This line ensures that gtest.h can be compiled on its own, even
-// when it's fused.
-#include "gtest/gtest.h"
-
-// The following lines pull in the real gtest *.cc files.
-#include "src/gtest.cc"
-#include "src/gtest-death-test.cc"
-#include "src/gtest-filepath.cc"
-#include "src/gtest-matchers.cc"
-#include "src/gtest-port.cc"
-#include "src/gtest-printers.cc"
-#include "src/gtest-test-part.cc"
-#include "src/gtest-typed-test.cc"
diff --git a/deps/googletest/src/gtest-death-test.cc b/deps/googletest/src/gtest-death-test.cc
deleted file mode 100644
index 9bfe31534..000000000
--- a/deps/googletest/src/gtest-death-test.cc
+++ /dev/null
@@ -1,1643 +0,0 @@
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-//
-// This file implements death tests.
-
-#include "gtest/gtest-death-test.h"
-
-#include <utility>
-
-#include "gtest/internal/gtest-port.h"
-#include "gtest/internal/custom/gtest.h"
-
-#if GTEST_HAS_DEATH_TEST
-
-# if GTEST_OS_MAC
-#  include <crt_externs.h>
-# endif  // GTEST_OS_MAC
-
-# include <errno.h>
-# include <fcntl.h>
-# include <limits.h>
-
-# if GTEST_OS_LINUX
-#  include <signal.h>
-# endif  // GTEST_OS_LINUX
-
-# include <stdarg.h>
-
-# if GTEST_OS_WINDOWS
-#  include <windows.h>
-# else
-#  include <sys/mman.h>
-#  include <sys/wait.h>
-# endif  // GTEST_OS_WINDOWS
-
-# if GTEST_OS_QNX
-#  include <spawn.h>
-# endif  // GTEST_OS_QNX
-
-# if GTEST_OS_FUCHSIA
-#  include <lib/fdio/fd.h>
-#  include <lib/fdio/io.h>
-#  include <lib/fdio/spawn.h>
-#  include <lib/zx/port.h>
-#  include <lib/zx/process.h>
-#  include <lib/zx/socket.h>
-#  include <zircon/processargs.h>
-#  include <zircon/syscalls.h>
-#  include <zircon/syscalls/policy.h>
-#  include <zircon/syscalls/port.h>
-# endif  // GTEST_OS_FUCHSIA
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-#include "gtest/gtest-message.h"
-#include "gtest/internal/gtest-string.h"
-#include "src/gtest-internal-inl.h"
-
-namespace testing {
-
-// Constants.
-
-// The default death test style.
-//
-// This is defined in internal/gtest-port.h as "fast", but can be overridden by
-// a definition in internal/custom/gtest-port.h. The recommended value, which is
-// used internally at Google, is "threadsafe".
-static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
-
-GTEST_DEFINE_string_(
-    death_test_style,
-    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
-    "Indicates how to run a death test in a forked child process: "
-    "\"threadsafe\" (child process re-executes the test binary "
-    "from the beginning, running only the specific death test) or "
-    "\"fast\" (child process runs the death test immediately "
-    "after forking).");
-
-GTEST_DEFINE_bool_(
-    death_test_use_fork,
-    internal::BoolFromGTestEnv("death_test_use_fork", false),
-    "Instructs to use fork()/_exit() instead of clone() in death tests. "
-    "Ignored and always uses fork() on POSIX systems where clone() is not "
-    "implemented. Useful when running under valgrind or similar tools if "
-    "those do not support clone(). Valgrind 3.3.1 will just fail if "
-    "it sees an unsupported combination of clone() flags. "
-    "It is not recommended to use this flag w/o valgrind though it will "
-    "work in 99% of the cases. Once valgrind is fixed, this flag will "
-    "most likely be removed.");
-
-namespace internal {
-GTEST_DEFINE_string_(
-    internal_run_death_test, "",
-    "Indicates the file, line number, temporal index of "
-    "the single death test to run, and a file descriptor to "
-    "which a success code may be sent, all separated by "
-    "the '|' characters.  This flag is specified if and only if the current "
-    "process is a sub-process launched for running a thread-safe "
-    "death test.  FOR INTERNAL USE ONLY.");
-}  // namespace internal
-
-#if GTEST_HAS_DEATH_TEST
-
-namespace internal {
-
-// Valid only for fast death tests. Indicates the code is running in the
-// child process of a fast style death test.
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
-static bool g_in_fast_death_test_child = false;
-# endif
-
-// Returns a Boolean value indicating whether the caller is currently
-// executing in the context of the death test child process.  Tools such as
-// Valgrind heap checkers may need this to modify their behavior in death
-// tests.  IMPORTANT: This is an internal utility.  Using it may break the
-// implementation of death tests.  User code MUST NOT use it.
-bool InDeathTestChild() {
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
-
-  // On Windows and Fuchsia, death tests are thread-safe regardless of the value
-  // of the death_test_style flag.
-  return !GTEST_FLAG(internal_run_death_test).empty();
-
-# else
-
-  if (GTEST_FLAG(death_test_style) == "threadsafe")
-    return !GTEST_FLAG(internal_run_death_test).empty();
-  else
-    return g_in_fast_death_test_child;
-#endif
-}
-
-}  // namespace internal
-
-// ExitedWithCode constructor.
-ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
-}
-
-// ExitedWithCode function-call operator.
-bool ExitedWithCode::operator()(int exit_status) const {
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
-
-  return exit_status == exit_code_;
-
-# else
-
-  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
-
-# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
-}
-
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
-// KilledBySignal constructor.
-KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
-}
-
-// KilledBySignal function-call operator.
-bool KilledBySignal::operator()(int exit_status) const {
-#  if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
-  {
-    bool result;
-    if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
-      return result;
-    }
-  }
-#  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
-  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
-}
-# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
-
-namespace internal {
-
-// Utilities needed for death tests.
-
-// Generates a textual description of a given exit code, in the format
-// specified by wait(2).
-static std::string ExitSummary(int exit_code) {
-  Message m;
-
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
-
-  m << "Exited with exit status " << exit_code;
-
-# else
-
-  if (WIFEXITED(exit_code)) {
-    m << "Exited with exit status " << WEXITSTATUS(exit_code);
-  } else if (WIFSIGNALED(exit_code)) {
-    m << "Terminated by signal " << WTERMSIG(exit_code);
-  }
-#  ifdef WCOREDUMP
-  if (WCOREDUMP(exit_code)) {
-    m << " (core dumped)";
-  }
-#  endif
-# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
-
-  return m.GetString();
-}
-
-// Returns true if exit_status describes a process that was terminated
-// by a signal, or exited normally with a nonzero exit code.
-bool ExitedUnsuccessfully(int exit_status) {
-  return !ExitedWithCode(0)(exit_status);
-}
-
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
-// Generates a textual failure message when a death test finds more than
-// one thread running, or cannot determine the number of threads, prior
-// to executing the given statement.  It is the responsibility of the
-// caller not to pass a thread_count of 1.
-static std::string DeathTestThreadWarning(size_t thread_count) {
-  Message msg;
-  msg << "Death tests use fork(), which is unsafe particularly"
-      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
-  if (thread_count == 0) {
-    msg << "couldn't detect the number of threads.";
-  } else {
-    msg << "detected " << thread_count << " threads.";
-  }
-  msg << " See "
-         "https://github.com/google/googletest/blob/master/googletest/docs/"
-         "advanced.md#death-tests-and-threads"
-      << " for more explanation and suggested solutions, especially if"
-      << " this is the last message you see before your test times out.";
-  return msg.GetString();
-}
-# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
-
-// Flag characters for reporting a death test that did not die.
-static const char kDeathTestLived = 'L';
-static const char kDeathTestReturned = 'R';
-static const char kDeathTestThrew = 'T';
-static const char kDeathTestInternalError = 'I';
-
-#if GTEST_OS_FUCHSIA
-
-// File descriptor used for the pipe in the child process.
-static const int kFuchsiaReadPipeFd = 3;
-
-#endif
-
-// An enumeration describing all of the possible ways that a death test can
-// conclude.  DIED means that the process died while executing the test
-// code; LIVED means that process lived beyond the end of the test code;
-// RETURNED means that the test statement attempted to execute a return
-// statement, which is not allowed; THREW means that the test statement
-// returned control by throwing an exception.  IN_PROGRESS means the test
-// has not yet concluded.
-enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
-
-// Routine for aborting the program which is safe to call from an
-// exec-style death test child process, in which case the error
-// message is propagated back to the parent process.  Otherwise, the
-// message is simply printed to stderr.  In either case, the program
-// then exits with status 1.
-static void DeathTestAbort(const std::string& message) {
-  // On a POSIX system, this function may be called from a threadsafe-style
-  // death test child process, which operates on a very small stack.  Use
-  // the heap for any additional non-minuscule memory requirements.
-  const InternalRunDeathTestFlag* const flag =
-      GetUnitTestImpl()->internal_run_death_test_flag();
-  if (flag != nullptr) {
-    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
-    fputc(kDeathTestInternalError, parent);
-    fprintf(parent, "%s", message.c_str());
-    fflush(parent);
-    _exit(1);
-  } else {
-    fprintf(stderr, "%s", message.c_str());
-    fflush(stderr);
-    posix::Abort();
-  }
-}
-
-// A replacement for CHECK that calls DeathTestAbort if the assertion
-// fails.
-# define GTEST_DEATH_TEST_CHECK_(expression) \
-  do { \
-    if (!::testing::internal::IsTrue(expression)) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression); \
-    } \
-  } while (::testing::internal::AlwaysFalse())
-
-// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
-// evaluating any system call that fulfills two conditions: it must return
-// -1 on failure, and set errno to EINTR when it is interrupted and
-// should be tried again.  The macro expands to a loop that repeatedly
-// evaluates the expression as long as it evaluates to -1 and sets
-// errno to EINTR.  If the expression evaluates to -1 but errno is
-// something other than EINTR, DeathTestAbort is called.
-# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
-  do { \
-    int gtest_retval; \
-    do { \
-      gtest_retval = (expression); \
-    } while (gtest_retval == -1 && errno == EINTR); \
-    if (gtest_retval == -1) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression + " != -1"); \
-    } \
-  } while (::testing::internal::AlwaysFalse())
-
-// Returns the message describing the last system error in errno.
-std::string GetLastErrnoDescription() {
-    return errno == 0 ? "" : posix::StrError(errno);
-}
-
-// This is called from a death test parent process to read a failure
-// message from the death test child process and log it with the FATAL
-// severity. On Windows, the message is read from a pipe handle. On other
-// platforms, it is read from a file descriptor.
-static void FailFromInternalError(int fd) {
-  Message error;
-  char buffer[256];
-  int num_read;
-
-  do {
-    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
-      buffer[num_read] = '\0';
-      error << buffer;
-    }
-  } while (num_read == -1 && errno == EINTR);
-
-  if (num_read == 0) {
-    GTEST_LOG_(FATAL) << error.GetString();
-  } else {
-    const int last_error = errno;
-    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
-                      << GetLastErrnoDescription() << " [" << last_error << "]";
-  }
-}
-
-// Death test constructor.  Increments the running death test count
-// for the current test.
-DeathTest::DeathTest() {
-  TestInfo* const info = GetUnitTestImpl()->current_test_info();
-  if (info == nullptr) {
-    DeathTestAbort("Cannot run a death test outside of a TEST or "
-                   "TEST_F construct");
-  }
-}
-
-// Creates and returns a death test by dispatching to the current
-// death test factory.
-bool DeathTest::Create(const char* statement,
-                       Matcher<const std::string&> matcher, const char* file,
-                       int line, DeathTest** test) {
-  return GetUnitTestImpl()->death_test_factory()->Create(
-      statement, std::move(matcher), file, line, test);
-}
-
-const char* DeathTest::LastMessage() {
-  return last_death_test_message_.c_str();
-}
-
-void DeathTest::set_last_death_test_message(const std::string& message) {
-  last_death_test_message_ = message;
-}
-
-std::string DeathTest::last_death_test_message_;
-
-// Provides cross platform implementation for some death functionality.
-class DeathTestImpl : public DeathTest {
- protected:
-  DeathTestImpl(const char* a_statement, Matcher<const std::string&> matcher)
-      : statement_(a_statement),
-        matcher_(std::move(matcher)),
-        spawned_(false),
-        status_(-1),
-        outcome_(IN_PROGRESS),
-        read_fd_(-1),
-        write_fd_(-1) {}
-
-  // read_fd_ is expected to be closed and cleared by a derived class.
-  ~DeathTestImpl() override { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
-
-  void Abort(AbortReason reason) override;
-  bool Passed(bool status_ok) override;
-
-  const char* statement() const { return statement_; }
-  bool spawned() const { return spawned_; }
-  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
-  int status() const { return status_; }
-  void set_status(int a_status) { status_ = a_status; }
-  DeathTestOutcome outcome() const { return outcome_; }
-  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
-  int read_fd() const { return read_fd_; }
-  void set_read_fd(int fd) { read_fd_ = fd; }
-  int write_fd() const { return write_fd_; }
-  void set_write_fd(int fd) { write_fd_ = fd; }
-
-  // Called in the parent process only. Reads the result code of the death
-  // test child process via a pipe, interprets it to set the outcome_
-  // member, and closes read_fd_.  Outputs diagnostics and terminates in
-  // case of unexpected codes.
-  void ReadAndInterpretStatusByte();
-
-  // Returns stderr output from the child process.
-  virtual std::string GetErrorLogs();
-
- private:
-  // The textual content of the code this object is testing.  This class
-  // doesn't own this string and should not attempt to delete it.
-  const char* const statement_;
-  // A matcher that's expected to match the stderr output by the child process.
-  Matcher<const std::string&> matcher_;
-  // True if the death test child process has been successfully spawned.
-  bool spawned_;
-  // The exit status of the child process.
-  int status_;
-  // How the death test concluded.
-  DeathTestOutcome outcome_;
-  // Descriptor to the read end of the pipe to the child process.  It is
-  // always -1 in the child process.  The child keeps its write end of the
-  // pipe in write_fd_.
-  int read_fd_;
-  // Descriptor to the child's write end of the pipe to the parent process.
-  // It is always -1 in the parent process.  The parent keeps its end of the
-  // pipe in read_fd_.
-  int write_fd_;
-};
-
-// Called in the parent process only. Reads the result code of the death
-// test child process via a pipe, interprets it to set the outcome_
-// member, and closes read_fd_.  Outputs diagnostics and terminates in
-// case of unexpected codes.
-void DeathTestImpl::ReadAndInterpretStatusByte() {
-  char flag;
-  int bytes_read;
-
-  // The read() here blocks until data is available (signifying the
-  // failure of the death test) or until the pipe is closed (signifying
-  // its success), so it's okay to call this in the parent before
-  // the child process has exited.
-  do {
-    bytes_read = posix::Read(read_fd(), &flag, 1);
-  } while (bytes_read == -1 && errno == EINTR);
-
-  if (bytes_read == 0) {
-    set_outcome(DIED);
-  } else if (bytes_read == 1) {
-    switch (flag) {
-      case kDeathTestReturned:
-        set_outcome(RETURNED);
-        break;
-      case kDeathTestThrew:
-        set_outcome(THREW);
-        break;
-      case kDeathTestLived:
-        set_outcome(LIVED);
-        break;
-      case kDeathTestInternalError:
-        FailFromInternalError(read_fd());  // Does not return.
-        break;
-      default:
-        GTEST_LOG_(FATAL) << "Death test child process reported "
-                          << "unexpected status byte ("
-                          << static_cast<unsigned int>(flag) << ")";
-    }
-  } else {
-    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
-                      << GetLastErrnoDescription();
-  }
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
-  set_read_fd(-1);
-}
-
-std::string DeathTestImpl::GetErrorLogs() {
-  return GetCapturedStderr();
-}
-
-// Signals that the death test code which should have exited, didn't.
-// Should be called only in a death test child process.
-// Writes a status byte to the child's status file descriptor, then
-// calls _exit(1).
-void DeathTestImpl::Abort(AbortReason reason) {
-  // The parent process considers the death test to be a failure if
-  // it finds any data in our pipe.  So, here we write a single flag byte
-  // to the pipe, then exit.
-  const char status_ch =
-      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
-      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
-
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
-  // We are leaking the descriptor here because on some platforms (i.e.,
-  // when built as Windows DLL), destructors of global objects will still
-  // run after calling _exit(). On such systems, write_fd_ will be
-  // indirectly closed from the destructor of UnitTestImpl, causing double
-  // close if it is also closed here. On debug configurations, double close
-  // may assert. As there are no in-process buffers to flush here, we are
-  // relying on the OS to close the descriptor after the process terminates
-  // when the destructors are not run.
-  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
-}
-
-// Returns an indented copy of stderr output for a death test.
-// This makes distinguishing death test output lines from regular log lines
-// much easier.
-static ::std::string FormatDeathTestOutput(const ::std::string& output) {
-  ::std::string ret;
-  for (size_t at = 0; ; ) {
-    const size_t line_end = output.find('\n', at);
-    ret += "[  DEATH   ] ";
-    if (line_end == ::std::string::npos) {
-      ret += output.substr(at);
-      break;
-    }
-    ret += output.substr(at, line_end + 1 - at);
-    at = line_end + 1;
-  }
-  return ret;
-}
-
-// Assesses the success or failure of a death test, using both private
-// members which have previously been set, and one argument:
-//
-// Private data members:
-//   outcome:  An enumeration describing how the death test
-//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
-//             fails in the latter three cases.
-//   status:   The exit status of the child process. On *nix, it is in the
-//             in the format specified by wait(2). On Windows, this is the
-//             value supplied to the ExitProcess() API or a numeric code
-//             of the exception that terminated the program.
-//   matcher_: A matcher that's expected to match the stderr output by the child
-//             process.
-//
-// Argument:
-//   status_ok: true if exit_status is acceptable in the context of
-//              this particular death test, which fails if it is false
-//
-// Returns true iff all of the above conditions are met.  Otherwise, the
-// first failing condition, in the order given above, is the one that is
-// reported. Also sets the last death test message string.
-bool DeathTestImpl::Passed(bool status_ok) {
-  if (!spawned())
-    return false;
-
-  const std::string error_message = GetErrorLogs();
-
-  bool success = false;
-  Message buffer;
-
-  buffer << "Death test: " << statement() << "\n";
-  switch (outcome()) {
-    case LIVED:
-      buffer << "    Result: failed to die.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case THREW:
-      buffer << "    Result: threw an exception.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case RETURNED:
-      buffer << "    Result: illegal return in test statement.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case DIED:
-      if (status_ok) {
-        if (matcher_.Matches(error_message)) {
-          success = true;
-        } else {
-          std::ostringstream stream;
-          matcher_.DescribeTo(&stream);
-          buffer << "    Result: died but not with expected error.\n"
-                 << "  Expected: " << stream.str() << "\n"
-                 << "Actual msg:\n"
-                 << FormatDeathTestOutput(error_message);
-        }
-      } else {
-        buffer << "    Result: died but not with expected exit code:\n"
-               << "            " << ExitSummary(status()) << "\n"
-               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
-      }
-      break;
-    case IN_PROGRESS:
-    default:
-      GTEST_LOG_(FATAL)
-          << "DeathTest::Passed somehow called before conclusion of test";
-  }
-
-  DeathTest::set_last_death_test_message(buffer.GetString());
-  return success;
-}
-
-# if GTEST_OS_WINDOWS
-// WindowsDeathTest implements death tests on Windows. Due to the
-// specifics of starting new processes on Windows, death tests there are
-// always threadsafe, and Google Test considers the
-// --gtest_death_test_style=fast setting to be equivalent to
-// --gtest_death_test_style=threadsafe there.
-//
-// A few implementation notes:  Like the Linux version, the Windows
-// implementation uses pipes for child-to-parent communication. But due to
-// the specifics of pipes on Windows, some extra steps are required:
-//
-// 1. The parent creates a communication pipe and stores handles to both
-//    ends of it.
-// 2. The parent starts the child and provides it with the information
-//    necessary to acquire the handle to the write end of the pipe.
-// 3. The child acquires the write end of the pipe and signals the parent
-//    using a Windows event.
-// 4. Now the parent can release the write end of the pipe on its side. If
-//    this is done before step 3, the object's reference count goes down to
-//    0 and it is destroyed, preventing the child from acquiring it. The
-//    parent now has to release it, or read operations on the read end of
-//    the pipe will not return when the child terminates.
-// 5. The parent reads child's output through the pipe (outcome code and
-//    any possible error messages) from the pipe, and its stderr and then
-//    determines whether to fail the test.
-//
-// Note: to distinguish Win32 API calls from the local method and function
-// calls, the former are explicitly resolved in the global namespace.
-//
-class WindowsDeathTest : public DeathTestImpl {
- public:
-  WindowsDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
-                   const char* file, int line)
-      : DeathTestImpl(a_statement, std::move(matcher)),
-        file_(file),
-        line_(line) {}
-
-  // All of these virtual functions are inherited from DeathTest.
-  virtual int Wait();
-  virtual TestRole AssumeRole();
-
- private:
-  // The name of the file in which the death test is located.
-  const char* const file_;
-  // The line number on which the death test is located.
-  const int line_;
-  // Handle to the write end of the pipe to the child process.
-  AutoHandle write_handle_;
-  // Child process handle.
-  AutoHandle child_handle_;
-  // Event the child process uses to signal the parent that it has
-  // acquired the handle to the write end of the pipe. After seeing this
-  // event the parent can release its own handles to make sure its
-  // ReadFile() calls return when the child terminates.
-  AutoHandle event_handle_;
-};
-
-// Waits for the child in a death test to exit, returning its exit
-// status, or 0 if no child process exists.  As a side effect, sets the
-// outcome data member.
-int WindowsDeathTest::Wait() {
-  if (!spawned())
-    return 0;
-
-  // Wait until the child either signals that it has acquired the write end
-  // of the pipe or it dies.
-  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
-  switch (::WaitForMultipleObjects(2,
-                                   wait_handles,
-                                   FALSE,  // Waits for any of the handles.
-                                   INFINITE)) {
-    case WAIT_OBJECT_0:
-    case WAIT_OBJECT_0 + 1:
-      break;
-    default:
-      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
-  }
-
-  // The child has acquired the write end of the pipe or exited.
-  // We release the handle on our side and continue.
-  write_handle_.Reset();
-  event_handle_.Reset();
-
-  ReadAndInterpretStatusByte();
-
-  // Waits for the child process to exit if it haven't already. This
-  // returns immediately if the child has already exited, regardless of
-  // whether previous calls to WaitForMultipleObjects synchronized on this
-  // handle or not.
-  GTEST_DEATH_TEST_CHECK_(
-      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
-                                             INFINITE));
-  DWORD status_code;
-  GTEST_DEATH_TEST_CHECK_(
-      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
-  child_handle_.Reset();
-  set_status(static_cast<int>(status_code));
-  return status();
-}
-
-// The AssumeRole process for a Windows death test.  It creates a child
-// process with the same executable as the current process to run the
-// death test.  The child process is given the --gtest_filter and
-// --gtest_internal_run_death_test flags such that it knows to run the
-// current death test only.
-DeathTest::TestRole WindowsDeathTest::AssumeRole() {
-  const UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const TestInfo* const info = impl->current_test_info();
-  const int death_test_index = info->result()->death_test_count();
-
-  if (flag != nullptr) {
-    // ParseInternalRunDeathTestFlag() has performed all the necessary
-    // processing.
-    set_write_fd(flag->write_fd());
-    return EXECUTE_TEST;
-  }
-
-  // WindowsDeathTest uses an anonymous pipe to communicate results of
-  // a death test.
-  SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES),
-                                                 nullptr, TRUE};
-  HANDLE read_handle, write_handle;
-  GTEST_DEATH_TEST_CHECK_(
-      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
-                   0)  // Default buffer size.
-      != FALSE);
-  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
-                                O_RDONLY));
-  write_handle_.Reset(write_handle);
-  event_handle_.Reset(::CreateEvent(
-      &handles_are_inheritable,
-      TRUE,       // The event will automatically reset to non-signaled state.
-      FALSE,      // The initial state is non-signalled.
-      nullptr));  // The even is unnamed.
-  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr);
-  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
-      "=" + file_ + "|" + StreamableToString(line_) + "|" +
-      StreamableToString(death_test_index) + "|" +
-      StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
-      // size_t has the same width as pointers on both 32-bit and 64-bit
-      // Windows platforms.
-      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
-      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
-      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
-
-  char executable_path[_MAX_PATH + 1];  // NOLINT
-  GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr,
-                                                                executable_path,
-                                                                _MAX_PATH));
-
-  std::string command_line =
-      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
-      internal_flag + "\"";
-
-  DeathTest::set_last_death_test_message("");
-
-  CaptureStderr();
-  // Flush the log buffers since the log streams are shared with the child.
-  FlushInfoLog();
-
-  // The child process will share the standard handles with the parent.
-  STARTUPINFOA startup_info;
-  memset(&startup_info, 0, sizeof(STARTUPINFO));
-  startup_info.dwFlags = STARTF_USESTDHANDLES;
-  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
-  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
-  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
-
-  PROCESS_INFORMATION process_info;
-  GTEST_DEATH_TEST_CHECK_(
-      ::CreateProcessA(
-          executable_path, const_cast<char*>(command_line.c_str()),
-          nullptr,  // Retuned process handle is not inheritable.
-          nullptr,  // Retuned thread handle is not inheritable.
-          TRUE,  // Child inherits all inheritable handles (for write_handle_).
-          0x0,   // Default creation flags.
-          nullptr,  // Inherit the parent's environment.
-          UnitTest::GetInstance()->original_working_dir(), &startup_info,
-          &process_info) != FALSE);
-  child_handle_.Reset(process_info.hProcess);
-  ::CloseHandle(process_info.hThread);
-  set_spawned(true);
-  return OVERSEE_TEST;
-}
-
-# elif GTEST_OS_FUCHSIA
-
-class FuchsiaDeathTest : public DeathTestImpl {
- public:
-  FuchsiaDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
-                   const char* file, int line)
-      : DeathTestImpl(a_statement, std::move(matcher)),
-        file_(file),
-        line_(line) {}
-
-  // All of these virtual functions are inherited from DeathTest.
-  int Wait() override;
-  TestRole AssumeRole() override;
-  std::string GetErrorLogs() override;
-
- private:
-  // The name of the file in which the death test is located.
-  const char* const file_;
-  // The line number on which the death test is located.
-  const int line_;
-  // The stderr data captured by the child process.
-  std::string captured_stderr_;
-
-  zx::process child_process_;
-  zx::port port_;
-  zx::socket stderr_socket_;
-};
-
-// Utility class for accumulating command-line arguments.
-class Arguments {
- public:
-  Arguments() { args_.push_back(nullptr); }
-
-  ~Arguments() {
-    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
-         ++i) {
-      free(*i);
-    }
-  }
-  void AddArgument(const char* argument) {
-    args_.insert(args_.end() - 1, posix::StrDup(argument));
-  }
-
-  template <typename Str>
-  void AddArguments(const ::std::vector<Str>& arguments) {
-    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
-      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
-    }
-  }
-  char* const* Argv() {
-    return &args_[0];
-  }
-
-  int size() {
-    return args_.size() - 1;
-  }
-
- private:
-  std::vector<char*> args_;
-};
-
-// Waits for the child in a death test to exit, returning its exit
-// status, or 0 if no child process exists.  As a side effect, sets the
-// outcome data member.
-int FuchsiaDeathTest::Wait() {
-  const int kProcessKey = 0;
-  const int kSocketKey = 1;
-
-  if (!spawned())
-    return 0;
-
-  // Register to wait for the child process to terminate.
-  zx_status_t status_zx;
-  status_zx = child_process_.wait_async(
-      port_, kProcessKey, ZX_PROCESS_TERMINATED, ZX_WAIT_ASYNC_ONCE);
-  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
-  // Register to wait for the socket to be readable or closed.
-  status_zx = stderr_socket_.wait_async(
-      port_, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
-      ZX_WAIT_ASYNC_ONCE);
-  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
-
-  bool process_terminated = false;
-  bool socket_closed = false;
-  do {
-    zx_port_packet_t packet = {};
-    status_zx = port_.wait(zx::time::infinite(), &packet);
-    GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
-
-    if (packet.key == kProcessKey) {
-      if (ZX_PKT_IS_EXCEPTION(packet.type)) {
-        // Process encountered an exception. Kill it directly rather than
-        // letting other handlers process the event. We will get a second
-        // kProcessKey event when the process actually terminates.
-        status_zx = child_process_.kill();
-        GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
-      } else {
-        // Process terminated.
-        GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
-        GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED);
-        process_terminated = true;
-      }
-    } else if (packet.key == kSocketKey) {
-      GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
-      if (packet.signal.observed & ZX_SOCKET_READABLE) {
-        // Read data from the socket.
-        constexpr size_t kBufferSize = 1024;
-        do {
-          size_t old_length = captured_stderr_.length();
-          size_t bytes_read = 0;
-          captured_stderr_.resize(old_length + kBufferSize);
-          status_zx = stderr_socket_.read(
-              0, &captured_stderr_.front() + old_length, kBufferSize,
-              &bytes_read);
-          captured_stderr_.resize(old_length + bytes_read);
-        } while (status_zx == ZX_OK);
-        if (status_zx == ZX_ERR_PEER_CLOSED) {
-          socket_closed = true;
-        } else {
-          GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT);
-          status_zx = stderr_socket_.wait_async(
-              port_, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
-              ZX_WAIT_ASYNC_ONCE);
-          GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
-        }
-      } else {
-        GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_SOCKET_PEER_CLOSED);
-        socket_closed = true;
-      }
-    }
-  } while (!process_terminated && !socket_closed);
-
-  ReadAndInterpretStatusByte();
-
-  zx_info_process_t buffer;
-  status_zx = child_process_.get_info(
-      ZX_INFO_PROCESS, &buffer, sizeof(buffer), nullptr, nullptr);
-  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
-
-  GTEST_DEATH_TEST_CHECK_(buffer.exited);
-  set_status(buffer.return_code);
-  return status();
-}
-
-// The AssumeRole process for a Fuchsia death test.  It creates a child
-// process with the same executable as the current process to run the
-// death test.  The child process is given the --gtest_filter and
-// --gtest_internal_run_death_test flags such that it knows to run the
-// current death test only.
-DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
-  const UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const TestInfo* const info = impl->current_test_info();
-  const int death_test_index = info->result()->death_test_count();
-
-  if (flag != nullptr) {
-    // ParseInternalRunDeathTestFlag() has performed all the necessary
-    // processing.
-    set_write_fd(kFuchsiaReadPipeFd);
-    return EXECUTE_TEST;
-  }
-
-  // Flush the log buffers since the log streams are shared with the child.
-  FlushInfoLog();
-
-  // Build the child process command line.
-  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
-      + file_ + "|"
-      + StreamableToString(line_) + "|"
-      + StreamableToString(death_test_index);
-  Arguments args;
-  args.AddArguments(GetInjectableArgvs());
-  args.AddArgument(filter_flag.c_str());
-  args.AddArgument(internal_flag.c_str());
-
-  // Build the pipe for communication with the child.
-  zx_status_t status;
-  zx_handle_t child_pipe_handle;
-  int child_pipe_fd;
-  status = fdio_pipe_half2(&child_pipe_fd, &child_pipe_handle);
-  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
-  set_read_fd(child_pipe_fd);
-
-  // Set the pipe handle for the child.
-  fdio_spawn_action_t spawn_actions[2] = {};
-  fdio_spawn_action_t* add_handle_action = &spawn_actions[0];
-  add_handle_action->action = FDIO_SPAWN_ACTION_ADD_HANDLE;
-  add_handle_action->h.id = PA_HND(PA_FD, kFuchsiaReadPipeFd);
-  add_handle_action->h.handle = child_pipe_handle;
-
-  // Create a socket pair will be used to receive the child process' stderr.
-  zx::socket stderr_producer_socket;
-  status =
-      zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
-  GTEST_DEATH_TEST_CHECK_(status >= 0);
-  int stderr_producer_fd = -1;
-  status =
-      fdio_fd_create(stderr_producer_socket.release(), &stderr_producer_fd);
-  GTEST_DEATH_TEST_CHECK_(status >= 0);
-
-  // Make the stderr socket nonblocking.
-  GTEST_DEATH_TEST_CHECK_(fcntl(stderr_producer_fd, F_SETFL, 0) == 0);
-
-  fdio_spawn_action_t* add_stderr_action = &spawn_actions[1];
-  add_stderr_action->action = FDIO_SPAWN_ACTION_CLONE_FD;
-  add_stderr_action->fd.local_fd = stderr_producer_fd;
-  add_stderr_action->fd.target_fd = STDERR_FILENO;
-
-  // Create a child job.
-  zx_handle_t child_job = ZX_HANDLE_INVALID;
-  status = zx_job_create(zx_job_default(), 0, & child_job);
-  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
-  zx_policy_basic_t policy;
-  policy.condition = ZX_POL_NEW_ANY;
-  policy.policy = ZX_POL_ACTION_ALLOW;
-  status = zx_job_set_policy(
-      child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, &policy, 1);
-  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
-
-  // Create an exception port and attach it to the |child_job|, to allow
-  // us to suppress the system default exception handler from firing.
-  status = zx::port::create(0, &port_);
-  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
-  status = zx_task_bind_exception_port(
-      child_job, port_.get(), 0 /* key */, 0 /*options */);
-  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
-
-  // Spawn the child process.
-  status = fdio_spawn_etc(
-      child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], args.Argv(), nullptr,
-      2, spawn_actions, child_process_.reset_and_get_address(), nullptr);
-  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
-
-  set_spawned(true);
-  return OVERSEE_TEST;
-}
-
-std::string FuchsiaDeathTest::GetErrorLogs() {
-  return captured_stderr_;
-}
-
-#else  // We are neither on Windows, nor on Fuchsia.
-
-// ForkingDeathTest provides implementations for most of the abstract
-// methods of the DeathTest interface.  Only the AssumeRole method is
-// left undefined.
-class ForkingDeathTest : public DeathTestImpl {
- public:
-  ForkingDeathTest(const char* statement, Matcher<const std::string&> matcher);
-
-  // All of these virtual functions are inherited from DeathTest.
-  int Wait() override;
-
- protected:
-  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
-
- private:
-  // PID of child process during death test; 0 in the child process itself.
-  pid_t child_pid_;
-};
-
-// Constructs a ForkingDeathTest.
-ForkingDeathTest::ForkingDeathTest(const char* a_statement,
-                                   Matcher<const std::string&> matcher)
-    : DeathTestImpl(a_statement, std::move(matcher)), child_pid_(-1) {}
-
-// Waits for the child in a death test to exit, returning its exit
-// status, or 0 if no child process exists.  As a side effect, sets the
-// outcome data member.
-int ForkingDeathTest::Wait() {
-  if (!spawned())
-    return 0;
-
-  ReadAndInterpretStatusByte();
-
-  int status_value;
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
-  set_status(status_value);
-  return status_value;
-}
-
-// A concrete death test class that forks, then immediately runs the test
-// in the child process.
-class NoExecDeathTest : public ForkingDeathTest {
- public:
-  NoExecDeathTest(const char* a_statement, Matcher<const std::string&> matcher)
-      : ForkingDeathTest(a_statement, std::move(matcher)) {}
-  TestRole AssumeRole() override;
-};
-
-// The AssumeRole process for a fork-and-run death test.  It implements a
-// straightforward fork, with a simple pipe to transmit the status byte.
-DeathTest::TestRole NoExecDeathTest::AssumeRole() {
-  const size_t thread_count = GetThreadCount();
-  if (thread_count != 1) {
-    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
-  }
-
-  int pipe_fd[2];
-  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
-
-  DeathTest::set_last_death_test_message("");
-  CaptureStderr();
-  // When we fork the process below, the log file buffers are copied, but the
-  // file descriptors are shared.  We flush all log files here so that closing
-  // the file descriptors in the child process doesn't throw off the
-  // synchronization between descriptors and buffers in the parent process.
-  // This is as close to the fork as possible to avoid a race condition in case
-  // there are multiple threads running before the death test, and another
-  // thread writes to the log file.
-  FlushInfoLog();
-
-  const pid_t child_pid = fork();
-  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
-  set_child_pid(child_pid);
-  if (child_pid == 0) {
-    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
-    set_write_fd(pipe_fd[1]);
-    // Redirects all logging to stderr in the child process to prevent
-    // concurrent writes to the log files.  We capture stderr in the parent
-    // process and append the child process' output to a log.
-    LogToStderr();
-    // Event forwarding to the listeners of event listener API mush be shut
-    // down in death test subprocesses.
-    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
-    g_in_fast_death_test_child = true;
-    return EXECUTE_TEST;
-  } else {
-    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
-    set_read_fd(pipe_fd[0]);
-    set_spawned(true);
-    return OVERSEE_TEST;
-  }
-}
-
-// A concrete death test class that forks and re-executes the main
-// program from the beginning, with command-line flags set that cause
-// only this specific death test to be run.
-class ExecDeathTest : public ForkingDeathTest {
- public:
-  ExecDeathTest(const char* a_statement, Matcher<const std::string&> matcher,
-                const char* file, int line)
-      : ForkingDeathTest(a_statement, std::move(matcher)),
-        file_(file),
-        line_(line) {}
-  TestRole AssumeRole() override;
-
- private:
-  static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
-    ::std::vector<std::string> args = GetInjectableArgvs();
-#  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
-    ::std::vector<std::string> extra_args =
-        GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
-    args.insert(args.end(), extra_args.begin(), extra_args.end());
-#  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
-    return args;
-  }
-  // The name of the file in which the death test is located.
-  const char* const file_;
-  // The line number on which the death test is located.
-  const int line_;
-};
-
-// Utility class for accumulating command-line arguments.
-class Arguments {
- public:
-  Arguments() { args_.push_back(nullptr); }
-
-  ~Arguments() {
-    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
-         ++i) {
-      free(*i);
-    }
-  }
-  void AddArgument(const char* argument) {
-    args_.insert(args_.end() - 1, posix::StrDup(argument));
-  }
-
-  template <typename Str>
-  void AddArguments(const ::std::vector<Str>& arguments) {
-    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
-      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
-    }
-  }
-  char* const* Argv() {
-    return &args_[0];
-  }
-
- private:
-  std::vector<char*> args_;
-};
-
-// A struct that encompasses the arguments to the child process of a
-// threadsafe-style death test process.
-struct ExecDeathTestArgs {
-  char* const* argv;  // Command-line arguments for the child's call to exec
-  int close_fd;       // File descriptor to close; the read end of a pipe
-};
-
-#  if GTEST_OS_MAC
-inline char** GetEnviron() {
-  // When Google Test is built as a framework on MacOS X, the environ variable
-  // is unavailable. Apple's documentation (man environ) recommends using
-  // _NSGetEnviron() instead.
-  return *_NSGetEnviron();
-}
-#  else
-// Some POSIX platforms expect you to declare environ. extern "C" makes
-// it reside in the global namespace.
-extern "C" char** environ;
-inline char** GetEnviron() { return environ; }
-#  endif  // GTEST_OS_MAC
-
-#  if !GTEST_OS_QNX
-// The main function for a threadsafe-style death test child process.
-// This function is called in a clone()-ed process and thus must avoid
-// any potentially unsafe operations like malloc or libc functions.
-static int ExecDeathTestChildMain(void* child_arg) {
-  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
-
-  // We need to execute the test program in the same environment where
-  // it was originally invoked.  Therefore we change to the original
-  // working directory first.
-  const char* const original_dir =
-      UnitTest::GetInstance()->original_working_dir();
-  // We can safely call chdir() as it's a direct system call.
-  if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
-    return EXIT_FAILURE;
-  }
-
-  // We can safely call execve() as it's a direct system call.  We
-  // cannot use execvp() as it's a libc function and thus potentially
-  // unsafe.  Since execve() doesn't search the PATH, the user must
-  // invoke the test program via a valid path that contains at least
-  // one path separator.
-  execve(args->argv[0], args->argv, GetEnviron());
-  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
-                 original_dir + " failed: " +
-                 GetLastErrnoDescription());
-  return EXIT_FAILURE;
-}
-#  endif  // !GTEST_OS_QNX
-
-#  if GTEST_HAS_CLONE
-// Two utility routines that together determine the direction the stack
-// grows.
-// This could be accomplished more elegantly by a single recursive
-// function, but we want to guard against the unlikely possibility of
-// a smart compiler optimizing the recursion away.
-//
-// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
-// StackLowerThanAddress into StackGrowsDown, which then doesn't give
-// correct answer.
-static void StackLowerThanAddress(const void* ptr,
-                                  bool* result) GTEST_NO_INLINE_;
-// HWAddressSanitizer add a random tag to the MSB of the local variable address,
-// making comparison result unpredictable.
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-static void StackLowerThanAddress(const void* ptr, bool* result) {
-  int dummy;
-  *result = (&dummy < ptr);
-}
-
-// Make sure AddressSanitizer does not tamper with the stack here.
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-static bool StackGrowsDown() {
-  int dummy;
-  bool result;
-  StackLowerThanAddress(&dummy, &result);
-  return result;
-}
-#  endif  // GTEST_HAS_CLONE
-
-// Spawns a child process with the same executable as the current process in
-// a thread-safe manner and instructs it to run the death test.  The
-// implementation uses fork(2) + exec.  On systems where clone(2) is
-// available, it is used instead, being slightly more thread-safe.  On QNX,
-// fork supports only single-threaded environments, so this function uses
-// spawn(2) there instead.  The function dies with an error message if
-// anything goes wrong.
-static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
-  ExecDeathTestArgs args = { argv, close_fd };
-  pid_t child_pid = -1;
-
-#  if GTEST_OS_QNX
-  // Obtains the current directory and sets it to be closed in the child
-  // process.
-  const int cwd_fd = open(".", O_RDONLY);
-  GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
-  // We need to execute the test program in the same environment where
-  // it was originally invoked.  Therefore we change to the original
-  // working directory first.
-  const char* const original_dir =
-      UnitTest::GetInstance()->original_working_dir();
-  // We can safely call chdir() as it's a direct system call.
-  if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
-    return EXIT_FAILURE;
-  }
-
-  int fd_flags;
-  // Set close_fd to be closed after spawn.
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
-                                        fd_flags | FD_CLOEXEC));
-  struct inheritance inherit = {0};
-  // spawn is a system call.
-  child_pid =
-      spawn(args.argv[0], 0, nullptr, &inherit, args.argv, GetEnviron());
-  // Restores the current working directory.
-  GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
-
-#  else   // GTEST_OS_QNX
-#   if GTEST_OS_LINUX
-  // When a SIGPROF signal is received while fork() or clone() are executing,
-  // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
-  // it after the call to fork()/clone() is complete.
-  struct sigaction saved_sigprof_action;
-  struct sigaction ignore_sigprof_action;
-  memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
-  sigemptyset(&ignore_sigprof_action.sa_mask);
-  ignore_sigprof_action.sa_handler = SIG_IGN;
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
-      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
-#   endif  // GTEST_OS_LINUX
-
-#   if GTEST_HAS_CLONE
-  const bool use_fork = GTEST_FLAG(death_test_use_fork);
-
-  if (!use_fork) {
-    static const bool stack_grows_down = StackGrowsDown();
-    const auto stack_size = static_cast<size_t>(getpagesize());
-    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
-    void* const stack = mmap(nullptr, stack_size, PROT_READ | PROT_WRITE,
-                             MAP_ANON | MAP_PRIVATE, -1, 0);
-    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
-
-    // Maximum stack alignment in bytes:  For a downward-growing stack, this
-    // amount is subtracted from size of the stack space to get an address
-    // that is within the stack space and is aligned on all systems we care
-    // about.  As far as I know there is no ABI with stack alignment greater
-    // than 64.  We assume stack and stack_size already have alignment of
-    // kMaxStackAlignment.
-    const size_t kMaxStackAlignment = 64;
-    void* const stack_top =
-        static_cast<char*>(stack) +
-            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
-    GTEST_DEATH_TEST_CHECK_(
-        static_cast<size_t>(stack_size) > kMaxStackAlignment &&
-        reinterpret_cast<uintptr_t>(stack_top) % kMaxStackAlignment == 0);
-
-    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
-
-    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
-  }
-#   else
-  const bool use_fork = true;
-#   endif  // GTEST_HAS_CLONE
-
-  if (use_fork && (child_pid = fork()) == 0) {
-      ExecDeathTestChildMain(&args);
-      _exit(0);
-  }
-#  endif  // GTEST_OS_QNX
-#  if GTEST_OS_LINUX
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(
-      sigaction(SIGPROF, &saved_sigprof_action, nullptr));
-#  endif  // GTEST_OS_LINUX
-
-  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
-  return child_pid;
-}
-
-// The AssumeRole process for a fork-and-exec death test.  It re-executes the
-// main program from the beginning, setting the --gtest_filter
-// and --gtest_internal_run_death_test flags to cause only the current
-// death test to be re-run.
-DeathTest::TestRole ExecDeathTest::AssumeRole() {
-  const UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const TestInfo* const info = impl->current_test_info();
-  const int death_test_index = info->result()->death_test_count();
-
-  if (flag != nullptr) {
-    set_write_fd(flag->write_fd());
-    return EXECUTE_TEST;
-  }
-
-  int pipe_fd[2];
-  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
-  // Clear the close-on-exec flag on the write end of the pipe, lest
-  // it be closed when the child process does an exec:
-  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
-
-  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
-      + file_ + "|" + StreamableToString(line_) + "|"
-      + StreamableToString(death_test_index) + "|"
-      + StreamableToString(pipe_fd[1]);
-  Arguments args;
-  args.AddArguments(GetArgvsForDeathTestChildProcess());
-  args.AddArgument(filter_flag.c_str());
-  args.AddArgument(internal_flag.c_str());
-
-  DeathTest::set_last_death_test_message("");
-
-  CaptureStderr();
-  // See the comment in NoExecDeathTest::AssumeRole for why the next line
-  // is necessary.
-  FlushInfoLog();
-
-  const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
-  set_child_pid(child_pid);
-  set_read_fd(pipe_fd[0]);
-  set_spawned(true);
-  return OVERSEE_TEST;
-}
-
-# endif  // !GTEST_OS_WINDOWS
-
-// Creates a concrete DeathTest-derived class that depends on the
-// --gtest_death_test_style flag, and sets the pointer pointed to
-// by the "test" argument to its address.  If the test should be
-// skipped, sets that pointer to NULL.  Returns true, unless the
-// flag is set to an invalid value.
-bool DefaultDeathTestFactory::Create(const char* statement,
-                                     Matcher<const std::string&> matcher,
-                                     const char* file, int line,
-                                     DeathTest** test) {
-  UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const int death_test_index = impl->current_test_info()
-      ->increment_death_test_count();
-
-  if (flag != nullptr) {
-    if (death_test_index > flag->index()) {
-      DeathTest::set_last_death_test_message(
-          "Death test count (" + StreamableToString(death_test_index)
-          + ") somehow exceeded expected maximum ("
-          + StreamableToString(flag->index()) + ")");
-      return false;
-    }
-
-    if (!(flag->file() == file && flag->line() == line &&
-          flag->index() == death_test_index)) {
-      *test = nullptr;
-      return true;
-    }
-  }
-
-# if GTEST_OS_WINDOWS
-
-  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
-      GTEST_FLAG(death_test_style) == "fast") {
-    *test = new WindowsDeathTest(statement, std::move(matcher), file, line);
-  }
-
-# elif GTEST_OS_FUCHSIA
-
-  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
-      GTEST_FLAG(death_test_style) == "fast") {
-    *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line);
-  }
-
-# else
-
-  if (GTEST_FLAG(death_test_style) == "threadsafe") {
-    *test = new ExecDeathTest(statement, std::move(matcher), file, line);
-  } else if (GTEST_FLAG(death_test_style) == "fast") {
-    *test = new NoExecDeathTest(statement, std::move(matcher));
-  }
-
-# endif  // GTEST_OS_WINDOWS
-
-  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
-    DeathTest::set_last_death_test_message(
-        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
-        + "\" encountered");
-    return false;
-  }
-
-  return true;
-}
-
-# if GTEST_OS_WINDOWS
-// Recreates the pipe and event handles from the provided parameters,
-// signals the event, and returns a file descriptor wrapped around the pipe
-// handle. This function is called in the child process only.
-static int GetStatusFileDescriptor(unsigned int parent_process_id,
-                            size_t write_handle_as_size_t,
-                            size_t event_handle_as_size_t) {
-  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
-                                                   FALSE,  // Non-inheritable.
-                                                   parent_process_id));
-  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
-    DeathTestAbort("Unable to open parent process " +
-                   StreamableToString(parent_process_id));
-  }
-
-  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
-
-  const HANDLE write_handle =
-      reinterpret_cast<HANDLE>(write_handle_as_size_t);
-  HANDLE dup_write_handle;
-
-  // The newly initialized handle is accessible only in the parent
-  // process. To obtain one accessible within the child, we need to use
-  // DuplicateHandle.
-  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
-                         ::GetCurrentProcess(), &dup_write_handle,
-                         0x0,    // Requested privileges ignored since
-                                 // DUPLICATE_SAME_ACCESS is used.
-                         FALSE,  // Request non-inheritable handler.
-                         DUPLICATE_SAME_ACCESS)) {
-    DeathTestAbort("Unable to duplicate the pipe handle " +
-                   StreamableToString(write_handle_as_size_t) +
-                   " from the parent process " +
-                   StreamableToString(parent_process_id));
-  }
-
-  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
-  HANDLE dup_event_handle;
-
-  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
-                         ::GetCurrentProcess(), &dup_event_handle,
-                         0x0,
-                         FALSE,
-                         DUPLICATE_SAME_ACCESS)) {
-    DeathTestAbort("Unable to duplicate the event handle " +
-                   StreamableToString(event_handle_as_size_t) +
-                   " from the parent process " +
-                   StreamableToString(parent_process_id));
-  }
-
-  const int write_fd =
-      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
-  if (write_fd == -1) {
-    DeathTestAbort("Unable to convert pipe handle " +
-                   StreamableToString(write_handle_as_size_t) +
-                   " to a file descriptor");
-  }
-
-  // Signals the parent that the write end of the pipe has been acquired
-  // so the parent can release its own write end.
-  ::SetEvent(dup_event_handle);
-
-  return write_fd;
-}
-# endif  // GTEST_OS_WINDOWS
-
-// Returns a newly created InternalRunDeathTestFlag object with fields
-// initialized from the GTEST_FLAG(internal_run_death_test) flag if
-// the flag is specified; otherwise returns NULL.
-InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
-  if (GTEST_FLAG(internal_run_death_test) == "") return nullptr;
-
-  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
-  // can use it here.
-  int line = -1;
-  int index = -1;
-  ::std::vector< ::std::string> fields;
-  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
-  int write_fd = -1;
-
-# if GTEST_OS_WINDOWS
-
-  unsigned int parent_process_id = 0;
-  size_t write_handle_as_size_t = 0;
-  size_t event_handle_as_size_t = 0;
-
-  if (fields.size() != 6
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &parent_process_id)
-      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
-      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
-                   GTEST_FLAG(internal_run_death_test));
-  }
-  write_fd = GetStatusFileDescriptor(parent_process_id,
-                                     write_handle_as_size_t,
-                                     event_handle_as_size_t);
-
-# elif GTEST_OS_FUCHSIA
-
-  if (fields.size() != 3
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
-        + GTEST_FLAG(internal_run_death_test));
-  }
-
-# else
-
-  if (fields.size() != 4
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &write_fd)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
-        + GTEST_FLAG(internal_run_death_test));
-  }
-
-# endif  // GTEST_OS_WINDOWS
-
-  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
-}
-
-}  // namespace internal
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-}  // namespace testing
diff --git a/deps/googletest/src/gtest-filepath.cc b/deps/googletest/src/gtest-filepath.cc
deleted file mode 100644
index 322fbb1b4..000000000
--- a/deps/googletest/src/gtest-filepath.cc
+++ /dev/null
@@ -1,379 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "gtest/internal/gtest-filepath.h"
-
-#include <stdlib.h>
-#include "gtest/internal/gtest-port.h"
-#include "gtest/gtest-message.h"
-
-#if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>
-#elif GTEST_OS_WINDOWS
-# include <direct.h>
-# include <io.h>
-#else
-# include <limits.h>
-# include <climits>  // Some Linux distributions define PATH_MAX here.
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-#include "gtest/internal/gtest-string.h"
-
-#if GTEST_OS_WINDOWS
-# define GTEST_PATH_MAX_ _MAX_PATH
-#elif defined(PATH_MAX)
-# define GTEST_PATH_MAX_ PATH_MAX
-#elif defined(_XOPEN_PATH_MAX)
-# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
-#else
-# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
-#endif  // GTEST_OS_WINDOWS
-
-namespace testing {
-namespace internal {
-
-#if GTEST_OS_WINDOWS
-// On Windows, '\\' is the standard path separator, but many tools and the
-// Windows API also accept '/' as an alternate path separator. Unless otherwise
-// noted, a file path can contain either kind of path separators, or a mixture
-// of them.
-const char kPathSeparator = '\\';
-const char kAlternatePathSeparator = '/';
-const char kAlternatePathSeparatorString[] = "/";
-# if GTEST_OS_WINDOWS_MOBILE
-// Windows CE doesn't have a current directory. You should not use
-// the current directory in tests on Windows CE, but this at least
-// provides a reasonable fallback.
-const char kCurrentDirectoryString[] = "\\";
-// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
-const DWORD kInvalidFileAttributes = 0xffffffff;
-# else
-const char kCurrentDirectoryString[] = ".\\";
-# endif  // GTEST_OS_WINDOWS_MOBILE
-#else
-const char kPathSeparator = '/';
-const char kCurrentDirectoryString[] = "./";
-#endif  // GTEST_OS_WINDOWS
-
-// Returns whether the given character is a valid path separator.
-static bool IsPathSeparator(char c) {
-#if GTEST_HAS_ALT_PATH_SEP_
-  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
-#else
-  return c == kPathSeparator;
-#endif
-}
-
-// Returns the current working directory, or "" if unsuccessful.
-FilePath FilePath::GetCurrentDir() {
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
-    GTEST_OS_WINDOWS_RT || ARDUINO
-  // Windows CE and Arduino don't have a current directory, so we just return
-  // something reasonable.
-  return FilePath(kCurrentDirectoryString);
-#elif GTEST_OS_WINDOWS
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
-  return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd);
-#else
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
-  char* result = getcwd(cwd, sizeof(cwd));
-# if GTEST_OS_NACL
-  // getcwd will likely fail in NaCl due to the sandbox, so return something
-  // reasonable. The user may have provided a shim implementation for getcwd,
-  // however, so fallback only when failure is detected.
-  return FilePath(result == nullptr ? kCurrentDirectoryString : cwd);
-# endif  // GTEST_OS_NACL
-  return FilePath(result == nullptr ? "" : cwd);
-#endif  // GTEST_OS_WINDOWS_MOBILE
-}
-
-// Returns a copy of the FilePath with the case-insensitive extension removed.
-// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
-// FilePath("dir/file"). If a case-insensitive extension is not
-// found, returns a copy of the original FilePath.
-FilePath FilePath::RemoveExtension(const char* extension) const {
-  const std::string dot_extension = std::string(".") + extension;
-  if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
-    return FilePath(pathname_.substr(
-        0, pathname_.length() - dot_extension.length()));
-  }
-  return *this;
-}
-
-// Returns a pointer to the last occurrence of a valid path separator in
-// the FilePath. On Windows, for example, both '/' and '\' are valid path
-// separators. Returns NULL if no path separator was found.
-const char* FilePath::FindLastPathSeparator() const {
-  const char* const last_sep = strrchr(c_str(), kPathSeparator);
-#if GTEST_HAS_ALT_PATH_SEP_
-  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
-  // Comparing two pointers of which only one is NULL is undefined.
-  if (last_alt_sep != nullptr &&
-      (last_sep == nullptr || last_alt_sep > last_sep)) {
-    return last_alt_sep;
-  }
-#endif
-  return last_sep;
-}
-
-// Returns a copy of the FilePath with the directory part removed.
-// Example: FilePath("path/to/file").RemoveDirectoryName() returns
-// FilePath("file"). If there is no directory part ("just_a_file"), it returns
-// the FilePath unmodified. If there is no file part ("just_a_dir/") it
-// returns an empty FilePath ("").
-// On Windows platform, '\' is the path separator, otherwise it is '/'.
-FilePath FilePath::RemoveDirectoryName() const {
-  const char* const last_sep = FindLastPathSeparator();
-  return last_sep ? FilePath(last_sep + 1) : *this;
-}
-
-// RemoveFileName returns the directory path with the filename removed.
-// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
-// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
-// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
-// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
-// On Windows platform, '\' is the path separator, otherwise it is '/'.
-FilePath FilePath::RemoveFileName() const {
-  const char* const last_sep = FindLastPathSeparator();
-  std::string dir;
-  if (last_sep) {
-    dir = std::string(c_str(), static_cast<size_t>(last_sep + 1 - c_str()));
-  } else {
-    dir = kCurrentDirectoryString;
-  }
-  return FilePath(dir);
-}
-
-// Helper functions for naming files in a directory for xml output.
-
-// Given directory = "dir", base_name = "test", number = 0,
-// extension = "xml", returns "dir/test.xml". If number is greater
-// than zero (e.g., 12), returns "dir/test_12.xml".
-// On Windows platform, uses \ as the separator rather than /.
-FilePath FilePath::MakeFileName(const FilePath& directory,
-                                const FilePath& base_name,
-                                int number,
-                                const char* extension) {
-  std::string file;
-  if (number == 0) {
-    file = base_name.string() + "." + extension;
-  } else {
-    file = base_name.string() + "_" + StreamableToString(number)
-        + "." + extension;
-  }
-  return ConcatPaths(directory, FilePath(file));
-}
-
-// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
-// On Windows, uses \ as the separator rather than /.
-FilePath FilePath::ConcatPaths(const FilePath& directory,
-                               const FilePath& relative_path) {
-  if (directory.IsEmpty())
-    return relative_path;
-  const FilePath dir(directory.RemoveTrailingPathSeparator());
-  return FilePath(dir.string() + kPathSeparator + relative_path.string());
-}
-
-// Returns true if pathname describes something findable in the file-system,
-// either a file, directory, or whatever.
-bool FilePath::FileOrDirectoryExists() const {
-#if GTEST_OS_WINDOWS_MOBILE
-  LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
-  const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
-  return attributes != kInvalidFileAttributes;
-#else
-  posix::StatStruct file_stat;
-  return posix::Stat(pathname_.c_str(), &file_stat) == 0;
-#endif  // GTEST_OS_WINDOWS_MOBILE
-}
-
-// Returns true if pathname describes a directory in the file-system
-// that exists.
-bool FilePath::DirectoryExists() const {
-  bool result = false;
-#if GTEST_OS_WINDOWS
-  // Don't strip off trailing separator if path is a root directory on
-  // Windows (like "C:\\").
-  const FilePath& path(IsRootDirectory() ? *this :
-                                           RemoveTrailingPathSeparator());
-#else
-  const FilePath& path(*this);
-#endif
-
-#if GTEST_OS_WINDOWS_MOBILE
-  LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
-  const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
-  if ((attributes != kInvalidFileAttributes) &&
-      (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
-    result = true;
-  }
-#else
-  posix::StatStruct file_stat;
-  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
-      posix::IsDir(file_stat);
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-  return result;
-}
-
-// Returns true if pathname describes a root directory. (Windows has one
-// root directory per disk drive.)
-bool FilePath::IsRootDirectory() const {
-#if GTEST_OS_WINDOWS
-  return pathname_.length() == 3 && IsAbsolutePath();
-#else
-  return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
-#endif
-}
-
-// Returns true if pathname describes an absolute path.
-bool FilePath::IsAbsolutePath() const {
-  const char* const name = pathname_.c_str();
-#if GTEST_OS_WINDOWS
-  return pathname_.length() >= 3 &&
-     ((name[0] >= 'a' && name[0] <= 'z') ||
-      (name[0] >= 'A' && name[0] <= 'Z')) &&
-     name[1] == ':' &&
-     IsPathSeparator(name[2]);
-#else
-  return IsPathSeparator(name[0]);
-#endif
-}
-
-// Returns a pathname for a file that does not currently exist. The pathname
-// will be directory/base_name.extension or
-// directory/base_name_<number>.extension if directory/base_name.extension
-// already exists. The number will be incremented until a pathname is found
-// that does not already exist.
-// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
-// There could be a race condition if two or more processes are calling this
-// function at the same time -- they could both pick the same filename.
-FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
-                                          const FilePath& base_name,
-                                          const char* extension) {
-  FilePath full_pathname;
-  int number = 0;
-  do {
-    full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
-  } while (full_pathname.FileOrDirectoryExists());
-  return full_pathname;
-}
-
-// Returns true if FilePath ends with a path separator, which indicates that
-// it is intended to represent a directory. Returns false otherwise.
-// This does NOT check that a directory (or file) actually exists.
-bool FilePath::IsDirectory() const {
-  return !pathname_.empty() &&
-         IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
-}
-
-// Create directories so that path exists. Returns true if successful or if
-// the directories already exist; returns false if unable to create directories
-// for any reason.
-bool FilePath::CreateDirectoriesRecursively() const {
-  if (!this->IsDirectory()) {
-    return false;
-  }
-
-  if (pathname_.length() == 0 || this->DirectoryExists()) {
-    return true;
-  }
-
-  const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
-  return parent.CreateDirectoriesRecursively() && this->CreateFolder();
-}
-
-// Create the directory so that path exists. Returns true if successful or
-// if the directory already exists; returns false if unable to create the
-// directory for any reason, including if the parent directory does not
-// exist. Not named "CreateDirectory" because that's a macro on Windows.
-bool FilePath::CreateFolder() const {
-#if GTEST_OS_WINDOWS_MOBILE
-  FilePath removed_sep(this->RemoveTrailingPathSeparator());
-  LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
-  int result = CreateDirectory(unicode, nullptr) ? 0 : -1;
-  delete [] unicode;
-#elif GTEST_OS_WINDOWS
-  int result = _mkdir(pathname_.c_str());
-#else
-  int result = mkdir(pathname_.c_str(), 0777);
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-  if (result == -1) {
-    return this->DirectoryExists();  // An error is OK if the directory exists.
-  }
-  return true;  // No error.
-}
-
-// If input name has a trailing separator character, remove it and return the
-// name, otherwise return the name string unmodified.
-// On Windows platform, uses \ as the separator, other platforms use /.
-FilePath FilePath::RemoveTrailingPathSeparator() const {
-  return IsDirectory()
-      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
-      : *this;
-}
-
-// Removes any redundant separators that might be in the pathname.
-// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
-// redundancies that might be in a pathname involving "." or "..".
-void FilePath::Normalize() {
-  if (pathname_.c_str() == nullptr) {
-    pathname_ = "";
-    return;
-  }
-  const char* src = pathname_.c_str();
-  char* const dest = new char[pathname_.length() + 1];
-  char* dest_ptr = dest;
-  memset(dest_ptr, 0, pathname_.length() + 1);
-
-  while (*src != '\0') {
-    *dest_ptr = *src;
-    if (!IsPathSeparator(*src)) {
-      src++;
-    } else {
-#if GTEST_HAS_ALT_PATH_SEP_
-      if (*dest_ptr == kAlternatePathSeparator) {
-        *dest_ptr = kPathSeparator;
-      }
-#endif
-      while (IsPathSeparator(*src))
-        src++;
-    }
-    dest_ptr++;
-  }
-  *dest_ptr = '\0';
-  pathname_ = dest;
-  delete[] dest;
-}
-
-}  // namespace internal
-}  // namespace testing
diff --git a/deps/googletest/src/gtest-internal-inl.h b/deps/googletest/src/gtest-internal-inl.h
deleted file mode 100644
index bdd7e90a3..000000000
--- a/deps/googletest/src/gtest-internal-inl.h
+++ /dev/null
@@ -1,1210 +0,0 @@
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Utility functions and classes used by the Google C++ testing framework.//
-// This file contains purely Google Test's internal implementation.  Please
-// DO NOT #INCLUDE IT IN A USER PROGRAM.
-
-#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
-#define GTEST_SRC_GTEST_INTERNAL_INL_H_
-
-#ifndef _WIN32_WCE
-# include <errno.h>
-#endif  // !_WIN32_WCE
-#include <stddef.h>
-#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
-#include <string.h>  // For memmove.
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "gtest/internal/gtest-port.h"
-
-#if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
-#endif
-
-#if GTEST_OS_WINDOWS
-# include <windows.h>  // NOLINT
-#endif  // GTEST_OS_WINDOWS
-
-#include "gtest/gtest.h"
-#include "gtest/gtest-spi.h"
-
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
-/* class A needs to have dll-interface to be used by clients of class B */)
-
-namespace testing {
-
-// Declares the flags.
-//
-// We don't want the users to modify this flag in the code, but want
-// Google Test's own unit tests to be able to access it. Therefore we
-// declare it here as opposed to in gtest.h.
-GTEST_DECLARE_bool_(death_test_use_fork);
-
-namespace internal {
-
-// The value of GetTestTypeId() as seen from within the Google Test
-// library.  This is solely for testing GetTestTypeId().
-GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
-
-// Names of the flags (needed for parsing Google Test flags).
-const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
-const char kBreakOnFailureFlag[] = "break_on_failure";
-const char kCatchExceptionsFlag[] = "catch_exceptions";
-const char kColorFlag[] = "color";
-const char kFilterFlag[] = "filter";
-const char kListTestsFlag[] = "list_tests";
-const char kOutputFlag[] = "output";
-const char kPrintTimeFlag[] = "print_time";
-const char kPrintUTF8Flag[] = "print_utf8";
-const char kRandomSeedFlag[] = "random_seed";
-const char kRepeatFlag[] = "repeat";
-const char kShuffleFlag[] = "shuffle";
-const char kStackTraceDepthFlag[] = "stack_trace_depth";
-const char kStreamResultToFlag[] = "stream_result_to";
-const char kThrowOnFailureFlag[] = "throw_on_failure";
-const char kFlagfileFlag[] = "flagfile";
-
-// A valid random seed must be in [1, kMaxRandomSeed].
-const int kMaxRandomSeed = 99999;
-
-// g_help_flag is true iff the --help flag or an equivalent form is
-// specified on the command line.
-GTEST_API_ extern bool g_help_flag;
-
-// Returns the current time in milliseconds.
-GTEST_API_ TimeInMillis GetTimeInMillis();
-
-// Returns true iff Google Test should use colors in the output.
-GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
-
-// Formats the given time in milliseconds as seconds.
-GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
-
-// Converts the given time in milliseconds to a date string in the ISO 8601
-// format, without the timezone information.  N.B.: due to the use the
-// non-reentrant localtime() function, this function is not thread safe.  Do
-// not use it in any code that can be called from multiple threads.
-GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
-
-// Parses a string for an Int32 flag, in the form of "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-GTEST_API_ bool ParseInt32Flag(
-    const char* str, const char* flag, Int32* value);
-
-// Returns a random seed in range [1, kMaxRandomSeed] based on the
-// given --gtest_random_seed flag value.
-inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
-  const unsigned int raw_seed = (random_seed_flag == 0) ?
-      static_cast<unsigned int>(GetTimeInMillis()) :
-      static_cast<unsigned int>(random_seed_flag);
-
-  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
-  // it's easy to type.
-  const int normalized_seed =
-      static_cast<int>((raw_seed - 1U) %
-                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
-  return normalized_seed;
-}
-
-// Returns the first valid random seed after 'seed'.  The behavior is
-// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
-// considered to be 1.
-inline int GetNextRandomSeed(int seed) {
-  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
-      << "Invalid random seed " << seed << " - must be in [1, "
-      << kMaxRandomSeed << "].";
-  const int next_seed = seed + 1;
-  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
-}
-
-// This class saves the values of all Google Test flags in its c'tor, and
-// restores them in its d'tor.
-class GTestFlagSaver {
- public:
-  // The c'tor.
-  GTestFlagSaver() {
-    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
-    break_on_failure_ = GTEST_FLAG(break_on_failure);
-    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
-    color_ = GTEST_FLAG(color);
-    death_test_style_ = GTEST_FLAG(death_test_style);
-    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
-    filter_ = GTEST_FLAG(filter);
-    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
-    list_tests_ = GTEST_FLAG(list_tests);
-    output_ = GTEST_FLAG(output);
-    print_time_ = GTEST_FLAG(print_time);
-    print_utf8_ = GTEST_FLAG(print_utf8);
-    random_seed_ = GTEST_FLAG(random_seed);
-    repeat_ = GTEST_FLAG(repeat);
-    shuffle_ = GTEST_FLAG(shuffle);
-    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
-    stream_result_to_ = GTEST_FLAG(stream_result_to);
-    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
-  }
-
-  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
-  ~GTestFlagSaver() {
-    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
-    GTEST_FLAG(break_on_failure) = break_on_failure_;
-    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
-    GTEST_FLAG(color) = color_;
-    GTEST_FLAG(death_test_style) = death_test_style_;
-    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
-    GTEST_FLAG(filter) = filter_;
-    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
-    GTEST_FLAG(list_tests) = list_tests_;
-    GTEST_FLAG(output) = output_;
-    GTEST_FLAG(print_time) = print_time_;
-    GTEST_FLAG(print_utf8) = print_utf8_;
-    GTEST_FLAG(random_seed) = random_seed_;
-    GTEST_FLAG(repeat) = repeat_;
-    GTEST_FLAG(shuffle) = shuffle_;
-    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
-    GTEST_FLAG(stream_result_to) = stream_result_to_;
-    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
-  }
-
- private:
-  // Fields for saving the original values of flags.
-  bool also_run_disabled_tests_;
-  bool break_on_failure_;
-  bool catch_exceptions_;
-  std::string color_;
-  std::string death_test_style_;
-  bool death_test_use_fork_;
-  std::string filter_;
-  std::string internal_run_death_test_;
-  bool list_tests_;
-  std::string output_;
-  bool print_time_;
-  bool print_utf8_;
-  internal::Int32 random_seed_;
-  internal::Int32 repeat_;
-  bool shuffle_;
-  internal::Int32 stack_trace_depth_;
-  std::string stream_result_to_;
-  bool throw_on_failure_;
-} GTEST_ATTRIBUTE_UNUSED_;
-
-// Converts a Unicode code point to a narrow string in UTF-8 encoding.
-// code_point parameter is of type UInt32 because wchar_t may not be
-// wide enough to contain a code point.
-// If the code_point is not a valid Unicode code point
-// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
-// to "(Invalid Unicode 0xXXXXXXXX)".
-GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
-
-// Converts a wide string to a narrow string in UTF-8 encoding.
-// The wide string is assumed to have the following encoding:
-//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin)
-//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
-// Parameter str points to a null-terminated wide string.
-// Parameter num_chars may additionally limit the number
-// of wchar_t characters processed. -1 is used when the entire string
-// should be processed.
-// If the string contains code points that are not valid Unicode code points
-// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
-// and contains invalid UTF-16 surrogate pairs, values in those pairs
-// will be encoded as individual Unicode characters from Basic Normal Plane.
-GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
-
-// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
-// if the variable is present. If a file already exists at this location, this
-// function will write over it. If the variable is present, but the file cannot
-// be created, prints an error and exits.
-void WriteToShardStatusFileIfNeeded();
-
-// Checks whether sharding is enabled by examining the relevant
-// environment variable values. If the variables are present,
-// but inconsistent (e.g., shard_index >= total_shards), prints
-// an error and exits. If in_subprocess_for_death_test, sharding is
-// disabled because it must only be applied to the original test
-// process. Otherwise, we could filter out death tests we intended to execute.
-GTEST_API_ bool ShouldShard(const char* total_shards_str,
-                            const char* shard_index_str,
-                            bool in_subprocess_for_death_test);
-
-// Parses the environment variable var as an Int32. If it is unset,
-// returns default_val. If it is not an Int32, prints an error and
-// and aborts.
-GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
-
-// Given the total number of shards, the shard index, and the test id,
-// returns true iff the test should be run on this shard. The test id is
-// some arbitrary but unique non-negative integer assigned to each test
-// method. Assumes that 0 <= shard_index < total_shards.
-GTEST_API_ bool ShouldRunTestOnShard(
-    int total_shards, int shard_index, int test_id);
-
-// STL container utilities.
-
-// Returns the number of elements in the given container that satisfy
-// the given predicate.
-template <class Container, typename Predicate>
-inline int CountIf(const Container& c, Predicate predicate) {
-  // Implemented as an explicit loop since std::count_if() in libCstd on
-  // Solaris has a non-standard signature.
-  int count = 0;
-  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
-    if (predicate(*it))
-      ++count;
-  }
-  return count;
-}
-
-// Applies a function/functor to each element in the container.
-template <class Container, typename Functor>
-void ForEach(const Container& c, Functor functor) {
-  std::for_each(c.begin(), c.end(), functor);
-}
-
-// Returns the i-th element of the vector, or default_value if i is not
-// in range [0, v.size()).
-template <typename E>
-inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
-  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value
-                                                    : v[static_cast<size_t>(i)];
-}
-
-// Performs an in-place shuffle of a range of the vector's elements.
-// 'begin' and 'end' are element indices as an STL-style range;
-// i.e. [begin, end) are shuffled, where 'end' == size() means to
-// shuffle to the end of the vector.
-template <typename E>
-void ShuffleRange(internal::Random* random, int begin, int end,
-                  std::vector<E>* v) {
-  const int size = static_cast<int>(v->size());
-  GTEST_CHECK_(0 <= begin && begin <= size)
-      << "Invalid shuffle range start " << begin << ": must be in range [0, "
-      << size << "].";
-  GTEST_CHECK_(begin <= end && end <= size)
-      << "Invalid shuffle range finish " << end << ": must be in range ["
-      << begin << ", " << size << "].";
-
-  // Fisher-Yates shuffle, from
-  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
-  for (int range_width = end - begin; range_width >= 2; range_width--) {
-    const int last_in_range = begin + range_width - 1;
-    const int selected =
-        begin +
-        static_cast<int>(random->Generate(static_cast<UInt32>(range_width)));
-    std::swap((*v)[static_cast<size_t>(selected)],
-              (*v)[static_cast<size_t>(last_in_range)]);
-  }
-}
-
-// Performs an in-place shuffle of the vector's elements.
-template <typename E>
-inline void Shuffle(internal::Random* random, std::vector<E>* v) {
-  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
-}
-
-// A function for deleting an object.  Handy for being used as a
-// functor.
-template <typename T>
-static void Delete(T* x) {
-  delete x;
-}
-
-// A predicate that checks the key of a TestProperty against a known key.
-//
-// TestPropertyKeyIs is copyable.
-class TestPropertyKeyIs {
- public:
-  // Constructor.
-  //
-  // TestPropertyKeyIs has NO default constructor.
-  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
-
-  // Returns true iff the test name of test property matches on key_.
-  bool operator()(const TestProperty& test_property) const {
-    return test_property.key() == key_;
-  }
-
- private:
-  std::string key_;
-};
-
-// Class UnitTestOptions.
-//
-// This class contains functions for processing options the user
-// specifies when running the tests.  It has only static members.
-//
-// In most cases, the user can specify an option using either an
-// environment variable or a command line flag.  E.g. you can set the
-// test filter using either GTEST_FILTER or --gtest_filter.  If both
-// the variable and the flag are present, the latter overrides the
-// former.
-class GTEST_API_ UnitTestOptions {
- public:
-  // Functions for processing the gtest_output flag.
-
-  // Returns the output format, or "" for normal printed output.
-  static std::string GetOutputFormat();
-
-  // Returns the absolute path of the requested output file, or the
-  // default (test_detail.xml in the original working directory) if
-  // none was explicitly specified.
-  static std::string GetAbsolutePathToOutputFile();
-
-  // Functions for processing the gtest_filter flag.
-
-  // Returns true iff the wildcard pattern matches the string.  The
-  // first ':' or '\0' character in pattern marks the end of it.
-  //
-  // This recursive algorithm isn't very efficient, but is clear and
-  // works well enough for matching test names, which are short.
-  static bool PatternMatchesString(const char *pattern, const char *str);
-
-  // Returns true iff the user-specified filter matches the test suite
-  // name and the test name.
-  static bool FilterMatchesTest(const std::string& test_suite_name,
-                                const std::string& test_name);
-
-#if GTEST_OS_WINDOWS
-  // Function for supporting the gtest_catch_exception flag.
-
-  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
-  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
-  // This function is useful as an __except condition.
-  static int GTestShouldProcessSEH(DWORD exception_code);
-#endif  // GTEST_OS_WINDOWS
-
-  // Returns true if "name" matches the ':' separated list of glob-style
-  // filters in "filter".
-  static bool MatchesFilter(const std::string& name, const char* filter);
-};
-
-// Returns the current application's name, removing directory path if that
-// is present.  Used by UnitTestOptions::GetOutputFile.
-GTEST_API_ FilePath GetCurrentExecutableName();
-
-// The role interface for getting the OS stack trace as a string.
-class OsStackTraceGetterInterface {
- public:
-  OsStackTraceGetterInterface() {}
-  virtual ~OsStackTraceGetterInterface() {}
-
-  // Returns the current OS stack trace as an std::string.  Parameters:
-  //
-  //   max_depth  - the maximum number of stack frames to be included
-  //                in the trace.
-  //   skip_count - the number of top frames to be skipped; doesn't count
-  //                against max_depth.
-  virtual std::string CurrentStackTrace(int max_depth, int skip_count) = 0;
-
-  // UponLeavingGTest() should be called immediately before Google Test calls
-  // user code. It saves some information about the current stack that
-  // CurrentStackTrace() will use to find and hide Google Test stack frames.
-  virtual void UponLeavingGTest() = 0;
-
-  // This string is inserted in place of stack frames that are part of
-  // Google Test's implementation.
-  static const char* const kElidedFramesMarker;
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
-};
-
-// A working implementation of the OsStackTraceGetterInterface interface.
-class OsStackTraceGetter : public OsStackTraceGetterInterface {
- public:
-  OsStackTraceGetter() {}
-
-  std::string CurrentStackTrace(int max_depth, int skip_count) override;
-  void UponLeavingGTest() override;
-
- private:
-#if GTEST_HAS_ABSL
-  Mutex mutex_;  // Protects all internal state.
-
-  // We save the stack frame below the frame that calls user code.
-  // We do this because the address of the frame immediately below
-  // the user code changes between the call to UponLeavingGTest()
-  // and any calls to the stack trace code from within the user code.
-  void* caller_frame_ = nullptr;
-#endif  // GTEST_HAS_ABSL
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
-};
-
-// Information about a Google Test trace point.
-struct TraceInfo {
-  const char* file;
-  int line;
-  std::string message;
-};
-
-// This is the default global test part result reporter used in UnitTestImpl.
-// This class should only be used by UnitTestImpl.
-class DefaultGlobalTestPartResultReporter
-  : public TestPartResultReporterInterface {
- public:
-  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
-  // Implements the TestPartResultReporterInterface. Reports the test part
-  // result in the current test.
-  void ReportTestPartResult(const TestPartResult& result) override;
-
- private:
-  UnitTestImpl* const unit_test_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
-};
-
-// This is the default per thread test part result reporter used in
-// UnitTestImpl. This class should only be used by UnitTestImpl.
-class DefaultPerThreadTestPartResultReporter
-    : public TestPartResultReporterInterface {
- public:
-  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
-  // Implements the TestPartResultReporterInterface. The implementation just
-  // delegates to the current global test part result reporter of *unit_test_.
-  void ReportTestPartResult(const TestPartResult& result) override;
-
- private:
-  UnitTestImpl* const unit_test_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
-};
-
-// The private implementation of the UnitTest class.  We don't protect
-// the methods under a mutex, as this class is not accessible by a
-// user and the UnitTest class that delegates work to this class does
-// proper locking.
-class GTEST_API_ UnitTestImpl {
- public:
-  explicit UnitTestImpl(UnitTest* parent);
-  virtual ~UnitTestImpl();
-
-  // There are two different ways to register your own TestPartResultReporter.
-  // You can register your own reporter to listen either only for test results
-  // from the current thread or for results from all threads.
-  // By default, each per-thread test result reporter just passes a new
-  // TestPartResult to the global test result reporter, which registers the
-  // test part result for the currently running test.
-
-  // Returns the global test part result reporter.
-  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
-
-  // Sets the global test part result reporter.
-  void SetGlobalTestPartResultReporter(
-      TestPartResultReporterInterface* reporter);
-
-  // Returns the test part result reporter for the current thread.
-  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
-
-  // Sets the test part result reporter for the current thread.
-  void SetTestPartResultReporterForCurrentThread(
-      TestPartResultReporterInterface* reporter);
-
-  // Gets the number of successful test suites.
-  int successful_test_suite_count() const;
-
-  // Gets the number of failed test suites.
-  int failed_test_suite_count() const;
-
-  // Gets the number of all test suites.
-  int total_test_suite_count() const;
-
-  // Gets the number of all test suites that contain at least one test
-  // that should run.
-  int test_suite_to_run_count() const;
-
-  // Gets the number of successful tests.
-  int successful_test_count() const;
-
-  // Gets the number of skipped tests.
-  int skipped_test_count() const;
-
-  // Gets the number of failed tests.
-  int failed_test_count() const;
-
-  // Gets the number of disabled tests that will be reported in the XML report.
-  int reportable_disabled_test_count() const;
-
-  // Gets the number of disabled tests.
-  int disabled_test_count() const;
-
-  // Gets the number of tests to be printed in the XML report.
-  int reportable_test_count() const;
-
-  // Gets the number of all tests.
-  int total_test_count() const;
-
-  // Gets the number of tests that should run.
-  int test_to_run_count() const;
-
-  // Gets the time of the test program start, in ms from the start of the
-  // UNIX epoch.
-  TimeInMillis start_timestamp() const { return start_timestamp_; }
-
-  // Gets the elapsed time, in milliseconds.
-  TimeInMillis elapsed_time() const { return elapsed_time_; }
-
-  // Returns true iff the unit test passed (i.e. all test suites passed).
-  bool Passed() const { return !Failed(); }
-
-  // Returns true iff the unit test failed (i.e. some test suite failed
-  // or something outside of all tests failed).
-  bool Failed() const {
-    return failed_test_suite_count() > 0 || ad_hoc_test_result()->Failed();
-  }
-
-  // Gets the i-th test suite among all the test suites. i can range from 0 to
-  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
-  const TestSuite* GetTestSuite(int i) const {
-    const int index = GetElementOr(test_suite_indices_, i, -1);
-    return index < 0 ? nullptr : test_suites_[static_cast<size_t>(i)];
-  }
-
-  //  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  const TestCase* GetTestCase(int i) const { return GetTestSuite(i); }
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-  // Gets the i-th test suite among all the test suites. i can range from 0 to
-  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
-  TestSuite* GetMutableSuiteCase(int i) {
-    const int index = GetElementOr(test_suite_indices_, i, -1);
-    return index < 0 ? nullptr : test_suites_[static_cast<size_t>(index)];
-  }
-
-  // Provides access to the event listener list.
-  TestEventListeners* listeners() { return &listeners_; }
-
-  // Returns the TestResult for the test that's currently running, or
-  // the TestResult for the ad hoc test if no test is running.
-  TestResult* current_test_result();
-
-  // Returns the TestResult for the ad hoc test.
-  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
-
-  // Sets the OS stack trace getter.
-  //
-  // Does nothing if the input and the current OS stack trace getter
-  // are the same; otherwise, deletes the old getter and makes the
-  // input the current getter.
-  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
-
-  // Returns the current OS stack trace getter if it is not NULL;
-  // otherwise, creates an OsStackTraceGetter, makes it the current
-  // getter, and returns it.
-  OsStackTraceGetterInterface* os_stack_trace_getter();
-
-  // Returns the current OS stack trace as an std::string.
-  //
-  // The maximum number of stack frames to be included is specified by
-  // the gtest_stack_trace_depth flag.  The skip_count parameter
-  // specifies the number of top frames to be skipped, which doesn't
-  // count against the number of frames to be included.
-  //
-  // For example, if Foo() calls Bar(), which in turn calls
-  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
-  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
-
-  // Finds and returns a TestSuite with the given name.  If one doesn't
-  // exist, creates one and returns it.
-  //
-  // Arguments:
-  //
-  //   test_suite_name: name of the test suite
-  //   type_param:     the name of the test's type parameter, or NULL if
-  //                   this is not a typed or a type-parameterized test.
-  //   set_up_tc:      pointer to the function that sets up the test suite
-  //   tear_down_tc:   pointer to the function that tears down the test suite
-  TestSuite* GetTestSuite(const char* test_suite_name, const char* type_param,
-                          internal::SetUpTestSuiteFunc set_up_tc,
-                          internal::TearDownTestSuiteFunc tear_down_tc);
-
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-  TestCase* GetTestCase(const char* test_case_name, const char* type_param,
-                        internal::SetUpTestSuiteFunc set_up_tc,
-                        internal::TearDownTestSuiteFunc tear_down_tc) {
-    return GetTestSuite(test_case_name, type_param, set_up_tc, tear_down_tc);
-  }
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-  // Adds a TestInfo to the unit test.
-  //
-  // Arguments:
-  //
-  //   set_up_tc:    pointer to the function that sets up the test suite
-  //   tear_down_tc: pointer to the function that tears down the test suite
-  //   test_info:    the TestInfo object
-  void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc,
-                   internal::TearDownTestSuiteFunc tear_down_tc,
-                   TestInfo* test_info) {
-    // In order to support thread-safe death tests, we need to
-    // remember the original working directory when the test program
-    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
-    // the user may have changed the current directory before calling
-    // RUN_ALL_TESTS().  Therefore we capture the current directory in
-    // AddTestInfo(), which is called to register a TEST or TEST_F
-    // before main() is reached.
-    if (original_working_dir_.IsEmpty()) {
-      original_working_dir_.Set(FilePath::GetCurrentDir());
-      GTEST_CHECK_(!original_working_dir_.IsEmpty())
-          << "Failed to get the current working directory.";
-    }
-
-    GetTestSuite(test_info->test_suite_name(), test_info->type_param(),
-                 set_up_tc, tear_down_tc)
-        ->AddTestInfo(test_info);
-  }
-
-  // Returns ParameterizedTestSuiteRegistry object used to keep track of
-  // value-parameterized tests and instantiate and register them.
-  internal::ParameterizedTestSuiteRegistry& parameterized_test_registry() {
-    return parameterized_test_registry_;
-  }
-
-  // Sets the TestSuite object for the test that's currently running.
-  void set_current_test_suite(TestSuite* a_current_test_suite) {
-    current_test_suite_ = a_current_test_suite;
-  }
-
-  // Sets the TestInfo object for the test that's currently running.  If
-  // current_test_info is NULL, the assertion results will be stored in
-  // ad_hoc_test_result_.
-  void set_current_test_info(TestInfo* a_current_test_info) {
-    current_test_info_ = a_current_test_info;
-  }
-
-  // Registers all parameterized tests defined using TEST_P and
-  // INSTANTIATE_TEST_SUITE_P, creating regular tests for each test/parameter
-  // combination. This method can be called more then once; it has guards
-  // protecting from registering the tests more then once.  If
-  // value-parameterized tests are disabled, RegisterParameterizedTests is
-  // present but does nothing.
-  void RegisterParameterizedTests();
-
-  // Runs all tests in this UnitTest object, prints the result, and
-  // returns true if all tests are successful.  If any exception is
-  // thrown during a test, this test is considered to be failed, but
-  // the rest of the tests will still be run.
-  bool RunAllTests();
-
-  // Clears the results of all tests, except the ad hoc tests.
-  void ClearNonAdHocTestResult() {
-    ForEach(test_suites_, TestSuite::ClearTestSuiteResult);
-  }
-
-  // Clears the results of ad-hoc test assertions.
-  void ClearAdHocTestResult() {
-    ad_hoc_test_result_.Clear();
-  }
-
-  // Adds a TestProperty to the current TestResult object when invoked in a
-  // context of a test or a test suite, or to the global property set. If the
-  // result already contains a property with the same key, the value will be
-  // updated.
-  void RecordProperty(const TestProperty& test_property);
-
-  enum ReactionToSharding {
-    HONOR_SHARDING_PROTOCOL,
-    IGNORE_SHARDING_PROTOCOL
-  };
-
-  // Matches the full name of each test against the user-specified
-  // filter to decide whether the test should run, then records the
-  // result in each TestSuite and TestInfo object.
-  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
-  // based on sharding variables in the environment.
-  // Returns the number of tests that should run.
-  int FilterTests(ReactionToSharding shard_tests);
-
-  // Prints the names of the tests matching the user-specified filter flag.
-  void ListTestsMatchingFilter();
-
-  const TestSuite* current_test_suite() const { return current_test_suite_; }
-  TestInfo* current_test_info() { return current_test_info_; }
-  const TestInfo* current_test_info() const { return current_test_info_; }
-
-  // Returns the vector of environments that need to be set-up/torn-down
-  // before/after the tests are run.
-  std::vector<Environment*>& environments() { return environments_; }
-
-  // Getters for the per-thread Google Test trace stack.
-  std::vector<TraceInfo>& gtest_trace_stack() {
-    return *(gtest_trace_stack_.pointer());
-  }
-  const std::vector<TraceInfo>& gtest_trace_stack() const {
-    return gtest_trace_stack_.get();
-  }
-
-#if GTEST_HAS_DEATH_TEST
-  void InitDeathTestSubprocessControlInfo() {
-    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
-  }
-  // Returns a pointer to the parsed --gtest_internal_run_death_test
-  // flag, or NULL if that flag was not specified.
-  // This information is useful only in a death test child process.
-  // Must not be called before a call to InitGoogleTest.
-  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
-    return internal_run_death_test_flag_.get();
-  }
-
-  // Returns a pointer to the current death test factory.
-  internal::DeathTestFactory* death_test_factory() {
-    return death_test_factory_.get();
-  }
-
-  void SuppressTestEventsIfInSubprocess();
-
-  friend class ReplaceDeathTestFactory;
-#endif  // GTEST_HAS_DEATH_TEST
-
-  // Initializes the event listener performing XML output as specified by
-  // UnitTestOptions. Must not be called before InitGoogleTest.
-  void ConfigureXmlOutput();
-
-#if GTEST_CAN_STREAM_RESULTS_
-  // Initializes the event listener for streaming test results to a socket.
-  // Must not be called before InitGoogleTest.
-  void ConfigureStreamingOutput();
-#endif
-
-  // Performs initialization dependent upon flag values obtained in
-  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
-  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
-  // this function is also called from RunAllTests.  Since this function can be
-  // called more than once, it has to be idempotent.
-  void PostFlagParsingInit();
-
-  // Gets the random seed used at the start of the current test iteration.
-  int random_seed() const { return random_seed_; }
-
-  // Gets the random number generator.
-  internal::Random* random() { return &random_; }
-
-  // Shuffles all test suites, and the tests within each test suite,
-  // making sure that death tests are still run first.
-  void ShuffleTests();
-
-  // Restores the test suites and tests to their order before the first shuffle.
-  void UnshuffleTests();
-
-  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
-  // UnitTest::Run() starts.
-  bool catch_exceptions() const { return catch_exceptions_; }
-
- private:
-  friend class ::testing::UnitTest;
-
-  // Used by UnitTest::Run() to capture the state of
-  // GTEST_FLAG(catch_exceptions) at the moment it starts.
-  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
-
-  // The UnitTest object that owns this implementation object.
-  UnitTest* const parent_;
-
-  // The working directory when the first TEST() or TEST_F() was
-  // executed.
-  internal::FilePath original_working_dir_;
-
-  // The default test part result reporters.
-  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
-  DefaultPerThreadTestPartResultReporter
-      default_per_thread_test_part_result_reporter_;
-
-  // Points to (but doesn't own) the global test part result reporter.
-  TestPartResultReporterInterface* global_test_part_result_repoter_;
-
-  // Protects read and write access to global_test_part_result_reporter_.
-  internal::Mutex global_test_part_result_reporter_mutex_;
-
-  // Points to (but doesn't own) the per-thread test part result reporter.
-  internal::ThreadLocal<TestPartResultReporterInterface*>
-      per_thread_test_part_result_reporter_;
-
-  // The vector of environments that need to be set-up/torn-down
-  // before/after the tests are run.
-  std::vector<Environment*> environments_;
-
-  // The vector of TestSuites in their original order.  It owns the
-  // elements in the vector.
-  std::vector<TestSuite*> test_suites_;
-
-  // Provides a level of indirection for the test suite list to allow
-  // easy shuffling and restoring the test suite order.  The i-th
-  // element of this vector is the index of the i-th test suite in the
-  // shuffled order.
-  std::vector<int> test_suite_indices_;
-
-  // ParameterizedTestRegistry object used to register value-parameterized
-  // tests.
-  internal::ParameterizedTestSuiteRegistry parameterized_test_registry_;
-
-  // Indicates whether RegisterParameterizedTests() has been called already.
-  bool parameterized_tests_registered_;
-
-  // Index of the last death test suite registered.  Initially -1.
-  int last_death_test_suite_;
-
-  // This points to the TestSuite for the currently running test.  It
-  // changes as Google Test goes through one test suite after another.
-  // When no test is running, this is set to NULL and Google Test
-  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
-  TestSuite* current_test_suite_;
-
-  // This points to the TestInfo for the currently running test.  It
-  // changes as Google Test goes through one test after another.  When
-  // no test is running, this is set to NULL and Google Test stores
-  // assertion results in ad_hoc_test_result_.  Initially NULL.
-  TestInfo* current_test_info_;
-
-  // Normally, a user only writes assertions inside a TEST or TEST_F,
-  // or inside a function called by a TEST or TEST_F.  Since Google
-  // Test keeps track of which test is current running, it can
-  // associate such an assertion with the test it belongs to.
-  //
-  // If an assertion is encountered when no TEST or TEST_F is running,
-  // Google Test attributes the assertion result to an imaginary "ad hoc"
-  // test, and records the result in ad_hoc_test_result_.
-  TestResult ad_hoc_test_result_;
-
-  // The list of event listeners that can be used to track events inside
-  // Google Test.
-  TestEventListeners listeners_;
-
-  // The OS stack trace getter.  Will be deleted when the UnitTest
-  // object is destructed.  By default, an OsStackTraceGetter is used,
-  // but the user can set this field to use a custom getter if that is
-  // desired.
-  OsStackTraceGetterInterface* os_stack_trace_getter_;
-
-  // True iff PostFlagParsingInit() has been called.
-  bool post_flag_parse_init_performed_;
-
-  // The random number seed used at the beginning of the test run.
-  int random_seed_;
-
-  // Our random number generator.
-  internal::Random random_;
-
-  // The time of the test program start, in ms from the start of the
-  // UNIX epoch.
-  TimeInMillis start_timestamp_;
-
-  // How long the test took to run, in milliseconds.
-  TimeInMillis elapsed_time_;
-
-#if GTEST_HAS_DEATH_TEST
-  // The decomposed components of the gtest_internal_run_death_test flag,
-  // parsed when RUN_ALL_TESTS is called.
-  std::unique_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
-  std::unique_ptr<internal::DeathTestFactory> death_test_factory_;
-#endif  // GTEST_HAS_DEATH_TEST
-
-  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
-  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
-
-  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
-  // starts.
-  bool catch_exceptions_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
-};  // class UnitTestImpl
-
-// Convenience function for accessing the global UnitTest
-// implementation object.
-inline UnitTestImpl* GetUnitTestImpl() {
-  return UnitTest::GetInstance()->impl();
-}
-
-#if GTEST_USES_SIMPLE_RE
-
-// Internal helper functions for implementing the simple regular
-// expression matcher.
-GTEST_API_ bool IsInSet(char ch, const char* str);
-GTEST_API_ bool IsAsciiDigit(char ch);
-GTEST_API_ bool IsAsciiPunct(char ch);
-GTEST_API_ bool IsRepeat(char ch);
-GTEST_API_ bool IsAsciiWhiteSpace(char ch);
-GTEST_API_ bool IsAsciiWordChar(char ch);
-GTEST_API_ bool IsValidEscape(char ch);
-GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
-GTEST_API_ bool ValidateRegex(const char* regex);
-GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
-GTEST_API_ bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char ch, char repeat, const char* regex, const char* str);
-GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
-
-#endif  // GTEST_USES_SIMPLE_RE
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.
-GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
-GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
-
-#if GTEST_HAS_DEATH_TEST
-
-// Returns the message describing the last system error, regardless of the
-// platform.
-GTEST_API_ std::string GetLastErrnoDescription();
-
-// Attempts to parse a string into a positive integer pointed to by the
-// number parameter.  Returns true if that is possible.
-// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
-// it here.
-template <typename Integer>
-bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
-  // Fail fast if the given string does not begin with a digit;
-  // this bypasses strtoXXX's "optional leading whitespace and plus
-  // or minus sign" semantics, which are undesirable here.
-  if (str.empty() || !IsDigit(str[0])) {
-    return false;
-  }
-  errno = 0;
-
-  char* end;
-  // BiggestConvertible is the largest integer type that system-provided
-  // string-to-number conversion routines can return.
-
-# if GTEST_OS_WINDOWS && !defined(__GNUC__)
-
-  // MSVC and C++ Builder define __int64 instead of the standard long long.
-  typedef unsigned __int64 BiggestConvertible;
-  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
-
-# else
-
-  typedef unsigned long long BiggestConvertible;  // NOLINT
-  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
-
-# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
-
-  const bool parse_success = *end == '\0' && errno == 0;
-
-  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
-
-  const Integer result = static_cast<Integer>(parsed);
-  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
-    *number = result;
-    return true;
-  }
-  return false;
-}
-#endif  // GTEST_HAS_DEATH_TEST
-
-// TestResult contains some private methods that should be hidden from
-// Google Test user but are required for testing. This class allow our tests
-// to access them.
-//
-// This class is supplied only for the purpose of testing Google Test's own
-// constructs. Do not use it in user tests, either directly or indirectly.
-class TestResultAccessor {
- public:
-  static void RecordProperty(TestResult* test_result,
-                             const std::string& xml_element,
-                             const TestProperty& property) {
-    test_result->RecordProperty(xml_element, property);
-  }
-
-  static void ClearTestPartResults(TestResult* test_result) {
-    test_result->ClearTestPartResults();
-  }
-
-  static const std::vector<testing::TestPartResult>& test_part_results(
-      const TestResult& test_result) {
-    return test_result.test_part_results();
-  }
-};
-
-#if GTEST_CAN_STREAM_RESULTS_
-
-// Streams test results to the given port on the given host machine.
-class StreamingListener : public EmptyTestEventListener {
- public:
-  // Abstract base class for writing strings to a socket.
-  class AbstractSocketWriter {
-   public:
-    virtual ~AbstractSocketWriter() {}
-
-    // Sends a string to the socket.
-    virtual void Send(const std::string& message) = 0;
-
-    // Closes the socket.
-    virtual void CloseConnection() {}
-
-    // Sends a string and a newline to the socket.
-    void SendLn(const std::string& message) { Send(message + "\n"); }
-  };
-
-  // Concrete class for actually writing strings to a socket.
-  class SocketWriter : public AbstractSocketWriter {
-   public:
-    SocketWriter(const std::string& host, const std::string& port)
-        : sockfd_(-1), host_name_(host), port_num_(port) {
-      MakeConnection();
-    }
-
-    ~SocketWriter() override {
-      if (sockfd_ != -1)
-        CloseConnection();
-    }
-
-    // Sends a string to the socket.
-    void Send(const std::string& message) override {
-      GTEST_CHECK_(sockfd_ != -1)
-          << "Send() can be called only when there is a connection.";
-
-      const auto len = static_cast<size_t>(message.length());
-      if (write(sockfd_, message.c_str(), len) != static_cast<ssize_t>(len)) {
-        GTEST_LOG_(WARNING)
-            << "stream_result_to: failed to stream to "
-            << host_name_ << ":" << port_num_;
-      }
-    }
-
-   private:
-    // Creates a client socket and connects to the server.
-    void MakeConnection();
-
-    // Closes the socket.
-    void CloseConnection() override {
-      GTEST_CHECK_(sockfd_ != -1)
-          << "CloseConnection() can be called only when there is a connection.";
-
-      close(sockfd_);
-      sockfd_ = -1;
-    }
-
-    int sockfd_;  // socket file descriptor
-    const std::string host_name_;
-    const std::string port_num_;
-
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
-  };  // class SocketWriter
-
-  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
-  static std::string UrlEncode(const char* str);
-
-  StreamingListener(const std::string& host, const std::string& port)
-      : socket_writer_(new SocketWriter(host, port)) {
-    Start();
-  }
-
-  explicit StreamingListener(AbstractSocketWriter* socket_writer)
-      : socket_writer_(socket_writer) { Start(); }
-
-  void OnTestProgramStart(const UnitTest& /* unit_test */) override {
-    SendLn("event=TestProgramStart");
-  }
-
-  void OnTestProgramEnd(const UnitTest& unit_test) override {
-    // Note that Google Test current only report elapsed time for each
-    // test iteration, not for the entire test program.
-    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
-
-    // Notify the streaming server to stop.
-    socket_writer_->CloseConnection();
-  }
-
-  void OnTestIterationStart(const UnitTest& /* unit_test */,
-                            int iteration) override {
-    SendLn("event=TestIterationStart&iteration=" +
-           StreamableToString(iteration));
-  }
-
-  void OnTestIterationEnd(const UnitTest& unit_test,
-                          int /* iteration */) override {
-    SendLn("event=TestIterationEnd&passed=" +
-           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
-           StreamableToString(unit_test.elapsed_time()) + "ms");
-  }
-
-  // Note that "event=TestCaseStart" is a wire format and has to remain
-  // "case" for compatibility
-  void OnTestCaseStart(const TestCase& test_case) override {
-    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
-  }
-
-  // Note that "event=TestCaseEnd" is a wire format and has to remain
-  // "case" for compatibility
-  void OnTestCaseEnd(const TestCase& test_case) override {
-    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) +
-           "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) +
-           "ms");
-  }
-
-  void OnTestStart(const TestInfo& test_info) override {
-    SendLn(std::string("event=TestStart&name=") + test_info.name());
-  }
-
-  void OnTestEnd(const TestInfo& test_info) override {
-    SendLn("event=TestEnd&passed=" +
-           FormatBool((test_info.result())->Passed()) +
-           "&elapsed_time=" +
-           StreamableToString((test_info.result())->elapsed_time()) + "ms");
-  }
-
-  void OnTestPartResult(const TestPartResult& test_part_result) override {
-    const char* file_name = test_part_result.file_name();
-    if (file_name == nullptr) file_name = "";
-    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
-           "&line=" + StreamableToString(test_part_result.line_number()) +
-           "&message=" + UrlEncode(test_part_result.message()));
-  }
-
- private:
-  // Sends the given message and a newline to the socket.
-  void SendLn(const std::string& message) { socket_writer_->SendLn(message); }
-
-  // Called at the start of streaming to notify the receiver what
-  // protocol we are using.
-  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
-
-  std::string FormatBool(bool value) { return value ? "1" : "0"; }
-
-  const std::unique_ptr<AbstractSocketWriter> socket_writer_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
-};  // class StreamingListener
-
-#endif  // GTEST_CAN_STREAM_RESULTS_
-
-}  // namespace internal
-}  // namespace testing
-
-GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
-
-#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
diff --git a/deps/googletest/src/gtest-matchers.cc b/deps/googletest/src/gtest-matchers.cc
deleted file mode 100644
index 7d2fb6851..000000000
--- a/deps/googletest/src/gtest-matchers.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// The Google C++ Testing and Mocking Framework (Google Test)
-//
-// This file implements just enough of the matcher interface to allow
-// EXPECT_DEATH and friends to accept a matcher argument.
-
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-port.h"
-#include "gtest/gtest-matchers.h"
-
-#include <string>
-
-namespace testing {
-
-// Constructs a matcher that matches a const std::string& whose value is
-// equal to s.
-Matcher<const std::string&>::Matcher(const std::string& s) { *this = Eq(s); }
-
-// Constructs a matcher that matches a const std::string& whose value is
-// equal to s.
-Matcher<const std::string&>::Matcher(const char* s) {
-  *this = Eq(std::string(s));
-}
-
-// Constructs a matcher that matches a std::string whose value is equal to
-// s.
-Matcher<std::string>::Matcher(const std::string& s) { *this = Eq(s); }
-
-// Constructs a matcher that matches a std::string whose value is equal to
-// s.
-Matcher<std::string>::Matcher(const char* s) { *this = Eq(std::string(s)); }
-
-#if GTEST_HAS_ABSL
-// Constructs a matcher that matches a const absl::string_view& whose value is
-// equal to s.
-Matcher<const absl::string_view&>::Matcher(const std::string& s) {
-  *this = Eq(s);
-}
-
-// Constructs a matcher that matches a const absl::string_view& whose value is
-// equal to s.
-Matcher<const absl::string_view&>::Matcher(const char* s) {
-  *this = Eq(std::string(s));
-}
-
-// Constructs a matcher that matches a const absl::string_view& whose value is
-// equal to s.
-Matcher<const absl::string_view&>::Matcher(absl::string_view s) {
-  *this = Eq(std::string(s));
-}
-
-// Constructs a matcher that matches a absl::string_view whose value is equal to
-// s.
-Matcher<absl::string_view>::Matcher(const std::string& s) { *this = Eq(s); }
-
-// Constructs a matcher that matches a absl::string_view whose value is equal to
-// s.
-Matcher<absl::string_view>::Matcher(const char* s) {
-  *this = Eq(std::string(s));
-}
-
-// Constructs a matcher that matches a absl::string_view whose value is equal to
-// s.
-Matcher<absl::string_view>::Matcher(absl::string_view s) {
-  *this = Eq(std::string(s));
-}
-#endif  // GTEST_HAS_ABSL
-
-}  // namespace testing
diff --git a/deps/googletest/src/gtest-port.cc b/deps/googletest/src/gtest-port.cc
deleted file mode 100644
index 2cba2693e..000000000
--- a/deps/googletest/src/gtest-port.cc
+++ /dev/null
@@ -1,1404 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-#include "gtest/internal/gtest-port.h"
-
-#include <limits.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <fstream>
-#include <memory>
-
-#if GTEST_OS_WINDOWS
-# include <windows.h>
-# include <io.h>
-# include <sys/stat.h>
-# include <map>  // Used in ThreadLocal.
-# ifdef _MSC_VER
-#  include <crtdbg.h>
-# endif  // _MSC_VER
-#else
-# include <unistd.h>
-#endif  // GTEST_OS_WINDOWS
-
-#if GTEST_OS_MAC
-# include <mach/mach_init.h>
-# include <mach/task.h>
-# include <mach/vm_map.h>
-#endif  // GTEST_OS_MAC
-
-#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
-    GTEST_OS_NETBSD || GTEST_OS_OPENBSD
-# include <sys/sysctl.h>
-# if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
-#  include <sys/user.h>
-# endif
-#endif
-
-#if GTEST_OS_QNX
-# include <devctl.h>
-# include <fcntl.h>
-# include <sys/procfs.h>
-#endif  // GTEST_OS_QNX
-
-#if GTEST_OS_AIX
-# include <procinfo.h>
-# include <sys/types.h>
-#endif  // GTEST_OS_AIX
-
-#if GTEST_OS_FUCHSIA
-# include <zircon/process.h>
-# include <zircon/syscalls.h>
-#endif  // GTEST_OS_FUCHSIA
-
-#include "gtest/gtest-spi.h"
-#include "gtest/gtest-message.h"
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-string.h"
-#include "src/gtest-internal-inl.h"
-
-namespace testing {
-namespace internal {
-
-#if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
-const int kStdOutFileno = 1;
-const int kStdErrFileno = 2;
-#else
-const int kStdOutFileno = STDOUT_FILENO;
-const int kStdErrFileno = STDERR_FILENO;
-#endif  // _MSC_VER
-
-#if GTEST_OS_LINUX
-
-namespace {
-template <typename T>
-T ReadProcFileField(const std::string& filename, int field) {
-  std::string dummy;
-  std::ifstream file(filename.c_str());
-  while (field-- > 0) {
-    file >> dummy;
-  }
-  T output = 0;
-  file >> output;
-  return output;
-}
-}  // namespace
-
-// Returns the number of active threads, or 0 when there is an error.
-size_t GetThreadCount() {
-  const std::string filename =
-      (Message() << "/proc/" << getpid() << "/stat").GetString();
-  return ReadProcFileField<size_t>(filename, 19);
-}
-
-#elif GTEST_OS_MAC
-
-size_t GetThreadCount() {
-  const task_t task = mach_task_self();
-  mach_msg_type_number_t thread_count;
-  thread_act_array_t thread_list;
-  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
-  if (status == KERN_SUCCESS) {
-    // task_threads allocates resources in thread_list and we need to free them
-    // to avoid leaks.
-    vm_deallocate(task,
-                  reinterpret_cast<vm_address_t>(thread_list),
-                  sizeof(thread_t) * thread_count);
-    return static_cast<size_t>(thread_count);
-  } else {
-    return 0;
-  }
-}
-
-#elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
-      GTEST_OS_NETBSD
-
-#if GTEST_OS_NETBSD
-#undef KERN_PROC
-#define KERN_PROC KERN_PROC2
-#define kinfo_proc kinfo_proc2
-#endif
-
-#if GTEST_OS_DRAGONFLY
-#define KP_NLWP(kp) (kp.kp_nthreads)
-#elif GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
-#define KP_NLWP(kp) (kp.ki_numthreads)
-#elif GTEST_OS_NETBSD
-#define KP_NLWP(kp) (kp.p_nlwps)
-#endif
-
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
-size_t GetThreadCount() {
-  int mib[] = {
-    CTL_KERN,
-    KERN_PROC,
-    KERN_PROC_PID,
-    getpid(),
-#if GTEST_OS_NETBSD
-    sizeof(struct kinfo_proc),
-    1,
-#endif
-  };
-  u_int miblen = sizeof(mib) / sizeof(mib[0]);
-  struct kinfo_proc info;
-  size_t size = sizeof(info);
-  if (sysctl(mib, miblen, &info, &size, NULL, 0)) {
-    return 0;
-  }
-  return static_cast<size_t>(KP_NLWP(info));
-}
-#elif GTEST_OS_OPENBSD
-
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
-size_t GetThreadCount() {
-  int mib[] = {
-    CTL_KERN,
-    KERN_PROC,
-    KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
-    getpid(),
-    sizeof(struct kinfo_proc),
-    0,
-  };
-  u_int miblen = sizeof(mib) / sizeof(mib[0]);
-
-  // get number of structs
-  size_t size;
-  if (sysctl(mib, miblen, NULL, &size, NULL, 0)) {
-    return 0;
-  }
-  mib[5] = size / mib[4];
-
-  // populate array of structs
-  struct kinfo_proc info[mib[5]];
-  if (sysctl(mib, miblen, &info, &size, NULL, 0)) {
-    return 0;
-  }
-
-  // exclude empty members
-  int nthreads = 0;
-  for (int i = 0; i < size / mib[4]; i++) {
-    if (info[i].p_tid != -1)
-      nthreads++;
-  }
-  return nthreads;
-}
-
-#elif GTEST_OS_QNX
-
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
-size_t GetThreadCount() {
-  const int fd = open("/proc/self/as", O_RDONLY);
-  if (fd < 0) {
-    return 0;
-  }
-  procfs_info process_info;
-  const int status =
-      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), nullptr);
-  close(fd);
-  if (status == EOK) {
-    return static_cast<size_t>(process_info.num_threads);
-  } else {
-    return 0;
-  }
-}
-
-#elif GTEST_OS_AIX
-
-size_t GetThreadCount() {
-  struct procentry64 entry;
-  pid_t pid = getpid();
-  int status = getprocs64(&entry, sizeof(entry), nullptr, 0, &pid, 1);
-  if (status == 1) {
-    return entry.pi_thcount;
-  } else {
-    return 0;
-  }
-}
-
-#elif GTEST_OS_FUCHSIA
-
-size_t GetThreadCount() {
-  int dummy_buffer;
-  size_t avail;
-  zx_status_t status = zx_object_get_info(
-      zx_process_self(),
-      ZX_INFO_PROCESS_THREADS,
-      &dummy_buffer,
-      0,
-      nullptr,
-      &avail);
-  if (status == ZX_OK) {
-    return avail;
-  } else {
-    return 0;
-  }
-}
-
-#else
-
-size_t GetThreadCount() {
-  // There's no portable way to detect the number of threads, so we just
-  // return 0 to indicate that we cannot detect it.
-  return 0;
-}
-
-#endif  // GTEST_OS_LINUX
-
-#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
-
-void SleepMilliseconds(int n) {
-  ::Sleep(n);
-}
-
-AutoHandle::AutoHandle()
-    : handle_(INVALID_HANDLE_VALUE) {}
-
-AutoHandle::AutoHandle(Handle handle)
-    : handle_(handle) {}
-
-AutoHandle::~AutoHandle() {
-  Reset();
-}
-
-AutoHandle::Handle AutoHandle::Get() const {
-  return handle_;
-}
-
-void AutoHandle::Reset() {
-  Reset(INVALID_HANDLE_VALUE);
-}
-
-void AutoHandle::Reset(HANDLE handle) {
-  // Resetting with the same handle we already own is invalid.
-  if (handle_ != handle) {
-    if (IsCloseable()) {
-      ::CloseHandle(handle_);
-    }
-    handle_ = handle;
-  } else {
-    GTEST_CHECK_(!IsCloseable())
-        << "Resetting a valid handle to itself is likely a programmer error "
-            "and thus not allowed.";
-  }
-}
-
-bool AutoHandle::IsCloseable() const {
-  // Different Windows APIs may use either of these values to represent an
-  // invalid handle.
-  return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE;
-}
-
-Notification::Notification()
-    : event_(::CreateEvent(nullptr,     // Default security attributes.
-                           TRUE,        // Do not reset automatically.
-                           FALSE,       // Initially unset.
-                           nullptr)) {  // Anonymous event.
-  GTEST_CHECK_(event_.Get() != nullptr);
-}
-
-void Notification::Notify() {
-  GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE);
-}
-
-void Notification::WaitForNotification() {
-  GTEST_CHECK_(
-      ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
-}
-
-Mutex::Mutex()
-    : owner_thread_id_(0),
-      type_(kDynamic),
-      critical_section_init_phase_(0),
-      critical_section_(new CRITICAL_SECTION) {
-  ::InitializeCriticalSection(critical_section_);
-}
-
-Mutex::~Mutex() {
-  // Static mutexes are leaked intentionally. It is not thread-safe to try
-  // to clean them up.
-  if (type_ == kDynamic) {
-    ::DeleteCriticalSection(critical_section_);
-    delete critical_section_;
-    critical_section_ = nullptr;
-  }
-}
-
-void Mutex::Lock() {
-  ThreadSafeLazyInit();
-  ::EnterCriticalSection(critical_section_);
-  owner_thread_id_ = ::GetCurrentThreadId();
-}
-
-void Mutex::Unlock() {
-  ThreadSafeLazyInit();
-  // We don't protect writing to owner_thread_id_ here, as it's the
-  // caller's responsibility to ensure that the current thread holds the
-  // mutex when this is called.
-  owner_thread_id_ = 0;
-  ::LeaveCriticalSection(critical_section_);
-}
-
-// Does nothing if the current thread holds the mutex. Otherwise, crashes
-// with high probability.
-void Mutex::AssertHeld() {
-  ThreadSafeLazyInit();
-  GTEST_CHECK_(owner_thread_id_ == ::GetCurrentThreadId())
-      << "The current thread is not holding the mutex @" << this;
-}
-
-namespace {
-
-#ifdef _MSC_VER
-// Use the RAII idiom to flag mem allocs that are intentionally never
-// deallocated. The motivation is to silence the false positive mem leaks
-// that are reported by the debug version of MS's CRT which can only detect
-// if an alloc is missing a matching deallocation.
-// Example:
-//    MemoryIsNotDeallocated memory_is_not_deallocated;
-//    critical_section_ = new CRITICAL_SECTION;
-//
-class MemoryIsNotDeallocated
-{
- public:
-  MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
-    old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
-    // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
-    // doesn't report mem leak if there's no matching deallocation.
-    _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
-  }
-
-  ~MemoryIsNotDeallocated() {
-    // Restore the original _CRTDBG_ALLOC_MEM_DF flag
-    _CrtSetDbgFlag(old_crtdbg_flag_);
-  }
-
- private:
-  int old_crtdbg_flag_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated);
-};
-#endif  // _MSC_VER
-
-}  // namespace
-
-// Initializes owner_thread_id_ and critical_section_ in static mutexes.
-void Mutex::ThreadSafeLazyInit() {
-  // Dynamic mutexes are initialized in the constructor.
-  if (type_ == kStatic) {
-    switch (
-        ::InterlockedCompareExchange(&critical_section_init_phase_, 1L, 0L)) {
-      case 0:
-        // If critical_section_init_phase_ was 0 before the exchange, we
-        // are the first to test it and need to perform the initialization.
-        owner_thread_id_ = 0;
-        {
-          // Use RAII to flag that following mem alloc is never deallocated.
-#ifdef _MSC_VER
-          MemoryIsNotDeallocated memory_is_not_deallocated;
-#endif  // _MSC_VER
-          critical_section_ = new CRITICAL_SECTION;
-        }
-        ::InitializeCriticalSection(critical_section_);
-        // Updates the critical_section_init_phase_ to 2 to signal
-        // initialization complete.
-        GTEST_CHECK_(::InterlockedCompareExchange(
-                          &critical_section_init_phase_, 2L, 1L) ==
-                      1L);
-        break;
-      case 1:
-        // Somebody else is already initializing the mutex; spin until they
-        // are done.
-        while (::InterlockedCompareExchange(&critical_section_init_phase_,
-                                            2L,
-                                            2L) != 2L) {
-          // Possibly yields the rest of the thread's time slice to other
-          // threads.
-          ::Sleep(0);
-        }
-        break;
-
-      case 2:
-        break;  // The mutex is already initialized and ready for use.
-
-      default:
-        GTEST_CHECK_(false)
-            << "Unexpected value of critical_section_init_phase_ "
-            << "while initializing a static mutex.";
-    }
-  }
-}
-
-namespace {
-
-class ThreadWithParamSupport : public ThreadWithParamBase {
- public:
-  static HANDLE CreateThread(Runnable* runnable,
-                             Notification* thread_can_start) {
-    ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start);
-    DWORD thread_id;
-    HANDLE thread_handle = ::CreateThread(
-        nullptr,  // Default security.
-        0,        // Default stack size.
-        &ThreadWithParamSupport::ThreadMain,
-        param,        // Parameter to ThreadMainStatic
-        0x0,          // Default creation flags.
-        &thread_id);  // Need a valid pointer for the call to work under Win98.
-    GTEST_CHECK_(thread_handle != nullptr)
-        << "CreateThread failed with error " << ::GetLastError() << ".";
-    if (thread_handle == nullptr) {
-      delete param;
-    }
-    return thread_handle;
-  }
-
- private:
-  struct ThreadMainParam {
-    ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
-        : runnable_(runnable),
-          thread_can_start_(thread_can_start) {
-    }
-    std::unique_ptr<Runnable> runnable_;
-    // Does not own.
-    Notification* thread_can_start_;
-  };
-
-  static DWORD WINAPI ThreadMain(void* ptr) {
-    // Transfers ownership.
-    std::unique_ptr<ThreadMainParam> param(static_cast<ThreadMainParam*>(ptr));
-    if (param->thread_can_start_ != nullptr)
-      param->thread_can_start_->WaitForNotification();
-    param->runnable_->Run();
-    return 0;
-  }
-
-  // Prohibit instantiation.
-  ThreadWithParamSupport();
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport);
-};
-
-}  // namespace
-
-ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
-                                         Notification* thread_can_start)
-      : thread_(ThreadWithParamSupport::CreateThread(runnable,
-                                                     thread_can_start)) {
-}
-
-ThreadWithParamBase::~ThreadWithParamBase() {
-  Join();
-}
-
-void ThreadWithParamBase::Join() {
-  GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
-      << "Failed to join the thread with error " << ::GetLastError() << ".";
-}
-
-// Maps a thread to a set of ThreadIdToThreadLocals that have values
-// instantiated on that thread and notifies them when the thread exits.  A
-// ThreadLocal instance is expected to persist until all threads it has
-// values on have terminated.
-class ThreadLocalRegistryImpl {
- public:
-  // Registers thread_local_instance as having value on the current thread.
-  // Returns a value that can be used to identify the thread from other threads.
-  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
-      const ThreadLocalBase* thread_local_instance) {
-    DWORD current_thread = ::GetCurrentThreadId();
-    MutexLock lock(&mutex_);
-    ThreadIdToThreadLocals* const thread_to_thread_locals =
-        GetThreadLocalsMapLocked();
-    ThreadIdToThreadLocals::iterator thread_local_pos =
-        thread_to_thread_locals->find(current_thread);
-    if (thread_local_pos == thread_to_thread_locals->end()) {
-      thread_local_pos = thread_to_thread_locals->insert(
-          std::make_pair(current_thread, ThreadLocalValues())).first;
-      StartWatcherThreadFor(current_thread);
-    }
-    ThreadLocalValues& thread_local_values = thread_local_pos->second;
-    ThreadLocalValues::iterator value_pos =
-        thread_local_values.find(thread_local_instance);
-    if (value_pos == thread_local_values.end()) {
-      value_pos =
-          thread_local_values
-              .insert(std::make_pair(
-                  thread_local_instance,
-                  std::shared_ptr<ThreadLocalValueHolderBase>(
-                      thread_local_instance->NewValueForCurrentThread())))
-              .first;
-    }
-    return value_pos->second.get();
-  }
-
-  static void OnThreadLocalDestroyed(
-      const ThreadLocalBase* thread_local_instance) {
-    std::vector<std::shared_ptr<ThreadLocalValueHolderBase> > value_holders;
-    // Clean up the ThreadLocalValues data structure while holding the lock, but
-    // defer the destruction of the ThreadLocalValueHolderBases.
-    {
-      MutexLock lock(&mutex_);
-      ThreadIdToThreadLocals* const thread_to_thread_locals =
-          GetThreadLocalsMapLocked();
-      for (ThreadIdToThreadLocals::iterator it =
-          thread_to_thread_locals->begin();
-          it != thread_to_thread_locals->end();
-          ++it) {
-        ThreadLocalValues& thread_local_values = it->second;
-        ThreadLocalValues::iterator value_pos =
-            thread_local_values.find(thread_local_instance);
-        if (value_pos != thread_local_values.end()) {
-          value_holders.push_back(value_pos->second);
-          thread_local_values.erase(value_pos);
-          // This 'if' can only be successful at most once, so theoretically we
-          // could break out of the loop here, but we don't bother doing so.
-        }
-      }
-    }
-    // Outside the lock, let the destructor for 'value_holders' deallocate the
-    // ThreadLocalValueHolderBases.
-  }
-
-  static void OnThreadExit(DWORD thread_id) {
-    GTEST_CHECK_(thread_id != 0) << ::GetLastError();
-    std::vector<std::shared_ptr<ThreadLocalValueHolderBase> > value_holders;
-    // Clean up the ThreadIdToThreadLocals data structure while holding the
-    // lock, but defer the destruction of the ThreadLocalValueHolderBases.
-    {
-      MutexLock lock(&mutex_);
-      ThreadIdToThreadLocals* const thread_to_thread_locals =
-          GetThreadLocalsMapLocked();
-      ThreadIdToThreadLocals::iterator thread_local_pos =
-          thread_to_thread_locals->find(thread_id);
-      if (thread_local_pos != thread_to_thread_locals->end()) {
-        ThreadLocalValues& thread_local_values = thread_local_pos->second;
-        for (ThreadLocalValues::iterator value_pos =
-            thread_local_values.begin();
-            value_pos != thread_local_values.end();
-            ++value_pos) {
-          value_holders.push_back(value_pos->second);
-        }
-        thread_to_thread_locals->erase(thread_local_pos);
-      }
-    }
-    // Outside the lock, let the destructor for 'value_holders' deallocate the
-    // ThreadLocalValueHolderBases.
-  }
-
- private:
-  // In a particular thread, maps a ThreadLocal object to its value.
-  typedef std::map<const ThreadLocalBase*,
-                   std::shared_ptr<ThreadLocalValueHolderBase> >
-      ThreadLocalValues;
-  // Stores all ThreadIdToThreadLocals having values in a thread, indexed by
-  // thread's ID.
-  typedef std::map<DWORD, ThreadLocalValues> ThreadIdToThreadLocals;
-
-  // Holds the thread id and thread handle that we pass from
-  // StartWatcherThreadFor to WatcherThreadFunc.
-  typedef std::pair<DWORD, HANDLE> ThreadIdAndHandle;
-
-  static void StartWatcherThreadFor(DWORD thread_id) {
-    // The returned handle will be kept in thread_map and closed by
-    // watcher_thread in WatcherThreadFunc.
-    HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION,
-                                 FALSE,
-                                 thread_id);
-    GTEST_CHECK_(thread != nullptr);
-    // We need to pass a valid thread ID pointer into CreateThread for it
-    // to work correctly under Win98.
-    DWORD watcher_thread_id;
-    HANDLE watcher_thread = ::CreateThread(
-        nullptr,  // Default security.
-        0,        // Default stack size
-        &ThreadLocalRegistryImpl::WatcherThreadFunc,
-        reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
-        CREATE_SUSPENDED, &watcher_thread_id);
-    GTEST_CHECK_(watcher_thread != nullptr);
-    // Give the watcher thread the same priority as ours to avoid being
-    // blocked by it.
-    ::SetThreadPriority(watcher_thread,
-                        ::GetThreadPriority(::GetCurrentThread()));
-    ::ResumeThread(watcher_thread);
-    ::CloseHandle(watcher_thread);
-  }
-
-  // Monitors exit from a given thread and notifies those
-  // ThreadIdToThreadLocals about thread termination.
-  static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
-    const ThreadIdAndHandle* tah =
-        reinterpret_cast<const ThreadIdAndHandle*>(param);
-    GTEST_CHECK_(
-        ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
-    OnThreadExit(tah->first);
-    ::CloseHandle(tah->second);
-    delete tah;
-    return 0;
-  }
-
-  // Returns map of thread local instances.
-  static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() {
-    mutex_.AssertHeld();
-#ifdef _MSC_VER
-    MemoryIsNotDeallocated memory_is_not_deallocated;
-#endif  // _MSC_VER
-    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals();
-    return map;
-  }
-
-  // Protects access to GetThreadLocalsMapLocked() and its return value.
-  static Mutex mutex_;
-  // Protects access to GetThreadMapLocked() and its return value.
-  static Mutex thread_map_mutex_;
-};
-
-Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);
-Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);
-
-ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
-      const ThreadLocalBase* thread_local_instance) {
-  return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
-      thread_local_instance);
-}
-
-void ThreadLocalRegistry::OnThreadLocalDestroyed(
-      const ThreadLocalBase* thread_local_instance) {
-  ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
-}
-
-#endif  // GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
-
-#if GTEST_USES_POSIX_RE
-
-// Implements RE.  Currently only needed for death tests.
-
-RE::~RE() {
-  if (is_valid_) {
-    // regfree'ing an invalid regex might crash because the content
-    // of the regex is undefined. Since the regex's are essentially
-    // the same, one cannot be valid (or invalid) without the other
-    // being so too.
-    regfree(&partial_regex_);
-    regfree(&full_regex_);
-  }
-  free(const_cast<char*>(pattern_));
-}
-
-// Returns true iff regular expression re matches the entire str.
-bool RE::FullMatch(const char* str, const RE& re) {
-  if (!re.is_valid_) return false;
-
-  regmatch_t match;
-  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
-}
-
-// Returns true iff regular expression re matches a substring of str
-// (including str itself).
-bool RE::PartialMatch(const char* str, const RE& re) {
-  if (!re.is_valid_) return false;
-
-  regmatch_t match;
-  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
-}
-
-// Initializes an RE from its string representation.
-void RE::Init(const char* regex) {
-  pattern_ = posix::StrDup(regex);
-
-  // Reserves enough bytes to hold the regular expression used for a
-  // full match.
-  const size_t full_regex_len = strlen(regex) + 10;
-  char* const full_pattern = new char[full_regex_len];
-
-  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
-  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
-  // We want to call regcomp(&partial_regex_, ...) even if the
-  // previous expression returns false.  Otherwise partial_regex_ may
-  // not be properly initialized can may cause trouble when it's
-  // freed.
-  //
-  // Some implementation of POSIX regex (e.g. on at least some
-  // versions of Cygwin) doesn't accept the empty string as a valid
-  // regex.  We change it to an equivalent form "()" to be safe.
-  if (is_valid_) {
-    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
-    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
-  }
-  EXPECT_TRUE(is_valid_)
-      << "Regular expression \"" << regex
-      << "\" is not a valid POSIX Extended regular expression.";
-
-  delete[] full_pattern;
-}
-
-#elif GTEST_USES_SIMPLE_RE
-
-// Returns true iff ch appears anywhere in str (excluding the
-// terminating '\0' character).
-bool IsInSet(char ch, const char* str) {
-  return ch != '\0' && strchr(str, ch) != nullptr;
-}
-
-// Returns true iff ch belongs to the given classification.  Unlike
-// similar functions in <ctype.h>, these aren't affected by the
-// current locale.
-bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
-bool IsAsciiPunct(char ch) {
-  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
-}
-bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
-bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
-bool IsAsciiWordChar(char ch) {
-  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
-      ('0' <= ch && ch <= '9') || ch == '_';
-}
-
-// Returns true iff "\\c" is a supported escape sequence.
-bool IsValidEscape(char c) {
-  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
-}
-
-// Returns true iff the given atom (specified by escaped and pattern)
-// matches ch.  The result is undefined if the atom is invalid.
-bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
-  if (escaped) {  // "\\p" where p is pattern_char.
-    switch (pattern_char) {
-      case 'd': return IsAsciiDigit(ch);
-      case 'D': return !IsAsciiDigit(ch);
-      case 'f': return ch == '\f';
-      case 'n': return ch == '\n';
-      case 'r': return ch == '\r';
-      case 's': return IsAsciiWhiteSpace(ch);
-      case 'S': return !IsAsciiWhiteSpace(ch);
-      case 't': return ch == '\t';
-      case 'v': return ch == '\v';
-      case 'w': return IsAsciiWordChar(ch);
-      case 'W': return !IsAsciiWordChar(ch);
-    }
-    return IsAsciiPunct(pattern_char) && pattern_char == ch;
-  }
-
-  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
-}
-
-// Helper function used by ValidateRegex() to format error messages.
-static std::string FormatRegexSyntaxError(const char* regex, int index) {
-  return (Message() << "Syntax error at index " << index
-          << " in simple regular expression \"" << regex << "\": ").GetString();
-}
-
-// Generates non-fatal failures and returns false if regex is invalid;
-// otherwise returns true.
-bool ValidateRegex(const char* regex) {
-  if (regex == nullptr) {
-    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
-    return false;
-  }
-
-  bool is_valid = true;
-
-  // True iff ?, *, or + can follow the previous atom.
-  bool prev_repeatable = false;
-  for (int i = 0; regex[i]; i++) {
-    if (regex[i] == '\\') {  // An escape sequence
-      i++;
-      if (regex[i] == '\0') {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
-                      << "'\\' cannot appear at the end.";
-        return false;
-      }
-
-      if (!IsValidEscape(regex[i])) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
-                      << "invalid escape sequence \"\\" << regex[i] << "\".";
-        is_valid = false;
-      }
-      prev_repeatable = true;
-    } else {  // Not an escape sequence.
-      const char ch = regex[i];
-
-      if (ch == '^' && i > 0) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'^' can only appear at the beginning.";
-        is_valid = false;
-      } else if (ch == '$' && regex[i + 1] != '\0') {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'$' can only appear at the end.";
-        is_valid = false;
-      } else if (IsInSet(ch, "()[]{}|")) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' is unsupported.";
-        is_valid = false;
-      } else if (IsRepeat(ch) && !prev_repeatable) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' can only follow a repeatable token.";
-        is_valid = false;
-      }
-
-      prev_repeatable = !IsInSet(ch, "^$?*+");
-    }
-  }
-
-  return is_valid;
-}
-
-// Matches a repeated regex atom followed by a valid simple regular
-// expression.  The regex atom is defined as c if escaped is false,
-// or \c otherwise.  repeat is the repetition meta character (?, *,
-// or +).  The behavior is undefined if str contains too many
-// characters to be indexable by size_t, in which case the test will
-// probably time out anyway.  We are fine with this limitation as
-// std::string has it too.
-bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char c, char repeat, const char* regex,
-    const char* str) {
-  const size_t min_count = (repeat == '+') ? 1 : 0;
-  const size_t max_count = (repeat == '?') ? 1 :
-      static_cast<size_t>(-1) - 1;
-  // We cannot call numeric_limits::max() as it conflicts with the
-  // max() macro on Windows.
-
-  for (size_t i = 0; i <= max_count; ++i) {
-    // We know that the atom matches each of the first i characters in str.
-    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
-      // We have enough matches at the head, and the tail matches too.
-      // Since we only care about *whether* the pattern matches str
-      // (as opposed to *how* it matches), there is no need to find a
-      // greedy match.
-      return true;
-    }
-    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
-      return false;
-  }
-  return false;
-}
-
-// Returns true iff regex matches a prefix of str.  regex must be a
-// valid simple regular expression and not start with "^", or the
-// result is undefined.
-bool MatchRegexAtHead(const char* regex, const char* str) {
-  if (*regex == '\0')  // An empty regex matches a prefix of anything.
-    return true;
-
-  // "$" only matches the end of a string.  Note that regex being
-  // valid guarantees that there's nothing after "$" in it.
-  if (*regex == '$')
-    return *str == '\0';
-
-  // Is the first thing in regex an escape sequence?
-  const bool escaped = *regex == '\\';
-  if (escaped)
-    ++regex;
-  if (IsRepeat(regex[1])) {
-    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
-    // here's an indirect recursion.  It terminates as the regex gets
-    // shorter in each recursion.
-    return MatchRepetitionAndRegexAtHead(
-        escaped, regex[0], regex[1], regex + 2, str);
-  } else {
-    // regex isn't empty, isn't "$", and doesn't start with a
-    // repetition.  We match the first atom of regex with the first
-    // character of str and recurse.
-    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
-        MatchRegexAtHead(regex + 1, str + 1);
-  }
-}
-
-// Returns true iff regex matches any substring of str.  regex must be
-// a valid simple regular expression, or the result is undefined.
-//
-// The algorithm is recursive, but the recursion depth doesn't exceed
-// the regex length, so we won't need to worry about running out of
-// stack space normally.  In rare cases the time complexity can be
-// exponential with respect to the regex length + the string length,
-// but usually it's must faster (often close to linear).
-bool MatchRegexAnywhere(const char* regex, const char* str) {
-  if (regex == nullptr || str == nullptr) return false;
-
-  if (*regex == '^')
-    return MatchRegexAtHead(regex + 1, str);
-
-  // A successful match can be anywhere in str.
-  do {
-    if (MatchRegexAtHead(regex, str))
-      return true;
-  } while (*str++ != '\0');
-  return false;
-}
-
-// Implements the RE class.
-
-RE::~RE() {
-  free(const_cast<char*>(pattern_));
-  free(const_cast<char*>(full_pattern_));
-}
-
-// Returns true iff regular expression re matches the entire str.
-bool RE::FullMatch(const char* str, const RE& re) {
-  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
-}
-
-// Returns true iff regular expression re matches a substring of str
-// (including str itself).
-bool RE::PartialMatch(const char* str, const RE& re) {
-  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
-}
-
-// Initializes an RE from its string representation.
-void RE::Init(const char* regex) {
-  pattern_ = full_pattern_ = nullptr;
-  if (regex != nullptr) {
-    pattern_ = posix::StrDup(regex);
-  }
-
-  is_valid_ = ValidateRegex(regex);
-  if (!is_valid_) {
-    // No need to calculate the full pattern when the regex is invalid.
-    return;
-  }
-
-  const size_t len = strlen(regex);
-  // Reserves enough bytes to hold the regular expression used for a
-  // full match: we need space to prepend a '^', append a '$', and
-  // terminate the string with '\0'.
-  char* buffer = static_cast<char*>(malloc(len + 3));
-  full_pattern_ = buffer;
-
-  if (*regex != '^')
-    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
-
-  // We don't use snprintf or strncpy, as they trigger a warning when
-  // compiled with VC++ 8.0.
-  memcpy(buffer, regex, len);
-  buffer += len;
-
-  if (len == 0 || regex[len - 1] != '$')
-    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
-
-  *buffer = '\0';
-}
-
-#endif  // GTEST_USES_POSIX_RE
-
-const char kUnknownFile[] = "unknown file";
-
-// Formats a source file path and a line number as they would appear
-// in an error message from the compiler used to compile this code.
-GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
-  const std::string file_name(file == nullptr ? kUnknownFile : file);
-
-  if (line < 0) {
-    return file_name + ":";
-  }
-#ifdef _MSC_VER
-  return file_name + "(" + StreamableToString(line) + "):";
-#else
-  return file_name + ":" + StreamableToString(line) + ":";
-#endif  // _MSC_VER
-}
-
-// Formats a file location for compiler-independent XML output.
-// Although this function is not platform dependent, we put it next to
-// FormatFileLocation in order to contrast the two functions.
-// Note that FormatCompilerIndependentFileLocation() does NOT append colon
-// to the file location it produces, unlike FormatFileLocation().
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
-    const char* file, int line) {
-  const std::string file_name(file == nullptr ? kUnknownFile : file);
-
-  if (line < 0)
-    return file_name;
-  else
-    return file_name + ":" + StreamableToString(line);
-}
-
-GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
-    : severity_(severity) {
-  const char* const marker =
-      severity == GTEST_INFO ?    "[  INFO ]" :
-      severity == GTEST_WARNING ? "[WARNING]" :
-      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
-  GetStream() << ::std::endl << marker << " "
-              << FormatFileLocation(file, line).c_str() << ": ";
-}
-
-// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
-GTestLog::~GTestLog() {
-  GetStream() << ::std::endl;
-  if (severity_ == GTEST_FATAL) {
-    fflush(stderr);
-    posix::Abort();
-  }
-}
-
-// Disable Microsoft deprecation warnings for POSIX functions called from
-// this class (creat, dup, dup2, and close)
-GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
-
-#if GTEST_HAS_STREAM_REDIRECTION
-
-// Object that captures an output stream (stdout/stderr).
-class CapturedStream {
- public:
-  // The ctor redirects the stream to a temporary file.
-# if GTEST_OS_WINDOWS
-  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(_dup(fd)) {
-    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
-    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
-
-    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
-    const UINT success = ::GetTempFileNameA(temp_dir_path,
-                                            "gtest_redir",
-                                            0,  // Generate unique file name.
-                                            temp_file_path);
-    GTEST_CHECK_(success != 0)
-        << "Unable to create a temporary file in " << temp_dir_path;
-    const int captured_fd = _creat(temp_file_path, _S_IREAD | _S_IWRITE);
-    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
-                                    << temp_file_path;
-    filename_ = temp_file_path;
-# else
-    // There's no guarantee that a test has write access to the current
-    // directory, so we create the temporary file in the /tmp directory
-    // instead. We use /tmp on most systems, and /sdcard on Android.
-    // That's because Android doesn't have /tmp.
-  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
-#  if GTEST_OS_LINUX_ANDROID
-    // Note: Android applications are expected to call the framework's
-    // Context.getExternalStorageDirectory() method through JNI to get
-    // the location of the world-writable SD Card directory. However,
-    // this requires a Context handle, which cannot be retrieved
-    // globally from native code. Doing so also precludes running the
-    // code as part of a regular standalone executable, which doesn't
-    // run in a Dalvik process (e.g. when running it through 'adb shell').
-    //
-    // The location /sdcard is directly accessible from native code
-    // and is the only location (unofficially) supported by the Android
-    // team. It's generally a symlink to the real SD Card mount point
-    // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or
-    // other OEM-customized locations. Never rely on these, and always
-    // use /sdcard.
-    char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX";
-#  else
-    char name_template[] = "/tmp/captured_stream.XXXXXX";
-#  endif  // GTEST_OS_LINUX_ANDROID
-    const int captured_fd = mkstemp(name_template);
-    filename_ = name_template;
-# endif  // GTEST_OS_WINDOWS
-    fflush(nullptr);
-#if GTEST_OS_WINDOWS
-    _dup2(captured_fd, fd_);
-    _close(captured_fd);
-#else
-    dup2(captured_fd, fd_);
-    close(captured_fd);
-#endif
-  }
-
-  ~CapturedStream() {
-    remove(filename_.c_str());
-  }
-
-  std::string GetCapturedString() {
-    if (uncaptured_fd_ != -1) {
-      // Restores the original stream.
-      fflush(nullptr);
-#if GTEST_OS_WINDOWS
-      _dup2(uncaptured_fd_, fd_);
-      _close(uncaptured_fd_);
-#else
-      dup2(uncaptured_fd_, fd_);
-      close(uncaptured_fd_);
-#endif
-      uncaptured_fd_ = -1;
-    }
-
-    FILE* const file = posix::FOpen(filename_.c_str(), "r");
-    const std::string content = ReadEntireFile(file);
-    posix::FClose(file);
-    return content;
-  }
-
- private:
-  const int fd_;  // A stream to capture.
-  int uncaptured_fd_;
-  // Name of the temporary file holding the stderr output.
-  ::std::string filename_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
-};
-
-GTEST_DISABLE_MSC_DEPRECATED_POP_()
-
-static CapturedStream* g_captured_stderr = nullptr;
-static CapturedStream* g_captured_stdout = nullptr;
-
-// Starts capturing an output stream (stdout/stderr).
-static void CaptureStream(int fd, const char* stream_name,
-                          CapturedStream** stream) {
-  if (*stream != nullptr) {
-    GTEST_LOG_(FATAL) << "Only one " << stream_name
-                      << " capturer can exist at a time.";
-  }
-  *stream = new CapturedStream(fd);
-}
-
-// Stops capturing the output stream and returns the captured string.
-static std::string GetCapturedStream(CapturedStream** captured_stream) {
-  const std::string content = (*captured_stream)->GetCapturedString();
-
-  delete *captured_stream;
-  *captured_stream = nullptr;
-
-  return content;
-}
-
-// Starts capturing stdout.
-void CaptureStdout() {
-  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
-}
-
-// Starts capturing stderr.
-void CaptureStderr() {
-  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
-}
-
-// Stops capturing stdout and returns the captured string.
-std::string GetCapturedStdout() {
-  return GetCapturedStream(&g_captured_stdout);
-}
-
-// Stops capturing stderr and returns the captured string.
-std::string GetCapturedStderr() {
-  return GetCapturedStream(&g_captured_stderr);
-}
-
-#endif  // GTEST_HAS_STREAM_REDIRECTION
-
-
-
-
-
-size_t GetFileSize(FILE* file) {
-  fseek(file, 0, SEEK_END);
-  return static_cast<size_t>(ftell(file));
-}
-
-std::string ReadEntireFile(FILE* file) {
-  const size_t file_size = GetFileSize(file);
-  char* const buffer = new char[file_size];
-
-  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
-  size_t bytes_read = 0;       // # of bytes read so far
-
-  fseek(file, 0, SEEK_SET);
-
-  // Keeps reading the file until we cannot read further or the
-  // pre-determined file size is reached.
-  do {
-    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
-    bytes_read += bytes_last_read;
-  } while (bytes_last_read > 0 && bytes_read < file_size);
-
-  const std::string content(buffer, bytes_read);
-  delete[] buffer;
-
-  return content;
-}
-
-#if GTEST_HAS_DEATH_TEST
-static const std::vector<std::string>* g_injected_test_argvs =
-    nullptr;  // Owned.
-
-std::vector<std::string> GetInjectableArgvs() {
-  if (g_injected_test_argvs != nullptr) {
-    return *g_injected_test_argvs;
-  }
-  return GetArgvs();
-}
-
-void SetInjectableArgvs(const std::vector<std::string>* new_argvs) {
-  if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs;
-  g_injected_test_argvs = new_argvs;
-}
-
-void SetInjectableArgvs(const std::vector<std::string>& new_argvs) {
-  SetInjectableArgvs(
-      new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
-}
-
-void ClearInjectableArgvs() {
-  delete g_injected_test_argvs;
-  g_injected_test_argvs = nullptr;
-}
-#endif  // GTEST_HAS_DEATH_TEST
-
-#if GTEST_OS_WINDOWS_MOBILE
-namespace posix {
-void Abort() {
-  DebugBreak();
-  TerminateProcess(GetCurrentProcess(), 1);
-}
-}  // namespace posix
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-// Returns the name of the environment variable corresponding to the
-// given flag.  For example, FlagToEnvVar("foo") will return
-// "GTEST_FOO" in the open-source version.
-static std::string FlagToEnvVar(const char* flag) {
-  const std::string full_flag =
-      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
-
-  Message env_var;
-  for (size_t i = 0; i != full_flag.length(); i++) {
-    env_var << ToUpper(full_flag.c_str()[i]);
-  }
-
-  return env_var.GetString();
-}
-
-// Parses 'str' for a 32-bit signed integer.  If successful, writes
-// the result to *value and returns true; otherwise leaves *value
-// unchanged and returns false.
-bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
-  // Parses the environment variable as a decimal integer.
-  char* end = nullptr;
-  const long long_value = strtol(str, &end, 10);  // NOLINT
-
-  // Has strtol() consumed all characters in the string?
-  if (*end != '\0') {
-    // No - an invalid character was encountered.
-    Message msg;
-    msg << "WARNING: " << src_text
-        << " is expected to be a 32-bit integer, but actually"
-        << " has value \"" << str << "\".\n";
-    printf("%s", msg.GetString().c_str());
-    fflush(stdout);
-    return false;
-  }
-
-  // Is the parsed value in the range of an Int32?
-  const Int32 result = static_cast<Int32>(long_value);
-  if (long_value == LONG_MAX || long_value == LONG_MIN ||
-      // The parsed value overflows as a long.  (strtol() returns
-      // LONG_MAX or LONG_MIN when the input overflows.)
-      result != long_value
-      // The parsed value overflows as an Int32.
-      ) {
-    Message msg;
-    msg << "WARNING: " << src_text
-        << " is expected to be a 32-bit integer, but actually"
-        << " has value " << str << ", which overflows.\n";
-    printf("%s", msg.GetString().c_str());
-    fflush(stdout);
-    return false;
-  }
-
-  *value = result;
-  return true;
-}
-
-// Reads and returns the Boolean environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-//
-// The value is considered true iff it's not "0".
-bool BoolFromGTestEnv(const char* flag, bool default_value) {
-#if defined(GTEST_GET_BOOL_FROM_ENV_)
-  return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
-#else
-  const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = posix::GetEnv(env_var.c_str());
-  return string_value == nullptr ? default_value
-                                 : strcmp(string_value, "0") != 0;
-#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
-}
-
-// Reads and returns a 32-bit integer stored in the environment
-// variable corresponding to the given flag; if it isn't set or
-// doesn't represent a valid 32-bit integer, returns default_value.
-Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
-#if defined(GTEST_GET_INT32_FROM_ENV_)
-  return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
-#else
-  const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = posix::GetEnv(env_var.c_str());
-  if (string_value == nullptr) {
-    // The environment variable is not set.
-    return default_value;
-  }
-
-  Int32 result = default_value;
-  if (!ParseInt32(Message() << "Environment variable " << env_var,
-                  string_value, &result)) {
-    printf("The default value %s is used.\n",
-           (Message() << default_value).GetString().c_str());
-    fflush(stdout);
-    return default_value;
-  }
-
-  return result;
-#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
-}
-
-// As a special case for the 'output' flag, if GTEST_OUTPUT is not
-// set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
-// system.  The value of XML_OUTPUT_FILE is a filename without the
-// "xml:" prefix of GTEST_OUTPUT.
-// Note that this is meant to be called at the call site so it does
-// not check that the flag is 'output'
-// In essence this checks an env variable called XML_OUTPUT_FILE
-// and if it is set we prepend "xml:" to its value, if it not set we return ""
-std::string OutputFlagAlsoCheckEnvVar(){
-  std::string default_value_for_output_flag = "";
-  const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
-  if (nullptr != xml_output_file_env) {
-    default_value_for_output_flag = std::string("xml:") + xml_output_file_env;
-  }
-  return default_value_for_output_flag;
-}
-
-// Reads and returns the string environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-const char* StringFromGTestEnv(const char* flag, const char* default_value) {
-#if defined(GTEST_GET_STRING_FROM_ENV_)
-  return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
-#else
-  const std::string env_var = FlagToEnvVar(flag);
-  const char* const value = posix::GetEnv(env_var.c_str());
-  return value == nullptr ? default_value : value;
-#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
-}
-
-}  // namespace internal
-}  // namespace testing
diff --git a/deps/googletest/src/gtest-printers.cc b/deps/googletest/src/gtest-printers.cc
deleted file mode 100644
index 40a8817e1..000000000
--- a/deps/googletest/src/gtest-printers.cc
+++ /dev/null
@@ -1,441 +0,0 @@
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-// Google Test - The Google C++ Testing and Mocking Framework
-//
-// This file implements a universal value printer that can print a
-// value of any type T:
-//
-//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
-//
-// It uses the << operator when possible, and prints the bytes in the
-// object otherwise.  A user can override its behavior for a class
-// type Foo by defining either operator<<(::std::ostream&, const Foo&)
-// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
-// defines Foo.
-
-#include "gtest/gtest-printers.h"
-#include <stdio.h>
-#include <cctype>
-#include <cwchar>
-#include <ostream>  // NOLINT
-#include <string>
-#include "gtest/internal/gtest-port.h"
-#include "src/gtest-internal-inl.h"
-
-namespace testing {
-
-namespace {
-
-using ::std::ostream;
-
-// Prints a segment of bytes in the given object.
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
-                                size_t count, ostream* os) {
-  char text[5] = "";
-  for (size_t i = 0; i != count; i++) {
-    const size_t j = start + i;
-    if (i != 0) {
-      // Organizes the bytes into groups of 2 for easy parsing by
-      // human.
-      if ((j % 2) == 0)
-        *os << ' ';
-      else
-        *os << '-';
-    }
-    GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
-    *os << text;
-  }
-}
-
-// Prints the bytes in the given value to the given ostream.
-void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
-                              ostream* os) {
-  // Tells the user how big the object is.
-  *os << count << "-byte object <";
-
-  const size_t kThreshold = 132;
-  const size_t kChunkSize = 64;
-  // If the object size is bigger than kThreshold, we'll have to omit
-  // some details by printing only the first and the last kChunkSize
-  // bytes.
-  if (count < kThreshold) {
-    PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
-  } else {
-    PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
-    *os << " ... ";
-    // Rounds up to 2-byte boundary.
-    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
-    PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
-  }
-  *os << ">";
-}
-
-}  // namespace
-
-namespace internal2 {
-
-// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
-// given object.  The delegation simplifies the implementation, which
-// uses the << operator and thus is easier done outside of the
-// ::testing::internal namespace, which contains a << operator that
-// sometimes conflicts with the one in STL.
-void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
-                          ostream* os) {
-  PrintBytesInObjectToImpl(obj_bytes, count, os);
-}
-
-}  // namespace internal2
-
-namespace internal {
-
-// Depending on the value of a char (or wchar_t), we print it in one
-// of three formats:
-//   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
-//   - as a hexadecimal escape sequence (e.g. '\x7F'), or
-//   - as a special escape sequence (e.g. '\r', '\n').
-enum CharFormat {
-  kAsIs,
-  kHexEscape,
-  kSpecialEscape
-};
-
-// Returns true if c is a printable ASCII character.  We test the
-// value of c directly instead of calling isprint(), which is buggy on
-// Windows Mobile.
-inline bool IsPrintableAscii(wchar_t c) {
-  return 0x20 <= c && c <= 0x7E;
-}
-
-// Prints a wide or narrow char c as a character literal without the
-// quotes, escaping it when necessary; returns how c was formatted.
-// The template argument UnsignedChar is the unsigned version of Char,
-// which is the type of c.
-template <typename UnsignedChar, typename Char>
-static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
-  switch (static_cast<wchar_t>(c)) {
-    case L'\0':
-      *os << "\\0";
-      break;
-    case L'\'':
-      *os << "\\'";
-      break;
-    case L'\\':
-      *os << "\\\\";
-      break;
-    case L'\a':
-      *os << "\\a";
-      break;
-    case L'\b':
-      *os << "\\b";
-      break;
-    case L'\f':
-      *os << "\\f";
-      break;
-    case L'\n':
-      *os << "\\n";
-      break;
-    case L'\r':
-      *os << "\\r";
-      break;
-    case L'\t':
-      *os << "\\t";
-      break;
-    case L'\v':
-      *os << "\\v";
-      break;
-    default:
-      if (IsPrintableAscii(c)) {
-        *os << static_cast<char>(c);
-        return kAsIs;
-      } else {
-        ostream::fmtflags flags = os->flags();
-        *os << "\\x" << std::hex << std::uppercase
-            << static_cast<int>(static_cast<UnsignedChar>(c));
-        os->flags(flags);
-        return kHexEscape;
-      }
-  }
-  return kSpecialEscape;
-}
-
-// Prints a wchar_t c as if it's part of a string literal, escaping it when
-// necessary; returns how c was formatted.
-static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
-  switch (c) {
-    case L'\'':
-      *os << "'";
-      return kAsIs;
-    case L'"':
-      *os << "\\\"";
-      return kSpecialEscape;
-    default:
-      return PrintAsCharLiteralTo<wchar_t>(c, os);
-  }
-}
-
-// Prints a char c as if it's part of a string literal, escaping it when
-// necessary; returns how c was formatted.
-static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
-  return PrintAsStringLiteralTo(
-      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
-}
-
-// Prints a wide or narrow character c and its code.  '\0' is printed
-// as "'\\0'", other unprintable characters are also properly escaped
-// using the standard C++ escape sequence.  The template argument
-// UnsignedChar is the unsigned version of Char, which is the type of c.
-template <typename UnsignedChar, typename Char>
-void PrintCharAndCodeTo(Char c, ostream* os) {
-  // First, print c as a literal in the most readable form we can find.
-  *os << ((sizeof(c) > 1) ? "L'" : "'");
-  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
-  *os << "'";
-
-  // To aid user debugging, we also print c's code in decimal, unless
-  // it's 0 (in which case c was printed as '\\0', making the code
-  // obvious).
-  if (c == 0)
-    return;
-  *os << " (" << static_cast<int>(c);
-
-  // For more convenience, we print c's code again in hexadecimal,
-  // unless c was already printed in the form '\x##' or the code is in
-  // [1, 9].
-  if (format == kHexEscape || (1 <= c && c <= 9)) {
-    // Do nothing.
-  } else {
-    *os << ", 0x" << String::FormatHexInt(static_cast<UnsignedChar>(c));
-  }
-  *os << ")";
-}
-
-void PrintTo(unsigned char c, ::std::ostream* os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
-void PrintTo(signed char c, ::std::ostream* os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
-
-// Prints a wchar_t as a symbol if it is printable or as its internal
-// code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
-void PrintTo(wchar_t wc, ostream* os) {
-  PrintCharAndCodeTo<wchar_t>(wc, os);
-}
-
-// Prints the given array of characters to the ostream.  CharType must be either
-// char or wchar_t.
-// The array starts at begin, the length is len, it may include '\0' characters
-// and may not be NUL-terminated.
-template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static CharFormat PrintCharsAsStringTo(
-    const CharType* begin, size_t len, ostream* os) {
-  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
-  *os << kQuoteBegin;
-  bool is_previous_hex = false;
-  CharFormat print_format = kAsIs;
-  for (size_t index = 0; index < len; ++index) {
-    const CharType cur = begin[index];
-    if (is_previous_hex && IsXDigit(cur)) {
-      // Previous character is of '\x..' form and this character can be
-      // interpreted as another hexadecimal digit in its number. Break string to
-      // disambiguate.
-      *os << "\" " << kQuoteBegin;
-    }
-    is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
-    // Remember if any characters required hex escaping.
-    if (is_previous_hex) {
-      print_format = kHexEscape;
-    }
-  }
-  *os << "\"";
-  return print_format;
-}
-
-// Prints a (const) char/wchar_t array of 'len' elements, starting at address
-// 'begin'.  CharType must be either char or wchar_t.
-template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static void UniversalPrintCharArray(
-    const CharType* begin, size_t len, ostream* os) {
-  // The code
-  //   const char kFoo[] = "foo";
-  // generates an array of 4, not 3, elements, with the last one being '\0'.
-  //
-  // Therefore when printing a char array, we don't print the last element if
-  // it's '\0', such that the output matches the string literal as it's
-  // written in the source code.
-  if (len > 0 && begin[len - 1] == '\0') {
-    PrintCharsAsStringTo(begin, len - 1, os);
-    return;
-  }
-
-  // If, however, the last element in the array is not '\0', e.g.
-  //    const char kFoo[] = { 'f', 'o', 'o' };
-  // we must print the entire array.  We also print a message to indicate
-  // that the array is not NUL-terminated.
-  PrintCharsAsStringTo(begin, len, os);
-  *os << " (no terminating NUL)";
-}
-
-// Prints a (const) char array of 'len' elements, starting at address 'begin'.
-void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
-  UniversalPrintCharArray(begin, len, os);
-}
-
-// Prints a (const) wchar_t array of 'len' elements, starting at address
-// 'begin'.
-void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
-  UniversalPrintCharArray(begin, len, os);
-}
-
-// Prints the given C string to the ostream.
-void PrintTo(const char* s, ostream* os) {
-  if (s == nullptr) {
-    *os << "NULL";
-  } else {
-    *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, strlen(s), os);
-  }
-}
-
-// MSVC compiler can be configured to define whar_t as a typedef
-// of unsigned short. Defining an overload for const wchar_t* in that case
-// would cause pointers to unsigned shorts be printed as wide strings,
-// possibly accessing more memory than intended and causing invalid
-// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
-// wchar_t is implemented as a native type.
-#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
-// Prints the given wide C string to the ostream.
-void PrintTo(const wchar_t* s, ostream* os) {
-  if (s == nullptr) {
-    *os << "NULL";
-  } else {
-    *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, wcslen(s), os);
-  }
-}
-#endif  // wchar_t is native
-
-namespace {
-
-bool ContainsUnprintableControlCodes(const char* str, size_t length) {
-  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
-
-  for (size_t i = 0; i < length; i++) {
-    unsigned char ch = *s++;
-    if (std::iscntrl(ch)) {
-        switch (ch) {
-        case '\t':
-        case '\n':
-        case '\r':
-          break;
-        default:
-          return true;
-        }
-      }
-  }
-  return false;
-}
-
-bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; }
-
-bool IsValidUTF8(const char* str, size_t length) {
-  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
-
-  for (size_t i = 0; i < length;) {
-    unsigned char lead = s[i++];
-
-    if (lead <= 0x7f) {
-      continue;  // single-byte character (ASCII) 0..7F
-    }
-    if (lead < 0xc2) {
-      return false;  // trail byte or non-shortest form
-    } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
-      ++i;  // 2-byte character
-    } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
-               IsUTF8TrailByte(s[i]) &&
-               IsUTF8TrailByte(s[i + 1]) &&
-               // check for non-shortest form and surrogate
-               (lead != 0xe0 || s[i] >= 0xa0) &&
-               (lead != 0xed || s[i] < 0xa0)) {
-      i += 2;  // 3-byte character
-    } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
-               IsUTF8TrailByte(s[i]) &&
-               IsUTF8TrailByte(s[i + 1]) &&
-               IsUTF8TrailByte(s[i + 2]) &&
-               // check for non-shortest form
-               (lead != 0xf0 || s[i] >= 0x90) &&
-               (lead != 0xf4 || s[i] < 0x90)) {
-      i += 3;  // 4-byte character
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-void ConditionalPrintAsText(const char* str, size_t length, ostream* os) {
-  if (!ContainsUnprintableControlCodes(str, length) &&
-      IsValidUTF8(str, length)) {
-    *os << "\n    As Text: \"" << str << "\"";
-  }
-}
-
-}  // anonymous namespace
-
-void PrintStringTo(const ::std::string& s, ostream* os) {
-  if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
-    if (GTEST_FLAG(print_utf8)) {
-      ConditionalPrintAsText(s.data(), s.size(), os);
-    }
-  }
-}
-
-#if GTEST_HAS_STD_WSTRING
-void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-}  // namespace internal
-
-}  // namespace testing
diff --git a/deps/googletest/src/gtest-test-part.cc b/deps/googletest/src/gtest-test-part.cc
deleted file mode 100644
index 178317a6b..000000000
--- a/deps/googletest/src/gtest-test-part.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-//
-// The Google C++ Testing and Mocking Framework (Google Test)
-
-#include "gtest/gtest-test-part.h"
-#include "src/gtest-internal-inl.h"
-
-namespace testing {
-
-using internal::GetUnitTestImpl;
-
-// Gets the summary of the failure message by omitting the stack trace
-// in it.
-std::string TestPartResult::ExtractSummary(const char* message) {
-  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
-  return stack_trace == nullptr ? message : std::string(message, stack_trace);
-}
-
-// Prints a TestPartResult object.
-std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
-  return os << result.file_name() << ":" << result.line_number() << ": "
-            << (result.type() == TestPartResult::kSuccess
-                    ? "Success"
-                    : result.type() == TestPartResult::kSkip
-                          ? "Skipped"
-                          : result.type() == TestPartResult::kFatalFailure
-                                ? "Fatal failure"
-                                : "Non-fatal failure")
-            << ":\n"
-            << result.message() << std::endl;
-}
-
-// Appends a TestPartResult to the array.
-void TestPartResultArray::Append(const TestPartResult& result) {
-  array_.push_back(result);
-}
-
-// Returns the TestPartResult at the given index (0-based).
-const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
-  if (index < 0 || index >= size()) {
-    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
-    internal::posix::Abort();
-  }
-
-  return array_[static_cast<size_t>(index)];
-}
-
-// Returns the number of TestPartResult objects in the array.
-int TestPartResultArray::size() const {
-  return static_cast<int>(array_.size());
-}
-
-namespace internal {
-
-HasNewFatalFailureHelper::HasNewFatalFailureHelper()
-    : has_new_fatal_failure_(false),
-      original_reporter_(GetUnitTestImpl()->
-                         GetTestPartResultReporterForCurrentThread()) {
-  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
-}
-
-HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
-  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
-      original_reporter_);
-}
-
-void HasNewFatalFailureHelper::ReportTestPartResult(
-    const TestPartResult& result) {
-  if (result.fatally_failed())
-    has_new_fatal_failure_ = true;
-  original_reporter_->ReportTestPartResult(result);
-}
-
-}  // namespace internal
-
-}  // namespace testing
diff --git a/deps/googletest/src/gtest-typed-test.cc b/deps/googletest/src/gtest-typed-test.cc
deleted file mode 100644
index 8677caf73..000000000
--- a/deps/googletest/src/gtest-typed-test.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-#include "gtest/gtest-typed-test.h"
-
-#include "gtest/gtest.h"
-
-namespace testing {
-namespace internal {
-
-#if GTEST_HAS_TYPED_TEST_P
-
-// Skips to the first non-space char in str. Returns an empty string if str
-// contains only whitespace characters.
-static const char* SkipSpaces(const char* str) {
-  while (IsSpace(*str))
-    str++;
-  return str;
-}
-
-static std::vector<std::string> SplitIntoTestNames(const char* src) {
-  std::vector<std::string> name_vec;
-  src = SkipSpaces(src);
-  for (; src != nullptr; src = SkipComma(src)) {
-    name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src)));
-  }
-  return name_vec;
-}
-
-// Verifies that registered_tests match the test names in
-// registered_tests_; returns registered_tests if successful, or
-// aborts the program otherwise.
-const char* TypedTestSuitePState::VerifyRegisteredTestNames(
-    const char* file, int line, const char* registered_tests) {
-  typedef RegisteredTestsMap::const_iterator RegisteredTestIter;
-  registered_ = true;
-
-  std::vector<std::string> name_vec = SplitIntoTestNames(registered_tests);
-
-  Message errors;
-
-  std::set<std::string> tests;
-  for (std::vector<std::string>::const_iterator name_it = name_vec.begin();
-       name_it != name_vec.end(); ++name_it) {
-    const std::string& name = *name_it;
-    if (tests.count(name) != 0) {
-      errors << "Test " << name << " is listed more than once.\n";
-      continue;
-    }
-
-    bool found = false;
-    for (RegisteredTestIter it = registered_tests_.begin();
-         it != registered_tests_.end();
-         ++it) {
-      if (name == it->first) {
-        found = true;
-        break;
-      }
-    }
-
-    if (found) {
-      tests.insert(name);
-    } else {
-      errors << "No test named " << name
-             << " can be found in this test suite.\n";
-    }
-  }
-
-  for (RegisteredTestIter it = registered_tests_.begin();
-       it != registered_tests_.end();
-       ++it) {
-    if (tests.count(it->first) == 0) {
-      errors << "You forgot to list test " << it->first << ".\n";
-    }
-  }
-
-  const std::string& errors_str = errors.GetString();
-  if (errors_str != "") {
-    fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
-            errors_str.c_str());
-    fflush(stderr);
-    posix::Abort();
-  }
-
-  return registered_tests;
-}
-
-#endif  // GTEST_HAS_TYPED_TEST_P
-
-}  // namespace internal
-}  // namespace testing
diff --git a/deps/googletest/src/gtest.cc b/deps/googletest/src/gtest.cc
deleted file mode 100644
index d874d9aca..000000000
--- a/deps/googletest/src/gtest.cc
+++ /dev/null
@@ -1,6124 +0,0 @@
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-//
-// The Google C++ Testing and Mocking Framework (Google Test)
-
-#include "gtest/gtest.h"
-#include "gtest/internal/custom/gtest.h"
-#include "gtest/gtest-spi.h"
-
-#include <ctype.h>
-#include <math.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <wchar.h>
-#include <wctype.h>
-
-#include <algorithm>
-#include <iomanip>
-#include <limits>
-#include <list>
-#include <map>
-#include <ostream>  // NOLINT
-#include <sstream>
-#include <vector>
-
-#if GTEST_OS_LINUX
-
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-
-# include <fcntl.h>  // NOLINT
-# include <limits.h>  // NOLINT
-# include <sched.h>  // NOLINT
-// Declares vsnprintf().  This header is not available on Windows.
-# include <strings.h>  // NOLINT
-# include <sys/mman.h>  // NOLINT
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-# include <string>
-
-#elif GTEST_OS_ZOS
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-# include <sys/time.h>  // NOLINT
-
-// On z/OS we additionally need strings.h for strcasecmp.
-# include <strings.h>  // NOLINT
-
-#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
-
-# include <windows.h>  // NOLINT
-# undef min
-
-#elif GTEST_OS_WINDOWS  // We are on Windows proper.
-
-# include <io.h>  // NOLINT
-# include <sys/timeb.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
-# include <sys/stat.h>  // NOLINT
-
-# if GTEST_OS_WINDOWS_MINGW
-// MinGW has gettimeofday() but not _ftime64().
-#  define GTEST_HAS_GETTIMEOFDAY_ 1
-#  include <sys/time.h>  // NOLINT
-# endif  // GTEST_OS_WINDOWS_MINGW
-
-// cpplint thinks that the header is already included, so we want to
-// silence it.
-# include <windows.h>  // NOLINT
-# undef min
-
-#else
-
-// Assume other platforms have gettimeofday().
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-
-// cpplint thinks that the header is already included, so we want to
-// silence it.
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-
-#endif  // GTEST_OS_LINUX
-
-#if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
-#endif
-
-#if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
-# include <sys/socket.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
-#endif
-
-#include "src/gtest-internal-inl.h"
-
-#if GTEST_OS_WINDOWS
-# define vsnprintf _vsnprintf
-#endif  // GTEST_OS_WINDOWS
-
-#if GTEST_OS_MAC
-#ifndef GTEST_OS_IOS
-#include <crt_externs.h>
-#endif
-#endif
-
-#if GTEST_HAS_ABSL
-#include "absl/debugging/failure_signal_handler.h"
-#include "absl/debugging/stacktrace.h"
-#include "absl/debugging/symbolize.h"
-#include "absl/strings/str_cat.h"
-#endif  // GTEST_HAS_ABSL
-
-namespace testing {
-
-using internal::CountIf;
-using internal::ForEach;
-using internal::GetElementOr;
-using internal::Shuffle;
-
-// Constants.
-
-// A test whose test suite name or test name matches this filter is
-// disabled and not run.
-static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
-
-// A test suite whose name matches this filter is considered a death
-// test suite and will be run before test suites whose name doesn't
-// match this filter.
-static const char kDeathTestSuiteFilter[] = "*DeathTest:*DeathTest/*";
-
-// A test filter that matches everything.
-static const char kUniversalFilter[] = "*";
-
-// The default output format.
-static const char kDefaultOutputFormat[] = "xml";
-// The default output file.
-static const char kDefaultOutputFile[] = "test_detail";
-
-// The environment variable name for the test shard index.
-static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
-// The environment variable name for the total number of test shards.
-static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
-// The environment variable name for the test shard status file.
-static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
-
-namespace internal {
-
-// The text used in failure messages to indicate the start of the
-// stack trace.
-const char kStackTraceMarker[] = "\nStack trace:\n";
-
-// g_help_flag is true iff the --help flag or an equivalent form is
-// specified on the command line.
-bool g_help_flag = false;
-
-// Utility function to Open File for Writing
-static FILE* OpenFileForWriting(const std::string& output_file) {
-  FILE* fileout = nullptr;
-  FilePath output_file_path(output_file);
-  FilePath output_dir(output_file_path.RemoveFileName());
-
-  if (output_dir.CreateDirectoriesRecursively()) {
-    fileout = posix::FOpen(output_file.c_str(), "w");
-  }
-  if (fileout == nullptr) {
-    GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\"";
-  }
-  return fileout;
-}
-
-}  // namespace internal
-
-// Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY
-// environment variable.
-static const char* GetDefaultFilter() {
-  const char* const testbridge_test_only =
-      internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY");
-  if (testbridge_test_only != nullptr) {
-    return testbridge_test_only;
-  }
-  return kUniversalFilter;
-}
-
-GTEST_DEFINE_bool_(
-    also_run_disabled_tests,
-    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
-    "Run disabled tests too, in addition to the tests normally being run.");
-
-GTEST_DEFINE_bool_(
-    break_on_failure,
-    internal::BoolFromGTestEnv("break_on_failure", false),
-    "True iff a failed assertion should be a debugger break-point.");
-
-GTEST_DEFINE_bool_(
-    catch_exceptions,
-    internal::BoolFromGTestEnv("catch_exceptions", true),
-    "True iff " GTEST_NAME_
-    " should catch exceptions and treat them as test failures.");
-
-GTEST_DEFINE_string_(
-    color,
-    internal::StringFromGTestEnv("color", "auto"),
-    "Whether to use colors in the output.  Valid values: yes, no, "
-    "and auto.  'auto' means to use colors if the output is "
-    "being sent to a terminal and the TERM environment variable "
-    "is set to a terminal type that supports colors.");
-
-GTEST_DEFINE_string_(
-    filter,
-    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
-    "A colon-separated list of glob (not regex) patterns "
-    "for filtering the tests to run, optionally followed by a "
-    "'-' and a : separated list of negative patterns (tests to "
-    "exclude).  A test is run if it matches one of the positive "
-    "patterns and does not match any of the negative patterns.");
-
-GTEST_DEFINE_bool_(
-    install_failure_signal_handler,
-    internal::BoolFromGTestEnv("install_failure_signal_handler", false),
-    "If true and supported on the current platform, " GTEST_NAME_ " should "
-    "install a signal handler that dumps debugging information when fatal "
-    "signals are raised.");
-
-GTEST_DEFINE_bool_(list_tests, false,
-                   "List all tests without running them.");
-
-// The net priority order after flag processing is thus:
-//   --gtest_output command line flag
-//   GTEST_OUTPUT environment variable
-//   XML_OUTPUT_FILE environment variable
-//   ''
-GTEST_DEFINE_string_(
-    output,
-    internal::StringFromGTestEnv("output",
-      internal::OutputFlagAlsoCheckEnvVar().c_str()),
-    "A format (defaults to \"xml\" but can be specified to be \"json\"), "
-    "optionally followed by a colon and an output file name or directory. "
-    "A directory is indicated by a trailing pathname separator. "
-    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
-    "If a directory is specified, output files will be created "
-    "within that directory, with file-names based on the test "
-    "executable's name and, if necessary, made unique by adding "
-    "digits.");
-
-GTEST_DEFINE_bool_(
-    print_time,
-    internal::BoolFromGTestEnv("print_time", true),
-    "True iff " GTEST_NAME_
-    " should display elapsed time in text output.");
-
-GTEST_DEFINE_bool_(
-    print_utf8,
-    internal::BoolFromGTestEnv("print_utf8", true),
-    "True iff " GTEST_NAME_
-    " prints UTF8 characters as text.");
-
-GTEST_DEFINE_int32_(
-    random_seed,
-    internal::Int32FromGTestEnv("random_seed", 0),
-    "Random number seed to use when shuffling test orders.  Must be in range "
-    "[1, 99999], or 0 to use a seed based on the current time.");
-
-GTEST_DEFINE_int32_(
-    repeat,
-    internal::Int32FromGTestEnv("repeat", 1),
-    "How many times to repeat each test.  Specify a negative number "
-    "for repeating forever.  Useful for shaking out flaky tests.");
-
-GTEST_DEFINE_bool_(
-    show_internal_stack_frames, false,
-    "True iff " GTEST_NAME_ " should include internal stack frames when "
-    "printing test failure stack traces.");
-
-GTEST_DEFINE_bool_(
-    shuffle,
-    internal::BoolFromGTestEnv("shuffle", false),
-    "True iff " GTEST_NAME_
-    " should randomize tests' order on every run.");
-
-GTEST_DEFINE_int32_(
-    stack_trace_depth,
-    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
-    "The maximum number of stack frames to print when an "
-    "assertion fails.  The valid range is 0 through 100, inclusive.");
-
-GTEST_DEFINE_string_(
-    stream_result_to,
-    internal::StringFromGTestEnv("stream_result_to", ""),
-    "This flag specifies the host name and the port number on which to stream "
-    "test results. Example: \"localhost:555\". The flag is effective only on "
-    "Linux.");
-
-GTEST_DEFINE_bool_(
-    throw_on_failure,
-    internal::BoolFromGTestEnv("throw_on_failure", false),
-    "When this flag is specified, a failed assertion will throw an exception "
-    "if exceptions are enabled or exit the program with a non-zero code "
-    "otherwise. For use with an external test framework.");
-
-#if GTEST_USE_OWN_FLAGFILE_FLAG_
-GTEST_DEFINE_string_(
-    flagfile,
-    internal::StringFromGTestEnv("flagfile", ""),
-    "This flag specifies the flagfile to read command-line flags from.");
-#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
-
-namespace internal {
-
-// Generates a random number from [0, range), using a Linear
-// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
-// than kMaxRange.
-UInt32 Random::Generate(UInt32 range) {
-  // These constants are the same as are used in glibc's rand(3).
-  // Use wider types than necessary to prevent unsigned overflow diagnostics.
-  state_ = static_cast<UInt32>(1103515245ULL*state_ + 12345U) % kMaxRange;
-
-  GTEST_CHECK_(range > 0)
-      << "Cannot generate a number in the range [0, 0).";
-  GTEST_CHECK_(range <= kMaxRange)
-      << "Generation of a number in [0, " << range << ") was requested, "
-      << "but this can only generate numbers in [0, " << kMaxRange << ").";
-
-  // Converting via modulus introduces a bit of downward bias, but
-  // it's simple, and a linear congruential generator isn't too good
-  // to begin with.
-  return state_ % range;
-}
-
-// GTestIsInitialized() returns true iff the user has initialized
-// Google Test.  Useful for catching the user mistake of not initializing
-// Google Test before calling RUN_ALL_TESTS().
-static bool GTestIsInitialized() { return GetArgvs().size() > 0; }
-
-// Iterates over a vector of TestSuites, keeping a running sum of the
-// results of calling a given int-returning method on each.
-// Returns the sum.
-static int SumOverTestSuiteList(const std::vector<TestSuite*>& case_list,
-                                int (TestSuite::*method)() const) {
-  int sum = 0;
-  for (size_t i = 0; i < case_list.size(); i++) {
-    sum += (case_list[i]->*method)();
-  }
-  return sum;
-}
-
-// Returns true iff the test suite passed.
-static bool TestSuitePassed(const TestSuite* test_suite) {
-  return test_suite->should_run() && test_suite->Passed();
-}
-
-// Returns true iff the test suite failed.
-static bool TestSuiteFailed(const TestSuite* test_suite) {
-  return test_suite->should_run() && test_suite->Failed();
-}
-
-// Returns true iff test_suite contains at least one test that should
-// run.
-static bool ShouldRunTestSuite(const TestSuite* test_suite) {
-  return test_suite->should_run();
-}
-
-// AssertHelper constructor.
-AssertHelper::AssertHelper(TestPartResult::Type type,
-                           const char* file,
-                           int line,
-                           const char* message)
-    : data_(new AssertHelperData(type, file, line, message)) {
-}
-
-AssertHelper::~AssertHelper() {
-  delete data_;
-}
-
-// Message assignment, for assertion streaming support.
-void AssertHelper::operator=(const Message& message) const {
-  UnitTest::GetInstance()->
-    AddTestPartResult(data_->type, data_->file, data_->line,
-                      AppendUserMessage(data_->message, message),
-                      UnitTest::GetInstance()->impl()
-                      ->CurrentOsStackTraceExceptTop(1)
-                      // Skips the stack frame for this function itself.
-                      );  // NOLINT
-}
-
-// A copy of all command line arguments.  Set by InitGoogleTest().
-static ::std::vector<std::string> g_argvs;
-
-::std::vector<std::string> GetArgvs() {
-#if defined(GTEST_CUSTOM_GET_ARGVS_)
-  // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or
-  // ::string. This code converts it to the appropriate type.
-  const auto& custom = GTEST_CUSTOM_GET_ARGVS_();
-  return ::std::vector<std::string>(custom.begin(), custom.end());
-#else   // defined(GTEST_CUSTOM_GET_ARGVS_)
-  return g_argvs;
-#endif  // defined(GTEST_CUSTOM_GET_ARGVS_)
-}
-
-// Returns the current application's name, removing directory path if that
-// is present.
-FilePath GetCurrentExecutableName() {
-  FilePath result;
-
-#if GTEST_OS_WINDOWS || GTEST_OS_OS2
-  result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe"));
-#else
-  result.Set(FilePath(GetArgvs()[0]));
-#endif  // GTEST_OS_WINDOWS
-
-  return result.RemoveDirectoryName();
-}
-
-// Functions for processing the gtest_output flag.
-
-// Returns the output format, or "" for normal printed output.
-std::string UnitTestOptions::GetOutputFormat() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  const char* const colon = strchr(gtest_output_flag, ':');
-  return (colon == nullptr)
-             ? std::string(gtest_output_flag)
-             : std::string(gtest_output_flag,
-                           static_cast<size_t>(colon - gtest_output_flag));
-}
-
-// Returns the name of the requested output file, or the default if none
-// was explicitly specified.
-std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-
-  std::string format = GetOutputFormat();
-  if (format.empty())
-    format = std::string(kDefaultOutputFormat);
-
-  const char* const colon = strchr(gtest_output_flag, ':');
-  if (colon == nullptr)
-    return internal::FilePath::MakeFileName(
-        internal::FilePath(
-            UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(kDefaultOutputFile), 0,
-        format.c_str()).string();
-
-  internal::FilePath output_name(colon + 1);
-  if (!output_name.IsAbsolutePath())
-    output_name = internal::FilePath::ConcatPaths(
-        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(colon + 1));
-
-  if (!output_name.IsDirectory())
-    return output_name.string();
-
-  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
-      output_name, internal::GetCurrentExecutableName(),
-      GetOutputFormat().c_str()));
-  return result.string();
-}
-
-// Returns true iff the wildcard pattern matches the string.  The
-// first ':' or '\0' character in pattern marks the end of it.
-//
-// This recursive algorithm isn't very efficient, but is clear and
-// works well enough for matching test names, which are short.
-bool UnitTestOptions::PatternMatchesString(const char *pattern,
-                                           const char *str) {
-  switch (*pattern) {
-    case '\0':
-    case ':':  // Either ':' or '\0' marks the end of the pattern.
-      return *str == '\0';
-    case '?':  // Matches any single character.
-      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
-    case '*':  // Matches any string (possibly empty) of characters.
-      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
-          PatternMatchesString(pattern + 1, str);
-    default:  // Non-special character.  Matches itself.
-      return *pattern == *str &&
-          PatternMatchesString(pattern + 1, str + 1);
-  }
-}
-
-bool UnitTestOptions::MatchesFilter(
-    const std::string& name, const char* filter) {
-  const char *cur_pattern = filter;
-  for (;;) {
-    if (PatternMatchesString(cur_pattern, name.c_str())) {
-      return true;
-    }
-
-    // Finds the next pattern in the filter.
-    cur_pattern = strchr(cur_pattern, ':');
-
-    // Returns if no more pattern can be found.
-    if (cur_pattern == nullptr) {
-      return false;
-    }
-
-    // Skips the pattern separater (the ':' character).
-    cur_pattern++;
-  }
-}
-
-// Returns true iff the user-specified filter matches the test suite
-// name and the test name.
-bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name,
-                                        const std::string& test_name) {
-  const std::string& full_name = test_suite_name + "." + test_name.c_str();
-
-  // Split --gtest_filter at '-', if there is one, to separate into
-  // positive filter and negative filter portions
-  const char* const p = GTEST_FLAG(filter).c_str();
-  const char* const dash = strchr(p, '-');
-  std::string positive;
-  std::string negative;
-  if (dash == nullptr) {
-    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
-    negative = "";
-  } else {
-    positive = std::string(p, dash);   // Everything up to the dash
-    negative = std::string(dash + 1);  // Everything after the dash
-    if (positive.empty()) {
-      // Treat '-test1' as the same as '*-test1'
-      positive = kUniversalFilter;
-    }
-  }
-
-  // A filter is a colon-separated list of patterns.  It matches a
-  // test if any pattern in it matches the test.
-  return (MatchesFilter(full_name, positive.c_str()) &&
-          !MatchesFilter(full_name, negative.c_str()));
-}
-
-#if GTEST_HAS_SEH
-// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
-// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
-// This function is useful as an __except condition.
-int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
-  // Google Test should handle a SEH exception if:
-  //   1. the user wants it to, AND
-  //   2. this is not a breakpoint exception, AND
-  //   3. this is not a C++ exception (VC++ implements them via SEH,
-  //      apparently).
-  //
-  // SEH exception code for C++ exceptions.
-  // (see http://support.microsoft.com/kb/185294 for more information).
-  const DWORD kCxxExceptionCode = 0xe06d7363;
-
-  bool should_handle = true;
-
-  if (!GTEST_FLAG(catch_exceptions))
-    should_handle = false;
-  else if (exception_code == EXCEPTION_BREAKPOINT)
-    should_handle = false;
-  else if (exception_code == kCxxExceptionCode)
-    should_handle = false;
-
-  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
-}
-#endif  // GTEST_HAS_SEH
-
-}  // namespace internal
-
-// The c'tor sets this object as the test part result reporter used by
-// Google Test.  The 'result' parameter specifies where to report the
-// results. Intercepts only failures from the current thread.
-ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
-    TestPartResultArray* result)
-    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
-      result_(result) {
-  Init();
-}
-
-// The c'tor sets this object as the test part result reporter used by
-// Google Test.  The 'result' parameter specifies where to report the
-// results.
-ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
-    InterceptMode intercept_mode, TestPartResultArray* result)
-    : intercept_mode_(intercept_mode),
-      result_(result) {
-  Init();
-}
-
-void ScopedFakeTestPartResultReporter::Init() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
-    old_reporter_ = impl->GetGlobalTestPartResultReporter();
-    impl->SetGlobalTestPartResultReporter(this);
-  } else {
-    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
-    impl->SetTestPartResultReporterForCurrentThread(this);
-  }
-}
-
-// The d'tor restores the test part result reporter used by Google Test
-// before.
-ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
-    impl->SetGlobalTestPartResultReporter(old_reporter_);
-  } else {
-    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
-  }
-}
-
-// Increments the test part result count and remembers the result.
-// This method is from the TestPartResultReporterInterface interface.
-void ScopedFakeTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  result_->Append(result);
-}
-
-namespace internal {
-
-// Returns the type ID of ::testing::Test.  We should always call this
-// instead of GetTypeId< ::testing::Test>() to get the type ID of
-// testing::Test.  This is to work around a suspected linker bug when
-// using Google Test as a framework on Mac OS X.  The bug causes
-// GetTypeId< ::testing::Test>() to return different values depending
-// on whether the call is from the Google Test framework itself or
-// from user test code.  GetTestTypeId() is guaranteed to always
-// return the same value, as it always calls GetTypeId<>() from the
-// gtest.cc, which is within the Google Test framework.
-TypeId GetTestTypeId() {
-  return GetTypeId<Test>();
-}
-
-// The value of GetTestTypeId() as seen from within the Google Test
-// library.  This is solely for testing GetTestTypeId().
-extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
-
-// This predicate-formatter checks that 'results' contains a test part
-// failure of the given type and that the failure message contains the
-// given substring.
-static AssertionResult HasOneFailure(const char* /* results_expr */,
-                                     const char* /* type_expr */,
-                                     const char* /* substr_expr */,
-                                     const TestPartResultArray& results,
-                                     TestPartResult::Type type,
-                                     const std::string& substr) {
-  const std::string expected(type == TestPartResult::kFatalFailure ?
-                        "1 fatal failure" :
-                        "1 non-fatal failure");
-  Message msg;
-  if (results.size() != 1) {
-    msg << "Expected: " << expected << "\n"
-        << "  Actual: " << results.size() << " failures";
-    for (int i = 0; i < results.size(); i++) {
-      msg << "\n" << results.GetTestPartResult(i);
-    }
-    return AssertionFailure() << msg;
-  }
-
-  const TestPartResult& r = results.GetTestPartResult(0);
-  if (r.type() != type) {
-    return AssertionFailure() << "Expected: " << expected << "\n"
-                              << "  Actual:\n"
-                              << r;
-  }
-
-  if (strstr(r.message(), substr.c_str()) == nullptr) {
-    return AssertionFailure() << "Expected: " << expected << " containing \""
-                              << substr << "\"\n"
-                              << "  Actual:\n"
-                              << r;
-  }
-
-  return AssertionSuccess();
-}
-
-// The constructor of SingleFailureChecker remembers where to look up
-// test part results, what type of failure we expect, and what
-// substring the failure message should contain.
-SingleFailureChecker::SingleFailureChecker(const TestPartResultArray* results,
-                                           TestPartResult::Type type,
-                                           const std::string& substr)
-    : results_(results), type_(type), substr_(substr) {}
-
-// The destructor of SingleFailureChecker verifies that the given
-// TestPartResultArray contains exactly one failure that has the given
-// type and contains the given substring.  If that's not the case, a
-// non-fatal failure will be generated.
-SingleFailureChecker::~SingleFailureChecker() {
-  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
-}
-
-DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
-
-void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  unit_test_->current_test_result()->AddTestPartResult(result);
-  unit_test_->listeners()->repeater()->OnTestPartResult(result);
-}
-
-DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
-
-void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
-}
-
-// Returns the global test part result reporter.
-TestPartResultReporterInterface*
-UnitTestImpl::GetGlobalTestPartResultReporter() {
-  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
-  return global_test_part_result_repoter_;
-}
-
-// Sets the global test part result reporter.
-void UnitTestImpl::SetGlobalTestPartResultReporter(
-    TestPartResultReporterInterface* reporter) {
-  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
-  global_test_part_result_repoter_ = reporter;
-}
-
-// Returns the test part result reporter for the current thread.
-TestPartResultReporterInterface*
-UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
-  return per_thread_test_part_result_reporter_.get();
-}
-
-// Sets the test part result reporter for the current thread.
-void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
-    TestPartResultReporterInterface* reporter) {
-  per_thread_test_part_result_reporter_.set(reporter);
-}
-
-// Gets the number of successful test suites.
-int UnitTestImpl::successful_test_suite_count() const {
-  return CountIf(test_suites_, TestSuitePassed);
-}
-
-// Gets the number of failed test suites.
-int UnitTestImpl::failed_test_suite_count() const {
-  return CountIf(test_suites_, TestSuiteFailed);
-}
-
-// Gets the number of all test suites.
-int UnitTestImpl::total_test_suite_count() const {
-  return static_cast<int>(test_suites_.size());
-}
-
-// Gets the number of all test suites that contain at least one test
-// that should run.
-int UnitTestImpl::test_suite_to_run_count() const {
-  return CountIf(test_suites_, ShouldRunTestSuite);
-}
-
-// Gets the number of successful tests.
-int UnitTestImpl::successful_test_count() const {
-  return SumOverTestSuiteList(test_suites_, &TestSuite::successful_test_count);
-}
-
-// Gets the number of skipped tests.
-int UnitTestImpl::skipped_test_count() const {
-  return SumOverTestSuiteList(test_suites_, &TestSuite::skipped_test_count);
-}
-
-// Gets the number of failed tests.
-int UnitTestImpl::failed_test_count() const {
-  return SumOverTestSuiteList(test_suites_, &TestSuite::failed_test_count);
-}
-
-// Gets the number of disabled tests that will be reported in the XML report.
-int UnitTestImpl::reportable_disabled_test_count() const {
-  return SumOverTestSuiteList(test_suites_,
-                              &TestSuite::reportable_disabled_test_count);
-}
-
-// Gets the number of disabled tests.
-int UnitTestImpl::disabled_test_count() const {
-  return SumOverTestSuiteList(test_suites_, &TestSuite::disabled_test_count);
-}
-
-// Gets the number of tests to be printed in the XML report.
-int UnitTestImpl::reportable_test_count() const {
-  return SumOverTestSuiteList(test_suites_, &TestSuite::reportable_test_count);
-}
-
-// Gets the number of all tests.
-int UnitTestImpl::total_test_count() const {
-  return SumOverTestSuiteList(test_suites_, &TestSuite::total_test_count);
-}
-
-// Gets the number of tests that should run.
-int UnitTestImpl::test_to_run_count() const {
-  return SumOverTestSuiteList(test_suites_, &TestSuite::test_to_run_count);
-}
-
-// Returns the current OS stack trace as an std::string.
-//
-// The maximum number of stack frames to be included is specified by
-// the gtest_stack_trace_depth flag.  The skip_count parameter
-// specifies the number of top frames to be skipped, which doesn't
-// count against the number of frames to be included.
-//
-// For example, if Foo() calls Bar(), which in turn calls
-// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
-// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
-  return os_stack_trace_getter()->CurrentStackTrace(
-      static_cast<int>(GTEST_FLAG(stack_trace_depth)),
-      skip_count + 1
-      // Skips the user-specified number of frames plus this function
-      // itself.
-      );  // NOLINT
-}
-
-// Returns the current time in milliseconds.
-TimeInMillis GetTimeInMillis() {
-#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
-  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
-  // http://analogous.blogspot.com/2005/04/epoch.html
-  const TimeInMillis kJavaEpochToWinFileTimeDelta =
-    static_cast<TimeInMillis>(116444736UL) * 100000UL;
-  const DWORD kTenthMicrosInMilliSecond = 10000;
-
-  SYSTEMTIME now_systime;
-  FILETIME now_filetime;
-  ULARGE_INTEGER now_int64;
-  GetSystemTime(&now_systime);
-  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
-    now_int64.LowPart = now_filetime.dwLowDateTime;
-    now_int64.HighPart = now_filetime.dwHighDateTime;
-    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
-      kJavaEpochToWinFileTimeDelta;
-    return now_int64.QuadPart;
-  }
-  return 0;
-#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
-  __timeb64 now;
-
-  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
-  // (deprecated function) there.
-  GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
-  _ftime64(&now);
-  GTEST_DISABLE_MSC_DEPRECATED_POP_()
-
-  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
-#elif GTEST_HAS_GETTIMEOFDAY_
-  struct timeval now;
-  gettimeofday(&now, nullptr);
-  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
-#else
-# error "Don't know how to get the current time on your system."
-#endif
-}
-
-// Utilities
-
-// class String.
-
-#if GTEST_OS_WINDOWS_MOBILE
-// Creates a UTF-16 wide string from the given ANSI string, allocating
-// memory using new. The caller is responsible for deleting the return
-// value using delete[]. Returns the wide string, or NULL if the
-// input is NULL.
-LPCWSTR String::AnsiToUtf16(const char* ansi) {
-  if (!ansi) return nullptr;
-  const int length = strlen(ansi);
-  const int unicode_length =
-      MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0);
-  WCHAR* unicode = new WCHAR[unicode_length + 1];
-  MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                      unicode, unicode_length);
-  unicode[unicode_length] = 0;
-  return unicode;
-}
-
-// Creates an ANSI string from the given wide string, allocating
-// memory using new. The caller is responsible for deleting the return
-// value using delete[]. Returns the ANSI string, or NULL if the
-// input is NULL.
-const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
-  if (!utf16_str) return nullptr;
-  const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr,
-                                              0, nullptr, nullptr);
-  char* ansi = new char[ansi_length + 1];
-  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, ansi, ansi_length, nullptr,
-                      nullptr);
-  ansi[ansi_length] = 0;
-  return ansi;
-}
-
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-// Compares two C strings.  Returns true iff they have the same content.
-//
-// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
-// C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::CStringEquals(const char * lhs, const char * rhs) {
-  if (lhs == nullptr) return rhs == nullptr;
-
-  if (rhs == nullptr) return false;
-
-  return strcmp(lhs, rhs) == 0;
-}
-
-#if GTEST_HAS_STD_WSTRING
-
-// Converts an array of wide chars to a narrow string using the UTF-8
-// encoding, and streams the result to the given Message object.
-static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
-                                     Message* msg) {
-  for (size_t i = 0; i != length; ) {  // NOLINT
-    if (wstr[i] != L'\0') {
-      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
-      while (i != length && wstr[i] != L'\0')
-        i++;
-    } else {
-      *msg << '\0';
-      i++;
-    }
-  }
-}
-
-#endif  // GTEST_HAS_STD_WSTRING
-
-void SplitString(const ::std::string& str, char delimiter,
-                 ::std::vector< ::std::string>* dest) {
-  ::std::vector< ::std::string> parsed;
-  ::std::string::size_type pos = 0;
-  while (::testing::internal::AlwaysTrue()) {
-    const ::std::string::size_type colon = str.find(delimiter, pos);
-    if (colon == ::std::string::npos) {
-      parsed.push_back(str.substr(pos));
-      break;
-    } else {
-      parsed.push_back(str.substr(pos, colon - pos));
-      pos = colon + 1;
-    }
-  }
-  dest->swap(parsed);
-}
-
-}  // namespace internal
-
-// Constructs an empty Message.
-// We allocate the stringstream separately because otherwise each use of
-// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
-// stack frame leading to huge stack frames in some cases; gcc does not reuse
-// the stack space.
-Message::Message() : ss_(new ::std::stringstream) {
-  // By default, we want there to be enough precision when printing
-  // a double to a Message.
-  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
-}
-
-// These two overloads allow streaming a wide C string to a Message
-// using the UTF-8 encoding.
-Message& Message::operator <<(const wchar_t* wide_c_str) {
-  return *this << internal::String::ShowWideCString(wide_c_str);
-}
-Message& Message::operator <<(wchar_t* wide_c_str) {
-  return *this << internal::String::ShowWideCString(wide_c_str);
-}
-
-#if GTEST_HAS_STD_WSTRING
-// Converts the given wide string to a narrow string using the UTF-8
-// encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::std::wstring& wstr) {
-  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
-  return *this;
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-// Gets the text streamed to this object so far as an std::string.
-// Each '\0' character in the buffer is replaced with "\\0".
-std::string Message::GetString() const {
-  return internal::StringStreamToString(ss_.get());
-}
-
-// AssertionResult constructors.
-// Used in EXPECT_TRUE/FALSE(assertion_result).
-AssertionResult::AssertionResult(const AssertionResult& other)
-    : success_(other.success_),
-      message_(other.message_.get() != nullptr
-                   ? new ::std::string(*other.message_)
-                   : static_cast< ::std::string*>(nullptr)) {}
-
-// Swaps two AssertionResults.
-void AssertionResult::swap(AssertionResult& other) {
-  using std::swap;
-  swap(success_, other.success_);
-  swap(message_, other.message_);
-}
-
-// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-AssertionResult AssertionResult::operator!() const {
-  AssertionResult negation(!success_);
-  if (message_.get() != nullptr) negation << *message_;
-  return negation;
-}
-
-// Makes a successful assertion result.
-AssertionResult AssertionSuccess() {
-  return AssertionResult(true);
-}
-
-// Makes a failed assertion result.
-AssertionResult AssertionFailure() {
-  return AssertionResult(false);
-}
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << message.
-AssertionResult AssertionFailure(const Message& message) {
-  return AssertionFailure() << message;
-}
-
-namespace internal {
-
-namespace edit_distance {
-std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t>& left,
-                                            const std::vector<size_t>& right) {
-  std::vector<std::vector<double> > costs(
-      left.size() + 1, std::vector<double>(right.size() + 1));
-  std::vector<std::vector<EditType> > best_move(
-      left.size() + 1, std::vector<EditType>(right.size() + 1));
-
-  // Populate for empty right.
-  for (size_t l_i = 0; l_i < costs.size(); ++l_i) {
-    costs[l_i][0] = static_cast<double>(l_i);
-    best_move[l_i][0] = kRemove;
-  }
-  // Populate for empty left.
-  for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) {
-    costs[0][r_i] = static_cast<double>(r_i);
-    best_move[0][r_i] = kAdd;
-  }
-
-  for (size_t l_i = 0; l_i < left.size(); ++l_i) {
-    for (size_t r_i = 0; r_i < right.size(); ++r_i) {
-      if (left[l_i] == right[r_i]) {
-        // Found a match. Consume it.
-        costs[l_i + 1][r_i + 1] = costs[l_i][r_i];
-        best_move[l_i + 1][r_i + 1] = kMatch;
-        continue;
-      }
-
-      const double add = costs[l_i + 1][r_i];
-      const double remove = costs[l_i][r_i + 1];
-      const double replace = costs[l_i][r_i];
-      if (add < remove && add < replace) {
-        costs[l_i + 1][r_i + 1] = add + 1;
-        best_move[l_i + 1][r_i + 1] = kAdd;
-      } else if (remove < add && remove < replace) {
-        costs[l_i + 1][r_i + 1] = remove + 1;
-        best_move[l_i + 1][r_i + 1] = kRemove;
-      } else {
-        // We make replace a little more expensive than add/remove to lower
-        // their priority.
-        costs[l_i + 1][r_i + 1] = replace + 1.00001;
-        best_move[l_i + 1][r_i + 1] = kReplace;
-      }
-    }
-  }
-
-  // Reconstruct the best path. We do it in reverse order.
-  std::vector<EditType> best_path;
-  for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) {
-    EditType move = best_move[l_i][r_i];
-    best_path.push_back(move);
-    l_i -= move != kAdd;
-    r_i -= move != kRemove;
-  }
-  std::reverse(best_path.begin(), best_path.end());
-  return best_path;
-}
-
-namespace {
-
-// Helper class to convert string into ids with deduplication.
-class InternalStrings {
- public:
-  size_t GetId(const std::string& str) {
-    IdMap::iterator it = ids_.find(str);
-    if (it != ids_.end()) return it->second;
-    size_t id = ids_.size();
-    return ids_[str] = id;
-  }
-
- private:
-  typedef std::map<std::string, size_t> IdMap;
-  IdMap ids_;
-};
-
-}  // namespace
-
-std::vector<EditType> CalculateOptimalEdits(
-    const std::vector<std::string>& left,
-    const std::vector<std::string>& right) {
-  std::vector<size_t> left_ids, right_ids;
-  {
-    InternalStrings intern_table;
-    for (size_t i = 0; i < left.size(); ++i) {
-      left_ids.push_back(intern_table.GetId(left[i]));
-    }
-    for (size_t i = 0; i < right.size(); ++i) {
-      right_ids.push_back(intern_table.GetId(right[i]));
-    }
-  }
-  return CalculateOptimalEdits(left_ids, right_ids);
-}
-
-namespace {
-
-// Helper class that holds the state for one hunk and prints it out to the
-// stream.
-// It reorders adds/removes when possible to group all removes before all
-// adds. It also adds the hunk header before printint into the stream.
-class Hunk {
- public:
-  Hunk(size_t left_start, size_t right_start)
-      : left_start_(left_start),
-        right_start_(right_start),
-        adds_(),
-        removes_(),
-        common_() {}
-
-  void PushLine(char edit, const char* line) {
-    switch (edit) {
-      case ' ':
-        ++common_;
-        FlushEdits();
-        hunk_.push_back(std::make_pair(' ', line));
-        break;
-      case '-':
-        ++removes_;
-        hunk_removes_.push_back(std::make_pair('-', line));
-        break;
-      case '+':
-        ++adds_;
-        hunk_adds_.push_back(std::make_pair('+', line));
-        break;
-    }
-  }
-
-  void PrintTo(std::ostream* os) {
-    PrintHeader(os);
-    FlushEdits();
-    for (std::list<std::pair<char, const char*> >::const_iterator it =
-             hunk_.begin();
-         it != hunk_.end(); ++it) {
-      *os << it->first << it->second << "\n";
-    }
-  }
-
-  bool has_edits() const { return adds_ || removes_; }
-
- private:
-  void FlushEdits() {
-    hunk_.splice(hunk_.end(), hunk_removes_);
-    hunk_.splice(hunk_.end(), hunk_adds_);
-  }
-
-  // Print a unified diff header for one hunk.
-  // The format is
-  //   "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
-  // where the left/right parts are omitted if unnecessary.
-  void PrintHeader(std::ostream* ss) const {
-    *ss << "@@ ";
-    if (removes_) {
-      *ss << "-" << left_start_ << "," << (removes_ + common_);
-    }
-    if (removes_ && adds_) {
-      *ss << " ";
-    }
-    if (adds_) {
-      *ss << "+" << right_start_ << "," << (adds_ + common_);
-    }
-    *ss << " @@\n";
-  }
-
-  size_t left_start_, right_start_;
-  size_t adds_, removes_, common_;
-  std::list<std::pair<char, const char*> > hunk_, hunk_adds_, hunk_removes_;
-};
-
-}  // namespace
-
-// Create a list of diff hunks in Unified diff format.
-// Each hunk has a header generated by PrintHeader above plus a body with
-// lines prefixed with ' ' for no change, '-' for deletion and '+' for
-// addition.
-// 'context' represents the desired unchanged prefix/suffix around the diff.
-// If two hunks are close enough that their contexts overlap, then they are
-// joined into one hunk.
-std::string CreateUnifiedDiff(const std::vector<std::string>& left,
-                              const std::vector<std::string>& right,
-                              size_t context) {
-  const std::vector<EditType> edits = CalculateOptimalEdits(left, right);
-
-  size_t l_i = 0, r_i = 0, edit_i = 0;
-  std::stringstream ss;
-  while (edit_i < edits.size()) {
-    // Find first edit.
-    while (edit_i < edits.size() && edits[edit_i] == kMatch) {
-      ++l_i;
-      ++r_i;
-      ++edit_i;
-    }
-
-    // Find the first line to include in the hunk.
-    const size_t prefix_context = std::min(l_i, context);
-    Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1);
-    for (size_t i = prefix_context; i > 0; --i) {
-      hunk.PushLine(' ', left[l_i - i].c_str());
-    }
-
-    // Iterate the edits until we found enough suffix for the hunk or the input
-    // is over.
-    size_t n_suffix = 0;
-    for (; edit_i < edits.size(); ++edit_i) {
-      if (n_suffix >= context) {
-        // Continue only if the next hunk is very close.
-        auto it = edits.begin() + static_cast<int>(edit_i);
-        while (it != edits.end() && *it == kMatch) ++it;
-        if (it == edits.end() ||
-            static_cast<size_t>(it - edits.begin()) - edit_i >= context) {
-          // There is no next edit or it is too far away.
-          break;
-        }
-      }
-
-      EditType edit = edits[edit_i];
-      // Reset count when a non match is found.
-      n_suffix = edit == kMatch ? n_suffix + 1 : 0;
-
-      if (edit == kMatch || edit == kRemove || edit == kReplace) {
-        hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str());
-      }
-      if (edit == kAdd || edit == kReplace) {
-        hunk.PushLine('+', right[r_i].c_str());
-      }
-
-      // Advance indices, depending on edit type.
-      l_i += edit != kAdd;
-      r_i += edit != kRemove;
-    }
-
-    if (!hunk.has_edits()) {
-      // We are done. We don't want this hunk.
-      break;
-    }
-
-    hunk.PrintTo(&ss);
-  }
-  return ss.str();
-}
-
-}  // namespace edit_distance
-
-namespace {
-
-// The string representation of the values received in EqFailure() are already
-// escaped. Split them on escaped '\n' boundaries. Leave all other escaped
-// characters the same.
-std::vector<std::string> SplitEscapedString(const std::string& str) {
-  std::vector<std::string> lines;
-  size_t start = 0, end = str.size();
-  if (end > 2 && str[0] == '"' && str[end - 1] == '"') {
-    ++start;
-    --end;
-  }
-  bool escaped = false;
-  for (size_t i = start; i + 1 < end; ++i) {
-    if (escaped) {
-      escaped = false;
-      if (str[i] == 'n') {
-        lines.push_back(str.substr(start, i - start - 1));
-        start = i + 1;
-      }
-    } else {
-      escaped = str[i] == '\\';
-    }
-  }
-  lines.push_back(str.substr(start, end - start));
-  return lines;
-}
-
-}  // namespace
-
-// Constructs and returns the message for an equality assertion
-// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
-//
-// The first four parameters are the expressions used in the assertion
-// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
-// where foo is 5 and bar is 6, we have:
-//
-//   lhs_expression: "foo"
-//   rhs_expression: "bar"
-//   lhs_value:      "5"
-//   rhs_value:      "6"
-//
-// The ignoring_case parameter is true iff the assertion is a
-// *_STRCASEEQ*.  When it's true, the string "Ignoring case" will
-// be inserted into the message.
-AssertionResult EqFailure(const char* lhs_expression,
-                          const char* rhs_expression,
-                          const std::string& lhs_value,
-                          const std::string& rhs_value,
-                          bool ignoring_case) {
-  Message msg;
-  msg << "Expected equality of these values:";
-  msg << "\n  " << lhs_expression;
-  if (lhs_value != lhs_expression) {
-    msg << "\n    Which is: " << lhs_value;
-  }
-  msg << "\n  " << rhs_expression;
-  if (rhs_value != rhs_expression) {
-    msg << "\n    Which is: " << rhs_value;
-  }
-
-  if (ignoring_case) {
-    msg << "\nIgnoring case";
-  }
-
-  if (!lhs_value.empty() && !rhs_value.empty()) {
-    const std::vector<std::string> lhs_lines =
-        SplitEscapedString(lhs_value);
-    const std::vector<std::string> rhs_lines =
-        SplitEscapedString(rhs_value);
-    if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
-      msg << "\nWith diff:\n"
-          << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
-    }
-  }
-
-  return AssertionFailure() << msg;
-}
-
-// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
-std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value) {
-  const char* actual_message = assertion_result.message();
-  Message msg;
-  msg << "Value of: " << expression_text
-      << "\n  Actual: " << actual_predicate_value;
-  if (actual_message[0] != '\0')
-    msg << " (" << actual_message << ")";
-  msg << "\nExpected: " << expected_predicate_value;
-  return msg.GetString();
-}
-
-// Helper function for implementing ASSERT_NEAR.
-AssertionResult DoubleNearPredFormat(const char* expr1,
-                                     const char* expr2,
-                                     const char* abs_error_expr,
-                                     double val1,
-                                     double val2,
-                                     double abs_error) {
-  const double diff = fabs(val1 - val2);
-  if (diff <= abs_error) return AssertionSuccess();
-
-  return AssertionFailure()
-      << "The difference between " << expr1 << " and " << expr2
-      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
-      << expr1 << " evaluates to " << val1 << ",\n"
-      << expr2 << " evaluates to " << val2 << ", and\n"
-      << abs_error_expr << " evaluates to " << abs_error << ".";
-}
-
-
-// Helper template for implementing FloatLE() and DoubleLE().
-template <typename RawType>
-AssertionResult FloatingPointLE(const char* expr1,
-                                const char* expr2,
-                                RawType val1,
-                                RawType val2) {
-  // Returns success if val1 is less than val2,
-  if (val1 < val2) {
-    return AssertionSuccess();
-  }
-
-  // or if val1 is almost equal to val2.
-  const FloatingPoint<RawType> lhs(val1), rhs(val2);
-  if (lhs.AlmostEquals(rhs)) {
-    return AssertionSuccess();
-  }
-
-  // Note that the above two checks will both fail if either val1 or
-  // val2 is NaN, as the IEEE floating-point standard requires that
-  // any predicate involving a NaN must return false.
-
-  ::std::stringstream val1_ss;
-  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-          << val1;
-
-  ::std::stringstream val2_ss;
-  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-          << val2;
-
-  return AssertionFailure()
-      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
-      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
-      << StringStreamToString(&val2_ss);
-}
-
-}  // namespace internal
-
-// Asserts that val1 is less than, or almost equal to, val2.  Fails
-// otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult FloatLE(const char* expr1, const char* expr2,
-                        float val1, float val2) {
-  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
-}
-
-// Asserts that val1 is less than, or almost equal to, val2.  Fails
-// otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult DoubleLE(const char* expr1, const char* expr2,
-                         double val1, double val2) {
-  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
-}
-
-namespace internal {
-
-// The helper function for {ASSERT|EXPECT}_EQ with int or enum
-// arguments.
-AssertionResult CmpHelperEQ(const char* lhs_expression,
-                            const char* rhs_expression,
-                            BiggestInt lhs,
-                            BiggestInt rhs) {
-  if (lhs == rhs) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   FormatForComparisonFailureMessage(lhs, rhs),
-                   FormatForComparisonFailureMessage(rhs, lhs),
-                   false);
-}
-
-// A macro for implementing the helper functions needed to implement
-// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
-// just to avoid copy-and-paste of similar code.
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   BiggestInt val1, BiggestInt val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return AssertionFailure() \
-        << "Expected: (" << expr1 << ") " #op " (" << expr2\
-        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
-        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
-  }\
-}
-
-// Implements the helper function for {ASSERT|EXPECT}_NE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(NE, !=)
-// Implements the helper function for {ASSERT|EXPECT}_LE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LE, <=)
-// Implements the helper function for {ASSERT|EXPECT}_LT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LT, < )
-// Implements the helper function for {ASSERT|EXPECT}_GE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GE, >=)
-// Implements the helper function for {ASSERT|EXPECT}_GT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GT, > )
-
-#undef GTEST_IMPL_CMP_HELPER_
-
-// The helper function for {ASSERT|EXPECT}_STREQ.
-AssertionResult CmpHelperSTREQ(const char* lhs_expression,
-                               const char* rhs_expression,
-                               const char* lhs,
-                               const char* rhs) {
-  if (String::CStringEquals(lhs, rhs)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   false);
-}
-
-// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
-AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
-                                   const char* rhs_expression,
-                                   const char* lhs,
-                                   const char* rhs) {
-  if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   true);
-}
-
-// The helper function for {ASSERT|EXPECT}_STRNE.
-AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const char* s1,
-                               const char* s2) {
-  if (!String::CStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  } else {
-    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                              << s2_expression << "), actual: \""
-                              << s1 << "\" vs \"" << s2 << "\"";
-  }
-}
-
-// The helper function for {ASSERT|EXPECT}_STRCASENE.
-AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
-                                   const char* s2_expression,
-                                   const char* s1,
-                                   const char* s2) {
-  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  } else {
-    return AssertionFailure()
-        << "Expected: (" << s1_expression << ") != ("
-        << s2_expression << ") (ignoring case), actual: \""
-        << s1 << "\" vs \"" << s2 << "\"";
-  }
-}
-
-}  // namespace internal
-
-namespace {
-
-// Helper functions for implementing IsSubString() and IsNotSubstring().
-
-// This group of overloaded functions return true iff needle is a
-// substring of haystack.  NULL is considered a substring of itself
-// only.
-
-bool IsSubstringPred(const char* needle, const char* haystack) {
-  if (needle == nullptr || haystack == nullptr) return needle == haystack;
-
-  return strstr(haystack, needle) != nullptr;
-}
-
-bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
-  if (needle == nullptr || haystack == nullptr) return needle == haystack;
-
-  return wcsstr(haystack, needle) != nullptr;
-}
-
-// StringType here can be either ::std::string or ::std::wstring.
-template <typename StringType>
-bool IsSubstringPred(const StringType& needle,
-                     const StringType& haystack) {
-  return haystack.find(needle) != StringType::npos;
-}
-
-// This function implements either IsSubstring() or IsNotSubstring(),
-// depending on the value of the expected_to_be_substring parameter.
-// StringType here can be const char*, const wchar_t*, ::std::string,
-// or ::std::wstring.
-template <typename StringType>
-AssertionResult IsSubstringImpl(
-    bool expected_to_be_substring,
-    const char* needle_expr, const char* haystack_expr,
-    const StringType& needle, const StringType& haystack) {
-  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
-    return AssertionSuccess();
-
-  const bool is_wide_string = sizeof(needle[0]) > 1;
-  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
-  return AssertionFailure()
-      << "Value of: " << needle_expr << "\n"
-      << "  Actual: " << begin_string_quote << needle << "\"\n"
-      << "Expected: " << (expected_to_be_substring ? "" : "not ")
-      << "a substring of " << haystack_expr << "\n"
-      << "Which is: " << begin_string_quote << haystack << "\"";
-}
-
-}  // namespace
-
-// IsSubstring() and IsNotSubstring() check whether needle is a
-// substring of haystack (NULL is considered a substring of itself
-// only), and return an appropriate error message when they fail.
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-#if GTEST_HAS_STD_WSTRING
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-namespace internal {
-
-#if GTEST_OS_WINDOWS
-
-namespace {
-
-// Helper function for IsHRESULT{SuccessFailure} predicates
-AssertionResult HRESULTFailureHelper(const char* expr,
-                                     const char* expected,
-                                     long hr) {  // NOLINT
-# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
-
-  // Windows CE doesn't support FormatMessage.
-  const char error_text[] = "";
-
-# else
-
-  // Looks up the human-readable system message for the HRESULT code
-  // and since we're not passing any params to FormatMessage, we don't
-  // want inserts expanded.
-  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
-                       FORMAT_MESSAGE_IGNORE_INSERTS;
-  const DWORD kBufSize = 4096;
-  // Gets the system's human readable message string for this HRESULT.
-  char error_text[kBufSize] = { '\0' };
-  DWORD message_length = ::FormatMessageA(kFlags,
-                                          0,   // no source, we're asking system
-                                          hr,  // the error
-                                          0,   // no line width restrictions
-                                          error_text,  // output buffer
-                                          kBufSize,    // buf size
-                                          nullptr);  // no arguments for inserts
-  // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
-  for (; message_length && IsSpace(error_text[message_length - 1]);
-          --message_length) {
-    error_text[message_length - 1] = '\0';
-  }
-
-# endif  // GTEST_OS_WINDOWS_MOBILE
-
-  const std::string error_hex("0x" + String::FormatHexInt(hr));
-  return ::testing::AssertionFailure()
-      << "Expected: " << expr << " " << expected << ".\n"
-      << "  Actual: " << error_hex << " " << error_text << "\n";
-}
-
-}  // namespace
-
-AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
-  if (SUCCEEDED(hr)) {
-    return AssertionSuccess();
-  }
-  return HRESULTFailureHelper(expr, "succeeds", hr);
-}
-
-AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
-  if (FAILED(hr)) {
-    return AssertionSuccess();
-  }
-  return HRESULTFailureHelper(expr, "fails", hr);
-}
-
-#endif  // GTEST_OS_WINDOWS
-
-// Utility functions for encoding Unicode text (wide strings) in
-// UTF-8.
-
-// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
-// like this:
-//
-// Code-point length   Encoding
-//   0 -  7 bits       0xxxxxxx
-//   8 - 11 bits       110xxxxx 10xxxxxx
-//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
-//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-
-// The maximum code-point a one-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
-
-// The maximum code-point a two-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
-
-// The maximum code-point a three-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
-
-// The maximum code-point a four-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
-
-// Chops off the n lowest bits from a bit pattern.  Returns the n
-// lowest bits.  As a side effect, the original bit pattern will be
-// shifted to the right by n bits.
-inline UInt32 ChopLowBits(UInt32* bits, int n) {
-  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
-  *bits >>= n;
-  return low_bits;
-}
-
-// Converts a Unicode code point to a narrow string in UTF-8 encoding.
-// code_point parameter is of type UInt32 because wchar_t may not be
-// wide enough to contain a code point.
-// If the code_point is not a valid Unicode code point
-// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
-// to "(Invalid Unicode 0xXXXXXXXX)".
-std::string CodePointToUtf8(UInt32 code_point) {
-  if (code_point > kMaxCodePoint4) {
-    return "(Invalid Unicode 0x" + String::FormatHexUInt32(code_point) + ")";
-  }
-
-  char str[5];  // Big enough for the largest valid code point.
-  if (code_point <= kMaxCodePoint1) {
-    str[1] = '\0';
-    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
-  } else if (code_point <= kMaxCodePoint2) {
-    str[2] = '\0';
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
-  } else if (code_point <= kMaxCodePoint3) {
-    str[3] = '\0';
-    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
-  } else {  // code_point <= kMaxCodePoint4
-    str[4] = '\0';
-    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
-  }
-  return str;
-}
-
-// The following two functions only make sense if the system
-// uses UTF-16 for wide string encoding. All supported systems
-// with 16 bit wchar_t (Windows, Cygwin) do use UTF-16.
-
-// Determines if the arguments constitute UTF-16 surrogate pair
-// and thus should be combined into a single Unicode code point
-// using CreateCodePointFromUtf16SurrogatePair.
-inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
-  return sizeof(wchar_t) == 2 &&
-      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
-}
-
-// Creates a Unicode code point from UTF16 surrogate pair.
-inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
-                                                    wchar_t second) {
-  const auto first_u = static_cast<UInt32>(first);
-  const auto second_u = static_cast<UInt32>(second);
-  const UInt32 mask = (1 << 10) - 1;
-  return (sizeof(wchar_t) == 2)
-             ? (((first_u & mask) << 10) | (second_u & mask)) + 0x10000
-             :
-             // This function should not be called when the condition is
-             // false, but we provide a sensible default in case it is.
-             first_u;
-}
-
-// Converts a wide string to a narrow string in UTF-8 encoding.
-// The wide string is assumed to have the following encoding:
-//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin)
-//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
-// Parameter str points to a null-terminated wide string.
-// Parameter num_chars may additionally limit the number
-// of wchar_t characters processed. -1 is used when the entire string
-// should be processed.
-// If the string contains code points that are not valid Unicode code points
-// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
-// and contains invalid UTF-16 surrogate pairs, values in those pairs
-// will be encoded as individual Unicode characters from Basic Normal Plane.
-std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
-  if (num_chars == -1)
-    num_chars = static_cast<int>(wcslen(str));
-
-  ::std::stringstream stream;
-  for (int i = 0; i < num_chars; ++i) {
-    UInt32 unicode_code_point;
-
-    if (str[i] == L'\0') {
-      break;
-    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
-      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
-                                                                 str[i + 1]);
-      i++;
-    } else {
-      unicode_code_point = static_cast<UInt32>(str[i]);
-    }
-
-    stream << CodePointToUtf8(unicode_code_point);
-  }
-  return StringStreamToString(&stream);
-}
-
-// Converts a wide C string to an std::string using the UTF-8 encoding.
-// NULL will be converted to "(null)".
-std::string String::ShowWideCString(const wchar_t * wide_c_str) {
-  if (wide_c_str == nullptr) return "(null)";
-
-  return internal::WideStringToUtf8(wide_c_str, -1);
-}
-
-// Compares two wide C strings.  Returns true iff they have the same
-// content.
-//
-// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
-// C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
-  if (lhs == nullptr) return rhs == nullptr;
-
-  if (rhs == nullptr) return false;
-
-  return wcscmp(lhs, rhs) == 0;
-}
-
-// Helper function for *_STREQ on wide strings.
-AssertionResult CmpHelperSTREQ(const char* lhs_expression,
-                               const char* rhs_expression,
-                               const wchar_t* lhs,
-                               const wchar_t* rhs) {
-  if (String::WideCStringEquals(lhs, rhs)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   false);
-}
-
-// Helper function for *_STRNE on wide strings.
-AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const wchar_t* s1,
-                               const wchar_t* s2) {
-  if (!String::WideCStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  }
-
-  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                            << s2_expression << "), actual: "
-                            << PrintToString(s1)
-                            << " vs " << PrintToString(s2);
-}
-
-// Compares two C strings, ignoring case.  Returns true iff they have
-// the same content.
-//
-// Unlike strcasecmp(), this function can handle NULL argument(s).  A
-// NULL C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
-  if (lhs == nullptr) return rhs == nullptr;
-  if (rhs == nullptr) return false;
-  return posix::StrCaseCmp(lhs, rhs) == 0;
-}
-
-  // Compares two wide C strings, ignoring case.  Returns true iff they
-  // have the same content.
-  //
-  // Unlike wcscasecmp(), this function can handle NULL argument(s).
-  // A NULL C string is considered different to any non-NULL wide C string,
-  // including the empty string.
-  // NB: The implementations on different platforms slightly differ.
-  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
-  // environment variable. On GNU platform this method uses wcscasecmp
-  // which compares according to LC_CTYPE category of the current locale.
-  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
-  // current locale.
-bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
-                                              const wchar_t* rhs) {
-  if (lhs == nullptr) return rhs == nullptr;
-
-  if (rhs == nullptr) return false;
-
-#if GTEST_OS_WINDOWS
-  return _wcsicmp(lhs, rhs) == 0;
-#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
-  return wcscasecmp(lhs, rhs) == 0;
-#else
-  // Android, Mac OS X and Cygwin don't define wcscasecmp.
-  // Other unknown OSes may not define it either.
-  wint_t left, right;
-  do {
-    left = towlower(*lhs++);
-    right = towlower(*rhs++);
-  } while (left && left == right);
-  return left == right;
-#endif  // OS selector
-}
-
-// Returns true iff str ends with the given suffix, ignoring case.
-// Any string is considered to end with an empty suffix.
-bool String::EndsWithCaseInsensitive(
-    const std::string& str, const std::string& suffix) {
-  const size_t str_len = str.length();
-  const size_t suffix_len = suffix.length();
-  return (str_len >= suffix_len) &&
-         CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
-                                      suffix.c_str());
-}
-
-// Formats an int value as "%02d".
-std::string String::FormatIntWidth2(int value) {
-  std::stringstream ss;
-  ss << std::setfill('0') << std::setw(2) << value;
-  return ss.str();
-}
-
-// Formats an int value as "%X".
-std::string String::FormatHexUInt32(UInt32 value) {
-  std::stringstream ss;
-  ss << std::hex << std::uppercase << value;
-  return ss.str();
-}
-
-// Formats an int value as "%X".
-std::string String::FormatHexInt(int value) {
-  return FormatHexUInt32(static_cast<UInt32>(value));
-}
-
-// Formats a byte as "%02X".
-std::string String::FormatByte(unsigned char value) {
-  std::stringstream ss;
-  ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
-     << static_cast<unsigned int>(value);
-  return ss.str();
-}
-
-// Converts the buffer in a stringstream to an std::string, converting NUL
-// bytes to "\\0" along the way.
-std::string StringStreamToString(::std::stringstream* ss) {
-  const ::std::string& str = ss->str();
-  const char* const start = str.c_str();
-  const char* const end = start + str.length();
-
-  std::string result;
-  result.reserve(static_cast<size_t>(2 * (end - start)));
-  for (const char* ch = start; ch != end; ++ch) {
-    if (*ch == '\0') {
-      result += "\\0";  // Replaces NUL with "\\0";
-    } else {
-      result += *ch;
-    }
-  }
-
-  return result;
-}
-
-// Appends the user-supplied message to the Google-Test-generated message.
-std::string AppendUserMessage(const std::string& gtest_msg,
-                              const Message& user_msg) {
-  // Appends the user message if it's non-empty.
-  const std::string user_msg_string = user_msg.GetString();
-  if (user_msg_string.empty()) {
-    return gtest_msg;
-  }
-
-  return gtest_msg + "\n" + user_msg_string;
-}
-
-}  // namespace internal
-
-// class TestResult
-
-// Creates an empty TestResult.
-TestResult::TestResult()
-    : death_test_count_(0),
-      elapsed_time_(0) {
-}
-
-// D'tor.
-TestResult::~TestResult() {
-}
-
-// Returns the i-th test part result among all the results. i can
-// range from 0 to total_part_count() - 1. If i is not in that range,
-// aborts the program.
-const TestPartResult& TestResult::GetTestPartResult(int i) const {
-  if (i < 0 || i >= total_part_count())
-    internal::posix::Abort();
-  return test_part_results_.at(static_cast<size_t>(i));
-}
-
-// Returns the i-th test property. i can range from 0 to
-// test_property_count() - 1. If i is not in that range, aborts the
-// program.
-const TestProperty& TestResult::GetTestProperty(int i) const {
-  if (i < 0 || i >= test_property_count())
-    internal::posix::Abort();
-  return test_properties_.at(static_cast<size_t>(i));
-}
-
-// Clears the test part results.
-void TestResult::ClearTestPartResults() {
-  test_part_results_.clear();
-}
-
-// Adds a test part result to the list.
-void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
-  test_part_results_.push_back(test_part_result);
-}
-
-// Adds a test property to the list. If a property with the same key as the
-// supplied property is already represented, the value of this test_property
-// replaces the old value for that key.
-void TestResult::RecordProperty(const std::string& xml_element,
-                                const TestProperty& test_property) {
-  if (!ValidateTestProperty(xml_element, test_property)) {
-    return;
-  }
-  internal::MutexLock lock(&test_properites_mutex_);
-  const std::vector<TestProperty>::iterator property_with_matching_key =
-      std::find_if(test_properties_.begin(), test_properties_.end(),
-                   internal::TestPropertyKeyIs(test_property.key()));
-  if (property_with_matching_key == test_properties_.end()) {
-    test_properties_.push_back(test_property);
-    return;
-  }
-  property_with_matching_key->SetValue(test_property.value());
-}
-
-// The list of reserved attributes used in the <testsuites> element of XML
-// output.
-static const char* const kReservedTestSuitesAttributes[] = {
-  "disabled",
-  "errors",
-  "failures",
-  "name",
-  "random_seed",
-  "tests",
-  "time",
-  "timestamp"
-};
-
-// The list of reserved attributes used in the <testsuite> element of XML
-// output.
-static const char* const kReservedTestSuiteAttributes[] = {
-  "disabled",
-  "errors",
-  "failures",
-  "name",
-  "tests",
-  "time"
-};
-
-// The list of reserved attributes used in the <testcase> element of XML output.
-static const char* const kReservedTestCaseAttributes[] = {
-    "classname",   "name", "status", "time",  "type_param",
-    "value_param", "file", "line"};
-
-// Use a slightly different set for allowed output to ensure existing tests can
-// still RecordProperty("result")
-static const char* const kReservedOutputTestCaseAttributes[] = {
-    "classname",   "name", "status", "time",  "type_param",
-    "value_param", "file", "line", "result"};
-
-template <int kSize>
-std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
-  return std::vector<std::string>(array, array + kSize);
-}
-
-static std::vector<std::string> GetReservedAttributesForElement(
-    const std::string& xml_element) {
-  if (xml_element == "testsuites") {
-    return ArrayAsVector(kReservedTestSuitesAttributes);
-  } else if (xml_element == "testsuite") {
-    return ArrayAsVector(kReservedTestSuiteAttributes);
-  } else if (xml_element == "testcase") {
-    return ArrayAsVector(kReservedTestCaseAttributes);
-  } else {
-    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
-  }
-  // This code is unreachable but some compilers may not realizes that.
-  return std::vector<std::string>();
-}
-
-// TODO(jdesprez): Merge the two getReserved attributes once skip is improved
-static std::vector<std::string> GetReservedOutputAttributesForElement(
-    const std::string& xml_element) {
-  if (xml_element == "testsuites") {
-    return ArrayAsVector(kReservedTestSuitesAttributes);
-  } else if (xml_element == "testsuite") {
-    return ArrayAsVector(kReservedTestSuiteAttributes);
-  } else if (xml_element == "testcase") {
-    return ArrayAsVector(kReservedOutputTestCaseAttributes);
-  } else {
-    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
-  }
-  // This code is unreachable but some compilers may not realizes that.
-  return std::vector<std::string>();
-}
-
-static std::string FormatWordList(const std::vector<std::string>& words) {
-  Message word_list;
-  for (size_t i = 0; i < words.size(); ++i) {
-    if (i > 0 && words.size() > 2) {
-      word_list << ", ";
-    }
-    if (i == words.size() - 1) {
-      word_list << "and ";
-    }
-    word_list << "'" << words[i] << "'";
-  }
-  return word_list.GetString();
-}
-
-static bool ValidateTestPropertyName(
-    const std::string& property_name,
-    const std::vector<std::string>& reserved_names) {
-  if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
-          reserved_names.end()) {
-    ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
-                  << " (" << FormatWordList(reserved_names)
-                  << " are reserved by " << GTEST_NAME_ << ")";
-    return false;
-  }
-  return true;
-}
-
-// Adds a failure if the key is a reserved attribute of the element named
-// xml_element.  Returns true if the property is valid.
-bool TestResult::ValidateTestProperty(const std::string& xml_element,
-                                      const TestProperty& test_property) {
-  return ValidateTestPropertyName(test_property.key(),
-                                  GetReservedAttributesForElement(xml_element));
-}
-
-// Clears the object.
-void TestResult::Clear() {
-  test_part_results_.clear();
-  test_properties_.clear();
-  death_test_count_ = 0;
-  elapsed_time_ = 0;
-}
-
-// Returns true off the test part was skipped.
-static bool TestPartSkipped(const TestPartResult& result) {
-  return result.skipped();
-}
-
-// Returns true iff the test was skipped.
-bool TestResult::Skipped() const {
-  return !Failed() && CountIf(test_part_results_, TestPartSkipped) > 0;
-}
-
-// Returns true iff the test failed.
-bool TestResult::Failed() const {
-  for (int i = 0; i < total_part_count(); ++i) {
-    if (GetTestPartResult(i).failed())
-      return true;
-  }
-  return false;
-}
-
-// Returns true iff the test part fatally failed.
-static bool TestPartFatallyFailed(const TestPartResult& result) {
-  return result.fatally_failed();
-}
-
-// Returns true iff the test fatally failed.
-bool TestResult::HasFatalFailure() const {
-  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
-}
-
-// Returns true iff the test part non-fatally failed.
-static bool TestPartNonfatallyFailed(const TestPartResult& result) {
-  return result.nonfatally_failed();
-}
-
-// Returns true iff the test has a non-fatal failure.
-bool TestResult::HasNonfatalFailure() const {
-  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
-}
-
-// Gets the number of all test parts.  This is the sum of the number
-// of successful test parts and the number of failed test parts.
-int TestResult::total_part_count() const {
-  return static_cast<int>(test_part_results_.size());
-}
-
-// Returns the number of the test properties.
-int TestResult::test_property_count() const {
-  return static_cast<int>(test_properties_.size());
-}
-
-// class Test
-
-// Creates a Test object.
-
-// The c'tor saves the states of all flags.
-Test::Test()
-    : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
-}
-
-// The d'tor restores the states of all flags.  The actual work is
-// done by the d'tor of the gtest_flag_saver_ field, and thus not
-// visible here.
-Test::~Test() {
-}
-
-// Sets up the test fixture.
-//
-// A sub-class may override this.
-void Test::SetUp() {
-}
-
-// Tears down the test fixture.
-//
-// A sub-class may override this.
-void Test::TearDown() {
-}
-
-// Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const std::string& key, const std::string& value) {
-  UnitTest::GetInstance()->RecordProperty(key, value);
-}
-
-// Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const std::string& key, int value) {
-  Message value_message;
-  value_message << value;
-  RecordProperty(key, value_message.GetString().c_str());
-}
-
-namespace internal {
-
-void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
-                                    const std::string& message) {
-  // This function is a friend of UnitTest and as such has access to
-  // AddTestPartResult.
-  UnitTest::GetInstance()->AddTestPartResult(
-      result_type,
-      nullptr,  // No info about the source file where the exception occurred.
-      -1,       // We have no info on which line caused the exception.
-      message,
-      "");  // No stack trace, either.
-}
-
-}  // namespace internal
-
-// Google Test requires all tests in the same test suite to use the same test
-// fixture class.  This function checks if the current test has the
-// same fixture class as the first test in the current test suite.  If
-// yes, it returns true; otherwise it generates a Google Test failure and
-// returns false.
-bool Test::HasSameFixtureClass() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  const TestSuite* const test_suite = impl->current_test_suite();
-
-  // Info about the first test in the current test suite.
-  const TestInfo* const first_test_info = test_suite->test_info_list()[0];
-  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
-  const char* const first_test_name = first_test_info->name();
-
-  // Info about the current test.
-  const TestInfo* const this_test_info = impl->current_test_info();
-  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
-  const char* const this_test_name = this_test_info->name();
-
-  if (this_fixture_id != first_fixture_id) {
-    // Is the first test defined using TEST?
-    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
-    // Is this test defined using TEST?
-    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
-
-    if (first_is_TEST || this_is_TEST) {
-      // Both TEST and TEST_F appear in same test suite, which is incorrect.
-      // Tell the user how to fix this.
-
-      // Gets the name of the TEST and the name of the TEST_F.  Note
-      // that first_is_TEST and this_is_TEST cannot both be true, as
-      // the fixture IDs are different for the two tests.
-      const char* const TEST_name =
-          first_is_TEST ? first_test_name : this_test_name;
-      const char* const TEST_F_name =
-          first_is_TEST ? this_test_name : first_test_name;
-
-      ADD_FAILURE()
-          << "All tests in the same test suite must use the same test fixture\n"
-          << "class, so mixing TEST_F and TEST in the same test suite is\n"
-          << "illegal.  In test suite " << this_test_info->test_suite_name()
-          << ",\n"
-          << "test " << TEST_F_name << " is defined using TEST_F but\n"
-          << "test " << TEST_name << " is defined using TEST.  You probably\n"
-          << "want to change the TEST to TEST_F or move it to another test\n"
-          << "case.";
-    } else {
-      // Two fixture classes with the same name appear in two different
-      // namespaces, which is not allowed. Tell the user how to fix this.
-      ADD_FAILURE()
-          << "All tests in the same test suite must use the same test fixture\n"
-          << "class.  However, in test suite "
-          << this_test_info->test_suite_name() << ",\n"
-          << "you defined test " << first_test_name << " and test "
-          << this_test_name << "\n"
-          << "using two different test fixture classes.  This can happen if\n"
-          << "the two classes are from different namespaces or translation\n"
-          << "units and have the same name.  You should probably rename one\n"
-          << "of the classes to put the tests into different test suites.";
-    }
-    return false;
-  }
-
-  return true;
-}
-
-#if GTEST_HAS_SEH
-
-// Adds an "exception thrown" fatal failure to the current test.  This
-// function returns its result via an output parameter pointer because VC++
-// prohibits creation of objects with destructors on stack in functions
-// using __try (see error C2712).
-static std::string* FormatSehExceptionMessage(DWORD exception_code,
-                                              const char* location) {
-  Message message;
-  message << "SEH exception with code 0x" << std::setbase(16) <<
-    exception_code << std::setbase(10) << " thrown in " << location << ".";
-
-  return new std::string(message.GetString());
-}
-
-#endif  // GTEST_HAS_SEH
-
-namespace internal {
-
-#if GTEST_HAS_EXCEPTIONS
-
-// Adds an "exception thrown" fatal failure to the current test.
-static std::string FormatCxxExceptionMessage(const char* description,
-                                             const char* location) {
-  Message message;
-  if (description != nullptr) {
-    message << "C++ exception with description \"" << description << "\"";
-  } else {
-    message << "Unknown C++ exception";
-  }
-  message << " thrown in " << location << ".";
-
-  return message.GetString();
-}
-
-static std::string PrintTestPartResultToString(
-    const TestPartResult& test_part_result);
-
-GoogleTestFailureException::GoogleTestFailureException(
-    const TestPartResult& failure)
-    : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
-
-#endif  // GTEST_HAS_EXCEPTIONS
-
-// We put these helper functions in the internal namespace as IBM's xlC
-// compiler rejects the code if they were declared static.
-
-// Runs the given method and handles SEH exceptions it throws, when
-// SEH is supported; returns the 0-value for type Result in case of an
-// SEH exception.  (Microsoft compilers cannot handle SEH and C++
-// exceptions in the same function.  Therefore, we provide a separate
-// wrapper function for handling SEH exceptions.)
-template <class T, typename Result>
-Result HandleSehExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
-#if GTEST_HAS_SEH
-  __try {
-    return (object->*method)();
-  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
-      GetExceptionCode())) {
-    // We create the exception message on the heap because VC++ prohibits
-    // creation of objects with destructors on stack in functions using __try
-    // (see error C2712).
-    std::string* exception_message = FormatSehExceptionMessage(
-        GetExceptionCode(), location);
-    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
-                                             *exception_message);
-    delete exception_message;
-    return static_cast<Result>(0);
-  }
-#else
-  (void)location;
-  return (object->*method)();
-#endif  // GTEST_HAS_SEH
-}
-
-// Runs the given method and catches and reports C++ and/or SEH-style
-// exceptions, if they are supported; returns the 0-value for type
-// Result in case of an SEH exception.
-template <class T, typename Result>
-Result HandleExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
-  // NOTE: The user code can affect the way in which Google Test handles
-  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
-  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
-  // after the exception is caught and either report or re-throw the
-  // exception based on the flag's value:
-  //
-  // try {
-  //   // Perform the test method.
-  // } catch (...) {
-  //   if (GTEST_FLAG(catch_exceptions))
-  //     // Report the exception as failure.
-  //   else
-  //     throw;  // Re-throws the original exception.
-  // }
-  //
-  // However, the purpose of this flag is to allow the program to drop into
-  // the debugger when the exception is thrown. On most platforms, once the
-  // control enters the catch block, the exception origin information is
-  // lost and the debugger will stop the program at the point of the
-  // re-throw in this function -- instead of at the point of the original
-  // throw statement in the code under test.  For this reason, we perform
-  // the check early, sacrificing the ability to affect Google Test's
-  // exception handling in the method where the exception is thrown.
-  if (internal::GetUnitTestImpl()->catch_exceptions()) {
-#if GTEST_HAS_EXCEPTIONS
-    try {
-      return HandleSehExceptionsInMethodIfSupported(object, method, location);
-    } catch (const AssertionException&) {  // NOLINT
-      // This failure was reported already.
-    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
-      // This exception type can only be thrown by a failed Google
-      // Test assertion with the intention of letting another testing
-      // framework catch it.  Therefore we just re-throw it.
-      throw;
-    } catch (const std::exception& e) {  // NOLINT
-      internal::ReportFailureInUnknownLocation(
-          TestPartResult::kFatalFailure,
-          FormatCxxExceptionMessage(e.what(), location));
-    } catch (...) {  // NOLINT
-      internal::ReportFailureInUnknownLocation(
-          TestPartResult::kFatalFailure,
-          FormatCxxExceptionMessage(nullptr, location));
-    }
-    return static_cast<Result>(0);
-#else
-    return HandleSehExceptionsInMethodIfSupported(object, method, location);
-#endif  // GTEST_HAS_EXCEPTIONS
-  } else {
-    return (object->*method)();
-  }
-}
-
-}  // namespace internal
-
-// Runs the test and updates the test result.
-void Test::Run() {
-  if (!HasSameFixtureClass()) return;
-
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
-  // We will run the test only if SetUp() was successful and didn't call
-  // GTEST_SKIP().
-  if (!HasFatalFailure() && !IsSkipped()) {
-    impl->os_stack_trace_getter()->UponLeavingGTest();
-    internal::HandleExceptionsInMethodIfSupported(
-        this, &Test::TestBody, "the test body");
-  }
-
-  // However, we want to clean up as much as possible.  Hence we will
-  // always call TearDown(), even if SetUp() or the test body has
-  // failed.
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &Test::TearDown, "TearDown()");
-}
-
-// Returns true iff the current test has a fatal failure.
-bool Test::HasFatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
-}
-
-// Returns true iff the current test has a non-fatal failure.
-bool Test::HasNonfatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->
-      HasNonfatalFailure();
-}
-
-// Returns true iff the current test was skipped.
-bool Test::IsSkipped() {
-  return internal::GetUnitTestImpl()->current_test_result()->Skipped();
-}
-
-// class TestInfo
-
-// Constructs a TestInfo object. It assumes ownership of the test factory
-// object.
-TestInfo::TestInfo(const std::string& a_test_suite_name,
-                   const std::string& a_name, const char* a_type_param,
-                   const char* a_value_param,
-                   internal::CodeLocation a_code_location,
-                   internal::TypeId fixture_class_id,
-                   internal::TestFactoryBase* factory)
-    : test_suite_name_(a_test_suite_name),
-      name_(a_name),
-      type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
-      value_param_(a_value_param ? new std::string(a_value_param) : nullptr),
-      location_(a_code_location),
-      fixture_class_id_(fixture_class_id),
-      should_run_(false),
-      is_disabled_(false),
-      matches_filter_(false),
-      factory_(factory),
-      result_() {}
-
-// Destructs a TestInfo object.
-TestInfo::~TestInfo() { delete factory_; }
-
-namespace internal {
-
-// Creates a new TestInfo object and registers it with Google Test;
-// returns the created object.
-//
-// Arguments:
-//
-//   test_suite_name:   name of the test suite
-//   name:             name of the test
-//   type_param:       the name of the test's type parameter, or NULL if
-//                     this is not a typed or a type-parameterized test.
-//   value_param:      text representation of the test's value parameter,
-//                     or NULL if this is not a value-parameterized test.
-//   code_location:    code location where the test is defined
-//   fixture_class_id: ID of the test fixture class
-//   set_up_tc:        pointer to the function that sets up the test suite
-//   tear_down_tc:     pointer to the function that tears down the test suite
-//   factory:          pointer to the factory that creates a test object.
-//                     The newly created TestInfo instance will assume
-//                     ownership of the factory object.
-TestInfo* MakeAndRegisterTestInfo(
-    const char* test_suite_name, const char* name, const char* type_param,
-    const char* value_param, CodeLocation code_location,
-    TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc,
-    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase* factory) {
-  TestInfo* const test_info =
-      new TestInfo(test_suite_name, name, type_param, value_param,
-                   code_location, fixture_class_id, factory);
-  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
-  return test_info;
-}
-
-void ReportInvalidTestSuiteType(const char* test_suite_name,
-                                CodeLocation code_location) {
-  Message errors;
-  errors
-      << "Attempted redefinition of test suite " << test_suite_name << ".\n"
-      << "All tests in the same test suite must use the same test fixture\n"
-      << "class.  However, in test suite " << test_suite_name << ", you tried\n"
-      << "to define a test using a fixture class different from the one\n"
-      << "used earlier. This can happen if the two fixture classes are\n"
-      << "from different namespaces and have the same name. You should\n"
-      << "probably rename one of the classes to put the tests into different\n"
-      << "test suites.";
-
-  GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(),
-                                          code_location.line)
-                    << " " << errors.GetString();
-}
-}  // namespace internal
-
-namespace {
-
-// A predicate that checks the test name of a TestInfo against a known
-// value.
-//
-// This is used for implementation of the TestSuite class only.  We put
-// it in the anonymous namespace to prevent polluting the outer
-// namespace.
-//
-// TestNameIs is copyable.
-class TestNameIs {
- public:
-  // Constructor.
-  //
-  // TestNameIs has NO default constructor.
-  explicit TestNameIs(const char* name)
-      : name_(name) {}
-
-  // Returns true iff the test name of test_info matches name_.
-  bool operator()(const TestInfo * test_info) const {
-    return test_info && test_info->name() == name_;
-  }
-
- private:
-  std::string name_;
-};
-
-}  // namespace
-
-namespace internal {
-
-// This method expands all parameterized tests registered with macros TEST_P
-// and INSTANTIATE_TEST_SUITE_P into regular tests and registers those.
-// This will be done just once during the program runtime.
-void UnitTestImpl::RegisterParameterizedTests() {
-  if (!parameterized_tests_registered_) {
-    parameterized_test_registry_.RegisterTests();
-    parameterized_tests_registered_ = true;
-  }
-}
-
-}  // namespace internal
-
-// Creates the test object, runs it, records its result, and then
-// deletes it.
-void TestInfo::Run() {
-  if (!should_run_) return;
-
-  // Tells UnitTest where to store test result.
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->set_current_test_info(this);
-
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
-  // Notifies the unit test event listeners that a test is about to start.
-  repeater->OnTestStart(*this);
-
-  const TimeInMillis start = internal::GetTimeInMillis();
-
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-
-  // Creates the test object.
-  Test* const test = internal::HandleExceptionsInMethodIfSupported(
-      factory_, &internal::TestFactoryBase::CreateTest,
-      "the test fixture's constructor");
-
-  // Runs the test if the constructor didn't generate a fatal failure or invoke
-  // GTEST_SKIP().
-  // Note that the object will not be null
-  if (!Test::HasFatalFailure() && !Test::IsSkipped()) {
-    // This doesn't throw as all user code that can throw are wrapped into
-    // exception handling code.
-    test->Run();
-  }
-
-  if (test != nullptr) {
-    // Deletes the test object.
-    impl->os_stack_trace_getter()->UponLeavingGTest();
-    internal::HandleExceptionsInMethodIfSupported(
-        test, &Test::DeleteSelf_, "the test fixture's destructor");
-  }
-
-  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
-
-  // Notifies the unit test event listener that a test has just finished.
-  repeater->OnTestEnd(*this);
-
-  // Tells UnitTest to stop associating assertion results to this
-  // test.
-  impl->set_current_test_info(nullptr);
-}
-
-// class TestSuite
-
-// Gets the number of successful tests in this test suite.
-int TestSuite::successful_test_count() const {
-  return CountIf(test_info_list_, TestPassed);
-}
-
-// Gets the number of successful tests in this test suite.
-int TestSuite::skipped_test_count() const {
-  return CountIf(test_info_list_, TestSkipped);
-}
-
-// Gets the number of failed tests in this test suite.
-int TestSuite::failed_test_count() const {
-  return CountIf(test_info_list_, TestFailed);
-}
-
-// Gets the number of disabled tests that will be reported in the XML report.
-int TestSuite::reportable_disabled_test_count() const {
-  return CountIf(test_info_list_, TestReportableDisabled);
-}
-
-// Gets the number of disabled tests in this test suite.
-int TestSuite::disabled_test_count() const {
-  return CountIf(test_info_list_, TestDisabled);
-}
-
-// Gets the number of tests to be printed in the XML report.
-int TestSuite::reportable_test_count() const {
-  return CountIf(test_info_list_, TestReportable);
-}
-
-// Get the number of tests in this test suite that should run.
-int TestSuite::test_to_run_count() const {
-  return CountIf(test_info_list_, ShouldRunTest);
-}
-
-// Gets the number of all tests.
-int TestSuite::total_test_count() const {
-  return static_cast<int>(test_info_list_.size());
-}
-
-// Creates a TestSuite with the given name.
-//
-// Arguments:
-//
-//   name:         name of the test suite
-//   a_type_param: the name of the test suite's type parameter, or NULL if
-//                 this is not a typed or a type-parameterized test suite.
-//   set_up_tc:    pointer to the function that sets up the test suite
-//   tear_down_tc: pointer to the function that tears down the test suite
-TestSuite::TestSuite(const char* a_name, const char* a_type_param,
-                     internal::SetUpTestSuiteFunc set_up_tc,
-                     internal::TearDownTestSuiteFunc tear_down_tc)
-    : name_(a_name),
-      type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
-      set_up_tc_(set_up_tc),
-      tear_down_tc_(tear_down_tc),
-      should_run_(false),
-      elapsed_time_(0) {}
-
-// Destructor of TestSuite.
-TestSuite::~TestSuite() {
-  // Deletes every Test in the collection.
-  ForEach(test_info_list_, internal::Delete<TestInfo>);
-}
-
-// Returns the i-th test among all the tests. i can range from 0 to
-// total_test_count() - 1. If i is not in that range, returns NULL.
-const TestInfo* TestSuite::GetTestInfo(int i) const {
-  const int index = GetElementOr(test_indices_, i, -1);
-  return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
-}
-
-// Returns the i-th test among all the tests. i can range from 0 to
-// total_test_count() - 1. If i is not in that range, returns NULL.
-TestInfo* TestSuite::GetMutableTestInfo(int i) {
-  const int index = GetElementOr(test_indices_, i, -1);
-  return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
-}
-
-// Adds a test to this test suite.  Will delete the test upon
-// destruction of the TestSuite object.
-void TestSuite::AddTestInfo(TestInfo* test_info) {
-  test_info_list_.push_back(test_info);
-  test_indices_.push_back(static_cast<int>(test_indices_.size()));
-}
-
-// Runs every test in this TestSuite.
-void TestSuite::Run() {
-  if (!should_run_) return;
-
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->set_current_test_suite(this);
-
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
-  // Call both legacy and the new API
-  repeater->OnTestSuiteStart(*this);
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
-  repeater->OnTestCaseStart(*this);
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
-
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
-
-  const internal::TimeInMillis start = internal::GetTimeInMillis();
-  for (int i = 0; i < total_test_count(); i++) {
-    GetMutableTestInfo(i)->Run();
-  }
-  elapsed_time_ = internal::GetTimeInMillis() - start;
-
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &TestSuite::RunTearDownTestSuite, "TearDownTestSuite()");
-
-  // Call both legacy and the new API
-  repeater->OnTestSuiteEnd(*this);
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
-  repeater->OnTestCaseEnd(*this);
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
-
-  impl->set_current_test_suite(nullptr);
-}
-
-// Clears the results of all tests in this test suite.
-void TestSuite::ClearResult() {
-  ad_hoc_test_result_.Clear();
-  ForEach(test_info_list_, TestInfo::ClearTestResult);
-}
-
-// Shuffles the tests in this test suite.
-void TestSuite::ShuffleTests(internal::Random* random) {
-  Shuffle(random, &test_indices_);
-}
-
-// Restores the test order to before the first shuffle.
-void TestSuite::UnshuffleTests() {
-  for (size_t i = 0; i < test_indices_.size(); i++) {
-    test_indices_[i] = static_cast<int>(i);
-  }
-}
-
-// Formats a countable noun.  Depending on its quantity, either the
-// singular form or the plural form is used. e.g.
-//
-// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
-// FormatCountableNoun(5, "book", "books") returns "5 books".
-static std::string FormatCountableNoun(int count,
-                                       const char * singular_form,
-                                       const char * plural_form) {
-  return internal::StreamableToString(count) + " " +
-      (count == 1 ? singular_form : plural_form);
-}
-
-// Formats the count of tests.
-static std::string FormatTestCount(int test_count) {
-  return FormatCountableNoun(test_count, "test", "tests");
-}
-
-// Formats the count of test suites.
-static std::string FormatTestSuiteCount(int test_suite_count) {
-  return FormatCountableNoun(test_suite_count, "test suite", "test suites");
-}
-
-// Converts a TestPartResult::Type enum to human-friendly string
-// representation.  Both kNonFatalFailure and kFatalFailure are translated
-// to "Failure", as the user usually doesn't care about the difference
-// between the two when viewing the test result.
-static const char * TestPartResultTypeToString(TestPartResult::Type type) {
-  switch (type) {
-    case TestPartResult::kSkip:
-      return "Skipped";
-    case TestPartResult::kSuccess:
-      return "Success";
-
-    case TestPartResult::kNonFatalFailure:
-    case TestPartResult::kFatalFailure:
-#ifdef _MSC_VER
-      return "error: ";
-#else
-      return "Failure\n";
-#endif
-    default:
-      return "Unknown result type";
-  }
-}
-
-namespace internal {
-
-// Prints a TestPartResult to an std::string.
-static std::string PrintTestPartResultToString(
-    const TestPartResult& test_part_result) {
-  return (Message()
-          << internal::FormatFileLocation(test_part_result.file_name(),
-                                          test_part_result.line_number())
-          << " " << TestPartResultTypeToString(test_part_result.type())
-          << test_part_result.message()).GetString();
-}
-
-// Prints a TestPartResult.
-static void PrintTestPartResult(const TestPartResult& test_part_result) {
-  const std::string& result =
-      PrintTestPartResultToString(test_part_result);
-  printf("%s\n", result.c_str());
-  fflush(stdout);
-  // If the test program runs in Visual Studio or a debugger, the
-  // following statements add the test part result message to the Output
-  // window such that the user can double-click on it to jump to the
-  // corresponding source code location; otherwise they do nothing.
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-  // We don't call OutputDebugString*() on Windows Mobile, as printing
-  // to stdout is done by OutputDebugString() there already - we don't
-  // want the same message printed twice.
-  ::OutputDebugStringA(result.c_str());
-  ::OutputDebugStringA("\n");
-#endif
-}
-
-// class PrettyUnitTestResultPrinter
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
-
-// Returns the character attribute for the given color.
-static WORD GetColorAttribute(GTestColor color) {
-  switch (color) {
-    case COLOR_RED:    return FOREGROUND_RED;
-    case COLOR_GREEN:  return FOREGROUND_GREEN;
-    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
-    default:           return 0;
-  }
-}
-
-static int GetBitOffset(WORD color_mask) {
-  if (color_mask == 0) return 0;
-
-  int bitOffset = 0;
-  while ((color_mask & 1) == 0) {
-    color_mask >>= 1;
-    ++bitOffset;
-  }
-  return bitOffset;
-}
-
-static WORD GetNewColor(GTestColor color, WORD old_color_attrs) {
-  // Let's reuse the BG
-  static const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN |
-                                      BACKGROUND_RED | BACKGROUND_INTENSITY;
-  static const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN |
-                                      FOREGROUND_RED | FOREGROUND_INTENSITY;
-  const WORD existing_bg = old_color_attrs & background_mask;
-
-  WORD new_color =
-      GetColorAttribute(color) | existing_bg | FOREGROUND_INTENSITY;
-  static const int bg_bitOffset = GetBitOffset(background_mask);
-  static const int fg_bitOffset = GetBitOffset(foreground_mask);
-
-  if (((new_color & background_mask) >> bg_bitOffset) ==
-      ((new_color & foreground_mask) >> fg_bitOffset)) {
-    new_color ^= FOREGROUND_INTENSITY;  // invert intensity
-  }
-  return new_color;
-}
-
-#else
-
-// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
-// an invalid input.
-static const char* GetAnsiColorCode(GTestColor color) {
-  switch (color) {
-    case COLOR_RED:     return "1";
-    case COLOR_GREEN:   return "2";
-    case COLOR_YELLOW:  return "3";
-    default:
-      return nullptr;
-  }
-}
-
-#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-
-// Returns true iff Google Test should use colors in the output.
-bool ShouldUseColor(bool stdout_is_tty) {
-  const char* const gtest_color = GTEST_FLAG(color).c_str();
-
-  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
-    // On Windows the TERM variable is usually not set, but the
-    // console there does support colors.
-    return stdout_is_tty;
-#else
-    // On non-Windows platforms, we rely on the TERM variable.
-    const char* const term = posix::GetEnv("TERM");
-    const bool term_supports_color =
-        String::CStringEquals(term, "xterm") ||
-        String::CStringEquals(term, "xterm-color") ||
-        String::CStringEquals(term, "xterm-256color") ||
-        String::CStringEquals(term, "screen") ||
-        String::CStringEquals(term, "screen-256color") ||
-        String::CStringEquals(term, "tmux") ||
-        String::CStringEquals(term, "tmux-256color") ||
-        String::CStringEquals(term, "rxvt-unicode") ||
-        String::CStringEquals(term, "rxvt-unicode-256color") ||
-        String::CStringEquals(term, "linux") ||
-        String::CStringEquals(term, "cygwin");
-    return stdout_is_tty && term_supports_color;
-#endif  // GTEST_OS_WINDOWS
-  }
-
-  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
-      String::CStringEquals(gtest_color, "1");
-  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
-  // value is neither one of these nor "auto", we treat it as "no" to
-  // be conservative.
-}
-
-// Helpers for printing colored strings to stdout. Note that on Windows, we
-// cannot simply emit special characters and have the terminal change colors.
-// This routine must actually emit the characters rather than return a string
-// that would be colored when printed, as can be done on Linux.
-void ColoredPrintf(GTestColor color, const char* fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
-    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
-  const bool use_color = AlwaysFalse();
-#else
-  static const bool in_color_mode =
-      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
-  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
-#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS
-
-  if (!use_color) {
-    vprintf(fmt, args);
-    va_end(args);
-    return;
-  }
-
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
-  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
-
-  // Gets the current text color.
-  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
-  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
-  const WORD old_color_attrs = buffer_info.wAttributes;
-  const WORD new_color = GetNewColor(color, old_color_attrs);
-
-  // We need to flush the stream buffers into the console before each
-  // SetConsoleTextAttribute call lest it affect the text that is already
-  // printed but has not yet reached the console.
-  fflush(stdout);
-  SetConsoleTextAttribute(stdout_handle, new_color);
-
-  vprintf(fmt, args);
-
-  fflush(stdout);
-  // Restores the text color.
-  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
-#else
-  printf("\033[0;3%sm", GetAnsiColorCode(color));
-  vprintf(fmt, args);
-  printf("\033[m");  // Resets the terminal to default.
-#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-  va_end(args);
-}
-
-// Text printed in Google Test's text output and --gtest_list_tests
-// output to label the type parameter and value parameter for a test.
-static const char kTypeParamLabel[] = "TypeParam";
-static const char kValueParamLabel[] = "GetParam()";
-
-static void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
-  const char* const type_param = test_info.type_param();
-  const char* const value_param = test_info.value_param();
-
-  if (type_param != nullptr || value_param != nullptr) {
-    printf(", where ");
-    if (type_param != nullptr) {
-      printf("%s = %s", kTypeParamLabel, type_param);
-      if (value_param != nullptr) printf(" and ");
-    }
-    if (value_param != nullptr) {
-      printf("%s = %s", kValueParamLabel, value_param);
-    }
-  }
-}
-
-// This class implements the TestEventListener interface.
-//
-// Class PrettyUnitTestResultPrinter is copyable.
-class PrettyUnitTestResultPrinter : public TestEventListener {
- public:
-  PrettyUnitTestResultPrinter() {}
-  static void PrintTestName(const char* test_suite, const char* test) {
-    printf("%s.%s", test_suite, test);
-  }
-
-  // The following methods override what's in the TestEventListener class.
-  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
-  void OnTestIterationStart(const UnitTest& unit_test, int iteration) override;
-  void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override;
-  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
-  void OnTestCaseStart(const TestSuite& test_suite) override;
-  void OnTestStart(const TestInfo& test_info) override;
-  void OnTestPartResult(const TestPartResult& result) override;
-  void OnTestEnd(const TestInfo& test_info) override;
-  void OnTestCaseEnd(const TestSuite& test_suite) override;
-  void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override;
-  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
-  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
-  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
-
- private:
-  static void PrintFailedTests(const UnitTest& unit_test);
-  static void PrintSkippedTests(const UnitTest& unit_test);
-};
-
-  // Fired before each iteration of tests starts.
-void PrettyUnitTestResultPrinter::OnTestIterationStart(
-    const UnitTest& unit_test, int iteration) {
-  if (GTEST_FLAG(repeat) != 1)
-    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
-
-  const char* const filter = GTEST_FLAG(filter).c_str();
-
-  // Prints the filter if it's not *.  This reminds the user that some
-  // tests may be skipped.
-  if (!String::CStringEquals(filter, kUniversalFilter)) {
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
-  }
-
-  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
-    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: This is test shard %d of %s.\n",
-                  static_cast<int>(shard_index) + 1,
-                  internal::posix::GetEnv(kTestTotalShards));
-  }
-
-  if (GTEST_FLAG(shuffle)) {
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: Randomizing tests' orders with a seed of %d .\n",
-                  unit_test.random_seed());
-  }
-
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
-  printf("Running %s from %s.\n",
-         FormatTestCount(unit_test.test_to_run_count()).c_str(),
-         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
-    const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
-  printf("Global test environment set-up.\n");
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestSuite& test_suite) {
-  const std::string counts =
-      FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
-  printf("%s from %s", counts.c_str(), test_suite.name());
-  if (test_suite.type_param() == nullptr) {
-    printf("\n");
-  } else {
-    printf(", where %s = %s\n", kTypeParamLabel, test_suite.type_param());
-  }
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
-  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
-  PrintTestName(test_info.test_suite_name(), test_info.name());
-  printf("\n");
-  fflush(stdout);
-}
-
-// Called after an assertion failure.
-void PrettyUnitTestResultPrinter::OnTestPartResult(
-    const TestPartResult& result) {
-  switch (result.type()) {
-    // If the test part succeeded, or was skipped,
-    // we don't need to do anything.
-    case TestPartResult::kSkip:
-    case TestPartResult::kSuccess:
-      return;
-    default:
-      // Print failure message from the assertion
-      // (e.g. expected this and got that).
-      PrintTestPartResult(result);
-      fflush(stdout);
-  }
-}
-
-void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
-  if (test_info.result()->Passed()) {
-    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
-  } else if (test_info.result()->Skipped()) {
-    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
-  } else {
-    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
-  }
-  PrintTestName(test_info.test_suite_name(), test_info.name());
-  if (test_info.result()->Failed())
-    PrintFullTestCommentIfPresent(test_info);
-
-  if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms)\n", internal::StreamableToString(
-           test_info.result()->elapsed_time()).c_str());
-  } else {
-    printf("\n");
-  }
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestSuite& test_suite) {
-  if (!GTEST_FLAG(print_time)) return;
-
-  const std::string counts =
-      FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
-  printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(),
-         internal::StreamableToString(test_suite.elapsed_time()).c_str());
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
-    const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
-  printf("Global test environment tear-down\n");
-  fflush(stdout);
-}
-
-// Internal helper for printing the list of failed tests.
-void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
-  const int failed_test_count = unit_test.failed_test_count();
-  if (failed_test_count == 0) {
-    return;
-  }
-
-  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
-    const TestSuite& test_suite = *unit_test.GetTestSuite(i);
-    if (!test_suite.should_run() || (test_suite.failed_test_count() == 0)) {
-      continue;
-    }
-    for (int j = 0; j < test_suite.total_test_count(); ++j) {
-      const TestInfo& test_info = *test_suite.GetTestInfo(j);
-      if (!test_info.should_run() || !test_info.result()->Failed()) {
-        continue;
-      }
-      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
-      printf("%s.%s", test_suite.name(), test_info.name());
-      PrintFullTestCommentIfPresent(test_info);
-      printf("\n");
-    }
-  }
-}
-
-// Internal helper for printing the list of skipped tests.
-void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest& unit_test) {
-  const int skipped_test_count = unit_test.skipped_test_count();
-  if (skipped_test_count == 0) {
-    return;
-  }
-
-  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
-    const TestSuite& test_suite = *unit_test.GetTestSuite(i);
-    if (!test_suite.should_run() || (test_suite.skipped_test_count() == 0)) {
-      continue;
-    }
-    for (int j = 0; j < test_suite.total_test_count(); ++j) {
-      const TestInfo& test_info = *test_suite.GetTestInfo(j);
-      if (!test_info.should_run() || !test_info.result()->Skipped()) {
-        continue;
-      }
-      ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
-      printf("%s.%s", test_suite.name(), test_info.name());
-      printf("\n");
-    }
-  }
-}
-
-void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                     int /*iteration*/) {
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
-  printf("%s from %s ran.",
-         FormatTestCount(unit_test.test_to_run_count()).c_str(),
-         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
-  if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms total)",
-           internal::StreamableToString(unit_test.elapsed_time()).c_str());
-  }
-  printf("\n");
-  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
-  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
-
-  const int skipped_test_count = unit_test.skipped_test_count();
-  if (skipped_test_count > 0) {
-    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
-    printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str());
-    PrintSkippedTests(unit_test);
-  }
-
-  int num_failures = unit_test.failed_test_count();
-  if (!unit_test.Passed()) {
-    const int failed_test_count = unit_test.failed_test_count();
-    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
-    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
-    PrintFailedTests(unit_test);
-    printf("\n%2d FAILED %s\n", num_failures,
-                        num_failures == 1 ? "TEST" : "TESTS");
-  }
-
-  int num_disabled = unit_test.reportable_disabled_test_count();
-  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
-    if (!num_failures) {
-      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
-    }
-    ColoredPrintf(COLOR_YELLOW,
-                  "  YOU HAVE %d DISABLED %s\n\n",
-                  num_disabled,
-                  num_disabled == 1 ? "TEST" : "TESTS");
-  }
-  // Ensure that Google Test output is printed before, e.g., heapchecker output.
-  fflush(stdout);
-}
-
-// End PrettyUnitTestResultPrinter
-
-// class TestEventRepeater
-//
-// This class forwards events to other event listeners.
-class TestEventRepeater : public TestEventListener {
- public:
-  TestEventRepeater() : forwarding_enabled_(true) {}
-  ~TestEventRepeater() override;
-  void Append(TestEventListener *listener);
-  TestEventListener* Release(TestEventListener* listener);
-
-  // Controls whether events will be forwarded to listeners_. Set to false
-  // in death test child processes.
-  bool forwarding_enabled() const { return forwarding_enabled_; }
-  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
-
-  void OnTestProgramStart(const UnitTest& unit_test) override;
-  void OnTestIterationStart(const UnitTest& unit_test, int iteration) override;
-  void OnEnvironmentsSetUpStart(const UnitTest& unit_test) override;
-  void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) override;
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
-  void OnTestCaseStart(const TestSuite& parameter) override;
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
-  void OnTestSuiteStart(const TestSuite& parameter) override;
-  void OnTestStart(const TestInfo& test_info) override;
-  void OnTestPartResult(const TestPartResult& result) override;
-  void OnTestEnd(const TestInfo& test_info) override;
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
-  void OnTestCaseEnd(const TestSuite& parameter) override;
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
-  void OnTestSuiteEnd(const TestSuite& parameter) override;
-  void OnEnvironmentsTearDownStart(const UnitTest& unit_test) override;
-  void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) override;
-  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
-  void OnTestProgramEnd(const UnitTest& unit_test) override;
-
- private:
-  // Controls whether events will be forwarded to listeners_. Set to false
-  // in death test child processes.
-  bool forwarding_enabled_;
-  // The list of listeners that receive events.
-  std::vector<TestEventListener*> listeners_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
-};
-
-TestEventRepeater::~TestEventRepeater() {
-  ForEach(listeners_, Delete<TestEventListener>);
-}
-
-void TestEventRepeater::Append(TestEventListener *listener) {
-  listeners_.push_back(listener);
-}
-
-TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
-  for (size_t i = 0; i < listeners_.size(); ++i) {
-    if (listeners_[i] == listener) {
-      listeners_.erase(listeners_.begin() + static_cast<int>(i));
-      return listener;
-    }
-  }
-
-  return nullptr;
-}
-
-// Since most methods are very similar, use macros to reduce boilerplate.
-// This defines a member that forwards the call to all listeners.
-#define GTEST_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (size_t i = 0; i < listeners_.size(); i++) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
-// This defines a member that forwards the call to all listeners in reverse
-// order.
-#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type)      \
-  void TestEventRepeater::Name(const Type& parameter) { \
-    if (forwarding_enabled_) {                          \
-      for (size_t i = listeners_.size(); i != 0; i--) { \
-        listeners_[i - 1]->Name(parameter);             \
-      }                                                 \
-    }                                                   \
-  }
-
-GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
-GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite)
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite)
-GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
-GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
-GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestSuite)
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-GTEST_REVERSE_REPEATER_METHOD_(OnTestSuiteEnd, TestSuite)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
-
-#undef GTEST_REPEATER_METHOD_
-#undef GTEST_REVERSE_REPEATER_METHOD_
-
-void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
-                                             int iteration) {
-  if (forwarding_enabled_) {
-    for (size_t i = 0; i < listeners_.size(); i++) {
-      listeners_[i]->OnTestIterationStart(unit_test, iteration);
-    }
-  }
-}
-
-void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
-                                           int iteration) {
-  if (forwarding_enabled_) {
-    for (size_t i = listeners_.size(); i > 0; i--) {
-      listeners_[i - 1]->OnTestIterationEnd(unit_test, iteration);
-    }
-  }
-}
-
-// End TestEventRepeater
-
-// This class generates an XML output file.
-class XmlUnitTestResultPrinter : public EmptyTestEventListener {
- public:
-  explicit XmlUnitTestResultPrinter(const char* output_file);
-
-  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
-  void ListTestsMatchingFilter(const std::vector<TestSuite*>& test_suites);
-
-  // Prints an XML summary of all unit tests.
-  static void PrintXmlTestsList(std::ostream* stream,
-                                const std::vector<TestSuite*>& test_suites);
-
- private:
-  // Is c a whitespace character that is normalized to a space character
-  // when it appears in an XML attribute value?
-  static bool IsNormalizableWhitespace(char c) {
-    return c == 0x9 || c == 0xA || c == 0xD;
-  }
-
-  // May c appear in a well-formed XML document?
-  static bool IsValidXmlCharacter(char c) {
-    return IsNormalizableWhitespace(c) || c >= 0x20;
-  }
-
-  // Returns an XML-escaped copy of the input string str.  If
-  // is_attribute is true, the text is meant to appear as an attribute
-  // value, and normalizable whitespace is preserved by replacing it
-  // with character references.
-  static std::string EscapeXml(const std::string& str, bool is_attribute);
-
-  // Returns the given string with all characters invalid in XML removed.
-  static std::string RemoveInvalidXmlCharacters(const std::string& str);
-
-  // Convenience wrapper around EscapeXml when str is an attribute value.
-  static std::string EscapeXmlAttribute(const std::string& str) {
-    return EscapeXml(str, true);
-  }
-
-  // Convenience wrapper around EscapeXml when str is not an attribute value.
-  static std::string EscapeXmlText(const char* str) {
-    return EscapeXml(str, false);
-  }
-
-  // Verifies that the given attribute belongs to the given element and
-  // streams the attribute as XML.
-  static void OutputXmlAttribute(std::ostream* stream,
-                                 const std::string& element_name,
-                                 const std::string& name,
-                                 const std::string& value);
-
-  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
-  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
-
-  // Streams an XML representation of a TestInfo object.
-  static void OutputXmlTestInfo(::std::ostream* stream,
-                                const char* test_suite_name,
-                                const TestInfo& test_info);
-
-  // Prints an XML representation of a TestSuite object
-  static void PrintXmlTestSuite(::std::ostream* stream,
-                                const TestSuite& test_suite);
-
-  // Prints an XML summary of unit_test to output stream out.
-  static void PrintXmlUnitTest(::std::ostream* stream,
-                               const UnitTest& unit_test);
-
-  // Produces a string representing the test properties in a result as space
-  // delimited XML attributes based on the property key="value" pairs.
-  // When the std::string is not empty, it includes a space at the beginning,
-  // to delimit this attribute from prior attributes.
-  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
-
-  // Streams an XML representation of the test properties of a TestResult
-  // object.
-  static void OutputXmlTestProperties(std::ostream* stream,
-                                      const TestResult& result);
-
-  // The output file.
-  const std::string output_file_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
-};
-
-// Creates a new XmlUnitTestResultPrinter.
-XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
-    : output_file_(output_file) {
-  if (output_file_.empty()) {
-    GTEST_LOG_(FATAL) << "XML output file may not be null";
-  }
-}
-
-// Called after the unit test ends.
-void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                  int /*iteration*/) {
-  FILE* xmlout = OpenFileForWriting(output_file_);
-  std::stringstream stream;
-  PrintXmlUnitTest(&stream, unit_test);
-  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
-  fclose(xmlout);
-}
-
-void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
-    const std::vector<TestSuite*>& test_suites) {
-  FILE* xmlout = OpenFileForWriting(output_file_);
-  std::stringstream stream;
-  PrintXmlTestsList(&stream, test_suites);
-  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
-  fclose(xmlout);
-}
-
-// Returns an XML-escaped copy of the input string str.  If is_attribute
-// is true, the text is meant to appear as an attribute value, and
-// normalizable whitespace is preserved by replacing it with character
-// references.
-//
-// Invalid XML characters in str, if any, are stripped from the output.
-// It is expected that most, if not all, of the text processed by this
-// module will consist of ordinary English text.
-// If this module is ever modified to produce version 1.1 XML output,
-// most invalid characters can be retained using character references.
-std::string XmlUnitTestResultPrinter::EscapeXml(
-    const std::string& str, bool is_attribute) {
-  Message m;
-
-  for (size_t i = 0; i < str.size(); ++i) {
-    const char ch = str[i];
-    switch (ch) {
-      case '<':
-        m << "&lt;";
-        break;
-      case '>':
-        m << "&gt;";
-        break;
-      case '&':
-        m << "&amp;";
-        break;
-      case '\'':
-        if (is_attribute)
-          m << "&apos;";
-        else
-          m << '\'';
-        break;
-      case '"':
-        if (is_attribute)
-          m << "&quot;";
-        else
-          m << '"';
-        break;
-      default:
-        if (IsValidXmlCharacter(ch)) {
-          if (is_attribute && IsNormalizableWhitespace(ch))
-            m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
-              << ";";
-          else
-            m << ch;
-        }
-        break;
-    }
-  }
-
-  return m.GetString();
-}
-
-// Returns the given string with all characters invalid in XML removed.
-// Currently invalid characters are dropped from the string. An
-// alternative is to replace them with certain characters such as . or ?.
-std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
-    const std::string& str) {
-  std::string output;
-  output.reserve(str.size());
-  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
-    if (IsValidXmlCharacter(*it))
-      output.push_back(*it);
-
-  return output;
-}
-
-// The following routines generate an XML representation of a UnitTest
-// object.
-// GOOGLETEST_CM0009 DO NOT DELETE
-//
-// This is how Google Test concepts map to the DTD:
-//
-// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
-//   <testsuite name="testcase-name">  <-- corresponds to a TestSuite object
-//     <testcase name="test-name">     <-- corresponds to a TestInfo object
-//       <failure message="...">...</failure>
-//       <failure message="...">...</failure>
-//       <failure message="...">...</failure>
-//                                     <-- individual assertion failures
-//     </testcase>
-//   </testsuite>
-// </testsuites>
-
-// Formats the given time in milliseconds as seconds.
-std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
-  ::std::stringstream ss;
-  ss << (static_cast<double>(ms) * 1e-3);
-  return ss.str();
-}
-
-static bool PortableLocaltime(time_t seconds, struct tm* out) {
-#if defined(_MSC_VER)
-  return localtime_s(out, &seconds) == 0;
-#elif defined(__MINGW32__) || defined(__MINGW64__)
-  // MINGW <time.h> provides neither localtime_r nor localtime_s, but uses
-  // Windows' localtime(), which has a thread-local tm buffer.
-  struct tm* tm_ptr = localtime(&seconds);  // NOLINT
-  if (tm_ptr == nullptr) return false;
-  *out = *tm_ptr;
-  return true;
-#else
-  return localtime_r(&seconds, out) != nullptr;
-#endif
-}
-
-// Converts the given epoch time in milliseconds to a date string in the ISO
-// 8601 format, without the timezone information.
-std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
-  struct tm time_struct;
-  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
-    return "";
-  // YYYY-MM-DDThh:mm:ss
-  return StreamableToString(time_struct.tm_year + 1900) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
-      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
-      String::FormatIntWidth2(time_struct.tm_min) + ":" +
-      String::FormatIntWidth2(time_struct.tm_sec);
-}
-
-// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
-void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
-                                                     const char* data) {
-  const char* segment = data;
-  *stream << "<![CDATA[";
-  for (;;) {
-    const char* const next_segment = strstr(segment, "]]>");
-    if (next_segment != nullptr) {
-      stream->write(
-          segment, static_cast<std::streamsize>(next_segment - segment));
-      *stream << "]]>]]&gt;<![CDATA[";
-      segment = next_segment + strlen("]]>");
-    } else {
-      *stream << segment;
-      break;
-    }
-  }
-  *stream << "]]>";
-}
-
-void XmlUnitTestResultPrinter::OutputXmlAttribute(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    const std::string& value) {
-  const std::vector<std::string>& allowed_names =
-      GetReservedOutputAttributesForElement(element_name);
-
-  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
-      << "Attribute " << name << " is not allowed for element <" << element_name
-      << ">.";
-
-  *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
-}
-
-// Prints an XML representation of a TestInfo object.
-void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
-                                                 const char* test_suite_name,
-                                                 const TestInfo& test_info) {
-  const TestResult& result = *test_info.result();
-  const std::string kTestsuite = "testcase";
-
-  if (test_info.is_in_another_shard()) {
-    return;
-  }
-
-  *stream << "    <testcase";
-  OutputXmlAttribute(stream, kTestsuite, "name", test_info.name());
-
-  if (test_info.value_param() != nullptr) {
-    OutputXmlAttribute(stream, kTestsuite, "value_param",
-                       test_info.value_param());
-  }
-  if (test_info.type_param() != nullptr) {
-    OutputXmlAttribute(stream, kTestsuite, "type_param",
-                       test_info.type_param());
-  }
-  if (GTEST_FLAG(list_tests)) {
-    OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
-    OutputXmlAttribute(stream, kTestsuite, "line",
-                       StreamableToString(test_info.line()));
-    *stream << " />\n";
-    return;
-  }
-
-  OutputXmlAttribute(stream, kTestsuite, "status",
-                     test_info.should_run() ? "run" : "notrun");
-  OutputXmlAttribute(stream, kTestsuite, "result",
-                     test_info.should_run()
-                         ? (result.Skipped() ? "skipped" : "completed")
-                         : "suppressed");
-  OutputXmlAttribute(stream, kTestsuite, "time",
-                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
-  OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name);
-
-  int failures = 0;
-  for (int i = 0; i < result.total_part_count(); ++i) {
-    const TestPartResult& part = result.GetTestPartResult(i);
-    if (part.failed()) {
-      if (++failures == 1) {
-        *stream << ">\n";
-      }
-      const std::string location =
-          internal::FormatCompilerIndependentFileLocation(part.file_name(),
-                                                          part.line_number());
-      const std::string summary = location + "\n" + part.summary();
-      *stream << "      <failure message=\""
-              << EscapeXmlAttribute(summary.c_str())
-              << "\" type=\"\">";
-      const std::string detail = location + "\n" + part.message();
-      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
-      *stream << "</failure>\n";
-    }
-  }
-
-  if (failures == 0 && result.test_property_count() == 0) {
-    *stream << " />\n";
-  } else {
-    if (failures == 0) {
-      *stream << ">\n";
-    }
-    OutputXmlTestProperties(stream, result);
-    *stream << "    </testcase>\n";
-  }
-}
-
-// Prints an XML representation of a TestSuite object
-void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream,
-                                                 const TestSuite& test_suite) {
-  const std::string kTestsuite = "testsuite";
-  *stream << "  <" << kTestsuite;
-  OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name());
-  OutputXmlAttribute(stream, kTestsuite, "tests",
-                     StreamableToString(test_suite.reportable_test_count()));
-  if (!GTEST_FLAG(list_tests)) {
-    OutputXmlAttribute(stream, kTestsuite, "failures",
-                       StreamableToString(test_suite.failed_test_count()));
-    OutputXmlAttribute(
-        stream, kTestsuite, "disabled",
-        StreamableToString(test_suite.reportable_disabled_test_count()));
-    OutputXmlAttribute(stream, kTestsuite, "errors", "0");
-    OutputXmlAttribute(stream, kTestsuite, "time",
-                       FormatTimeInMillisAsSeconds(test_suite.elapsed_time()));
-    *stream << TestPropertiesAsXmlAttributes(test_suite.ad_hoc_test_result());
-  }
-  *stream << ">\n";
-  for (int i = 0; i < test_suite.total_test_count(); ++i) {
-    if (test_suite.GetTestInfo(i)->is_reportable())
-      OutputXmlTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i));
-  }
-  *stream << "  </" << kTestsuite << ">\n";
-}
-
-// Prints an XML summary of unit_test to output stream out.
-void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
-                                                const UnitTest& unit_test) {
-  const std::string kTestsuites = "testsuites";
-
-  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
-  *stream << "<" << kTestsuites;
-
-  OutputXmlAttribute(stream, kTestsuites, "tests",
-                     StreamableToString(unit_test.reportable_test_count()));
-  OutputXmlAttribute(stream, kTestsuites, "failures",
-                     StreamableToString(unit_test.failed_test_count()));
-  OutputXmlAttribute(
-      stream, kTestsuites, "disabled",
-      StreamableToString(unit_test.reportable_disabled_test_count()));
-  OutputXmlAttribute(stream, kTestsuites, "errors", "0");
-  OutputXmlAttribute(
-      stream, kTestsuites, "timestamp",
-      FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
-  OutputXmlAttribute(stream, kTestsuites, "time",
-                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
-
-  if (GTEST_FLAG(shuffle)) {
-    OutputXmlAttribute(stream, kTestsuites, "random_seed",
-                       StreamableToString(unit_test.random_seed()));
-  }
-  *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
-
-  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
-  *stream << ">\n";
-
-  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
-    if (unit_test.GetTestSuite(i)->reportable_test_count() > 0)
-      PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i));
-  }
-  *stream << "</" << kTestsuites << ">\n";
-}
-
-void XmlUnitTestResultPrinter::PrintXmlTestsList(
-    std::ostream* stream, const std::vector<TestSuite*>& test_suites) {
-  const std::string kTestsuites = "testsuites";
-
-  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
-  *stream << "<" << kTestsuites;
-
-  int total_tests = 0;
-  for (auto test_suite : test_suites) {
-    total_tests += test_suite->total_test_count();
-  }
-  OutputXmlAttribute(stream, kTestsuites, "tests",
-                     StreamableToString(total_tests));
-  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
-  *stream << ">\n";
-
-  for (auto test_suite : test_suites) {
-    PrintXmlTestSuite(stream, *test_suite);
-  }
-  *stream << "</" << kTestsuites << ">\n";
-}
-
-// Produces a string representing the test properties in a result as space
-// delimited XML attributes based on the property key="value" pairs.
-std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
-    const TestResult& result) {
-  Message attributes;
-  for (int i = 0; i < result.test_property_count(); ++i) {
-    const TestProperty& property = result.GetTestProperty(i);
-    attributes << " " << property.key() << "="
-        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
-  }
-  return attributes.GetString();
-}
-
-void XmlUnitTestResultPrinter::OutputXmlTestProperties(
-    std::ostream* stream, const TestResult& result) {
-  const std::string kProperties = "properties";
-  const std::string kProperty = "property";
-
-  if (result.test_property_count() <= 0) {
-    return;
-  }
-
-  *stream << "<" << kProperties << ">\n";
-  for (int i = 0; i < result.test_property_count(); ++i) {
-    const TestProperty& property = result.GetTestProperty(i);
-    *stream << "<" << kProperty;
-    *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
-    *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
-    *stream << "/>\n";
-  }
-  *stream << "</" << kProperties << ">\n";
-}
-
-// End XmlUnitTestResultPrinter
-
-// This class generates an JSON output file.
-class JsonUnitTestResultPrinter : public EmptyTestEventListener {
- public:
-  explicit JsonUnitTestResultPrinter(const char* output_file);
-
-  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
-
-  // Prints an JSON summary of all unit tests.
-  static void PrintJsonTestList(::std::ostream* stream,
-                                const std::vector<TestSuite*>& test_suites);
-
- private:
-  // Returns an JSON-escaped copy of the input string str.
-  static std::string EscapeJson(const std::string& str);
-
-  //// Verifies that the given attribute belongs to the given element and
-  //// streams the attribute as JSON.
-  static void OutputJsonKey(std::ostream* stream,
-                            const std::string& element_name,
-                            const std::string& name,
-                            const std::string& value,
-                            const std::string& indent,
-                            bool comma = true);
-  static void OutputJsonKey(std::ostream* stream,
-                            const std::string& element_name,
-                            const std::string& name,
-                            int value,
-                            const std::string& indent,
-                            bool comma = true);
-
-  // Streams a JSON representation of a TestInfo object.
-  static void OutputJsonTestInfo(::std::ostream* stream,
-                                 const char* test_suite_name,
-                                 const TestInfo& test_info);
-
-  // Prints a JSON representation of a TestSuite object
-  static void PrintJsonTestSuite(::std::ostream* stream,
-                                 const TestSuite& test_suite);
-
-  // Prints a JSON summary of unit_test to output stream out.
-  static void PrintJsonUnitTest(::std::ostream* stream,
-                                const UnitTest& unit_test);
-
-  // Produces a string representing the test properties in a result as
-  // a JSON dictionary.
-  static std::string TestPropertiesAsJson(const TestResult& result,
-                                          const std::string& indent);
-
-  // The output file.
-  const std::string output_file_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter);
-};
-
-// Creates a new JsonUnitTestResultPrinter.
-JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file)
-    : output_file_(output_file) {
-  if (output_file_.empty()) {
-    GTEST_LOG_(FATAL) << "JSON output file may not be null";
-  }
-}
-
-void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                  int /*iteration*/) {
-  FILE* jsonout = OpenFileForWriting(output_file_);
-  std::stringstream stream;
-  PrintJsonUnitTest(&stream, unit_test);
-  fprintf(jsonout, "%s", StringStreamToString(&stream).c_str());
-  fclose(jsonout);
-}
-
-// Returns an JSON-escaped copy of the input string str.
-std::string JsonUnitTestResultPrinter::EscapeJson(const std::string& str) {
-  Message m;
-
-  for (size_t i = 0; i < str.size(); ++i) {
-    const char ch = str[i];
-    switch (ch) {
-      case '\\':
-      case '"':
-      case '/':
-        m << '\\' << ch;
-        break;
-      case '\b':
-        m << "\\b";
-        break;
-      case '\t':
-        m << "\\t";
-        break;
-      case '\n':
-        m << "\\n";
-        break;
-      case '\f':
-        m << "\\f";
-        break;
-      case '\r':
-        m << "\\r";
-        break;
-      default:
-        if (ch < ' ') {
-          m << "\\u00" << String::FormatByte(static_cast<unsigned char>(ch));
-        } else {
-          m << ch;
-        }
-        break;
-    }
-  }
-
-  return m.GetString();
-}
-
-// The following routines generate an JSON representation of a UnitTest
-// object.
-
-// Formats the given time in milliseconds as seconds.
-static std::string FormatTimeInMillisAsDuration(TimeInMillis ms) {
-  ::std::stringstream ss;
-  ss << (static_cast<double>(ms) * 1e-3) << "s";
-  return ss.str();
-}
-
-// Converts the given epoch time in milliseconds to a date string in the
-// RFC3339 format, without the timezone information.
-static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
-  struct tm time_struct;
-  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
-    return "";
-  // YYYY-MM-DDThh:mm:ss
-  return StreamableToString(time_struct.tm_year + 1900) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
-      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
-      String::FormatIntWidth2(time_struct.tm_min) + ":" +
-      String::FormatIntWidth2(time_struct.tm_sec) + "Z";
-}
-
-static inline std::string Indent(size_t width) {
-  return std::string(width, ' ');
-}
-
-void JsonUnitTestResultPrinter::OutputJsonKey(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    const std::string& value,
-    const std::string& indent,
-    bool comma) {
-  const std::vector<std::string>& allowed_names =
-      GetReservedOutputAttributesForElement(element_name);
-
-  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
-      << "Key \"" << name << "\" is not allowed for value \"" << element_name
-      << "\".";
-
-  *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
-  if (comma)
-    *stream << ",\n";
-}
-
-void JsonUnitTestResultPrinter::OutputJsonKey(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    int value,
-    const std::string& indent,
-    bool comma) {
-  const std::vector<std::string>& allowed_names =
-      GetReservedOutputAttributesForElement(element_name);
-
-  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
-      << "Key \"" << name << "\" is not allowed for value \"" << element_name
-      << "\".";
-
-  *stream << indent << "\"" << name << "\": " << StreamableToString(value);
-  if (comma)
-    *stream << ",\n";
-}
-
-// Prints a JSON representation of a TestInfo object.
-void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
-                                                   const char* test_suite_name,
-                                                   const TestInfo& test_info) {
-  const TestResult& result = *test_info.result();
-  const std::string kTestsuite = "testcase";
-  const std::string kIndent = Indent(10);
-
-  *stream << Indent(8) << "{\n";
-  OutputJsonKey(stream, kTestsuite, "name", test_info.name(), kIndent);
-
-  if (test_info.value_param() != nullptr) {
-    OutputJsonKey(stream, kTestsuite, "value_param", test_info.value_param(),
-                  kIndent);
-  }
-  if (test_info.type_param() != nullptr) {
-    OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(),
-                  kIndent);
-  }
-  if (GTEST_FLAG(list_tests)) {
-    OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
-    OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
-    *stream << "\n" << Indent(8) << "}";
-    return;
-  }
-
-  OutputJsonKey(stream, kTestsuite, "status",
-                test_info.should_run() ? "RUN" : "NOTRUN", kIndent);
-  OutputJsonKey(stream, kTestsuite, "result",
-                test_info.should_run()
-                    ? (result.Skipped() ? "SKIPPED" : "COMPLETED")
-                    : "SUPPRESSED",
-                kIndent);
-  OutputJsonKey(stream, kTestsuite, "time",
-                FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent);
-  OutputJsonKey(stream, kTestsuite, "classname", test_suite_name, kIndent,
-                false);
-  *stream << TestPropertiesAsJson(result, kIndent);
-
-  int failures = 0;
-  for (int i = 0; i < result.total_part_count(); ++i) {
-    const TestPartResult& part = result.GetTestPartResult(i);
-    if (part.failed()) {
-      *stream << ",\n";
-      if (++failures == 1) {
-        *stream << kIndent << "\"" << "failures" << "\": [\n";
-      }
-      const std::string location =
-          internal::FormatCompilerIndependentFileLocation(part.file_name(),
-                                                          part.line_number());
-      const std::string message = EscapeJson(location + "\n" + part.message());
-      *stream << kIndent << "  {\n"
-              << kIndent << "    \"failure\": \"" << message << "\",\n"
-              << kIndent << "    \"type\": \"\"\n"
-              << kIndent << "  }";
-    }
-  }
-
-  if (failures > 0)
-    *stream << "\n" << kIndent << "]";
-  *stream << "\n" << Indent(8) << "}";
-}
-
-// Prints an JSON representation of a TestSuite object
-void JsonUnitTestResultPrinter::PrintJsonTestSuite(
-    std::ostream* stream, const TestSuite& test_suite) {
-  const std::string kTestsuite = "testsuite";
-  const std::string kIndent = Indent(6);
-
-  *stream << Indent(4) << "{\n";
-  OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent);
-  OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(),
-                kIndent);
-  if (!GTEST_FLAG(list_tests)) {
-    OutputJsonKey(stream, kTestsuite, "failures",
-                  test_suite.failed_test_count(), kIndent);
-    OutputJsonKey(stream, kTestsuite, "disabled",
-                  test_suite.reportable_disabled_test_count(), kIndent);
-    OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent);
-    OutputJsonKey(stream, kTestsuite, "time",
-                  FormatTimeInMillisAsDuration(test_suite.elapsed_time()),
-                  kIndent, false);
-    *stream << TestPropertiesAsJson(test_suite.ad_hoc_test_result(), kIndent)
-            << ",\n";
-  }
-
-  *stream << kIndent << "\"" << kTestsuite << "\": [\n";
-
-  bool comma = false;
-  for (int i = 0; i < test_suite.total_test_count(); ++i) {
-    if (test_suite.GetTestInfo(i)->is_reportable()) {
-      if (comma) {
-        *stream << ",\n";
-      } else {
-        comma = true;
-      }
-      OutputJsonTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i));
-    }
-  }
-  *stream << "\n" << kIndent << "]\n" << Indent(4) << "}";
-}
-
-// Prints a JSON summary of unit_test to output stream out.
-void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
-                                                  const UnitTest& unit_test) {
-  const std::string kTestsuites = "testsuites";
-  const std::string kIndent = Indent(2);
-  *stream << "{\n";
-
-  OutputJsonKey(stream, kTestsuites, "tests", unit_test.reportable_test_count(),
-                kIndent);
-  OutputJsonKey(stream, kTestsuites, "failures", unit_test.failed_test_count(),
-                kIndent);
-  OutputJsonKey(stream, kTestsuites, "disabled",
-                unit_test.reportable_disabled_test_count(), kIndent);
-  OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
-  if (GTEST_FLAG(shuffle)) {
-    OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
-                  kIndent);
-  }
-  OutputJsonKey(stream, kTestsuites, "timestamp",
-                FormatEpochTimeInMillisAsRFC3339(unit_test.start_timestamp()),
-                kIndent);
-  OutputJsonKey(stream, kTestsuites, "time",
-                FormatTimeInMillisAsDuration(unit_test.elapsed_time()), kIndent,
-                false);
-
-  *stream << TestPropertiesAsJson(unit_test.ad_hoc_test_result(), kIndent)
-          << ",\n";
-
-  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
-  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
-
-  bool comma = false;
-  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
-    if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) {
-      if (comma) {
-        *stream << ",\n";
-      } else {
-        comma = true;
-      }
-      PrintJsonTestSuite(stream, *unit_test.GetTestSuite(i));
-    }
-  }
-
-  *stream << "\n" << kIndent << "]\n" << "}\n";
-}
-
-void JsonUnitTestResultPrinter::PrintJsonTestList(
-    std::ostream* stream, const std::vector<TestSuite*>& test_suites) {
-  const std::string kTestsuites = "testsuites";
-  const std::string kIndent = Indent(2);
-  *stream << "{\n";
-  int total_tests = 0;
-  for (auto test_suite : test_suites) {
-    total_tests += test_suite->total_test_count();
-  }
-  OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent);
-
-  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
-  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
-
-  for (size_t i = 0; i < test_suites.size(); ++i) {
-    if (i != 0) {
-      *stream << ",\n";
-    }
-    PrintJsonTestSuite(stream, *test_suites[i]);
-  }
-
-  *stream << "\n"
-          << kIndent << "]\n"
-          << "}\n";
-}
-// Produces a string representing the test properties in a result as
-// a JSON dictionary.
-std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
-    const TestResult& result, const std::string& indent) {
-  Message attributes;
-  for (int i = 0; i < result.test_property_count(); ++i) {
-    const TestProperty& property = result.GetTestProperty(i);
-    attributes << ",\n" << indent << "\"" << property.key() << "\": "
-               << "\"" << EscapeJson(property.value()) << "\"";
-  }
-  return attributes.GetString();
-}
-
-// End JsonUnitTestResultPrinter
-
-#if GTEST_CAN_STREAM_RESULTS_
-
-// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
-// replaces them by "%xx" where xx is their hexadecimal value. For
-// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
-// in both time and space -- important as the input str may contain an
-// arbitrarily long test failure message and stack trace.
-std::string StreamingListener::UrlEncode(const char* str) {
-  std::string result;
-  result.reserve(strlen(str) + 1);
-  for (char ch = *str; ch != '\0'; ch = *++str) {
-    switch (ch) {
-      case '%':
-      case '=':
-      case '&':
-      case '\n':
-        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
-        break;
-      default:
-        result.push_back(ch);
-        break;
-    }
-  }
-  return result;
-}
-
-void StreamingListener::SocketWriter::MakeConnection() {
-  GTEST_CHECK_(sockfd_ == -1)
-      << "MakeConnection() can't be called when there is already a connection.";
-
-  addrinfo hints;
-  memset(&hints, 0, sizeof(hints));
-  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
-  hints.ai_socktype = SOCK_STREAM;
-  addrinfo* servinfo = nullptr;
-
-  // Use the getaddrinfo() to get a linked list of IP addresses for
-  // the given host name.
-  const int error_num = getaddrinfo(
-      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
-  if (error_num != 0) {
-    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
-                        << gai_strerror(error_num);
-  }
-
-  // Loop through all the results and connect to the first we can.
-  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr;
-       cur_addr = cur_addr->ai_next) {
-    sockfd_ = socket(
-        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
-    if (sockfd_ != -1) {
-      // Connect the client socket to the server socket.
-      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
-        close(sockfd_);
-        sockfd_ = -1;
-      }
-    }
-  }
-
-  freeaddrinfo(servinfo);  // all done with this structure
-
-  if (sockfd_ == -1) {
-    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
-                        << host_name_ << ":" << port_num_;
-  }
-}
-
-// End of class Streaming Listener
-#endif  // GTEST_CAN_STREAM_RESULTS__
-
-// class OsStackTraceGetter
-
-const char* const OsStackTraceGetterInterface::kElidedFramesMarker =
-    "... " GTEST_NAME_ " internal frames ...";
-
-std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-#if GTEST_HAS_ABSL
-  std::string result;
-
-  if (max_depth <= 0) {
-    return result;
-  }
-
-  max_depth = std::min(max_depth, kMaxStackTraceDepth);
-
-  std::vector<void*> raw_stack(max_depth);
-  // Skips the frames requested by the caller, plus this function.
-  const int raw_stack_size =
-      absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1);
-
-  void* caller_frame = nullptr;
-  {
-    MutexLock lock(&mutex_);
-    caller_frame = caller_frame_;
-  }
-
-  for (int i = 0; i < raw_stack_size; ++i) {
-    if (raw_stack[i] == caller_frame &&
-        !GTEST_FLAG(show_internal_stack_frames)) {
-      // Add a marker to the trace and stop adding frames.
-      absl::StrAppend(&result, kElidedFramesMarker, "\n");
-      break;
-    }
-
-    char tmp[1024];
-    const char* symbol = "(unknown)";
-    if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) {
-      symbol = tmp;
-    }
-
-    char line[1024];
-    snprintf(line, sizeof(line), "  %p: %s\n", raw_stack[i], symbol);
-    result += line;
-  }
-
-  return result;
-
-#else  // !GTEST_HAS_ABSL
-  static_cast<void>(max_depth);
-  static_cast<void>(skip_count);
-  return "";
-#endif  // GTEST_HAS_ABSL
-}
-
-void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
-#if GTEST_HAS_ABSL
-  void* caller_frame = nullptr;
-  if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) {
-    caller_frame = nullptr;
-  }
-
-  MutexLock lock(&mutex_);
-  caller_frame_ = caller_frame;
-#endif  // GTEST_HAS_ABSL
-}
-
-// A helper class that creates the premature-exit file in its
-// constructor and deletes the file in its destructor.
-class ScopedPrematureExitFile {
- public:
-  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
-      : premature_exit_filepath_(premature_exit_filepath ?
-                                 premature_exit_filepath : "") {
-    // If a path to the premature-exit file is specified...
-    if (!premature_exit_filepath_.empty()) {
-      // create the file with a single "0" character in it.  I/O
-      // errors are ignored as there's nothing better we can do and we
-      // don't want to fail the test because of this.
-      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
-      fwrite("0", 1, 1, pfile);
-      fclose(pfile);
-    }
-  }
-
-  ~ScopedPrematureExitFile() {
-    if (!premature_exit_filepath_.empty()) {
-      int retval = remove(premature_exit_filepath_.c_str());
-      if (retval) {
-        GTEST_LOG_(ERROR) << "Failed to remove premature exit filepath \""
-                          << premature_exit_filepath_ << "\" with error "
-                          << retval;
-      }
-    }
-  }
-
- private:
-  const std::string premature_exit_filepath_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
-};
-
-}  // namespace internal
-
-// class TestEventListeners
-
-TestEventListeners::TestEventListeners()
-    : repeater_(new internal::TestEventRepeater()),
-      default_result_printer_(nullptr),
-      default_xml_generator_(nullptr) {}
-
-TestEventListeners::~TestEventListeners() { delete repeater_; }
-
-// Returns the standard listener responsible for the default console
-// output.  Can be removed from the listeners list to shut down default
-// console output.  Note that removing this object from the listener list
-// with Release transfers its ownership to the user.
-void TestEventListeners::Append(TestEventListener* listener) {
-  repeater_->Append(listener);
-}
-
-// Removes the given event listener from the list and returns it.  It then
-// becomes the caller's responsibility to delete the listener. Returns
-// NULL if the listener is not found in the list.
-TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
-  if (listener == default_result_printer_)
-    default_result_printer_ = nullptr;
-  else if (listener == default_xml_generator_)
-    default_xml_generator_ = nullptr;
-  return repeater_->Release(listener);
-}
-
-// Returns repeater that broadcasts the TestEventListener events to all
-// subscribers.
-TestEventListener* TestEventListeners::repeater() { return repeater_; }
-
-// Sets the default_result_printer attribute to the provided listener.
-// The listener is also added to the listener list and previous
-// default_result_printer is removed from it and deleted. The listener can
-// also be NULL in which case it will not be added to the list. Does
-// nothing if the previous and the current listener objects are the same.
-void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
-  if (default_result_printer_ != listener) {
-    // It is an error to pass this method a listener that is already in the
-    // list.
-    delete Release(default_result_printer_);
-    default_result_printer_ = listener;
-    if (listener != nullptr) Append(listener);
-  }
-}
-
-// Sets the default_xml_generator attribute to the provided listener.  The
-// listener is also added to the listener list and previous
-// default_xml_generator is removed from it and deleted. The listener can
-// also be NULL in which case it will not be added to the list. Does
-// nothing if the previous and the current listener objects are the same.
-void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
-  if (default_xml_generator_ != listener) {
-    // It is an error to pass this method a listener that is already in the
-    // list.
-    delete Release(default_xml_generator_);
-    default_xml_generator_ = listener;
-    if (listener != nullptr) Append(listener);
-  }
-}
-
-// Controls whether events will be forwarded by the repeater to the
-// listeners in the list.
-bool TestEventListeners::EventForwardingEnabled() const {
-  return repeater_->forwarding_enabled();
-}
-
-void TestEventListeners::SuppressEventForwarding() {
-  repeater_->set_forwarding_enabled(false);
-}
-
-// class UnitTest
-
-// Gets the singleton UnitTest object.  The first time this method is
-// called, a UnitTest object is constructed and returned.  Consecutive
-// calls will return the same object.
-//
-// We don't protect this under mutex_ as a user is not supposed to
-// call this before main() starts, from which point on the return
-// value will never change.
-UnitTest* UnitTest::GetInstance() {
-  // CodeGear C++Builder insists on a public destructor for the
-  // default implementation.  Use this implementation to keep good OO
-  // design with private destructor.
-
-#if defined(__BORLANDC__)
-  static UnitTest* const instance = new UnitTest;
-  return instance;
-#else
-  static UnitTest instance;
-  return &instance;
-#endif  // defined(__BORLANDC__)
-}
-
-// Gets the number of successful test suites.
-int UnitTest::successful_test_suite_count() const {
-  return impl()->successful_test_suite_count();
-}
-
-// Gets the number of failed test suites.
-int UnitTest::failed_test_suite_count() const {
-  return impl()->failed_test_suite_count();
-}
-
-// Gets the number of all test suites.
-int UnitTest::total_test_suite_count() const {
-  return impl()->total_test_suite_count();
-}
-
-// Gets the number of all test suites that contain at least one test
-// that should run.
-int UnitTest::test_suite_to_run_count() const {
-  return impl()->test_suite_to_run_count();
-}
-
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-int UnitTest::successful_test_case_count() const {
-  return impl()->successful_test_suite_count();
-}
-int UnitTest::failed_test_case_count() const {
-  return impl()->failed_test_suite_count();
-}
-int UnitTest::total_test_case_count() const {
-  return impl()->total_test_suite_count();
-}
-int UnitTest::test_case_to_run_count() const {
-  return impl()->test_suite_to_run_count();
-}
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-// Gets the number of successful tests.
-int UnitTest::successful_test_count() const {
-  return impl()->successful_test_count();
-}
-
-// Gets the number of skipped tests.
-int UnitTest::skipped_test_count() const {
-  return impl()->skipped_test_count();
-}
-
-// Gets the number of failed tests.
-int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
-
-// Gets the number of disabled tests that will be reported in the XML report.
-int UnitTest::reportable_disabled_test_count() const {
-  return impl()->reportable_disabled_test_count();
-}
-
-// Gets the number of disabled tests.
-int UnitTest::disabled_test_count() const {
-  return impl()->disabled_test_count();
-}
-
-// Gets the number of tests to be printed in the XML report.
-int UnitTest::reportable_test_count() const {
-  return impl()->reportable_test_count();
-}
-
-// Gets the number of all tests.
-int UnitTest::total_test_count() const { return impl()->total_test_count(); }
-
-// Gets the number of tests that should run.
-int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
-
-// Gets the time of the test program start, in ms from the start of the
-// UNIX epoch.
-internal::TimeInMillis UnitTest::start_timestamp() const {
-    return impl()->start_timestamp();
-}
-
-// Gets the elapsed time, in milliseconds.
-internal::TimeInMillis UnitTest::elapsed_time() const {
-  return impl()->elapsed_time();
-}
-
-// Returns true iff the unit test passed (i.e. all test suites passed).
-bool UnitTest::Passed() const { return impl()->Passed(); }
-
-// Returns true iff the unit test failed (i.e. some test suite failed
-// or something outside of all tests failed).
-bool UnitTest::Failed() const { return impl()->Failed(); }
-
-// Gets the i-th test suite among all the test suites. i can range from 0 to
-// total_test_suite_count() - 1. If i is not in that range, returns NULL.
-const TestSuite* UnitTest::GetTestSuite(int i) const {
-  return impl()->GetTestSuite(i);
-}
-
-//  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-const TestCase* UnitTest::GetTestCase(int i) const {
-  return impl()->GetTestCase(i);
-}
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-
-// Returns the TestResult containing information on test failures and
-// properties logged outside of individual test suites.
-const TestResult& UnitTest::ad_hoc_test_result() const {
-  return *impl()->ad_hoc_test_result();
-}
-
-// Gets the i-th test suite among all the test suites. i can range from 0 to
-// total_test_suite_count() - 1. If i is not in that range, returns NULL.
-TestSuite* UnitTest::GetMutableTestSuite(int i) {
-  return impl()->GetMutableSuiteCase(i);
-}
-
-// Returns the list of event listeners that can be used to track events
-// inside Google Test.
-TestEventListeners& UnitTest::listeners() {
-  return *impl()->listeners();
-}
-
-// Registers and returns a global test environment.  When a test
-// program is run, all global test environments will be set-up in the
-// order they were registered.  After all tests in the program have
-// finished, all global test environments will be torn-down in the
-// *reverse* order they were registered.
-//
-// The UnitTest object takes ownership of the given environment.
-//
-// We don't protect this under mutex_, as we only support calling it
-// from the main thread.
-Environment* UnitTest::AddEnvironment(Environment* env) {
-  if (env == nullptr) {
-    return nullptr;
-  }
-
-  impl_->environments().push_back(env);
-  return env;
-}
-
-// Adds a TestPartResult to the current TestResult object.  All Google Test
-// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
-// this to report their results.  The user code should use the
-// assertion macros instead of calling this directly.
-void UnitTest::AddTestPartResult(
-    TestPartResult::Type result_type,
-    const char* file_name,
-    int line_number,
-    const std::string& message,
-    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
-  Message msg;
-  msg << message;
-
-  internal::MutexLock lock(&mutex_);
-  if (impl_->gtest_trace_stack().size() > 0) {
-    msg << "\n" << GTEST_NAME_ << " trace:";
-
-    for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) {
-      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
-      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
-          << " " << trace.message;
-    }
-  }
-
-  if (os_stack_trace.c_str() != nullptr && !os_stack_trace.empty()) {
-    msg << internal::kStackTraceMarker << os_stack_trace;
-  }
-
-  const TestPartResult result = TestPartResult(
-      result_type, file_name, line_number, msg.GetString().c_str());
-  impl_->GetTestPartResultReporterForCurrentThread()->
-      ReportTestPartResult(result);
-
-  if (result_type != TestPartResult::kSuccess &&
-      result_type != TestPartResult::kSkip) {
-    // gtest_break_on_failure takes precedence over
-    // gtest_throw_on_failure.  This allows a user to set the latter
-    // in the code (perhaps in order to use Google Test assertions
-    // with another testing framework) and specify the former on the
-    // command line for debugging.
-    if (GTEST_FLAG(break_on_failure)) {
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
-      // Using DebugBreak on Windows allows gtest to still break into a debugger
-      // when a failure happens and both the --gtest_break_on_failure and
-      // the --gtest_catch_exceptions flags are specified.
-      DebugBreak();
-#elif (!defined(__native_client__)) &&            \
-    ((defined(__clang__) || defined(__GNUC__)) && \
-     (defined(__x86_64__) || defined(__i386__)))
-      // with clang/gcc we can achieve the same effect on x86 by invoking int3
-      asm("int3");
-#else
-      // Dereference nullptr through a volatile pointer to prevent the compiler
-      // from removing. We use this rather than abort() or __builtin_trap() for
-      // portability: some debuggers don't correctly trap abort().
-      *static_cast<volatile int*>(nullptr) = 1;
-#endif  // GTEST_OS_WINDOWS
-    } else if (GTEST_FLAG(throw_on_failure)) {
-#if GTEST_HAS_EXCEPTIONS
-      throw internal::GoogleTestFailureException(result);
-#else
-      // We cannot call abort() as it generates a pop-up in debug mode
-      // that cannot be suppressed in VC 7.1 or below.
-      exit(1);
-#endif
-    }
-  }
-}
-
-// Adds a TestProperty to the current TestResult object when invoked from
-// inside a test, to current TestSuite's ad_hoc_test_result_ when invoked
-// from SetUpTestSuite or TearDownTestSuite, or to the global property set
-// when invoked elsewhere.  If the result already contains a property with
-// the same key, the value will be updated.
-void UnitTest::RecordProperty(const std::string& key,
-                              const std::string& value) {
-  impl_->RecordProperty(TestProperty(key, value));
-}
-
-// Runs all tests in this UnitTest object and prints the result.
-// Returns 0 if successful, or 1 otherwise.
-//
-// We don't protect this under mutex_, as we only support calling it
-// from the main thread.
-int UnitTest::Run() {
-  const bool in_death_test_child_process =
-      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
-
-  // Google Test implements this protocol for catching that a test
-  // program exits before returning control to Google Test:
-  //
-  //   1. Upon start, Google Test creates a file whose absolute path
-  //      is specified by the environment variable
-  //      TEST_PREMATURE_EXIT_FILE.
-  //   2. When Google Test has finished its work, it deletes the file.
-  //
-  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
-  // running a Google-Test-based test program and check the existence
-  // of the file at the end of the test execution to see if it has
-  // exited prematurely.
-
-  // If we are in the child process of a death test, don't
-  // create/delete the premature exit file, as doing so is unnecessary
-  // and will confuse the parent process.  Otherwise, create/delete
-  // the file upon entering/leaving this function.  If the program
-  // somehow exits before this function has a chance to return, the
-  // premature-exit file will be left undeleted, causing a test runner
-  // that understands the premature-exit-file protocol to report the
-  // test as having failed.
-  const internal::ScopedPrematureExitFile premature_exit_file(
-      in_death_test_child_process
-          ? nullptr
-          : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
-
-  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
-  // used for the duration of the program.
-  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
-
-#if GTEST_OS_WINDOWS
-  // Either the user wants Google Test to catch exceptions thrown by the
-  // tests or this is executing in the context of death test child
-  // process. In either case the user does not want to see pop-up dialogs
-  // about crashes - they are expected.
-  if (impl()->catch_exceptions() || in_death_test_child_process) {
-# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
-    // SetErrorMode doesn't exist on CE.
-    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
-                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
-# endif  // !GTEST_OS_WINDOWS_MOBILE
-
-# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
-    // Death test children can be terminated with _abort().  On Windows,
-    // _abort() can show a dialog with a warning message.  This forces the
-    // abort message to go to stderr instead.
-    _set_error_mode(_OUT_TO_STDERR);
-# endif
-
-# if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
-    // In the debug version, Visual Studio pops up a separate dialog
-    // offering a choice to debug the aborted program. We need to suppress
-    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
-    // executed. Google Test will notify the user of any unexpected
-    // failure via stderr.
-    if (!GTEST_FLAG(break_on_failure))
-      _set_abort_behavior(
-          0x0,                                    // Clear the following flags:
-          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
-# endif
-  }
-#endif  // GTEST_OS_WINDOWS
-
-  return internal::HandleExceptionsInMethodIfSupported(
-      impl(),
-      &internal::UnitTestImpl::RunAllTests,
-      "auxiliary test code (environments or event listeners)") ? 0 : 1;
-}
-
-// Returns the working directory when the first TEST() or TEST_F() was
-// executed.
-const char* UnitTest::original_working_dir() const {
-  return impl_->original_working_dir_.c_str();
-}
-
-// Returns the TestSuite object for the test that's currently running,
-// or NULL if no test is running.
-const TestSuite* UnitTest::current_test_suite() const
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  internal::MutexLock lock(&mutex_);
-  return impl_->current_test_suite();
-}
-
-// Legacy API is still available but deprecated
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
-const TestCase* UnitTest::current_test_case() const
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  internal::MutexLock lock(&mutex_);
-  return impl_->current_test_suite();
-}
-#endif
-
-// Returns the TestInfo object for the test that's currently running,
-// or NULL if no test is running.
-const TestInfo* UnitTest::current_test_info() const
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  internal::MutexLock lock(&mutex_);
-  return impl_->current_test_info();
-}
-
-// Returns the random seed used at the start of the current test run.
-int UnitTest::random_seed() const { return impl_->random_seed(); }
-
-// Returns ParameterizedTestSuiteRegistry object used to keep track of
-// value-parameterized tests and instantiate and register them.
-internal::ParameterizedTestSuiteRegistry&
-UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) {
-  return impl_->parameterized_test_registry();
-}
-
-// Creates an empty UnitTest.
-UnitTest::UnitTest() {
-  impl_ = new internal::UnitTestImpl(this);
-}
-
-// Destructor of UnitTest.
-UnitTest::~UnitTest() {
-  delete impl_;
-}
-
-// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
-// Google Test trace stack.
-void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  internal::MutexLock lock(&mutex_);
-  impl_->gtest_trace_stack().push_back(trace);
-}
-
-// Pops a trace from the per-thread Google Test trace stack.
-void UnitTest::PopGTestTrace()
-    GTEST_LOCK_EXCLUDED_(mutex_) {
-  internal::MutexLock lock(&mutex_);
-  impl_->gtest_trace_stack().pop_back();
-}
-
-namespace internal {
-
-UnitTestImpl::UnitTestImpl(UnitTest* parent)
-    : parent_(parent),
-      GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */)
-          default_global_test_part_result_reporter_(this),
-      default_per_thread_test_part_result_reporter_(this),
-      GTEST_DISABLE_MSC_WARNINGS_POP_() global_test_part_result_repoter_(
-          &default_global_test_part_result_reporter_),
-      per_thread_test_part_result_reporter_(
-          &default_per_thread_test_part_result_reporter_),
-      parameterized_test_registry_(),
-      parameterized_tests_registered_(false),
-      last_death_test_suite_(-1),
-      current_test_suite_(nullptr),
-      current_test_info_(nullptr),
-      ad_hoc_test_result_(),
-      os_stack_trace_getter_(nullptr),
-      post_flag_parse_init_performed_(false),
-      random_seed_(0),  // Will be overridden by the flag before first use.
-      random_(0),       // Will be reseeded before first use.
-      start_timestamp_(0),
-      elapsed_time_(0),
-#if GTEST_HAS_DEATH_TEST
-      death_test_factory_(new DefaultDeathTestFactory),
-#endif
-      // Will be overridden by the flag before first use.
-      catch_exceptions_(false) {
-  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
-}
-
-UnitTestImpl::~UnitTestImpl() {
-  // Deletes every TestSuite.
-  ForEach(test_suites_, internal::Delete<TestSuite>);
-
-  // Deletes every Environment.
-  ForEach(environments_, internal::Delete<Environment>);
-
-  delete os_stack_trace_getter_;
-}
-
-// Adds a TestProperty to the current TestResult object when invoked in a
-// context of a test, to current test suite's ad_hoc_test_result when invoke
-// from SetUpTestSuite/TearDownTestSuite, or to the global property set
-// otherwise.  If the result already contains a property with the same key,
-// the value will be updated.
-void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
-  std::string xml_element;
-  TestResult* test_result;  // TestResult appropriate for property recording.
-
-  if (current_test_info_ != nullptr) {
-    xml_element = "testcase";
-    test_result = &(current_test_info_->result_);
-  } else if (current_test_suite_ != nullptr) {
-    xml_element = "testsuite";
-    test_result = &(current_test_suite_->ad_hoc_test_result_);
-  } else {
-    xml_element = "testsuites";
-    test_result = &ad_hoc_test_result_;
-  }
-  test_result->RecordProperty(xml_element, test_property);
-}
-
-#if GTEST_HAS_DEATH_TEST
-// Disables event forwarding if the control is currently in a death test
-// subprocess. Must not be called before InitGoogleTest.
-void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
-  if (internal_run_death_test_flag_.get() != nullptr)
-    listeners()->SuppressEventForwarding();
-}
-#endif  // GTEST_HAS_DEATH_TEST
-
-// Initializes event listeners performing XML output as specified by
-// UnitTestOptions. Must not be called before InitGoogleTest.
-void UnitTestImpl::ConfigureXmlOutput() {
-  const std::string& output_format = UnitTestOptions::GetOutputFormat();
-  if (output_format == "xml") {
-    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
-        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
-  } else if (output_format == "json") {
-    listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter(
-        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
-  } else if (output_format != "") {
-    GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \""
-                        << output_format << "\" ignored.";
-  }
-}
-
-#if GTEST_CAN_STREAM_RESULTS_
-// Initializes event listeners for streaming test results in string form.
-// Must not be called before InitGoogleTest.
-void UnitTestImpl::ConfigureStreamingOutput() {
-  const std::string& target = GTEST_FLAG(stream_result_to);
-  if (!target.empty()) {
-    const size_t pos = target.find(':');
-    if (pos != std::string::npos) {
-      listeners()->Append(new StreamingListener(target.substr(0, pos),
-                                                target.substr(pos+1)));
-    } else {
-      GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
-                          << "\" ignored.";
-    }
-  }
-}
-#endif  // GTEST_CAN_STREAM_RESULTS_
-
-// Performs initialization dependent upon flag values obtained in
-// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
-// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
-// this function is also called from RunAllTests.  Since this function can be
-// called more than once, it has to be idempotent.
-void UnitTestImpl::PostFlagParsingInit() {
-  // Ensures that this function does not execute more than once.
-  if (!post_flag_parse_init_performed_) {
-    post_flag_parse_init_performed_ = true;
-
-#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
-    // Register to send notifications about key process state changes.
-    listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_());
-#endif  // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
-
-#if GTEST_HAS_DEATH_TEST
-    InitDeathTestSubprocessControlInfo();
-    SuppressTestEventsIfInSubprocess();
-#endif  // GTEST_HAS_DEATH_TEST
-
-    // Registers parameterized tests. This makes parameterized tests
-    // available to the UnitTest reflection API without running
-    // RUN_ALL_TESTS.
-    RegisterParameterizedTests();
-
-    // Configures listeners for XML output. This makes it possible for users
-    // to shut down the default XML output before invoking RUN_ALL_TESTS.
-    ConfigureXmlOutput();
-
-#if GTEST_CAN_STREAM_RESULTS_
-    // Configures listeners for streaming test results to the specified server.
-    ConfigureStreamingOutput();
-#endif  // GTEST_CAN_STREAM_RESULTS_
-
-#if GTEST_HAS_ABSL
-    if (GTEST_FLAG(install_failure_signal_handler)) {
-      absl::FailureSignalHandlerOptions options;
-      absl::InstallFailureSignalHandler(options);
-    }
-#endif  // GTEST_HAS_ABSL
-  }
-}
-
-// A predicate that checks the name of a TestSuite against a known
-// value.
-//
-// This is used for implementation of the UnitTest class only.  We put
-// it in the anonymous namespace to prevent polluting the outer
-// namespace.
-//
-// TestSuiteNameIs is copyable.
-class TestSuiteNameIs {
- public:
-  // Constructor.
-  explicit TestSuiteNameIs(const std::string& name) : name_(name) {}
-
-  // Returns true iff the name of test_suite matches name_.
-  bool operator()(const TestSuite* test_suite) const {
-    return test_suite != nullptr &&
-           strcmp(test_suite->name(), name_.c_str()) == 0;
-  }
-
- private:
-  std::string name_;
-};
-
-// Finds and returns a TestSuite with the given name.  If one doesn't
-// exist, creates one and returns it.  It's the CALLER'S
-// RESPONSIBILITY to ensure that this function is only called WHEN THE
-// TESTS ARE NOT SHUFFLED.
-//
-// Arguments:
-//
-//   test_suite_name: name of the test suite
-//   type_param:     the name of the test suite's type parameter, or NULL if
-//                   this is not a typed or a type-parameterized test suite.
-//   set_up_tc:      pointer to the function that sets up the test suite
-//   tear_down_tc:   pointer to the function that tears down the test suite
-TestSuite* UnitTestImpl::GetTestSuite(
-    const char* test_suite_name, const char* type_param,
-    internal::SetUpTestSuiteFunc set_up_tc,
-    internal::TearDownTestSuiteFunc tear_down_tc) {
-  // Can we find a TestSuite with the given name?
-  const auto test_suite =
-      std::find_if(test_suites_.rbegin(), test_suites_.rend(),
-                   TestSuiteNameIs(test_suite_name));
-
-  if (test_suite != test_suites_.rend()) return *test_suite;
-
-  // No.  Let's create one.
-  auto* const new_test_suite =
-      new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc);
-
-  // Is this a death test suite?
-  if (internal::UnitTestOptions::MatchesFilter(test_suite_name,
-                                               kDeathTestSuiteFilter)) {
-    // Yes.  Inserts the test suite after the last death test suite
-    // defined so far.  This only works when the test suites haven't
-    // been shuffled.  Otherwise we may end up running a death test
-    // after a non-death test.
-    ++last_death_test_suite_;
-    test_suites_.insert(test_suites_.begin() + last_death_test_suite_,
-                        new_test_suite);
-  } else {
-    // No.  Appends to the end of the list.
-    test_suites_.push_back(new_test_suite);
-  }
-
-  test_suite_indices_.push_back(static_cast<int>(test_suite_indices_.size()));
-  return new_test_suite;
-}
-
-// Helpers for setting up / tearing down the given environment.  They
-// are for use in the ForEach() function.
-static void SetUpEnvironment(Environment* env) { env->SetUp(); }
-static void TearDownEnvironment(Environment* env) { env->TearDown(); }
-
-// Runs all tests in this UnitTest object, prints the result, and
-// returns true if all tests are successful.  If any exception is
-// thrown during a test, the test is considered to be failed, but the
-// rest of the tests will still be run.
-//
-// When parameterized tests are enabled, it expands and registers
-// parameterized tests first in RegisterParameterizedTests().
-// All other functions called from RunAllTests() may safely assume that
-// parameterized tests are ready to be counted and run.
-bool UnitTestImpl::RunAllTests() {
-  // True iff Google Test is initialized before RUN_ALL_TESTS() is called.
-  const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
-
-  // Do not run any test if the --help flag was specified.
-  if (g_help_flag)
-    return true;
-
-  // Repeats the call to the post-flag parsing initialization in case the
-  // user didn't call InitGoogleTest.
-  PostFlagParsingInit();
-
-  // Even if sharding is not on, test runners may want to use the
-  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
-  // protocol.
-  internal::WriteToShardStatusFileIfNeeded();
-
-  // True iff we are in a subprocess for running a thread-safe-style
-  // death test.
-  bool in_subprocess_for_death_test = false;
-
-#if GTEST_HAS_DEATH_TEST
-  in_subprocess_for_death_test =
-      (internal_run_death_test_flag_.get() != nullptr);
-# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
-  if (in_subprocess_for_death_test) {
-    GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
-  }
-# endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
-#endif  // GTEST_HAS_DEATH_TEST
-
-  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
-                                        in_subprocess_for_death_test);
-
-  // Compares the full test names with the filter to decide which
-  // tests to run.
-  const bool has_tests_to_run = FilterTests(should_shard
-                                              ? HONOR_SHARDING_PROTOCOL
-                                              : IGNORE_SHARDING_PROTOCOL) > 0;
-
-  // Lists the tests and exits if the --gtest_list_tests flag was specified.
-  if (GTEST_FLAG(list_tests)) {
-    // This must be called *after* FilterTests() has been called.
-    ListTestsMatchingFilter();
-    return true;
-  }
-
-  random_seed_ = GTEST_FLAG(shuffle) ?
-      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
-
-  // True iff at least one test has failed.
-  bool failed = false;
-
-  TestEventListener* repeater = listeners()->repeater();
-
-  start_timestamp_ = GetTimeInMillis();
-  repeater->OnTestProgramStart(*parent_);
-
-  // How many times to repeat the tests?  We don't want to repeat them
-  // when we are inside the subprocess of a death test.
-  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
-  // Repeats forever if the repeat count is negative.
-  const bool gtest_repeat_forever = repeat < 0;
-  for (int i = 0; gtest_repeat_forever || i != repeat; i++) {
-    // We want to preserve failures generated by ad-hoc test
-    // assertions executed before RUN_ALL_TESTS().
-    ClearNonAdHocTestResult();
-
-    const TimeInMillis start = GetTimeInMillis();
-
-    // Shuffles test suites and tests if requested.
-    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
-      random()->Reseed(static_cast<UInt32>(random_seed_));
-      // This should be done before calling OnTestIterationStart(),
-      // such that a test event listener can see the actual test order
-      // in the event.
-      ShuffleTests();
-    }
-
-    // Tells the unit test event listeners that the tests are about to start.
-    repeater->OnTestIterationStart(*parent_, i);
-
-    // Runs each test suite if there is at least one test to run.
-    if (has_tests_to_run) {
-      // Sets up all environments beforehand.
-      repeater->OnEnvironmentsSetUpStart(*parent_);
-      ForEach(environments_, SetUpEnvironment);
-      repeater->OnEnvironmentsSetUpEnd(*parent_);
-
-      // Runs the tests only if there was no fatal failure or skip triggered
-      // during global set-up.
-      if (Test::IsSkipped()) {
-        // Emit diagnostics when global set-up calls skip, as it will not be
-        // emitted by default.
-        TestResult& test_result =
-            *internal::GetUnitTestImpl()->current_test_result();
-        for (int j = 0; j < test_result.total_part_count(); ++j) {
-          const TestPartResult& test_part_result =
-              test_result.GetTestPartResult(j);
-          if (test_part_result.type() == TestPartResult::kSkip) {
-            const std::string& result = test_part_result.message();
-            printf("%s\n", result.c_str());
-          }
-        }
-        fflush(stdout);
-      } else if (!Test::HasFatalFailure()) {
-        for (int test_index = 0; test_index < total_test_suite_count();
-             test_index++) {
-          GetMutableSuiteCase(test_index)->Run();
-        }
-      }
-
-      // Tears down all environments in reverse order afterwards.
-      repeater->OnEnvironmentsTearDownStart(*parent_);
-      std::for_each(environments_.rbegin(), environments_.rend(),
-                    TearDownEnvironment);
-      repeater->OnEnvironmentsTearDownEnd(*parent_);
-    }
-
-    elapsed_time_ = GetTimeInMillis() - start;
-
-    // Tells the unit test event listener that the tests have just finished.
-    repeater->OnTestIterationEnd(*parent_, i);
-
-    // Gets the result and clears it.
-    if (!Passed()) {
-      failed = true;
-    }
-
-    // Restores the original test order after the iteration.  This
-    // allows the user to quickly repro a failure that happens in the
-    // N-th iteration without repeating the first (N - 1) iterations.
-    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
-    // case the user somehow changes the value of the flag somewhere
-    // (it's always safe to unshuffle the tests).
-    UnshuffleTests();
-
-    if (GTEST_FLAG(shuffle)) {
-      // Picks a new random seed for each iteration.
-      random_seed_ = GetNextRandomSeed(random_seed_);
-    }
-  }
-
-  repeater->OnTestProgramEnd(*parent_);
-
-  if (!gtest_is_initialized_before_run_all_tests) {
-    ColoredPrintf(
-        COLOR_RED,
-        "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
-        "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
-        "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
-        " will start to enforce the valid usage. "
-        "Please fix it ASAP, or IT WILL START TO FAIL.\n");  // NOLINT
-#if GTEST_FOR_GOOGLE_
-    ColoredPrintf(COLOR_RED,
-                  "For more details, see http://wiki/Main/ValidGUnitMain.\n");
-#endif  // GTEST_FOR_GOOGLE_
-  }
-
-  return !failed;
-}
-
-// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
-// if the variable is present. If a file already exists at this location, this
-// function will write over it. If the variable is present, but the file cannot
-// be created, prints an error and exits.
-void WriteToShardStatusFileIfNeeded() {
-  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
-  if (test_shard_file != nullptr) {
-    FILE* const file = posix::FOpen(test_shard_file, "w");
-    if (file == nullptr) {
-      ColoredPrintf(COLOR_RED,
-                    "Could not write to the test shard status file \"%s\" "
-                    "specified by the %s environment variable.\n",
-                    test_shard_file, kTestShardStatusFile);
-      fflush(stdout);
-      exit(EXIT_FAILURE);
-    }
-    fclose(file);
-  }
-}
-
-// Checks whether sharding is enabled by examining the relevant
-// environment variable values. If the variables are present,
-// but inconsistent (i.e., shard_index >= total_shards), prints
-// an error and exits. If in_subprocess_for_death_test, sharding is
-// disabled because it must only be applied to the original test
-// process. Otherwise, we could filter out death tests we intended to execute.
-bool ShouldShard(const char* total_shards_env,
-                 const char* shard_index_env,
-                 bool in_subprocess_for_death_test) {
-  if (in_subprocess_for_death_test) {
-    return false;
-  }
-
-  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
-  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
-
-  if (total_shards == -1 && shard_index == -1) {
-    return false;
-  } else if (total_shards == -1 && shard_index != -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestShardIndex << " = " << shard_index
-      << ", but have left " << kTestTotalShards << " unset.\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  } else if (total_shards != -1 && shard_index == -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestTotalShards << " = " << total_shards
-      << ", but have left " << kTestShardIndex << " unset.\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  } else if (shard_index < 0 || shard_index >= total_shards) {
-    const Message msg = Message()
-      << "Invalid environment variables: we require 0 <= "
-      << kTestShardIndex << " < " << kTestTotalShards
-      << ", but you have " << kTestShardIndex << "=" << shard_index
-      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  }
-
-  return total_shards > 1;
-}
-
-// Parses the environment variable var as an Int32. If it is unset,
-// returns default_val. If it is not an Int32, prints an error
-// and aborts.
-Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
-  const char* str_val = posix::GetEnv(var);
-  if (str_val == nullptr) {
-    return default_val;
-  }
-
-  Int32 result;
-  if (!ParseInt32(Message() << "The value of environment variable " << var,
-                  str_val, &result)) {
-    exit(EXIT_FAILURE);
-  }
-  return result;
-}
-
-// Given the total number of shards, the shard index, and the test id,
-// returns true iff the test should be run on this shard. The test id is
-// some arbitrary but unique non-negative integer assigned to each test
-// method. Assumes that 0 <= shard_index < total_shards.
-bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
-  return (test_id % total_shards) == shard_index;
-}
-
-// Compares the name of each test with the user-specified filter to
-// decide whether the test should be run, then records the result in
-// each TestSuite and TestInfo object.
-// If shard_tests == true, further filters tests based on sharding
-// variables in the environment - see
-// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
-// . Returns the number of tests that should run.
-int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
-  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
-  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
-
-  // num_runnable_tests are the number of tests that will
-  // run across all shards (i.e., match filter and are not disabled).
-  // num_selected_tests are the number of tests to be run on
-  // this shard.
-  int num_runnable_tests = 0;
-  int num_selected_tests = 0;
-  for (auto* test_suite : test_suites_) {
-    const std::string& test_suite_name = test_suite->name();
-    test_suite->set_should_run(false);
-
-    for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
-      TestInfo* const test_info = test_suite->test_info_list()[j];
-      const std::string test_name(test_info->name());
-      // A test is disabled if test suite name or test name matches
-      // kDisableTestFilter.
-      const bool is_disabled = internal::UnitTestOptions::MatchesFilter(
-                                   test_suite_name, kDisableTestFilter) ||
-                               internal::UnitTestOptions::MatchesFilter(
-                                   test_name, kDisableTestFilter);
-      test_info->is_disabled_ = is_disabled;
-
-      const bool matches_filter = internal::UnitTestOptions::FilterMatchesTest(
-          test_suite_name, test_name);
-      test_info->matches_filter_ = matches_filter;
-
-      const bool is_runnable =
-          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
-          matches_filter;
-
-      const bool is_in_another_shard =
-          shard_tests != IGNORE_SHARDING_PROTOCOL &&
-          !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests);
-      test_info->is_in_another_shard_ = is_in_another_shard;
-      const bool is_selected = is_runnable && !is_in_another_shard;
-
-      num_runnable_tests += is_runnable;
-      num_selected_tests += is_selected;
-
-      test_info->should_run_ = is_selected;
-      test_suite->set_should_run(test_suite->should_run() || is_selected);
-    }
-  }
-  return num_selected_tests;
-}
-
-// Prints the given C-string on a single line by replacing all '\n'
-// characters with string "\\n".  If the output takes more than
-// max_length characters, only prints the first max_length characters
-// and "...".
-static void PrintOnOneLine(const char* str, int max_length) {
-  if (str != nullptr) {
-    for (int i = 0; *str != '\0'; ++str) {
-      if (i >= max_length) {
-        printf("...");
-        break;
-      }
-      if (*str == '\n') {
-        printf("\\n");
-        i += 2;
-      } else {
-        printf("%c", *str);
-        ++i;
-      }
-    }
-  }
-}
-
-// Prints the names of the tests matching the user-specified filter flag.
-void UnitTestImpl::ListTestsMatchingFilter() {
-  // Print at most this many characters for each type/value parameter.
-  const int kMaxParamLength = 250;
-
-  for (auto* test_suite : test_suites_) {
-    bool printed_test_suite_name = false;
-
-    for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
-      const TestInfo* const test_info = test_suite->test_info_list()[j];
-      if (test_info->matches_filter_) {
-        if (!printed_test_suite_name) {
-          printed_test_suite_name = true;
-          printf("%s.", test_suite->name());
-          if (test_suite->type_param() != nullptr) {
-            printf("  # %s = ", kTypeParamLabel);
-            // We print the type parameter on a single line to make
-            // the output easy to parse by a program.
-            PrintOnOneLine(test_suite->type_param(), kMaxParamLength);
-          }
-          printf("\n");
-        }
-        printf("  %s", test_info->name());
-        if (false && test_info->value_param() != nullptr) {
-          printf("  # %s = ", kValueParamLabel);
-          // We print the value parameter on a single line to make the
-          // output easy to parse by a program.
-          PrintOnOneLine(test_info->value_param(), kMaxParamLength);
-        }
-        printf("\n");
-      }
-    }
-  }
-  fflush(stdout);
-  const std::string& output_format = UnitTestOptions::GetOutputFormat();
-  if (output_format == "xml" || output_format == "json") {
-    FILE* fileout = OpenFileForWriting(
-        UnitTestOptions::GetAbsolutePathToOutputFile().c_str());
-    std::stringstream stream;
-    if (output_format == "xml") {
-      XmlUnitTestResultPrinter(
-          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
-          .PrintXmlTestsList(&stream, test_suites_);
-    } else if (output_format == "json") {
-      JsonUnitTestResultPrinter(
-          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
-          .PrintJsonTestList(&stream, test_suites_);
-    }
-    fprintf(fileout, "%s", StringStreamToString(&stream).c_str());
-    fclose(fileout);
-  }
-}
-
-// Sets the OS stack trace getter.
-//
-// Does nothing if the input and the current OS stack trace getter are
-// the same; otherwise, deletes the old getter and makes the input the
-// current getter.
-void UnitTestImpl::set_os_stack_trace_getter(
-    OsStackTraceGetterInterface* getter) {
-  if (os_stack_trace_getter_ != getter) {
-    delete os_stack_trace_getter_;
-    os_stack_trace_getter_ = getter;
-  }
-}
-
-// Returns the current OS stack trace getter if it is not NULL;
-// otherwise, creates an OsStackTraceGetter, makes it the current
-// getter, and returns it.
-OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
-  if (os_stack_trace_getter_ == nullptr) {
-#ifdef GTEST_OS_STACK_TRACE_GETTER_
-    os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_;
-#else
-    os_stack_trace_getter_ = new OsStackTraceGetter;
-#endif  // GTEST_OS_STACK_TRACE_GETTER_
-  }
-
-  return os_stack_trace_getter_;
-}
-
-// Returns the most specific TestResult currently running.
-TestResult* UnitTestImpl::current_test_result() {
-  if (current_test_info_ != nullptr) {
-    return &current_test_info_->result_;
-  }
-  if (current_test_suite_ != nullptr) {
-    return &current_test_suite_->ad_hoc_test_result_;
-  }
-  return &ad_hoc_test_result_;
-}
-
-// Shuffles all test suites, and the tests within each test suite,
-// making sure that death tests are still run first.
-void UnitTestImpl::ShuffleTests() {
-  // Shuffles the death test suites.
-  ShuffleRange(random(), 0, last_death_test_suite_ + 1, &test_suite_indices_);
-
-  // Shuffles the non-death test suites.
-  ShuffleRange(random(), last_death_test_suite_ + 1,
-               static_cast<int>(test_suites_.size()), &test_suite_indices_);
-
-  // Shuffles the tests inside each test suite.
-  for (auto& test_suite : test_suites_) {
-    test_suite->ShuffleTests(random());
-  }
-}
-
-// Restores the test suites and tests to their order before the first shuffle.
-void UnitTestImpl::UnshuffleTests() {
-  for (size_t i = 0; i < test_suites_.size(); i++) {
-    // Unshuffles the tests in each test suite.
-    test_suites_[i]->UnshuffleTests();
-    // Resets the index of each test suite.
-    test_suite_indices_[i] = static_cast<int>(i);
-  }
-}
-
-// Returns the current OS stack trace as an std::string.
-//
-// The maximum number of stack frames to be included is specified by
-// the gtest_stack_trace_depth flag.  The skip_count parameter
-// specifies the number of top frames to be skipped, which doesn't
-// count against the number of frames to be included.
-//
-// For example, if Foo() calls Bar(), which in turn calls
-// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
-// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
-                                            int skip_count) {
-  // We pass skip_count + 1 to skip this wrapper function in addition
-  // to what the user really wants to skip.
-  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
-}
-
-// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
-// suppress unreachable code warnings.
-namespace {
-class ClassUniqueToAlwaysTrue {};
-}
-
-bool IsTrue(bool condition) { return condition; }
-
-bool AlwaysTrue() {
-#if GTEST_HAS_EXCEPTIONS
-  // This condition is always false so AlwaysTrue() never actually throws,
-  // but it makes the compiler think that it may throw.
-  if (IsTrue(false))
-    throw ClassUniqueToAlwaysTrue();
-#endif  // GTEST_HAS_EXCEPTIONS
-  return true;
-}
-
-// If *pstr starts with the given prefix, modifies *pstr to be right
-// past the prefix and returns true; otherwise leaves *pstr unchanged
-// and returns false.  None of pstr, *pstr, and prefix can be NULL.
-bool SkipPrefix(const char* prefix, const char** pstr) {
-  const size_t prefix_len = strlen(prefix);
-  if (strncmp(*pstr, prefix, prefix_len) == 0) {
-    *pstr += prefix_len;
-    return true;
-  }
-  return false;
-}
-
-// Parses a string as a command line flag.  The string should have
-// the format "--flag=value".  When def_optional is true, the "=value"
-// part can be omitted.
-//
-// Returns the value of the flag, or NULL if the parsing failed.
-static const char* ParseFlagValue(const char* str, const char* flag,
-                                  bool def_optional) {
-  // str and flag must not be NULL.
-  if (str == nullptr || flag == nullptr) return nullptr;
-
-  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
-  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
-  const size_t flag_len = flag_str.length();
-  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
-
-  // Skips the flag name.
-  const char* flag_end = str + flag_len;
-
-  // When def_optional is true, it's OK to not have a "=value" part.
-  if (def_optional && (flag_end[0] == '\0')) {
-    return flag_end;
-  }
-
-  // If def_optional is true and there are more characters after the
-  // flag name, or if def_optional is false, there must be a '=' after
-  // the flag name.
-  if (flag_end[0] != '=') return nullptr;
-
-  // Returns the string after "=".
-  return flag_end + 1;
-}
-
-// Parses a string for a bool flag, in the form of either
-// "--flag=value" or "--flag".
-//
-// In the former case, the value is taken as true as long as it does
-// not start with '0', 'f', or 'F'.
-//
-// In the latter case, the value is taken as true.
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, true);
-
-  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
-
-  // Converts the string value to a bool.
-  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
-  return true;
-}
-
-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
-
-  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
-
-  // Sets *value to the value of the flag.
-  return ParseInt32(Message() << "The value of flag --" << flag,
-                    value_str, value);
-}
-
-// Parses a string for a string flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-template <typename String>
-static bool ParseStringFlag(const char* str, const char* flag, String* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
-
-  // Aborts if the parsing failed.
-  if (value_str == nullptr) return false;
-
-  // Sets *value to the value of the flag.
-  *value = value_str;
-  return true;
-}
-
-// Determines whether a string has a prefix that Google Test uses for its
-// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
-// If Google Test detects that a command line flag has its prefix but is not
-// recognized, it will print its help message. Flags starting with
-// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
-// internal flags and do not trigger the help message.
-static bool HasGoogleTestFlagPrefix(const char* str) {
-  return (SkipPrefix("--", &str) ||
-          SkipPrefix("-", &str) ||
-          SkipPrefix("/", &str)) &&
-         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
-         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
-          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
-}
-
-// Prints a string containing code-encoded text.  The following escape
-// sequences can be used in the string to control the text color:
-//
-//   @@    prints a single '@' character.
-//   @R    changes the color to red.
-//   @G    changes the color to green.
-//   @Y    changes the color to yellow.
-//   @D    changes to the default terminal text color.
-//
-static void PrintColorEncoded(const char* str) {
-  GTestColor color = COLOR_DEFAULT;  // The current color.
-
-  // Conceptually, we split the string into segments divided by escape
-  // sequences.  Then we print one segment at a time.  At the end of
-  // each iteration, the str pointer advances to the beginning of the
-  // next segment.
-  for (;;) {
-    const char* p = strchr(str, '@');
-    if (p == nullptr) {
-      ColoredPrintf(color, "%s", str);
-      return;
-    }
-
-    ColoredPrintf(color, "%s", std::string(str, p).c_str());
-
-    const char ch = p[1];
-    str = p + 2;
-    if (ch == '@') {
-      ColoredPrintf(color, "@");
-    } else if (ch == 'D') {
-      color = COLOR_DEFAULT;
-    } else if (ch == 'R') {
-      color = COLOR_RED;
-    } else if (ch == 'G') {
-      color = COLOR_GREEN;
-    } else if (ch == 'Y') {
-      color = COLOR_YELLOW;
-    } else {
-      --str;
-    }
-  }
-}
-
-static const char kColorEncodedHelpMessage[] =
-"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
-"following command line flags to control its behavior:\n"
-"\n"
-"Test Selection:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
-"      List the names of all tests instead of running them. The name of\n"
-"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
-"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
-    "[@G-@YNEGATIVE_PATTERNS]@D\n"
-"      Run only the tests whose name matches one of the positive patterns but\n"
-"      none of the negative patterns. '?' matches any single character; '*'\n"
-"      matches any substring; ':' separates two patterns.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
-"      Run all disabled tests too.\n"
-"\n"
-"Test Execution:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
-"      Run the tests repeatedly; use a negative count to repeat forever.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
-"      Randomize tests' orders on every iteration.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
-"      Random number seed to use for shuffling test orders (between 1 and\n"
-"      99999, or 0 to use a seed based on the current time).\n"
-"\n"
-"Test Output:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
-"      Enable/disable colored output. The default is @Gauto@D.\n"
-"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
-"      Don't print the elapsed time of each test.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G"
-    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
-"      Generate a JSON or XML report in the given directory or with the given\n"
-"      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
-# if GTEST_CAN_STREAM_RESULTS_
-"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
-"      Stream test results to the given server.\n"
-# endif  // GTEST_CAN_STREAM_RESULTS_
-"\n"
-"Assertion Behavior:\n"
-# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
-"      Set the default death test style.\n"
-# endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
-"      Turn assertion failures into debugger break-points.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
-"      Turn assertion failures into C++ exceptions for use by an external\n"
-"      test framework.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
-"      Do not report exceptions as test failures. Instead, allow them\n"
-"      to crash the program or throw a pop-up (on Windows).\n"
-"\n"
-"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
-    "the corresponding\n"
-"environment variable of a flag (all letters in upper-case). For example, to\n"
-"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
-    "color=no@D or set\n"
-"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
-"\n"
-"For more information, please read the " GTEST_NAME_ " documentation at\n"
-"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
-"(not one in your own code or tests), please report it to\n"
-"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
-
-static bool ParseGoogleTestFlag(const char* const arg) {
-  return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
-                       &GTEST_FLAG(also_run_disabled_tests)) ||
-      ParseBoolFlag(arg, kBreakOnFailureFlag,
-                    &GTEST_FLAG(break_on_failure)) ||
-      ParseBoolFlag(arg, kCatchExceptionsFlag,
-                    &GTEST_FLAG(catch_exceptions)) ||
-      ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-      ParseStringFlag(arg, kDeathTestStyleFlag,
-                      &GTEST_FLAG(death_test_style)) ||
-      ParseBoolFlag(arg, kDeathTestUseFork,
-                    &GTEST_FLAG(death_test_use_fork)) ||
-      ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-      ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                      &GTEST_FLAG(internal_run_death_test)) ||
-      ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-      ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-      ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-      ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
-      ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-      ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-      ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-      ParseInt32Flag(arg, kStackTraceDepthFlag,
-                     &GTEST_FLAG(stack_trace_depth)) ||
-      ParseStringFlag(arg, kStreamResultToFlag,
-                      &GTEST_FLAG(stream_result_to)) ||
-      ParseBoolFlag(arg, kThrowOnFailureFlag,
-                    &GTEST_FLAG(throw_on_failure));
-}
-
-#if GTEST_USE_OWN_FLAGFILE_FLAG_
-static void LoadFlagsFromFile(const std::string& path) {
-  FILE* flagfile = posix::FOpen(path.c_str(), "r");
-  if (!flagfile) {
-    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile)
-                      << "\"";
-  }
-  std::string contents(ReadEntireFile(flagfile));
-  posix::FClose(flagfile);
-  std::vector<std::string> lines;
-  SplitString(contents, '\n', &lines);
-  for (size_t i = 0; i < lines.size(); ++i) {
-    if (lines[i].empty())
-      continue;
-    if (!ParseGoogleTestFlag(lines[i].c_str()))
-      g_help_flag = true;
-  }
-}
-#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.  The type parameter CharType can be
-// instantiated to either char or wchar_t.
-template <typename CharType>
-void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
-  for (int i = 1; i < *argc; i++) {
-    const std::string arg_string = StreamableToString(argv[i]);
-    const char* const arg = arg_string.c_str();
-
-    using internal::ParseBoolFlag;
-    using internal::ParseInt32Flag;
-    using internal::ParseStringFlag;
-
-    bool remove_flag = false;
-    if (ParseGoogleTestFlag(arg)) {
-      remove_flag = true;
-#if GTEST_USE_OWN_FLAGFILE_FLAG_
-    } else if (ParseStringFlag(arg, kFlagfileFlag, &GTEST_FLAG(flagfile))) {
-      LoadFlagsFromFile(GTEST_FLAG(flagfile));
-      remove_flag = true;
-#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
-    } else if (arg_string == "--help" || arg_string == "-h" ||
-               arg_string == "-?" || arg_string == "/?" ||
-               HasGoogleTestFlagPrefix(arg)) {
-      // Both help flag and unrecognized Google Test flags (excluding
-      // internal ones) trigger help display.
-      g_help_flag = true;
-    }
-
-    if (remove_flag) {
-      // Shift the remainder of the argv list left by one.  Note
-      // that argv has (*argc + 1) elements, the last one always being
-      // NULL.  The following loop moves the trailing NULL element as
-      // well.
-      for (int j = i; j != *argc; j++) {
-        argv[j] = argv[j + 1];
-      }
-
-      // Decrements the argument count.
-      (*argc)--;
-
-      // We also need to decrement the iterator as we just removed
-      // an element.
-      i--;
-    }
-  }
-
-  if (g_help_flag) {
-    // We print the help here instead of in RUN_ALL_TESTS(), as the
-    // latter may not be called at all if the user is using Google
-    // Test with another testing framework.
-    PrintColorEncoded(kColorEncodedHelpMessage);
-  }
-}
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.
-void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
-  ParseGoogleTestFlagsOnlyImpl(argc, argv);
-
-  // Fix the value of *_NSGetArgc() on macOS, but iff
-  // *_NSGetArgv() == argv
-  // Only applicable to char** version of argv
-#if GTEST_OS_MAC
-#ifndef GTEST_OS_IOS
-  if (*_NSGetArgv() == argv) {
-    *_NSGetArgc() = *argc;
-  }
-#endif
-#endif
-}
-void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
-  ParseGoogleTestFlagsOnlyImpl(argc, argv);
-}
-
-// The internal implementation of InitGoogleTest().
-//
-// The type parameter CharType can be instantiated to either char or
-// wchar_t.
-template <typename CharType>
-void InitGoogleTestImpl(int* argc, CharType** argv) {
-  // We don't want to run the initialization code twice.
-  if (GTestIsInitialized()) return;
-
-  if (*argc <= 0) return;
-
-  g_argvs.clear();
-  for (int i = 0; i != *argc; i++) {
-    g_argvs.push_back(StreamableToString(argv[i]));
-  }
-
-#if GTEST_HAS_ABSL
-  absl::InitializeSymbolizer(g_argvs[0].c_str());
-#endif  // GTEST_HAS_ABSL
-
-  ParseGoogleTestFlagsOnly(argc, argv);
-  GetUnitTestImpl()->PostFlagParsingInit();
-}
-
-}  // namespace internal
-
-// Initializes Google Test.  This must be called before calling
-// RUN_ALL_TESTS().  In particular, it parses a command line for the
-// flags that Google Test recognizes.  Whenever a Google Test flag is
-// seen, it is removed from argv, and *argc is decremented.
-//
-// No value is returned.  Instead, the Google Test flag variables are
-// updated.
-//
-// Calling the function for the second time has no user-visible effect.
-void InitGoogleTest(int* argc, char** argv) {
-#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
-  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
-  internal::InitGoogleTestImpl(argc, argv);
-#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
-}
-
-// This overloaded version can be used in Windows programs compiled in
-// UNICODE mode.
-void InitGoogleTest(int* argc, wchar_t** argv) {
-#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
-  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
-  internal::InitGoogleTestImpl(argc, argv);
-#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
-}
-
-// This overloaded version can be used on Arduino/embedded platforms where
-// there is no argc/argv.
-void InitGoogleTest() {
-  // Since Arduino doesn't have a command line, fake out the argc/argv arguments
-  int argc = 1;
-  const auto arg0 = "dummy";
-  char* argv0 = const_cast<char*>(arg0);
-  char** argv = &argv0;
-
-#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
-  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
-  internal::InitGoogleTestImpl(&argc, argv);
-#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
-}
-
-std::string TempDir() {
-#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
-  return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
-#endif
-
-#if GTEST_OS_WINDOWS_MOBILE
-  return "\\temp\\";
-#elif GTEST_OS_WINDOWS
-  const char* temp_dir = internal::posix::GetEnv("TEMP");
-  if (temp_dir == nullptr || temp_dir[0] == '\0')
-    return "\\temp\\";
-  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
-    return temp_dir;
-  else
-    return std::string(temp_dir) + "\\";
-#elif GTEST_OS_LINUX_ANDROID
-  return "/sdcard/";
-#else
-  return "/tmp/";
-#endif  // GTEST_OS_WINDOWS_MOBILE
-}
-
-// Class ScopedTrace
-
-// Pushes the given source file location and message onto a per-thread
-// trace stack maintained by Google Test.
-void ScopedTrace::PushTrace(const char* file, int line, std::string message) {
-  internal::TraceInfo trace;
-  trace.file = file;
-  trace.line = line;
-  trace.message.swap(message);
-
-  UnitTest::GetInstance()->PushGTestTrace(trace);
-}
-
-// Pops the info pushed by the c'tor.
-ScopedTrace::~ScopedTrace()
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
-  UnitTest::GetInstance()->PopGTestTrace();
-}
-
-}  // namespace testing
diff --git a/deps/googletest/src/gtest_main.cc b/deps/googletest/src/gtest_main.cc
deleted file mode 100644
index f6e1dd96f..000000000
--- a/deps/googletest/src/gtest_main.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <cstdio>
-#include "gtest/gtest.h"
-
-#ifdef ARDUINO
-void setup() {
-  testing::InitGoogleTest();
-}
-
-void loop() { RUN_ALL_TESTS(); }
-
-#else
-
-GTEST_API_ int main(int argc, char **argv) {
-  printf("Running main() from %s\n", __FILE__);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-#endif
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
deleted file mode 100644
index 87f9cd738..000000000
--- a/docs/CMakeLists.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-cmake_minimum_required (VERSION 3.13)
-
-set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake")
-find_package(Sphinx REQUIRED)
-
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
-
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
-
-# HTML output directory
-set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/Documentation/html")
-
-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
-    "${BINARY_BUILD_DIR}/conf.py"
-    @ONLY)
-
-add_custom_target(Documentation ALL
-    ${SPHINX_EXECUTABLE}
-        -Q -b html
-        -c "${BINARY_BUILD_DIR}"
-        -d "${SPHINX_CACHE_DIR}"
-        "${CMAKE_CURRENT_SOURCE_DIR}"
-        "${SPHINX_HTML_DIR}"
-    COMMENT "Building HTML documentation with Sphinx")
-
-install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/Documentation/html
-    DESTINATION "docs"
-    PATTERN "_static" EXCLUDE
-    PATTERN "_sources" EXCLUDE
-)
diff --git a/docs/README.md b/docs/README.md
deleted file mode 100644
index 040d22aac..000000000
--- a/docs/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-### oneMKL documentation
-
-This folder contains oneMKL documentation in reStructuredText (rST) format.
-
-The documentation build step is skipped by default.
-To enable building documentation from the main build, set `-DBUILD_DOC=ON`.
-For more information see [Building with CMake](../README.md#building-with-cmake).
-
-To build documentation only, use the following commands from the current folder:
-```bash
-# Inside <path to onemkl>/docs
-mkdir build && cd build
-cmake ..
-cmake --build .
-```
-Generated documentation can be found in `<path to onemkl>/docs/build/Documentation`
diff --git a/docs/_static/favicons.png b/docs/_static/favicons.png
deleted file mode 100644
index f450376b19e2d945e6d2da8b329de8aed320a9b7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 467
zcmeAS@N?(olHy`uVBq!ia0vp^0wB!63?wyl`GbKJbFq_W2nPqp?T7vkfZQzsJ|V8|
z1wPZ8J^^{vNsoafCh*ie2o%(1S)bwZ*nr{i!pV<{eBW$c`~#v=hGA`t*<A*PHR&Gr
z85kxoFsx)?FosBNUHa9I;Sz*Bq2_Ih`;}R3pO*B0%W}QVz_6=0;+hr1rFq?-v;FTz
znq6y7d8@{-jget`XX&fiZJ(W_j{-F|B)yhlI9(8OUzXtvGs7eXhJGG~C7Txh-0CsI
z3n*<<666=m@bTJ%%=6PWtg7m`^Jw|A`Lp(JKeqqir|u7ZO<W}nYxXaQd;IMA#aYu9
zoq2Tg&*pOrem{F(v~%ODz3oj~4{(;*{REoR?CIhdA~Ci0^id%uLjl)|=X@sl9lKjl
zUjF0%|0HL}s>65p27hI++iAAt!F9FH-3in5x7jA&dYmBX!*k>8p9}maQ@6zPbDmk8
z>2hFgpElF;@Hqk%_dazSu3wYHs?8U_V)m|`)BA<~sXKg?ZB<y)_k6C)GS%aZMZD8O
iB3A{mt17i`t7CS$$kD(2WV8n;xIA6`T-G@yGywpIRl08g

diff --git a/docs/_static/oneAPI-rgb-rev-100.png b/docs/_static/oneAPI-rgb-rev-100.png
deleted file mode 100644
index 58d2d5c54e586b53d027d39236038dc60f5fea94..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 7414
zcmZ`;Wl$7Q*QT4LySt^OL11Z?TtHf+mr^=cP&%YhKtf7FQc`wl1f;uUK~i#;ZuoeA
zee?c!=gzs$oafGS|C}@ToHIAhKu?31fSv#i4UJe!6KM2=CQoL8hx4=#71Jj_0hY5e
zNEr>SKAG^&4*SVxwbwKPp`itHqoIXIqM_Y9QQ`m4(ELQu(Dq)Tp~-$mL!)`~<)@+i
z(*e$F9StDb<9}8m{9F1Hga1a;+y@Qq8OMJNJ+M^C|B1x))dH#EZlW_$J!eCYZ+fB_
zwzPoC;J~HByr9q4CfP^t+z#R6GJX(6$&yMjuu*18L<;E(97j?{0~gYN{vFkaZzerG
zs63GH!g4dq`u_blmhgByVj5>kHBx2~UOhZJ0G1}67IB3($eH<~wd29QAdOv&mM!1u
zDt||Qr~*E>v>H4w1#?3-jTy-&Y?hQM{IMmRNuc4e&Ds5-_=RplqtVH?Q1E%X$+gug
z`E5tR{T0apb35@Xz6n4^giImbAUfh=?GCf@+pJ6BgO!ZvxrhQLU$DE6w^`?XHC2qX
z#&;S1Nf^Yqs3{=3X|h4a8zO(RGEo_k9dd-gr~^m?r15AmYybyXCqCPsaKUCPE_Dx9
zao-t>i`(Tgj1_9R?6drYCj~8>CR=)5zo}MT6}!btlHFNe{cf=f=IO!{AxK2WrC><8
zlk{z3Af77_u*v9u>Eu`8^jr@)+jv;#3TaL;uv^YKC}&q<8}OjjI}7&i*=(*a*bKLf
zzzZbBi^%Sbt{JB{aGR8PmD3ZSiwbblHt+8PsE!v=Rz#{nWnV+QQoQd)zWUKDe>k;%
zRb&&kS6i|F0|#tdh9+KCB1Ux++@@5;yUDH|Ao16Qkd4y_@$B%kOi8`rp*bd`ORyvH
zP`mOkGWN4irb^t~<9jT6%->b~9?z2g)E!7yMDN{vA9|2EwG_Jn#_=LbDe+5tObpz_
zDp|j7{$kfI0F;JSe5LoO$U_)dn-q5kgAuCnr+9pPQAX%h#CN2DhxXd%r~oX=1AyPc
z&yqbvq2^Twz9);_a*e52+Rkaxk>-Z7bw4db*1`-VZAh?T?4N}@(<-8viKm|l+C_4k
zwP4@kUh}5k<DStzsUE5-ZcOno^^tl1`<UW>U6$UKTkfjq@Z@Yo<CwLNM_kD7593}2
z`n&WTXQiE0tJlA$B*)yMrDRQ!g-H!5p<{nc2XjasP^VFr(eqr6XC0a?PBWs1D>sBX
zfQDqN4n&z-7dug`6L(R8UQ#sjEja`^AUrD3s85_!G4R~O&gZQVt{?0cc7(oj@bz2^
z9t!VW&wRd;Ae0H2)VWJB3m{o=tHSn)orCo^%m!Iw&gD?;Gnq23*PXHE=5u8|XLH-R
zpnbIe<d2YOx`zKz47TgNnVmRb(fr8qWk0wQdNK1_D>cu5P=B$p5x>Ie%@2cor!FBB
zfFc~(7E2o{rPkzP$oCRVNUtLRdtW5a@vXyd_Ke0D1Zhsahwzr?XRC<_mFBF{CGtbK
zMh`wubV{iC{7{qONj&f!8pQ#}qtCvg25VN&smQGU9?{In@8E296cuL{tryrkTPf!v
z${#4a5&J+&2?pl?tbWWaR}GHR`>!C=eHn{*OZGBRy<%UF@dh#JYbg!*V-A=IU9`*I
zG=gZ=u+~BLG1%v!+>ae2tW#NPAS+ibk91N9#SE&))ZLM72Co$Lg0+nW2o9iwsOHS<
zyJwo)_wk%r^FuL`!Y>l1{$@6X^35+z%^Dam<dipT&o}OVh~-@&vBSl}71)w1rzdNh
z__Vkmx{a80%k$m}^c_-2IFx#*eC64$^~Nl`T?K5yKoIFr^_8-ND(7UUF7$G4sfwNe
z(#uEH<-H$9d&eD9vmt0<`<-i-Ry07ZHCHIdN2V#8?o@nf=IhzsG~Qr<tb>iyWP_F#
zyTR*wJ-&NOgC>t{7KW8P%$*0G(o2IHdbn>kyav83+rd>VYa|pL5_wnIGM*2A*u&`j
z603D{)2#nDgn#5L>Uxej-D9*IXk3*p{_7%*@9%2ORY)PBhNrWtKHa!6G(`Wi+kUE*
zPy5ATq)B;(uZvvW;mxlP!7l{QH!nmRq9&N>_3keb%C{T|W=jbnSr(j@U|Qf^nbC6=
z==i|$Q6RF)nYLYy{Y`0clTN{M`-p7FB7J+Yb4Ap23B-ogER(<+k!Er+*fKVnuIKV6
zo~Susm+;1P!CBDSUdr)Cut)Rc+Who<_ZT%DSHk~pFMj8xuEH}vsY7e6v>{YxLKfG<
z>QZqU-0h>TNb8v0tHHTLc(9tw%rNw%T06w9#}~*ZTra4IK-j~B`n&alXa53JW2DbW
zmi2soSs-s2Z@Yw`=>#UVwi_nR#fo`!b$ZN*f6`R|pO^KBGOr0ex?aP6x@^V0j7^V2
z@tr!yO&Q>WLiYNnVyWF;+2pWd8Ve?MBNrXJMFJq1N%yGecVO6**4_lItp>SkDHG6L
z7wh#46*U8uwji65CfOdn^1;tt7>3$Br1k?_iSf%chpj<~D;5?$Tg0C-LSogk7vc1p
zAC=qND-7~1jtBieSVrxxcn}eByFwax#|Y$D&Ia{xGk&C63+}{~MDLX1!>IWeC5+|L
zIE|l+Z$z)i$;U4mWHv-tS2`(21wzG@tD~k8C)$B^jZP4=zRljrbzYIYOqhJji*v4o
z@=BH*i)ZDLGyc3~>Kscs(<SNn?^%n$6@RZy-Vbu5$m6g`A&vO4518;>T=mlHJSsgk
zn@oEPgmnPQ<)gurCu|#XsuojxjKe)X#Gd~$69H9~-6h*-FZYOWFnDd<3>Er64+7j2
zsamKsxD5C?GgL-DwOJC+R;PkPlv%v0bl*`%T~QUhE{!*(FU3&da|p}6&Fp51D`~b$
zmF_mp)}@RN<Vp_Zi!&{3Bw6ZYn_iYQ9?7C7*b~n$-yW%TF^(ZMTw=QlTk|sDpWr1?
zjeFt5^K?Z4)gsh}a33m57q-K1DY;72t&RGWZ~_Kq$6`b{ATJ${R?M$gqO{Na)Y>)%
zsE68#kGM1DIa+~~c-iSM8anYx?^2CiWtA+h`M5m8j=~8H>;*jbT*eJ<GmYl-O4(Np
zmMIKj&sZ-f;}2TZ)6bfty}oJ@YyY6nM`gEluYuH!Du70z(iuG@7$+=;o1ZO*<z1Ko
zO6s|YS6Q-1sI(gtzNjfgqg6_)wFTv9N+3pb(jO13xO}lbsU^mw1fUWtHS&61oDfs?
zR%aRgzHt0yCM`y7bN1Q07DU)$`nSBh?E&n3KC$gjK3ct79hO~VotcxWe}|f^SNb8u
z$#2PD?*QRlJgU)eS%MlHpRU+G00%aM3L*(T-S0T-|I))6{Jt0sEnp6R>9mz_c^ppr
zaPakrE6WrR;%3s;3YM_dE(7nv#%;%k-~$wajm>`hBxi#42pJLgOFzM>U$o48G$T<V
z{!wO)ja8^0O8-1v3g~ZMY(>t5IRsBHG(WajhmFlgOL1p0p0N)w%l6gZ#C*R1S1Fn{
zTF5lX{U%u&Cg%pn$ZDO(=@vjQQR+NDZU!TpP7DT)Z0#=bzWg(D3|aLKSti&~bzeMZ
zWPiNnQ1p>+2x(}mYQJ@7Tq!%mYOdBSp&V)~s<xI{bPZA!tJc?V*)!tWY2jCs&7R}T
zr5xf`<7d(rw6OSoQAnQ;zdl(Sm!mPCRllgJFU*nA{CxD<#55_HzpJN~*E@uDrHF5Y
zK&}N_HyL-g$2Vn+=A=+1(y0-`eZtx~&^?tC#}vJ*F~aXLX+4xXxg%7bKZ0gRi#6Ld
z&R@wjIurZ#hXQ9_!?9JbRgFsZ_Fr7)juG_e%KfxFLfR1Cl4_wS`$e!0*;XY@WaL+)
zanX}brS)M!gWTwCOVwjD^^Wf0n@yil=_U0H{Z}VY*6YLQB?sP2Xf-*m>vT3Age=pd
z>&%y2htB|g0a2|tmH*sMRn$YUoN7`1ea8#lcaui5y=3o*D-$QwUr9vo>2hZ3w;x=4
zf|&WFeVgTcJBG?m$SnQ*2}gmiBO+r7n8E0_Jv>BPGS7>bSonrabIwI{4g)}0BUzRi
zX_Qy^s$2|dw;q~Du*!pnd{ez3=d0avYbMi*?3ETi#})hXCUr0v_*S+kw3_VlU3kaz
zC%PkI#N1OQ95Y78p>+n$V+Pt3HC4<qe$lMJvhc8M29G14&Vz^j6pt@2SZj7#L43@i
z3rH==Ww*S@!vw8A1-j9tLUOm+4y}_bu6IVi(~Uw}heqFmA@|2Cy>qy?F$bV*=<pQ2
ze_%&wwU4XstZRk))v=dTpx*#iQ$zi!`TqA0^X88;axS&*$hlPBwknIPN1yYt<5k$>
z3|&X4hra6VjDd~<Tib_^x&*7q9}np?ZR)*^z27|5g0NwaMogdSG7$y3u(y{A-Hz=@
zqHP%cN-Rur+Br|WanjW?s9bg2D1n#!S3o2O2T8P-UQ6X_l050jtgQrK;n%a<Nn?32
zb1kAXBAC)-5w4|ddZWL~($Cl4UOavwqg!`#o~I<)Tu#CMf(QMBQ`-Q?rU1945Q)45
ziRS8ZDmunRdiW#F)3b;(zXavDoqcsuz?CWjFI_sY3*n)xxD{-i3v^U7_R)O^@JRSR
zv@&^e2W*6W7Qi4Ib9l;26|o9Deu8SsbP@ue;_bSRo-3AogIA~2;PDZRQ~)gJ;*$S2
z_9((yB_M!e)HN5fDALrTU3#$b6AHV~j>@PKa&Qj)#q9o7oI>eZ*omXT-1e3mycEWq
z{`kU)^p4JDiCX>dNgX@5ifx+0m2p@dbzEG8RhE)mzNPw`N&Cx35u(gb$-I;*<n3uM
zmoPB;PMTd~vUkqa3Cu1EwdNz&=LJpV$nQK;34S%&^>3~c^I2~XAM}%z!r7Gzo_ar5
z<FKk~StIq|SgFK@Sif^kqHE)vtsEuij2N^ZPQ3+$f+cGb4;r<QVBiPs>uMD)q9Fyw
z6nK~O1x6Ho80*!GdSey^GEM&<q$BuEThZFaz<MX%KNMQ$e<mnvpZbM>;M%a8-wMRn
z!paYl!_sp%WE|VyvQ3>|eJITX$_%cyh~J<hN2~DK9+Dl6VR)Z}h<1~OJpFG-Or@cV
z!mD!bEO~IzE1I>Iq$9lzWrbf$y?cJo7hT0Oc3TjL_7UjY9d-{(t)p|yw#_>)-PNG6
zZ0pd!m7X?Ph~90;M{%5=0Hl=z93E01rs<|0<8mEV4e7|-t@mHK$Nl8BI(d1AeJ(Aj
z7Sf_m`+3&-KJM{lz2e)m-ylFj)-o?c!w)Aje{R{!h}B!A+u!Fav(wavpylV^nlAql
zX0aT6r(If`fCDLTL}bVjV)f!TF+hst044CodM=IBcUxi*IBlt)Nt`3t9vSU3#WXi*
zogk^8BN+V2oHK_~i0+4q1kGh_qu1>>3g>~GWpn;$j^%bgou^b+8y2D_wm42>mv-7C
zJB@e@G5ia6?r2?<Xs{*Tt}v&T8--H^97VRt^ly6R&H)IA=Ojucqs%`e*o30+Wrl?4
z`M3CA@$MLJUhp@BNPMiWm!Q1hB}hM+qKL^2uEJitg<;PoOBt1@%<J5481T82`Qs<x
z7)gBd;8*Ltiq#QC8dqbr+QCwbj9l52mVftMIQ^vQXW(lH;V9#N?=(-6-f=0<{dk^4
zX<i~tk2Bf8TE$DH@+;t%TSX%N+g@@gIFbD=JVk}PUTBrLTrYY=FNX7w^P?k0Hfb@O
z%rW=iH0SPiB%Cu|<!?jm9$C@*jJQgyGAe;O(n{`(_0b3iVNtLK(UCW3dfDAXP8sD(
zIs1kKHwu00r*`ge5yU{7GlziyqB`_$>@Ld`o)z2k9J_=4<+qI0_m!0?R*ETYI+zW}
z=@L&mVqtm@L${;zY`D!EOQNO^KojHW_rSCj2E!XG-YqYB$lt7`RSC+OP8C$!PbdTN
z7XldW)t<ff*@*?lwy#`Ft=_*H8b0#4uE2mDy7u|6iv{bGQDpb9sFc@5YSoMX%16}I
zdPtHM(;T<IVrhx@^IWt&YD5%Z7T|<-p*b^8FtdnUDwX<Rtfo7Zt;*IHYjieLTLzR_
zY%@#5S1F#ZfR@J<kA0urI0|DPtPOTNmmhERy;z9xz1Zbg`q`AODL@4j%ipcPe1M+H
z++97Of1I-?B9TfRkG9T(&y|fq-Ha_i0lrAoAafOlGM+)V8m2#s#`vgsf60Zn2Z0{{
zR>=iFTqk~13kLlJ?O=KGQaM3)rGMW}`QbVRO`tDnipE<?f)p_0PH$trTngDDB`&M$
zG)t&58{}UK=8&xVrH5GScQS$g?4;+WQdQH1DL)e2e&5>naGoqoYECs8x+{B`LXvly
zt>O7@`UdKxA@TY>hivdUO;)wvM<&F-@}pAfCGK+YmWGe<WE;zs?(Na^Eq4V~Kjd)c
zo3G#$%`?l5?>5bIWpIrR(-JXpvalfckgJL^(=F(B`}kd1g0p-XbMDWitPAeCKlA=)
zvIW-lirF?n<-KdNF&$gfLUb!k_&jp>aDRVJ`bz38-_q-s8p(d^Yu(L~FwyRYrsoMb
zPowtq2j#w@7~*E)HZ}7N(;g|EzUyp7x84aHjcWNByG2M0!m&A1bN_=IRnTH`hlNeN
zAw#ASQoCd(hK6`aC+4~Icj-Q-A^zMNbOA$wVX`RKpmGKK6{#;*qydQvKlW<?UFz}X
z&HC~V!Vm`8DkBb)L0L*C57}U>YD#6TzuVwXmYdb&d>BfsD9j$#IJ5flOsp0n;M+xu
zEalD-B^@l~H8Q){B2Lf6G7C}&7uy@EYf-}}Er4i*Y#;DFa-Nj3<5Q2&a;syvfGTCU
z?Gz|eMrJ=6`KiVV{_HwY7%t7qz?Y`?S+IRNtUGeeM94c`#NRf)8Ng7BQ7jkc#I5i<
zDmFg1Uo#0J1ZtYm_AJVt<kB+Y%QZ4xxUbXk=PootRRo$xrocB`dss`PnI_w%qpF3t
z)m69$<oTmgyoZXZM+%$b1FW1avk|kvG9L6)r}fm7-O2LWT3a4S=D$TW2({`EG03my
zQWZuAVsoYqE6)b^C?|ehH$lA@v)d#y>LOh?k>{U{-Hl7}t&EHGponIhy>9r$!AZjV
zK3?B3pl<}IX)M+?ML&Oe{GI^%l1<_n8#Z`z%?XnrD)v|dV`_*qs{!>PoYYH2O3kKv
zTYj39@pBmXeSBeXDOYAESx+Or6|9c9J?#Zwa!QXc>va`-#UHnFKY=qh!OTSmPJ;-f
z;raf4F$$mW;B53DnfQn!dMSlHD63km_ZL=Vjm~0{lu@BPzV4BpZG#yh!acHmvgn#f
z=NfKExi$~UfB-D$COl1CeM;YgHW!AR8E2g34IM{oU5)qghYqodyOne+SmV4!DN)%g
zdpZ{^-$scaYbIJs&rfozPEg})|ET{+mREZ4lBFhNG)P^ig$?xQJs!pnSJA9I+(+pD
zxw@Zfk|IR#cQZ&-mT|-bHx+CfRxcd45B&WW#>++}El(zq(0l{yOM)YH6m?uBX)1?9
z4XyUy)2|uAaV3oRUxQD?0ek2ikH=#!!l?#u;d+yO0f`EL0SVn*mD#_`qhOI6-u52C
z$r9QIB_L;0R^(oAl<DpUWi#d<J5)`RYeBYT?jl22@-shj_yymU*tFa9+se`$0Hy%i
zH3)d94r+iY$QNj86aAC&R=JV;vz;&JFb&Nq%`p`q4~p_DR=EeDT5z0M%8xo(g$D^w
z-oySM-pROL5oCVjV8@ssfKk1~a>uU08re83t^8z5a(B3KW*yFt!h0ikEL@vl1nEG~
zuldQJ3y(?pj&@IzlVv}`tbp3&!<g(1jes7C*&ui<@T8ACZV*6r9Udzo)H+VbPCjU?
zGl5E?)HkL?(HBNtecLfZ=fU`tfT!PZs*fJD#_*!~!5ZDVODCKL{Sk*VoG`BX3c4o$
zS|=$S)Rh~KfjY-Nl;0QfGheCX25x@y)48sX<zyMRpm|vtJ6IkoSRmZe#V)6|S#>Vf
zuC`eiwjBJ$-&9c0ew?N<0aK7$F`y*w>QAcZ`s#4<wgy!BIxD!XT=6y8F&&pcgiRNh
zqJ@20aOjd*+fQG#;Rxn+Rlu(}{R|X(ehEVBJS2>y6)%wQc#?TPy#2yCh5AfL<L>9B
z?yHmqNDETm(%)Lyr1s8cEwHd15cnb#)s@s`rncD;w)YM%&>ntji*balhy(kqVeO4|
zCH5vR7K;!=sZVQrw97Jl_8eWbuf~_6e%~Vu!%Xk^U;DOOxMeKGh*lr62N@hsigFOc
zKfu0;h`6t@mfD)$ce-;p36Ev$>;jRddI`2d6Jp)N-v3l1lkq9|dC2aa8hfShJ>Cv<
zSP9Jw^Ho#<ZUY@|L%l0`Xvk76)i_U`TMtioc&EA8FK33s-n?6}r7%t$CvTVc6S#it
zdgOipU0tvIa0?$l6!#`X`g0yX4fr^UTCjNbm#<R2*6Y3K5!kRi>CKsNFH9tsqV5+;
zGywLPU3c#p_U{4$hgi4^G+~^$?<<7k&4}XGqabG0u=Xqbx4cW|pI+|K(7aR!0TF!c
zKNaae#@-2&;qT9LkOj?SxZU0;W}PoybF#5Bu{Q*DH-+p`hsO$1r+N_9gas)p05=`k
zjmmA8n<mKj=AWYLPxe-Aa})cVk?h(Nj*!~9JqBm}T(eIYv>>})#K@yT)|M`H^<3g#
z)`iO_Xrfu6!;-G7ctBr>o^gNtMCe%Q`Pb$?gt;|Y`j;{zCdK)qSGlF+izsQR|I9hr
ztE?+v{`2+77NUiw7soX7%dHfx9uWX^6va9Pjg3v`KyR|Mj-NnhiiU#W>|=A>g4>TS
z?5_NKNpO1&3U<Icwod4XvtW&z8GIe&RA=B6u^aZ#j5<g5B-S&2ez%c@@sOvXirsgF
z=zK9y7FLs|%=r9>dpqlt!I1cP?WkgPgQ~0d<Qu+DZ2J1D0$@RHUHO6i4O89f(Ac)&
zeuQ-MAwwklYlws}KJ#PL*T4F4ND9~VKi_nhH)ir$3i*?n{@F1%JxHNDE9l)aoxiii
zv@-Rww}o%Vp8h(+P;l#`^d7joe<Z(nMUgym_r6G%sV=G}&#I8~{6s%M)#GCzYTNm_
z7mxq+K(3jF{Bv_y+Giy_GM@5*++HM?-ggZ$p7vEGK)K19bJvmcjg*=Dx^775{ViLc
zxtD|T9sRc9Jnn+Je38CJGX;rW$o(|o9cBK4*xpSesU=Ha;@0bj8MhL|Hc5DK#B{_e
zl7ukuXHlH%G;FIHV~p!#?&vv^1$$eH+oXIT`y1aS8HL|o;TJ*}^FON8ZP@|&vN4Gz
zYP5#9g_~dd!5NuGQBf@G-Sn6>)$=lr5fAUE>LZZk?K7@@p2vFYBQl^7EgJ3yfe7by
z-T`tH_Urir3{$jE<n2Kp7ua#*D8R(6FdL-r>#-2Y<FiYBhwj9~;5E!Hi`V*h-L{qP
zaJcfCqhBKXYvJ&PD;wOA*jS8*!EsaVyB9qn@3TQBz-Kh6=t1jrvv9(|e|I~Cgf(6?
zsSnumjj=Fup#N_Q9W|h0bNDjCc4wqEJ<Q4c_C4#deY@M<VQB4`U4ggyIp^WwWpQUr
zT-(3>IM-kFtRUUoW^$#v)_4u^6PP=jo~Md#`vvG#09gzTTmJ40^~-eh4pl}1g!bK_
zCw~3qX6)`%K-_#5rZ4&Flr3gww%3kU367P~V}RF6&yPNmf=+#1bb(vDx{bcM2e3|Q
z(IZyX%G5CFHNET8A0U^n+Dl&tJ6}gxdvC`lKob)Y6A=;-6%vyHi;2ogO2~?f3yDa|
liiiy6IiLKOz{AVI)hXzI3);F?^_~Q1T55X08kJY?{|_}8aQy%P

diff --git a/docs/_static/style.css b/docs/_static/style.css
deleted file mode 100644
index 23c9885b4..000000000
--- a/docs/_static/style.css
+++ /dev/null
@@ -1,141 +0,0 @@
-/* override table width restrictions */
-@media screen and (min-width: 767px) {
-
-   .wy-table-responsive table td {
-      /* !important prevents the common CSS stylesheets from overriding
-         this as on RTD they are loaded after this stylesheet */
-         
-      white-space: normal !important;
-   }
-}
-/*
-   .wy-table-responsive {
-      overflow: visible !important;
-   }
-*/
-
-/* make the page width fill the window */
-.wy-nav-content {
-   max-width: none;
-}
-
-
-.code-block-caption {
-    color: #000;
-    font: italic 85%/1 arial,sans-serif;
-    padding: 1em 0;
-    text-align: center;
-}
-
-
-.collapsible {
-  margin-left: -10px;
-  background-color: #f1f1f1;
-  cursor: pointer;
-  padding: 18px 18px 18px 10px;
-  width: 100%;
-  border: none;
-  text-align: left;
-  outline: none;
-  font-weight: 700;
-  font-family: "Roboto Slab","ff-tisa-web-pro","Georgia",Arial,sans-serif;
-}
-
-code.sig-name.descname {
-   white-space: initial;
-}
-
-table.optimization-notice td {
-   white-space: initial;
-}
-
-table.optimization-notice td p:last-child {
-   text-align: right;
-}
-
-img.with-border {
-   border:1px solid #021a40;
-}
-
-div.column {
-   float: left;
-   width: 50%;
-   padding: 10px;
- }
-
-/* Clear floats after the columns */
-.column:after {
- content: "";
- display: table;
- clear: both;
-}
-
-.comparison:after {
-   content: "";
-   display: table;
-   clear: both;
-  }
-
-div.admonition-container {
- width: 49%;
- padding: 0 3px 0 0;
-}
-
-/* Clear floats after the columns */
-.admonition-container:after {
- content: "";
- display: table;
- clear: both;
-}
-
-div.quotation {
-  background-color: #fffff1;
-  padding: 1em 0 0 1em;
-}
-
-.rst-content div[class^='highlight'] pre {
-   white-space: pre-wrap;
-}
-
-/* A workaround for https://github.com/readthedocs/sphinx_rtd_theme/issues/647
- * Override display for function signatures so that there is spacing between
- * types and arguments */
- .rst-content dl:not(.docutils) dt {
-   display: table-cell !important;
-}
-.rst-content dl:not(.docutils) dd {
-   margin-top: 6px;
-}
-
-/*
-.rst-content tt.literal, .rst-content code.literal, .highlight {
-   background: #f0f0f0;
-}
-.rst-content tt.literal, .rst-content code.literal {
-    color: #000000;
-}*/
-
-
-.eqno {
-   margin-left: 5px;
-   float: right;
-}
-.math .headerlink {
-   display: none;
-   visibility: hidden;
-}
-.math:hover .headerlink {
-   display: inline-block;
-   visibility: visible;
-   margin-right: -0.7em;
-}
-
-/* A workaround for https://github.com/readthedocs/sphinx_rtd_theme/issues/647
-* Override display for function signatures so that there is spacing between
-* types and arguments */
-.rst-content dl:not(.docutils) dt {
-   display: table-cell !important;
-}
-.rst-content dl:not(.docutils) dd {
-   margin-top: 6px;
-}
\ No newline at end of file
diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html
deleted file mode 100644
index cae88b2cc..000000000
--- a/docs/_templates/layout.html
+++ /dev/null
@@ -1,18 +0,0 @@
-{% extends "!layout.html" %}
-{% block extrahead %}
-<script type="text/javascript">
-  // Configure TMS settings
-  var wapLocalCode = 'us-en'; // Dynamically set per localized site; see mapping table for values
-  var wapSection = "oneapi-mkl"; // WAP team will give you a unique section for your site
-  // Load TMS
-  if (document.location.href.includes("oneapi-src.github.io")) {
-    (function () {
-      var url = 'https://www.intel.com/content/dam/www/global/wap/tms-loader.js'; // WAP file URL
-      var po = document.createElement('script'); po.type = 'text/javascript'; po.async = true; po.src = url;
-      var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(po, s);
-    })();
-  }
-</script>
-{% endblock %}
-<!-- any other content blocks -->
-<!--...-->
diff --git a/docs/building_and_running_tests.rst b/docs/building_and_running_tests.rst
deleted file mode 100644
index 43d3431af..000000000
--- a/docs/building_and_running_tests.rst
+++ /dev/null
@@ -1,51 +0,0 @@
-.. _building_and_running_tests:
-
-Building and Running Tests
-==========================
-
-The functional tests are enabled by default, and can be enabled/disabled
-with the CMake build parameter ``-DBUILD_FUNCTIONAL_TESTS=True/False``. Only
-the tests relevant to the enabled backends and target domains will be built.
-
-Building tests for BLAS and LAPACK domains requires additional libraries for
-reference.
-
-* BLAS: Requires a reference BLAS library.
-* LAPACK: Requires a reference LAPACK library.
-
-For both BLAS and LAPACK, shared libraries supporting both 32 and 64 bit
-indexing are required.
-
-A reference LAPACK implementation (including BLAS) can be built as the
-following:
-
-.. code-block:: bash
-
-  git clone https://github.com/Reference-LAPACK/lapack.git 
-  cd lapack; mkdir -p build; cd build 
-  cmake -DCMAKE_INSTALL_PREFIX=~/lapack -DCBLAS=True -DLAPACK=True -DLAPACKE=True -DBUILD_INDEX64=True -DBUILD_SHARED_LIBS=True .. 
-  cmake --build . -j --target install 
-  cmake -DCMAKE_INSTALL_PREFIX=~/lapack -DCBLAS=True -DLAPACK=True -DLAPACKE=True -DBUILD_INDEX64=False -DBUILD_SHARED_LIBS=True .. 
-  cmake --build . -j --target install
-
-and then used in oneMKL by setting ``-REF_BLAS_ROOT=/path/to/lapack/install``
-and ``-DREF_LAPACK_ROOT=/path/to/lapack/install``.
-
-You can re-run tests without re-building the entire project.
-
-To run the tests, either run test binaries individually, or use ``ctest`` CMake test driver program.
-
-.. code-block:: bash
-
-  # Run all tests
-  ctest
-  # Run only Gpu specific tests
-  ctest -R Gpu
-  # Exclude Cpu tests
-  ctest -E Cpu
-
-For more ``ctest`` options, refer to `ctest manual page <https://cmake.org/cmake/help/v3.13/manual/ctest.1.html>`_.
-
-When running tests you may encounter the issue ``BACKEND NOT FOUND EXCEPTION``,
-you may need to add your ``<oneMKL build directory>/lib`` to your
-``LD_LIBRARY_PATH`` on Linux.
diff --git a/docs/building_the_project_with_adaptivecpp.rst b/docs/building_the_project_with_adaptivecpp.rst
deleted file mode 100644
index 98c763b90..000000000
--- a/docs/building_the_project_with_adaptivecpp.rst
+++ /dev/null
@@ -1,171 +0,0 @@
-.. _building_the_project_with_adaptivecpp:
-
-Building the Project with AdaptiveCpp
-=====================================
-
-.. _build_setup_with_adaptivecpp:
-
-Environment Setup
-#################
-
-#. 
-   Build and install AdaptiveCpp. For a detailed description of available
-   AdaptiveCpp backends, their dependencies, and installation, see the
-   `AdaptiveCpp installation readme
-   <https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/installing.md#compilation-flows>`_.
-
-#. 
-   Clone this project. The root directory of the cloned repository will be
-   referred to as ``<path to onemkl>``.
-
-#. 
-   Download and install the `required dependencies
-   <https://github.com/oneapi-src/oneMKL?tab=readme-ov-file#software-requirements>`_
-   manually.
-
-Build Commands
-###############
-
-In most cases, building oneMKL Interfaces is as simple as setting the compiler and
-selecting the desired backends to build with.
-
-On Linux (other OSes are not supported with the AdaptiveCpp compiler):
-
-.. code-block:: bash
-
-  # Inside <path to onemkl>
-  mkdir build && cd build
-  cmake .. -DONEMKL_SYCL_IMPLEMENTATION=hipsycl    \ # Indicate that AdaptiveCpp is being used.
-          -DENABLE_MKLGPU_BACKEND=False            \ # MKLGPU backend is not supported by AdaptiveCpp
-          -DENABLE_<BACKEND_NAME>_BACKEND=True     \ # Enable backend(s) (optional)
-          -DENABLE_<BACKEND_NAME_2>_BACKEND=True   \ # Multiple backends can be enabled at once.
-          -DHIPSYCL_TARGETS=omp/;hip:gfx90a,gfx906 \ # Set target architectures depending on supported devices.
-          -DBUILD_FUNCTIONAL_TESTS=False           \ # See section *Building the tests* for more on building tests. True by default.
-          -DBUILD_EXAMPLES=False                   # Optional: True by default.
-  cmake --build .
-  cmake --install . --prefix <path_to_install_dir> # required to have full package structure
-
-Backends should be enabled by setting ``-DENABLE_<BACKEND_NAME>_BACKEND=True`` for
-each desired backend. By default, the ``MKLGPU`` and ``MKLCPU`` backends are
-enabled, but ``MKLGPU`` must be disabled with AdaptiveCpp. The supported
-backends for the compilers are given in the table at `oneMKL supported
-configurations table
-<https://github.com/oneapi-src/oneMKL?tab=readme-ov-file#supported-configurations>`_,
-and the CMake option names are given in the table below. Some backends may
-require additional parameters to be set. See the relevant section below for
-additional guidance. The target architectures must be specified with
-``HIP_TARGETS``. See the `AdaptiveCpp documentation
-<https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/using-hipsycl.md#adaptivecpp-targets-specification>`_.
-
-If a backend library supports multiple domains (i.e. BLAS, RNG), it may be
-desirable to only enable selected domains. For this, the ``TARGET_DOMAINS``
-variable should be set. For further details, see :ref:`_build_target_domains`.
-
-By default, the library also additionally builds examples and tests. These can
-be disabled by setting the parameters ``BUILD_FUNCTIONAL_TESTS`` and
-``BUILD_EXAMPLES`` to False. Building the functional tests may require additional
-external libraries. See the section :ref:`building_and_running_tests` for more
-information.
-
-The most important supported build options are:
-
-.. list-table::
-   :header-rows: 1
-
-   * - CMake Option
-     - Supported Values
-     - Default Value 
-   * - ENABLE_MKLCPU_BACKEND
-     - True, False
-     - True      
-   * - ENABLE_CUBLAS_BACKEND
-     - True, False
-     - False     
-   * - ENABLE_CURAND_BACKEND
-     - True, False
-     - False     
-   * - ENABLE_NETLIB_BACKEND
-     - True, False
-     - False     
-   * - ENABLE_ROCBLAS_BACKEND
-     - True, False
-     - False     
-   * - ENABLE_ROCRAND_BACKEND
-     - True, False
-     - False     
-   * - ENABLE_MKLCPU_THREAD_TBB
-     - True, False
-     - True      
-   * - BUILD_FUNCTIONAL_TESTS
-     - True, False
-     - True      
-   * - BUILD_EXAMPLES
-     - True, False
-     - True      
-   * - TARGET_DOMAINS (list)
-     - blas, rng
-     - All supported domains
-
-Some additional build options are given in
-:ref:`build_additional_options_dpcpp`.
-
-Backends
-########
-
-.. _build_for_cuda_adaptivecpp:
-
-Building for CUDA
-~~~~~~~~~~~~~~~~~
-
-The CUDA backends can be enabled with ``ENABLE_CUBLAS_BACKEND`` and
-``ENABLE_CURAND_BACKEND``.
-
-The target architecture must be set using the ``HIPSYCL_TARGETS`` parameter. For
-example, to target a Nvidia A100 (Ampere architecture), set
-``-DHIPSYCL_TARGETS=cuda:sm_80``, where the figure ``80`` corresponds to a CUDA
-compute capability of 8.0. The correspondence between compute capabilities and
-Nvidia GPU products is given on the `Nvidia website
-<https://developer.nvidia.com/cuda-gpus>`_. Multiple architectures can be
-enabled using a comma separated list. See the `AdaptiveCpp documentation
-<https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/using-hipsycl.md#adaptivecpp-targets-specification>`_.
-
-No additional parameters are required for using CUDA libraries. In most cases,
-the CUDA libraries should be found automatically by CMake.
-
-.. _build_for_rocm_adaptivecpp:
-
-Building for ROCm
-~~~~~~~~~~~~~~~~~
-
-The ROCm backends can be enabled with ``ENABLE_ROCBLAS_BACKEND`` and
-``ENABLE_ROCRAND_BACKEND``.
-
-The target architecture must be set using the ``HIPSYCL_TARGETS`` parameter. See
-the `AdaptiveCpp documentation
-<https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/using-hipsycl.md#adaptivecpp-targets-specification>`_.
-For example, to target the MI200 series, set ``-DHIPSYCL_TARGETS=hip:gfx90a``.
-Multiple architectures can be enabled using a comma separated list. For example,
-``-DHIPSYCL_TARGETS=hip:gfx906,gfx90a``, and multiple APIs with a semicolon
-(``-DHIPSYCL_TARGETS=omp\;hip:gfx906,gfx90a``).
-
-For common AMD GPU architectures, see the :ref:`build_for_ROCM_dpcpp` in the
-DPC++ build guide.
-
-.. _project_cleanup:
-
-Project Cleanup
-###############
-
-Most use-cases involve building the project without the need to clean up the
-build directory. However, if you wish to clean up the build directory, you can
-delete the ``build`` folder and create a new one. If you wish to clean up the
-build files but retain the build configuration, following commands will help you
-do so.
-
-.. code-block:: sh
-
-  # If you use "GNU/Unix Makefiles" for building,
-  make clean
-
-  # If you use "Ninja" for building
-  ninja -t clean
diff --git a/docs/building_the_project_with_dpcpp.rst b/docs/building_the_project_with_dpcpp.rst
deleted file mode 100644
index 365028237..000000000
--- a/docs/building_the_project_with_dpcpp.rst
+++ /dev/null
@@ -1,475 +0,0 @@
-.. _building_the_project_with_dpcpp:
-
-Building the Project with DPC++
-===============================
-
-This page describes building the oneMKL Interfaces with either the Intel(R)
-oneAPI DPC++ Compiler or open-source oneAPI DPC++ Compiler. For guidance on
-building the project with AdaptiveCpp, see
-:ref:`building_the_project_with_adaptivecpp`.
-
-.. _build_setup_with_dpcpp:
-
-Environment Setup
-##################
-
-#. 
-   Install the required DPC++ compiler (Intel(R) DPC++ or Open DPC++ - see
-   :ref:`Selecting a Compiler<selecting_a_compiler>`).
-
-#. 
-   Clone this project. The root directory of the cloned repository will be
-   referred to as ``<path to onemkl>``.
-
-#. 
-   Build and install all `required dependencies
-   <https://github.com/oneapi-src/oneMKL?tab=readme-ov-file#software-requirements>`_. 
-
-.. _build_introduction_with_dpcpp:
-
-Build Commands
-###############
-
-The build commands for various compilers and backends differ mostly in setting
-the values of CMake options for compiler and backend. In this section, we
-describe the common build commands. We will discuss backend-specific details in
-the `Backends`_ section and provide examples in `CMake invocation examples`_.
-
-On Linux, the common form of the build command looks as follows (see `Building
-for Windows`_ for building on Windows):
-
-.. code-block:: bash
-
-  # Inside <path to onemkl>
-  mkdir build && cd build
-  cmake .. -DCMAKE_CXX_COMPILER=$CXX_COMPILER    \ # Should be icpx or clang++
-          -DCMAKE_C_COMPILER=$C_COMPILER         \ # Should be icx or clang
-          -DENABLE_MKLGPU_BACKEND=False          \ # Optional: The MKLCPU backend is True by default.
-          -DENABLE_MKLGPU_BACKEND=False          \ # Optional: The MKLGPU backend is True by default.
-          -DENABLE_<BACKEND_NAME>_BACKEND=True   \ # Enable any other backend(s) (optional)
-          -DENABLE_<BACKEND_NAME_2>_BACKEND=True \ # Multiple backends can be enabled at once.
-          -DBUILD_FUNCTIONAL_TESTS=False         \ # See page *Building and Running Tests* for more on building tests. True by default.
-          -DBUILD_EXAMPLES=False                   # Optional: True by default.
-  cmake --build .
-  cmake --install . --prefix <path_to_install_dir>  # required to have full package structure
-
-In the above, the ``$CXX_COMPILER`` and ``$C_COMPILER`` should be set to
-``icpx`` and ``icx`` respectively when using the Intel(R) oneAPI DPC++ Compiler,
-or ``clang++`` and ``clang`` respectively when using the Open DPC++ Compiler. 
-
-Backends should be enabled by setting ``-DENABLE_<BACKEND_NAME>_BACKEND=True`` for
-each desired backend. By default, only the ``MKLGPU`` and ``MKLCPU`` backends
-are enabled. Multiple backends for multiple device vendors can be enabled at
-once (albeit with limitations when using portBLAS and portFFT). The supported
-backends for the compilers are given in the table at `oneMKL supported
-configurations table
-<https://github.com/oneapi-src/oneMKL?tab=readme-ov-file#supported-configurations>`_,
-and the CMake option names are given in the table below. Some backends may
-require additional parameters to be set. See the relevant section below for
-additional guidance.
-
-If a backend library supports multiple domains (i.e., BLAS, LAPACK, DFT, RNG,
-sparse BLAS), it may be desirable to only enable selected domains. For this, the
-``TARGET_DOMAINS`` variable should be set. See the section `TARGET_DOMAINS`_.
-
-By default, the library also additionally builds examples and tests. These can
-be disabled by setting the parameters ``BUILD_FUNCTIONAL_TESTS`` and
-``BUILD_EXAMPLES`` to ``False``. Building the functional tests requires
-additional external libraries for the BLAS and LAPACK domains. See the section
-:ref:`building_and_running_tests` for more information.
-
-The most important supported build options are:
-
-.. list-table::
-   :header-rows: 1
-
-   * - CMake Option
-     - Supported Values
-     - Default Value 
-   * - ENABLE_MKLCPU_BACKEND
-     - True, False
-     - True      
-   * - ENABLE_MKLGPU_BACKEND
-     - True, False
-     - True      
-   * - ENABLE_CUBLAS_BACKEND
-     - True, False
-     - False     
-   * - ENABLE_CUSOLVER_BACKEND
-     - True, False
-     - False     
-   * - ENABLE_CUFFT_BACKEND
-     - True, False
-     - False     
-   * - ENABLE_CURAND_BACKEND
-     - True, False
-     - False     
-   * - ENABLE_NETLIB_BACKEND
-     - True, False
-     - False     
-   * - ENABLE_ROCBLAS_BACKEND
-     - True, False
-     - False     
-   * - ENABLE_ROCFFT_BACKEND
-     - True, False
-     - False    
-   * - ENABLE_ROCSOLVER_BACKEND
-     - True, False
-     - False     
-   * - ENABLE_ROCRAND_BACKEND
-     - True, False
-     - False     
-   * - ENABLE_MKLCPU_THREAD_TBB
-     - True, False
-     - True      
-   * - ENABLE_PORTBLAS_BACKEND
-     - True, False
-     - False      
-   * - ENABLE_PORTFFT_BACKEND
-     - True, False
-     - False      
-   * - BUILD_FUNCTIONAL_TESTS
-     - True, False
-     - True      
-   * - BUILD_EXAMPLES
-     - True, False
-     - True      
-   * - TARGET_DOMAINS (list)
-     - blas, lapack, rng, dft, sparse_blas
-     - All domains 
-
-Some additional build options are given in the section `Additional build options`_.
-
-.. _build_target_domains:
-
-TARGET_DOMAINS
-^^^^^^^^^^^^^^
-
-oneMKL supports multiple domains: BLAS, DFT, LAPACK, RNG and sparse BLAS. The
-domains built by oneMKL can be selected using the ``TARGET_DOMAINS`` parameter.
-In most cases, ``TARGET_DOMAINS`` is set automatically according to the domains
-supported by the backend libraries enabled. However, while most backend
-libraries support only one of these domains, but some may support multiple. For
-example, the ``MKLCPU`` backend supports every domain. To enable support for
-only the BLAS domain in the oneMKL Interfaces whilst compiling with ``MKLCPU``,
-``TARGET_DOMAINS`` could be set to ``blas``. To enable BLAS and DFT,
-``-DTARGET_DOMAINS="blas dft"`` would be used.
-
-
-Backends
-#########
-
-.. _build_for_intel_onemkl_dpcpp:
-
-Building for Intel(R) oneMKL
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The Intel(R) oneMKL backend supports multiple domains on both x86 CPUs and Intel
-GPUs. The MKLCPU backend using Intel(R) oneMKL for x86 CPU is enabled by
-default, and controlled with the parameter ``ENABLE_MKLCPU_BACKEND``. The MKLGPU
-backend using Intel(R) oneMKL for Intel GPU is enabled by default, and
-controlled with the parameter ``ENABLE_MKLGPU_BACKEND``.
-
-When using the Intel(R) oneAPI DPC++ Compiler, it is likely that Intel(R) oneMKL
-will be found automatically. If it is not, the parameter ``MKL_ROOT`` can be set
-to point to the installation prefix of Intel(R) oneMKL. Alternatively, the
-``MKLROOT`` environment variable can be set, either manually or by using an
-environment script provided by the package.
-
-
-.. _build_for_CUDA_dpcpp:
-
-Building for CUDA
-^^^^^^^^^^^^^^^^^
-
-The CUDA backends can be enabled with ``ENABLE_CUBLAS_BACKEND``,
-``ENABLE_CUFFT_BACKEND``, ``ENABLE_CURAND_BACKEND``, and
-``ENABLE_CUSOLVER_BACKEND``.
-
-No additional parameters are required for using CUDA libraries. In most cases,
-the CUDA libraries should be found automatically by CMake.
-
-.. _build_for_ROCM_dpcpp:
-
-Building for ROCm
-^^^^^^^^^^^^^^^^^
-
-The ROCm backends can be enabled with ``ENABLE_ROCBLAS_BACKEND``,
-``ENABLE_ROCFFT_BACKEND``, ``ENABLE_ROCSOLVER_BACKEND`` and
-``ENABLE_ROCRAND_BACKEND``.
-
-For *RocBLAS*, *RocSOLVER* and *RocRAND*, the target device architecture must be
-set. This can be set with using the ``HIP_TARGETS`` parameter. For example, to
-enable a build for MI200 series GPUs, ``-DHIP_TARGETS=gfx90a`` should be set.
-Currently, DPC++ can only build for a single HIP target at a time. This may
-change in future versions.
-
-A few often-used architectures are listed below:
-
-.. list-table::
-   :header-rows: 1
-
-   * - Architecture
-     - AMD GPU name
-   * - gfx90a
-     - AMD Instinct(TM) MI210/250/250X Accelerator
-   * - gfx908
-     - AMD Instinct(TM) MI 100 Accelerator
-   * - gfx906
-     - | AMD Radeon Instinct(TM) MI50/60 Accelerator
-       | AMD Radeon(TM) (Pro) VII Graphics Card
-   * - gfx900
-     - | Radeon Instinct(TM) MI 25 Accelerator
-       | Radeon(TM) RX Vega 64/56 Graphics
-
-For a host with ROCm installed, the device architecture can be retrieved via the
-``rocminfo`` tool. The architecture will be displayed in the ``Name:`` row.
-
-.. _build_for_portlibs_dpcpp:
-
-Pure SYCL backends: portBLAS and portFFT
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-`portBLAS <https://github.com/codeplaysoftware/portBLAS>`_ and `portFFT
-<https://github.com/codeplaysoftware/portFFT>`_ are experimental pure-SYCL
-backends that work on all SYCL targets supported by the DPC++ compiler. Since
-they support multiple targets, they cannot be enabled with other backends in the
-same domain, or the MKLCPU or MKLGPU backends. Both libraries are experimental
-and currently only support a subset of operations and features.
-
-For best performance, both libraries must be tuned. See the individual sections
-for more details.
-
-Both portBLAS and portFFT are used as header-only libraries, and will be
-downloaded automatically if not found.
-
-.. _build_for_portblas_dpcpp:
-
-Building for portBLAS
----------------------
-
-`portBLAS <https://github.com/codeplaysoftware/portBLAS>`_ is
-enabled by setting ``-DENABLE_PORTBLAS_BACKEND=True``.
-
-By default, the portBLAS backend is not tuned for any specific device.
-This tuning is required to achieve best performance.
-portBLAS can be tuned for a specific hardware target by adding compiler
-definitions in 2 ways:
-
-#.
-  Manually specify a tuning target with ``-DPORTBLAS_TUNING_TARGET=<target>``.
-  The list of portBLAS targets can be found
-  `here <https://github.com/codeplaysoftware/portBLAS#cmake-options>`_.
-  This will automatically set ``-fsycl-targets`` if needed.
-#.
-  If one target is set via ``-fsycl-targets`` the configuration step will
-  try to automatically detect the portBLAS tuning target. One can manually
-  specify ``-fsycl-targets`` via ``CMAKE_CXX_FLAGS``. See
-  `DPC++ User Manual <https://intel.github.io/llvm-docs/UsersManual.html>`_
-  for more information on ``-fsycl-targets``.
-
-portBLAS relies heavily on JIT compilation. This may cause time-outs on some
-systems. To avoid this issue, use ahead-of-time compilation through tuning
-targets or ``sycl-targets``.
-
-.. _build_for_portfft_dpcpp:
-
-Building for portFFT
----------------------
-
-`portFFT <https://github.com/codeplaysoftware/portFFT>`_ is enabled by setting
-``-DENABLE_PORTFFT_BACKEND=True``.
-
-By default, the portFFT backend is not tuned for any specific device. The tuning
-flags are detailed in the `portFFT
-<https://github.com/codeplaysoftware/portFFT>`_ repository, and can set at
-configuration time. Note that some tuning configurations may be incompatible
-with some targets.
-
-The portFFT library is compiled using the same ``-fsycl-targets`` as specified
-by the ``CMAKE_CXX_FLAGS``. If none are found, it will compile for
-``-fsycl-targets=spir64``, and -if the compiler supports it-
-``nvptx64-nvidia-cuda``. To enable HIP targets, ``HIP_TARGETS`` must be
-specified. See `DPC++ User Manual
-<https://intel.github.io/llvm-docs/UsersManual.html>`_ for more information on
-``-fsycl-targets``.
-
-.. _build_additional_options_dpcpp:
-
-Additional Build Options
-##########################
-
-When building oneMKL the SYCL implementation can be specified by setting the
-``ONEMKL_SYCL_IMPLEMENTATION`` option. Possible values are:
-
-* ``dpc++`` (default) for the `Intel(R) oneAPI DPC++ Compiler
-  <https://software.intel.com/en-us/oneapi/dpc-compiler>`_ and for the `oneAPI
-  DPC++ Compiler <https://github.com/intel/llvm>`_ compilers.
-* ``hipsycl`` for the `AdaptiveCpp <https://github.com/illuhad/AdaptiveCpp>`_
-  SYCL implementation.
-Please see :ref:`building_the_project_with_adaptivecpp` if using this option.
-
-The following table provides details of CMake options and their default values:
-
-.. list-table::
-   :header-rows: 1
-
-   * - CMake Option
-     - Supported Values
-     - Default Value 
-   * - BUILD_SHARED_LIBS
-     - True, False
-     - True      
-   * - BUILD_DOC
-     - True, False
-     - False     
-
-
-.. note::
-  When building with ``clang++`` for AMD backends, you must additionally set
-  ``ONEAPI_DEVICE_SELECTOR`` to ``hip:gpu`` and provide ``-DHIP_TARGETS`` 
-  according to the targeted hardware. This backend has only been tested for the 
-  ``gfx90a`` architecture (MI210) at the time of writing. 
-
-.. note::
-  When building with ``BUILD_FUNCTIONAL_TESTS=True`` (default option) only single CUDA backend can be built
-  (`#270 <https://github.com/oneapi-src/oneMKL/issues/270>`_).
-
-
-.. _build_invocation_examples_dpcpp:
-
-CMake invocation examples
-##########################
-
-Build oneMKL with support for Nvidia GPUs with tests
-disabled using the Ninja build system:
-
-.. code-block:: bash
-
-  cmake $ONEMKL_DIR \
-      -GNinja \
-      -DCMAKE_CXX_COMPILER=clang++ \
-      -DCMAKE_C_COMPILER=clang \
-      -DENABLE_MKLGPU_BACKEND=False \
-      -DENABLE_MKLCPU_BACKEND=False \
-      -DENABLE_CUFFT_BACKEND=True \
-      -DENABLE_CUBLAS_BACKEND=True \
-      -DENABLE_CUSOLVER_BACKEND=True \
-      -DENABLE_CURAND_BACKEND=True \
-      -DBUILD_FUNCTIONAL_TESTS=False
-
-``$ONEMKL_DIR`` points at the oneMKL source directly. The x86 CPU (``MKLCPU``)
-and Intel GPU (``MKLGPU``) backends are enabled by default, but are disabled
-here. The backends for Nvidia GPUs must all be explicilty enabled. The tests are
-disabled, but the examples will still be built.
-
-Building oneMKL with support for AMD GPUs with tests
-disabled:
-
-.. code-block:: bash
-
-  cmake $ONEMKL_DIR \
-      -DCMAKE_CXX_COMPILER=clang++ \ 
-      -DCMAKE_C_COMPILER=clang \
-      -DENABLE_MKLCPU_BACKEND=False \ 
-      -DENABLE_MKLGPU_BACKEND=False \
-      -DENABLE_ROCFFT_BACKEND=True  \ 
-      -DENABLE_ROCBLAS_BACKEND=True \
-      -DENABLE_ROCSOLVER_BACKEND=True \ 
-      -DHIP_TARGETS=gfx90a \
-      -DBUILD_FUNCTIONAL_TESTS=False
-
-``$ONEMKL_DIR`` points at the oneMKL source directly. The x86 CPU (``MKLCPU``)
-and Intel GPU (``MKLGPU``) backends are enabled by default, but are disabled
-here. The backends for AMD GPUs must all be explicilty enabled. The tests are
-disabled, but the examples will still be built.
-
-
-Build oneMKL for the DFT domain only with support for x86 CPU, Intel GPU, AMD
-GPU and Nvidia GPU with testing enabled:
-
-.. code-block:: bash
-
-  cmake $ONEMKL_DIR \ 
-      -DCMAKE_CXX_COMPILER=icpx \
-      -DCMAKE_C_COMPILER=icx \ 
-      -DENABLE_ROCFFT_BACKEND=True \
-      -DENABLE_CUFFT_BACKEND=True \
-      -DTARGET_DOMAINS=dft \
-      -DBUILD_EXAMPLES=False
-
-Note that this is not a supported configuration, and requires Codeplay's oneAPI
-for `AMD <https://developer.codeplay.com/products/oneapi/amd/home/>`_ and
-`Nvidia <https://developer.codeplay.com/products/oneapi/nvidia/home/>`_ GPU
-plugins. The MKLCPU and MKLGPU backends are enabled by
-default, with backends for Nvidia GPU and AMD GPU explicitly enabled.
-``-DTARGET_DOMAINS=dft`` causes only DFT backends to be built. If this was not
-set, the backend libraries to enable the use of BLAS, LAPACK and RNG with MKLGPU
-and MKLCPU would also be enabled. The build of examples is disabled. Since
-functional testing was not disabled, tests would be built.
-
-.. _project_cleanup:
-
-Project Cleanup
-###############
-
-Most use-cases involve building the project without the need to clean up the
-build directory. However, if you wish to clean up the build directory, you can
-delete the ``build`` folder and create a new one. If you wish to clean up the
-build files but retain the build configuration, following commands will help you
-do so.
-
-.. code-block:: sh
-
-  # If you use "GNU/Unix Makefiles" for building,
-  make clean
-  
-  # If you use "Ninja" for building
-  ninja -t clean
-
-
-.. _build_for_windows_dpcpp:
-
-Building for Windows
-####################
-
-The Windows build is similar to the Linux build, albeit that `fewer backends are
-supported <https://github.com/oneapi-src/oneMKL?tab=readme-ov-file#windows>`_.
-Additionally, the Ninja build system must be used. For example:
-
-.. code-block:: bash
-
-  # Inside <path to onemkl>
-  md build && cd build
-  cmake .. -G Ninja [-DCMAKE_CXX_COMPILER=<path_to_icx_compiler>\bin\icx] # required only if icx is not found in environment variable PATH
-                    [-DCMAKE_C_COMPILER=<path_to_icx_compiler>\bin\icx]   # required only if icx is not found in environment variable PATH
-                    [-DMKL_ROOT=<mkl_install_prefix>]                     # required only if environment variable MKLROOT is not set
-                    [-DREF_BLAS_ROOT=<reference_blas_install_prefix>]     # required only for testing
-                    [-DREF_LAPACK_ROOT=<reference_lapack_install_prefix>] # required only for testing
-  ninja
-  ctest
-  cmake --install . --prefix <path_to_install_dir> # required to have full package structure
-
-.. _build_common_problems_dpcpp:
-
-Build FAQ
-#########
-
-clangrt builtins lib not found
-  Encountered when trying to build oneMKL with some ROCm libraries. There are
-  several possible solutions: * If building Open DPC++ from source, add
-  ``compiler-rt`` to the external projects compile option:
-  ``--llvm-external-projects compiler-rt``. * The *clangrt* from ROCm can be
-  used, depending on ROCm version: ``export
-  LIBRARY_PATH=/path/to/rocm-$rocm-version$/llvm/lib/clang/$clang-version$/lib/linux/:$LIBRARY_PATH``
-
-Could NOT find CBLAS (missing: CBLAS file)
-  Encountered when tests are enabled along with the BLAS domain. The tests
-  require a reference BLAS implementation, but cannot find one. Either install
-  or build a BLAS library and set ``-DREF_BLAS_ROOT``` as described in
-  :ref:`building_and_running_tests`. Alternatively, the tests can be disabled by
-  setting ``-DBUILD_FUNCTIONAL_TESTS=False``.
-
-error: invalid target ID ''; format is a processor name followed by an optional colon-delimited list of features followed by an enable/disable sign (e.g.,'gfx908:sramecc+:xnack-')
-  The HIP_TARGET has not been set. Please see `Building for ROCm`_.
-
diff --git a/docs/conf.py.in b/docs/conf.py.in
deleted file mode 100644
index d874dbab7..000000000
--- a/docs/conf.py.in
+++ /dev/null
@@ -1,198 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# Configuration file for the Sphinx documentation builder.
-#
-# This file does only contain a selection of the most common options. For a
-# full list see the documentation:
-# http://www.sphinx-doc.org/en/master/config
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, as shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-
-
-# -- Project information -----------------------------------------------------
-
-project = 'oneAPI Math Kernel Library Interfaces'
-copyright = '2020-2022, Intel Corporation'
-author = 'Intel Corporation'
-
-# The short X.Y version
-version = ''
-# The full version, including alpha/beta/rc tags
-release = '0.1'
-
-
-# -- General configuration ---------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.doctest',
-    'sphinx.ext.todo',
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-# source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
-
-# The master toctree document.
-master_doc = 'index'
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = 'en'
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = []
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = None
-
-static_dir = '@CMAKE_CURRENT_SOURCE_DIR@/_static'
-
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = 'sphinx_book_theme'
-html_logo = f'{static_dir}/oneAPI-rgb-rev-100.png'
-html_favicon = f'{static_dir}/favicons.png'
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#
-
-# Theme options
-html_theme_options = {
-'repository_url': 'https://github.com/oneapi-src/oneMKL',
-'path_to_docs': 'docs',
-'use_issues_button': True,
-'use_edit_page_button': True,
-'repository_branch': 'develop',
-'extra_footer': '<p align="right"><a href="https://www.intel.com/content/www/us/en/privacy/intel-cookie-notice.html">Cookies</a></p>',
-'navigation_with_keys': False,
-}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = [static_dir]
-
-# Custom sidebar templates, must be a dictionary that maps document names
-# to template names.
-#
-# The default sidebars (for documents that don't match any pattern) are
-# defined by theme itself.  Builtin themes are using these templates by
-# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
-# 'searchbox.html']``.
-#
-# html_sidebars = {}
-
-
-# -- Options for HTMLHelp output ---------------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'reSTTemplatedoc'
-
-
-# -- Options for LaTeX output ------------------------------------------------
-
-latex_elements = {
-    # The paper size ('letterpaper' or 'a4paper').
-    #
-    # 'papersize': 'letterpaper',
-
-    # The font size ('10pt', '11pt' or '12pt').
-    #
-    # 'pointsize': '10pt',
-
-    # Additional stuff for the LaTeX preamble.
-    #
-    # 'preamble': '',
-
-    # Latex figure (float) alignment
-    #
-    # 'figure_align': 'htbp',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-    (master_doc, 'reSTTemplate.tex', 'reST Template Documentation',
-     'Ben Fitch', 'manual'),
-]
-
-
-# -- Options for manual page output ------------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'resttemplate', 'reST Template Documentation',
-     [author], 1)
-]
-
-
-# -- Options for Texinfo output ----------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-    (master_doc, 'reSTTemplate', 'reST Template Documentation',
-     author, 'reSTTemplate', 'One line description of project.',
-     'Miscellaneous'),
-]
-
-
-# -- Options for Epub output -------------------------------------------------
-
-# Bibliographic Dublin Core info.
-epub_title = project
-
-# The unique identifier of the text. This can be a ISBN number
-# or the project homepage.
-#
-# epub_identifier = ''
-
-# A unique identification for the text.
-#
-# epub_uid = ''
-
-# A list of files that should not be packed into the epub file.
-epub_exclude_files = ['search.html']
-
-
-# -- Extension configuration -------------------------------------------------
-
-# -- Options for todo extension ----------------------------------------------
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = True
diff --git a/docs/create_new_backend.rst b/docs/create_new_backend.rst
deleted file mode 100644
index 8f25bda33..000000000
--- a/docs/create_new_backend.rst
+++ /dev/null
@@ -1,513 +0,0 @@
-..
-  Copyright 2020 Intel Corporation
-
-.. _create_backend_wrappers:
-
-Integrating a Third-Party Library to oneAPI Math Kernel Library (oneMKL) Interfaces
-====================================================================================
-
-This step-by-step tutorial provides examples for enabling new third-party libraries in oneMKL.
-
-oneMKL has a header-based implementation of the interface layer (``include`` directory) and a source-based implementation of the backend layer for each third-party library (``src`` directory). To enable a third-party library, you must update both parts of oneMKL and integrate the new third-party library to the oneMKL build and test systems.
-
-For the new backend library and header naming please use the following template:
-
-.. code-block::
-
-    onemkl_<domain>_<3rd-party library short name>[<wrapper for specific target>]
-
-Where ``<wrapper for specific target>`` is required only if multiple wrappers are provided from the same 3rd-party library, e.g., wrappers with Intel oneMKL C API for CPU target ``onemkl_blas_mklcpu.so`` and wrappers with Intel oneMKL DPC++ API for GPU target ``onemkl_blas_mklgpu.so``.
-
-If there is no need for multiple wrappers only ``<domain>`` and ``<3rd-party library short name>`` are required, e.g. ``onemkl_rng_curand.so``
-
-`1. Create Header Files`_
-
-`2. Integrate Header Files`_
-
-`3. Create Wrappers`_
-
-`4. Integrate Wrappers To the Build System`_
-
-`5. Update the Test System`_
-
-.. _generate_header_files:
-
-1. Create Header Files
-----------------------
-
-For each new backend library, you should create the following two header files:
-
-* Header file with a declaration of entry points to the new third-party library wrappers
-* Compiler-time dispatching interface (see `oneMKL Usage Models <../README.md#supported-usage-models>`_) for new third-party libraries
-
-**Header File Example**: command to generate the header file with a declaration of BLAS entry points in the oneapi::mkl::newlib namespace 
-
-.. code-block:: bash
-
-    python scripts/generate_backend_api.py include/oneapi/mkl/blas.hpp \                                  # Base header file
-                                           include/oneapi/mkl/blas/detail/newlib/onemkl_blas_newlib.hpp \ # Output header file
-                                           oneapi::mkl::newlib                                            # Wrappers namespace
-
-Code snippet of the generated header file ``include/oneapi/mkl/blas/detail/newlib/onemkl_blas_newlib.hpp``
-
-.. code-block:: cpp
-
-    namespace oneapi {
-    namespace mkl {
-    namespace newlib {
-    
-    void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-              sycl::buffer<float, 1> &result);
-
-
-
-**Compile-time Dispatching Interface Example**: command to generate the compile-time dispatching interface template instantiations for ``newlib`` and supported device ``newdevice``
-
-.. code-block:: bash
-
-    python scripts/generate_ct_instant.py   include/oneapi/mkl/blas/detail/blas_ct_templates.hpp \         # Base header file
-                                            include/oneapi/mkl/blas/detail/newlib/blas_ct.hpp \            # Output header file
-                                            include/oneapi/mkl/blas/detail/newlib/onemkl_blas_newlib.hpp \ # Header file with declaration of entry points to wrappers
-                                            newlib \                                                       # Library name
-                                            newdevice \                                                    # Backend name
-                                            oneapi::mkl::newlib                                            # Wrappers namespace
-
-Code snippet of the generated header file ``include/oneapi/mkl/blas/detail/newlib/blas_ct.hpp``
-
-.. code-block:: cpp
-
-    namespace oneapi {
-    namespace mkl {
-    namespace blas {
-    
-    template <>
-    void asum<library::newlib, backend::newdevice>(sycl::queue &queue, std::int64_t n,
-                                                   sycl::buffer<float, 1> &x, std::int64_t incx,
-                                                   sycl::buffer<float, 1> &result) {
-        asum_precondition(queue, n, x, incx, result);
-        oneapi::mkl::newlib::asum(queue, n, x, incx, result);
-        asum_postcondition(queue, n, x, incx, result);
-    }
-
-
-.. _integrate_header_files:
-
-2. Integrate Header Files
--------------------------
-
-Below you can see structure of oneMKL top-level include directory:
-
-::
-
-    include/
-        oneapi/
-            mkl/
-                mkl.hpp -> oneMKL spec APIs
-                types.hpp  -> oneMKL spec types
-                blas.hpp   -> oneMKL BLAS APIs w/ pre-check/dispatching/post-check
-                detail/    -> implementation specific header files
-                    exceptions.hpp        -> oneMKL exception classes
-                    backends.hpp          -> list of oneMKL backends
-                    backends_table.hpp    -> table of backend libraries for each domain and device
-                    get_device_id.hpp     -> function to query device information from queue for Run-time dispatching
-                blas/
-                    predicates.hpp -> oneMKL BLAS pre-check post-check
-                    detail/        -> BLAS domain specific implementation details
-                        blas_loader.hpp       -> oneMKL Run-time BLAS API
-                        blas_ct_templates.hpp -> oneMKL Compile-time BLAS API general templates
-                        cublas/
-                            blas_ct.hpp            -> oneMKL Compile-time BLAS API template instantiations for <cublas>
-                            onemkl_blas_cublas.hpp -> backend wrappers library API
-                        mklcpu/
-                            blas_ct.hpp            -> oneMKL Compile-time BLAS API template instantiations for <mklcpu>
-                            onemkl_blas_mklcpu.hpp -> backend wrappers library API
-                        <other backends>/
-                <other domains>/
-
-
-To integrate the new third-party library to a oneMKL header-based part, following files from this structure should be updated:
-
-* ``include/oneapi/mkl/detail/backends.hpp``: add the new backend
-
-  **Example**: add the ``newbackend`` backend
-
-  .. code-block:: diff
-
-        enum class backend { mklcpu,
-     +                       newbackend,
-
-
-  .. code-block:: diff
-
-        static backendmap backend_map = { { backend::mklcpu, "mklcpu" },
-     +                                    { backend::newbackend, "newbackend" },
-
-* ``include/oneapi/mkl/detail/backends_table.hpp``: add new backend library for supported domain(s) and device(s)
-
-  **Example**: enable ``newlib`` for ``blas`` domain and ``newdevice`` device
-
-  .. code-block:: diff
-    
-        enum class device : uint16_t { x86cpu,
-                                       ...
-     +                                 newdevice
-                                     };
-        
-        static std::map<domain, std::map<device, std::vector<const char*>>> libraries = {
-            { domain::blas,
-              { { device::x86cpu,
-                  {
-        #ifdef ENABLE_MKLCPU_BACKEND
-                      LIB_NAME("blas_mklcpu")
-        #endif
-                   } },
-     +          { device::newdevice,
-     +            {
-     +  #ifdef ENABLE_NEWLIB_BACKEND
-     +                 LIB_NAME("blas_newlib")
-     +  #endif
-     +             } },
-
-* ``include/oneapi/mkl/detail/get_device_id.hpp``: add new device detection mechanism for Run-time dispatching
-
-  **Example**: enable ``newdevice`` if the queue is targeted for the Host
-
-  .. code-block:: diff
-    
-        inline oneapi::mkl::device get_device_id(sycl::queue &queue) {
-            oneapi::mkl::device device_id;
-     +      if (queue.is_host())
-     +          device_id=device::newdevice;
-
-* ``include/oneapi/mkl/blas.hpp``: include the generated header file for the compile-time dispatching interface (see `oneMKL Usage Models <../README.md#supported-usage-models>`_)
-
-  **Example**: add ``include/oneapi/mkl/blas/detail/newlib/blas_ct.hpp`` generated at the `1. Create Header Files`_ step
-    
-  .. code-block:: diff
-    
-        #include "oneapi/mkl/blas/detail/mklcpu/blas_ct.hpp"
-        #include "oneapi/mkl/blas/detail/mklgpu/blas_ct.hpp"
-     +  #include "oneapi/mkl/blas/detail/newlib/blas_ct.hpp"
-
-
-The new files generated at the `1. Create Header Files`_ step result in the following updated structure of the BLAS domain header files.
-
-.. code-block:: diff
-
-    include/
-        oneapi/
-            mkl/
-                blas.hpp -> oneMKL BLAS APIs w/ pre-check/dispatching/post-check
-                blas/
-                    predicates.hpp -> oneMKL BLAS pre-check post-check
-                    detail/        -> BLAS domain specific implementation details
-                        blas_loader.hpp       -> oneMKL Run-time BLAS API
-                        blas_ct_templates.hpp -> oneMKL Compile-time BLAS API general templates
-                        cublas/
-                            blas_ct.hpp            -> oneMKL Compile-time BLAS API template instantiations for <cublas>
-                            onemkl_blas_cublas.hpp -> backend wrappers library API
-                        mklcpu/
-                            blas_ct.hpp            -> oneMKL Compile-time BLAS API template instantiations for <mklcpu>
-                            onemkl_blas_mklcpu.hpp -> backend wrappers library API
-        +              newlib/
-        +                  blas_ct.hpp            -> oneMKL Compile-time BLAS API template instantiations for <newbackend>
-        +                  onemkl_blas_newlib.hpp -> backend wrappers library API
-                        <other backends>/
-                <other domains>/
-
-.. _generate_wrappers_and_cmake:
-
-3. Create Wrappers
-------------------
-Wrappers convert Data Parallel C++ (DPC++) input data types to third-party library data types and call corresponding implementation from the third-party library. Wrappers for each third-party library are built to separate oneMKL backend libraries. The ``libonemkl.so`` dispatcher library loads the wrappers at run-time if you are using the interface for run-time dispatching, or you will link with them directly in case you are using the interface for compile-time dispatching (for more information see `oneMKL Usage Models <../README.md#supported-usage-models>`_).
-
-All wrappers and dispatcher library implementations are in the ``src`` directory:
-
-::
-
-    src/
-        include/
-            function_table_initializer.hpp -> general loader implementation w/ global libraries table
-        blas/
-            function_table.hpp -> loaded BLAS functions declaration
-            blas_loader.cpp -> BLAS wrappers for loader
-            backends/
-                cublas/ -> cuBLAS wrappers
-                mklcpu/ -> Intel oneMKL CPU wrappers
-                mklgpu/ -> Intel oneMKL GPU wrappers
-                <other backend libraries>/
-        <other domains>/
-
-Each backend library should contain a table of all functions from the chosen domain.
-
-``scripts/generate_wrappers.py`` can help to generate wrappers with the "Not implemented" exception for all functions based on the provided header file.
-
-You can modify wrappers generated with this script to enable third-party library functionality.
-
-**Example**: generate wrappers for ``newlib`` based on the header files generated and integrated previously, and enable only one ``asum`` function
-
-The command below generates two new files:
-
-* ``src/blas/backends/newlib/newlib_wrappers.cpp`` - DPC++ wrappers for all functions from ``include/oneapi/mkl/blas/detail/newlib/onemkl_blas_newlib.hpp``
-* ``src/blas/backends/newlib/newlib_wrappers_table_dyn.cpp`` - structure of symbols for run-time dispatcher (in the same location as wrappers), suffix ``_dyn`` indicates that this file is required for dynamic library only.
-
-.. code-block:: bash
-
-    python scripts/generate_wrappers.py include/oneapi/mkl/blas/detail/newlib/onemkl_blas_newlib.hpp \ # Base header file
-                                        src/blas/function_table.hpp \                                  # Declaration for structure of symbols
-                                        src/blas/backends/newlib/newlib_wrappers.cpp \                 # Output wrappers
-                                        newlib                                                         # Library name
-
-You can then modify ``src/blas/backends/newlib/newlib_wrappers.cpp`` to enable the C function ``newlib_sasum`` from the third-party library ``libnewlib.so``.
-
-To enable this function:
-
-* Include the header file ``newlib.h`` with the ``newlib_sasum`` function declaration
-* Convert all DPC++ parameters to proper C types: use the ``get_access`` method for input and output DPC++ buffers to get row pointers
-* Submit the DPC++ kernel with a C function call to ``newlib`` as ``single_task``
-
-The following code snippet is updated for ``src/blas/backends/newlib/newlib_wrappers.cpp``:
-
-.. code-block:: diff
-
-        #if __has_include(<sycl/sycl.hpp>)
-        #include <sycl/sycl.hpp>
-        #else
-        #include <CL/sycl.hpp>
-        #endif
-        
-        #include "oneapi/mkl/types.hpp"
-        
-        #include "oneapi/mkl/blas/detail/newlib/onemkl_blas_newlib.hpp"
-    +    
-    +    #include "newlib.h"
-        
-        namespace oneapi {
-        namespace mkl {
-        namespace newlib {
-        
-        void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-                   sycl::buffer<float, 1> &result) {
-    -       throw std::runtime_error("Not implemented for newlib");
-    +       queue.submit([&](sycl::handler &cgh) {
-    +           auto accessor_x      = x.get_access<sycl::access::mode::read>(cgh);
-    +           auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-    +           cgh.single_task<class newlib_sasum>([=]() {
-    +               accessor_result[0] = ::newlib_sasum((const int)n, accessor_x.get_pointer(), (const int)incx);
-    +           });
-    +       });
-        }
-        
-        void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-                  sycl::buffer<double, 1> &result) {
-            throw std::runtime_error("Not implemented for newlib");
-        }
-
-Updated structure of the ``src`` folder with the ``newlib`` wrappers:
-
-.. code-block:: diff
-
-    src/
-        blas/
-            loader.hpp -> general loader implementation w/ global libraries table
-            function_table.hpp -> loaded BLAS functions declaration
-            blas_loader.cpp -> BLAS wrappers for loader
-            backends/
-                cublas/ -> cuBLAS wrappers
-                mklcpu/ -> Intel oneMKL CPU wrappers
-                mklgpu/ -> Intel oneMKL GPU wrappers
-     +          newlib/
-     +              newlib.h
-     +              newlib_wrappers.cpp
-     +              newlib_wrappers_table_dyn.cpp
-                <other backend libraries>/
-        <other domains>/
-
-.. _integrate_backend_to_build_system:
-
-4. Integrate Wrappers to the Build System
------------------------------------------
-Here is the list of files that should be created/updated to integrate the new wrappers for the third-party library to the oneMKL build system:
-
-* Add the new option ``ENABLE_XXX_BACKEND`` for the new third-party library to the top of the ``CMakeList.txt`` file.
-
-  **Example**: changes for ``newlib`` in the top of the ``CMakeList.txt`` file
-
-  .. code-block:: diff
-
-            option(ENABLE_MKLCPU_BACKEND "" ON)
-            option(ENABLE_MKLGPU_BACKEND "" ON)
-        +   option(ENABLE_NEWLIB_BACKEND "" ON)
-
-* Add the new directory (``src/<domain>/backends/<new_directory>``) with the wrappers for the new third-party library under the ``ENABLE_XXX_BACKEND`` condition to the ``src/<domain>/backends/CMakeList.txt`` file.
-
-  **Example**: changes for ``newlib`` in ``src/blas/backends/CMakeLists.txt``
-
-  .. code-block:: diff
-    
-            if(ENABLE_MKLCPU_BACKEND)
-                add_subdirectory(mklcpu)
-            endif()
-        +    
-        +   if(ENABLE_NEWLIB_BACKEND)
-        +       add_subdirectory(newlib)
-        +   endif()
-
-* Create the ``cmake/FindXXX.cmake`` cmake config file to find the new third-party library and its dependencies.
-
-  **Example**: new config file ``cmake/FindNEWLIB.cmake`` for ``newlib``
-    
-  .. code-block:: cmake
-    
-        include_guard()
-        # Find library by name in NEWLIB_ROOT cmake variable or environment variable NEWLIBROOT
-        find_library(NEWLIB_LIBRARY NAMES newlib
-            HINTS ${NEWLIB_ROOT} $ENV{NEWLIBROOT}
-            PATH_SUFFIXES "lib")
-        # Make sure that the library was found
-        include(FindPackageHandleStandardArgs)
-        find_package_handle_standard_args(NEWLIB REQUIRED_VARS NEWLIB_LIBRARY)
-        # Set cmake target for the library
-        add_library(ONEMKL::NEWLIB::NEWLIB UNKNOWN IMPORTED)
-        set_target_properties(ONEMKL::NEWLIB::NEWLIB PROPERTIES
-            IMPORTED_LOCATION ${NEWLIB_LIBRARY})
-
-* Create the ``src/<domain>/backends/<new_directory>/CMakeList.txt`` cmake config file to specify how to build the backend layer for the new third-party library.
-
-  ``scripts/generate_cmake.py`` can help to generate the initial ``src/<domain>/backends/<new_directory>/CMakeList.txt`` config file automatically for all files in the directory.
-  Note: all source files with the ``_dyn`` suffix are added to build if the target is a dynamic library only.
-  
-  **Example**: command to generate the cmake config file for the ``src/blas/backends/newlib`` directory
-
-  .. code-block:: bash
-
-    python scripts/generate_cmake.py src/blas/backends/newlib \ # Full path to the directory
-                                     newlib                     # Library name
-
-  You should manually update the generated config file with information about the new ``cmake/FindXXX.cmake`` file and instructions about how to link with the third-party library.
-  
-  **Example**: update the generated ``src/blas/backends/newlib/CMakeLists.txt`` file
-
-  .. code-block:: diff
-
-            # Add third-party library
-        -   # find_package(XXX REQUIRED)
-        +   find_package(NEWLIB REQUIRED)
-    
-  .. code-block:: diff
-
-            target_link_libraries(${LIB_OBJ}
-                PUBLIC ONEMKL::SYCL::SYCL
-        -       # Add third-party library to link with here
-        +       PUBLIC ONEMKL::NEWLIB::NEWLIB
-            )
-
-Now you can build the backend library for ``newlib`` to make sure the third-party library integration was completed successfully (for more information, see `Build with cmake <../README.md#building-with-cmake>`_)
-
-.. code-block:: bash
-
-    cd build/
-    cmake .. -DNEWLIB_ROOT=<path/to/newlib> \
-        -DENABLE_MKLCPU_BACKEND=OFF \
-        -DENABLE_MKLGPU_BACKEND=OFF \
-        -DENABLE_NEWLIB_BACKEND=ON \           # Enable new third-party library backend
-        -DBUILD_FUNCTIONAL_TESTS=OFF           # At this step we want build only
-    cmake --build . -j4
-
-.. _integrate_backend_to_test_system:
-
-5. Update the Test System
--------------------------
-
-Update the following files to enable the new third-party library for unit tests:
-
-* ``src/config.hpp.in``: add a cmake option for the new third-party library so this macro can be propagated to unit tests
-    
-  **Example**: add ``ENABLE_NEWLIB_BACKEND``
-
-  .. code-block:: diff
-    
-        #cmakedefine ENABLE_MKLCPU_BACKEND
-     +  #cmakedefine ENABLE_NEWLIB_BACKEND
-
-* ``tests/unit_tests/CMakeLists.txt``: add instructions about how to link tests with the new backend library
-
-  **Example**: add the ``newlib`` backend library
-
-  .. code-block:: diff
-    
-        if(ENABLE_MKLCPU_BACKEND)
-            add_dependencies(test_main_ct onemkl_blas_mklcpu)
-            if(BUILD_SHARED_LIBS)
-                list(APPEND ONEMKL_LIBRARIES onemkl_blas_mklcpu)
-            else()
-                list(APPEND ONEMKL_LIBRARIES -foffload-static-lib=${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libonemkl_blas_mklcpu.a)
-                find_package(MKL REQUIRED)
-                list(APPEND ONEMKL_LIBRARIES ${MKL_LINK_C})
-            endif()
-        endif()
-     +
-     +    if(ENABLE_NEWLIB_BACKEND)
-     +       add_dependencies(test_main_ct onemkl_blas_newlib)
-     +       if(BUILD_SHARED_LIBS)
-     +           list(APPEND ONEMKL_LIBRARIES onemkl_blas_newlib)
-     +       else()
-     +           list(APPEND ONEMKL_LIBRARIES -foffload-static-lib=${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libonemkl_blas_newlib.a)
-     +           find_package(NEWLIB REQUIRED)
-     +           list(APPEND ONEMKL_LIBRARIES ONEMKL::NEWLIB::NEWLIB)
-     +       endif()
-     +   endif()
-
-* ``tests/unit_tests/include/test_helper.hpp``: add the helper function for the compile-time dispatching interface with the new backend, and specify the device for which it should be called
-
-  **Example**: add the helper function for the ``newlib`` compile-time dispatching interface with ``newdevice`` if it is the Host
-
-  .. code-block:: diff
-    
-        #ifdef ENABLE_MKLGPU_BACKEND
-            #define TEST_RUN_INTELGPU(q, func, args) \
-                func<oneapi::mkl::backend::mklgpu> args
-        #else
-            #define TEST_RUN_INTELGPU(q, func, args)
-        #endif
-     +    
-     +  #ifdef ENABLE_NEWLIB_BACKEND
-     +     #define TEST_RUN_NEWDEVICE(q, func, args) \
-     +         func<oneapi::mkl::backend::newbackend> args
-     +  #else
-     +      #define TEST_RUN_NEWDEVICE(q, func, args)
-     +  #endif
- 
-  .. code-block:: diff
- 
-        #define TEST_RUN_CT(q, func, args)               \
-            do {                                         \
-     +          if (q.is_host())                         \
-     +              TEST_RUN_NEWDEVICE(q, func, args);   \ 
-
-
-* ``tests/unit_tests/main_test.cpp``: add the targeted device to the vector of devices to test
-
-  **Example**: add the targeted device CPU for ``newlib``
-
-  .. code-block:: diff
-    
-                }
-            }
-     +           
-     +  #ifdef ENABLE_NEWLIB_BACKEND
-     +      devices.push_back(sycl::device(sycl::host_selector()));
-     +  #endif
-
-Now you can build and run functional testing for enabled third-party libraries (for more information see `Build with cmake <../README.md#building-with-cmake>`_).
-
-.. code-block:: bash
-
-    cd build/
-    cmake .. -DNEWLIB_ROOT=<path/to/newlib> \
-        -DENABLE_MKLCPU_BACKEND=OFF \
-        -DENABLE_MKLGPU_BACKEND=OFF \
-        -DENABLE_NEWLIB_BACKEND=ON  \
-        -DBUILD_FUNCTIONAL_TESTS=ON
-    cmake --build . -j4
-    ctest
diff --git a/docs/domains/blas/asum.rst b/docs/domains/blas/asum.rst
deleted file mode 100644
index 1fc02c84c..000000000
--- a/docs/domains/blas/asum.rst
+++ /dev/null
@@ -1,158 +0,0 @@
-.. _onemkl_blas_asum:
-
-asum
-====
-
-Computes the sum of magnitudes of the vector elements.
-
-.. _onemkl_blas_asum_description:
-
-.. rubric:: Description
-
-The ``asum`` routine computes the sum of the magnitudes of elements of a
-real vector, or the sum of magnitudes of the real and imaginary parts
-of elements of a complex vector:
-
-.. math::
-
-   result = \sum_{i=1}^{n}(|Re(x_i)| + |Im(x_i)|) 
-   
-where ``x`` is a vector with ``n`` elements.
-
-``asum`` supports the following precisions for data:
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-        -  T_res 
-      * -  ``float`` 
-        -  ``float`` 
-      * -  ``double`` 
-        -  ``double`` 
-      * -  ``std::complex<float>`` 
-        -  ``float`` 
-      * -  ``std::complex<double>`` 
-        -  ``double`` 
-
-.. _onemkl_blas_asum_buffer:
-
-asum (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void asum(sycl::queue &queue,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T_res,1> &result)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void asum(sycl::queue &queue,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T_res,1> &result)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-   
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      Buffer where the scalar result is stored (the sum of magnitudes of
-      the real and imaginary parts of all elements of the vector).
-
-
-.. _onemkl_blas_asum_usm:
-
-asum (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event asum(sycl::queue &queue,
-                        std::int64_t n,
-                        const T *x,
-                        std::int64_t incx,
-                        T_res *result,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event asum(sycl::queue &queue,
-                        std::int64_t n,
-                        const T *x,
-                        std::int64_t incx,
-                        T_res *result,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   x
-      Pointer to input vector ``x``. The array holding the vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      Pointer to the output matrix where the scalar result is stored
-      (the sum of magnitudes of the real and imaginary parts of all
-      elements of the vector).
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/axpby.rst b/docs/domains/blas/axpby.rst
deleted file mode 100644
index f95247fd6..000000000
--- a/docs/domains/blas/axpby.rst
+++ /dev/null
@@ -1,180 +0,0 @@
-.. _onemkl_blas_axpby:
-
-axpby
-=====
-
-Computes a vector-scalar product added to a scaled-vector.
-
-.. _onemkl_blas_axpby_description:
-
-.. rubric:: Description
-
-The ``axpby`` routines compute two scalar-vector product and add them:
-
-.. math::
-
-      y \leftarrow beta * y + alpha * x
-
-where ``x`` and ``y`` are vectors of ``n`` elements and ``alpha`` and ``beta`` are scalars.
-
-``axpby`` supports the following precisions.
-
-   .. list-table::
-      :header-rows: 1
-
-      * -  T
-      * -  ``float``
-      * -  ``double``
-      * -  ``std::complex<float>``
-      * -  ``std::complex<double>``
-
-.. _onemkl_blas_axpby_buffer:
-
-axpby (Buffer Version)
-----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void axpby(sycl::queue &queue,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x, std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y, std::int64_t incy)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void axpby(sycl::queue &queue,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x, std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y, std::int64_t incy)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x`` and ``y``.
-
-   alpha
-      Specifies the scalar ``alpha``.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at least
-      (1 + (``n`` – 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride between two consecutive elements of the ``x`` vector.
-
-   beta
-      Specifies the scalar ``beta``.
-
-   y
-      Buffer holding input vector ``y``. The buffer must be of size at least
-      (1 + (``n`` – 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride between two consecutive elements of the ``y`` vector.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Buffer holding the updated vector ``y``.
-
-
-.. _onemkl_blas_axpby_usm:
-
-axpby (USM Version)
--------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event axpby(sycl::queue &queue,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x, std::int64_t incx,
-                        const T beta,
-                        T *y, std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event axpby(sycl::queue &queue,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x, std::int64_t incx,
-                        const T beta,
-                        T *y, std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x`` and ``y``.
-
-   alpha
-      Specifies the scalar alpha.
-
-   beta
-      Specifies the scalar beta.
-
-   x
-      Pointer to the input vector ``x``. The allocated memory must be
-      of size at least (1 + (``n`` – 1)*abs(``incx``)). See
-      :ref:`matrix-storage` for more details.
-
-   incx
-      Stride between consecutive elements of the ``x`` vector.
-
-   y
-      Pointer to the input vector ``y``. The allocated memory must be
-      of size at least (1 + (``n`` – 1)*abs(``incy``)). See
-      :ref:`matrix-storage` for more details.
-
-   incy
-      Stride between consecutive elements of the ``y`` vector.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Array holding the updated vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-like-extensions`
-
diff --git a/docs/domains/blas/axpy.rst b/docs/domains/blas/axpy.rst
deleted file mode 100644
index a3f5a69e3..000000000
--- a/docs/domains/blas/axpy.rst
+++ /dev/null
@@ -1,184 +0,0 @@
-.. _onemkl_blas_axpy:
-
-axpy
-====
-
-Computes a vector-scalar product and adds the result to a vector.
-
-.. _onemkl_blas_axpy_description:
-      
-.. rubric:: Description
-
-The ``axpy`` routines compute a scalar-vector product and add the result
-to a vector:
-
-.. math::
-
-      y \leftarrow alpha * x + y
-
-where:
-
-``x`` and ``y`` are vectors of ``n`` elements,
-
-``alpha`` is a scalar.
-
-``axpy`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_axpy_buffer:
-
-axpy (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void axpy(sycl::queue &queue,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void axpy(sycl::queue &queue,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   alpha
-      Specifies the scalar alpha.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at least
-      (1 + (``n`` – 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input vector ``y``. The buffer must be of size at least
-      (1 + (``n`` – 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Buffer holding the updated vector ``y``.
-
-
-.. _onemkl_blas_axpy_usm:
-
-axpy (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event axpy(sycl::queue &queue,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x,
-                        std::int64_t incx,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event axpy(sycl::queue &queue,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x,
-                        std::int64_t incx,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   alpha
-      Specifies the scalar alpha.
-
-   x
-      Pointer to the input vector ``x``. The array holding the vector
-      ``x`` must be of size at least (1 + (``n`` – 1)*abs(``incx``)). See
-      :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Pointer to the input vector ``y``. The array holding the vector
-      ``y`` must be of size at least (1 + (``n`` – 1)*abs(``incy``)). See
-      :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Pointer to the updated vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/axpy_batch.rst b/docs/domains/blas/axpy_batch.rst
deleted file mode 100644
index 308ed2b29..000000000
--- a/docs/domains/blas/axpy_batch.rst
+++ /dev/null
@@ -1,350 +0,0 @@
-.. _onemkl_blas_axpy_batch:
-
-axpy_batch
-==========
-
-Computes a group of ``axpy`` operations.
-
-.. _onemkl_blas_axpy_batch_description:
-
-.. rubric:: Description
-
-The ``axpy_batch`` routines are batched versions of :ref:`onemkl_blas_axpy`, performing
-multiple ``axpy`` operations in a single call. Each ``axpy`` 
-operation adds a scalar-vector product to a vector.
-   
-``axpy_batch`` supports the following precisions for data.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_axpy_batch_buffer:
-
-axpy_batch (Buffer Version)
----------------------------
-
-.. rubric:: Description
-
-The buffer version of ``axpy_batch`` supports only the strided API. 
-
-The strided API operation is defined as:
-::
-  
-   for i = 0 … batch_size – 1
-      X and Y are vectors at offset i * stridex, i * stridey in x and y
-      Y := alpha * X + Y
-   end for
-
-where:
-
-``alpha`` is scalar,
-
-``X`` and ``Y`` are vectors.
-   
-**Strided API**
-
-.. rubric:: Syntax
- 
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void axpy_batch(sycl::queue &queue,
-                       std::int64_t n,
-                       T alpha,
-                       sycl::buffer<T,
-                       1> &x,
-                       std::int64_t incx,
-                       std::int64_t stridex,
-                       sycl::buffer<T,
-                       1> &y,
-                       std::int64_t incy,
-                       std::int64_t stridey,
-                       std::int64_t batch_size)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void axpy_batch(sycl::queue &queue,
-                       std::int64_t n,
-                       T alpha,
-                       sycl::buffer<T,
-                       1> &x,
-                       std::int64_t incx,
-                       std::int64_t stridex,
-                       sycl::buffer<T,
-                       1> &y,
-                       std::int64_t incy,
-                       std::int64_t stridey,
-                       std::int64_t batch_size)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in ``X`` and ``Y``.
-
-   alpha
-       Specifies the scalar ``alpha``.
-
-   x
-      Buffer holding input vectors ``X`` with size ``stridex`` * ``batch_size``.
-
-   incx 
-      Stride of vector ``X``.
-
-   stridex 
-      Stride between different ``X`` vectors.
-
-   y
-      Buffer holding input/output vectors ``Y`` with size ``stridey`` * ``batch_size``.
-
-   incy 
-      Stride of vector ``Y``.
-   
-   stridey 
-      Stride between different ``Y`` vectors.
-
-   batch_size 
-      Specifies the number of ``axpy`` operations to perform.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Output buffer, overwritten by ``batch_size`` ``axpy`` operations of the form 
-      ``alpha`` * ``X`` + ``Y``.
-
-
-.. _onemkl_blas_axpy_batch_usm:
-
-axpy_batch (USM Version)
-------------------------
-
-.. rubric:: Description
-
-The USM version of ``axpy_batch`` supports the group API and strided API. 
-
-The group API operation is defined as
-::
-   
-   idx = 0
-   for i = 0 … group_count – 1
-       for j = 0 … group_size – 1
-           X and Y are vectors in x[idx] and y[idx]
-           Y := alpha[i] * X + Y
-           idx := idx + 1
-       end for
-   end for
-
-The strided API operation is defined as
-::
-   
-   for i = 0 … batch_size – 1
-      X and Y are vectors at offset i * stridex, i * stridey in x and y
-      Y := alpha * X + Y
-   end for
-
-where:
-
-``alpha`` is scalar,
-
-``X`` and ``Y`` are vectors.
-
-For group API, ``x`` and ``y`` arrays contain the pointers for all the input vectors. 
-The total number of vectors in ``x`` and ``y`` are given by:
-
-.. math::
-
-      total\_batch\_count = \sum_{i=0}^{group\_count-1}group\_size[i]    
-
-For strided API, ``x`` and ``y`` arrays contain all the input vectors. 
-The total number of vectors in ``x`` and ``y`` are given by the ``batch_size`` parameter.
-
-**Group API**
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event axpy_batch(sycl::queue &queue,
-                              std::int64_t *n,
-                              T *alpha,
-                              const T **x,
-                              std::int64_t *incx,
-                              T **y,
-                              std::int64_t *incy,
-                              std::int64_t group_count,
-                              std::int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event axpy_batch(sycl::queue &queue,
-                              std::int64_t *n,
-                              T *alpha,
-                              const T **x,
-                              std::int64_t *incx,
-                              T **y,
-                              std::int64_t *incy,
-                              std::int64_t group_count,
-                              std::int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Array of ``group_count`` integers. ``n[i]`` specifies the number of elements in vectors ``X`` and ``Y`` for every vector in group ``i``.
-
-   alpha
-       Array of ``group_count`` scalar elements. ``alpha[i]`` specifies the scaling factor for vector ``X`` in group ``i``.
-
-   x
-      Array of pointers to input vectors ``X`` with size ``total_batch_count``.
-      The size of array allocated for the ``X`` vector of the group ``i`` must be at least (1 + (``n[i]`` – 1)*abs(``incx[i]``)). 
-      See :ref:`matrix-storage` for more details.
-
-   incx
-      Array of ``group_count`` integers. ``incx[i]`` specifies the stride of vector ``X`` in group ``i``.
- 
-   y
-      Array of pointers to input/output vectors ``Y`` with size ``total_batch_count``.
-      The size of array allocated for the ``Y`` vector of the group ``i`` must be at least (1 + (``n[i]`` – 1)*abs(``incy[i]``)). 
-      See :ref:`matrix-storage` for more details.
-
-   incy
-      Array of ``group_count`` integers. ``incy[i]`` specifies the stride of vector ``Y`` in group ``i``.
-
-   group_count
-      Number of groups. Must be at least 0.
-
-   group_size
-      Array of ``group_count`` integers. ``group_size[i]`` specifies the number of ``axpy`` operations in group ``i``. 
-      Each element in ``group_size`` must be at least 0.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Array of pointers holding the ``Y`` vectors, overwritten by ``total_batch_count`` ``axpy`` operations of the form 
-      ``alpha`` * ``X`` + ``Y``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event axpy_batch(sycl::queue &queue,
-                              std::int64_t n,
-                              T alpha,
-                              const T *x,
-                              std::int64_t incx,
-                              std::int64_t stridex,
-                              T *y,
-                              std::int64_t incy,
-                              std::int64_t stridey,
-                              std::int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event axpy_batch(sycl::queue &queue,
-                              std::int64_t n,
-                              T alpha,
-                              const T *x,
-                              std::int64_t incx,
-                              std::int64_t stridex,
-                              T *y,
-                              std::int64_t incy,
-                              std::int64_t stridey,
-                              std::int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in ``X`` and ``Y``.
-
-   alpha
-       Specifies the scalar ``alpha``.
-
-   x
-      Pointer to input vectors ``X`` with size ``stridex`` * ``batch_size``.
-
-   incx 
-      Stride of vector ``X``.
-   
-   stridex 
-      Stride between different ``X`` vectors.
-
-   y
-      Pointer to input/output vectors ``Y`` with size ``stridey`` * ``batch_size``.
-
-   incy 
-      Stride of vector ``Y``.
-   
-   stridey 
-      Stride between different ``Y`` vectors.
-
-   batch_size 
-      Specifies the number of ``axpy`` operations to perform.
-  
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Output vectors, overwritten by ``batch_size`` ``axpy`` operations of the form 
-      ``alpha`` * ``X`` + ``Y``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:**:ref:`blas-like-extensions`
diff --git a/docs/domains/blas/blas-level-1-routines.rst b/docs/domains/blas/blas-level-1-routines.rst
deleted file mode 100644
index c96c2d54c..000000000
--- a/docs/domains/blas/blas-level-1-routines.rst
+++ /dev/null
@@ -1,76 +0,0 @@
-.. _blas-level-1-routines:
-
-BLAS Level 1 Routines
-=====================
-
-
-.. container::
-
-
-   BLAS Level 1 includes routines which perform
-   vector-vector operations as described in the following table. 
-
-
-   .. container:: tablenoborder
-
-
-      .. list-table:: 
-         :header-rows: 1
-
-         * -     Routines
-           -     Description     
-         * -     :ref:`onemkl_blas_asum`   
-           -     Sum of vector magnitudes      
-         * -     :ref:`onemkl_blas_axpy`   
-           -     Scalar-vector product      
-         * -     :ref:`onemkl_blas_copy`   
-           -     Copy vector      
-         * -     :ref:`onemkl_blas_dot`   
-           -     Dot product      
-         * -     :ref:`onemkl_blas_sdsdot`   
-           -     Dot product with double precision      
-         * -     :ref:`onemkl_blas_dotc`   
-           -     Dot product conjugated      
-         * -     :ref:`onemkl_blas_dotu`
-           -     Dot product unconjugated      
-         * -     :ref:`onemkl_blas_nrm2`   
-           -     Vector 2-norm (Euclidean norm)      
-         * -     :ref:`onemkl_blas_rot`
-           -     Plane rotation of points      
-         * -     :ref:`onemkl_blas_rotg`   
-           -     Generate Givens rotation of points      
-         * -     :ref:`onemkl_blas_rotm`   
-           -     Modified Givens plane rotation of points           
-         * -     :ref:`onemkl_blas_rotmg`  
-           -     Generate modified Givens plane rotation of points           
-         * -     :ref:`onemkl_blas_scal`
-           -     Vector-scalar product      
-         * -     :ref:`onemkl_blas_swap`   
-           -     Vector-vector swap      
-         * -     :ref:`onemkl_blas_iamax`   
-           -     Index of the maximum absolute value element of a vector     
-         * -     :ref:`onemkl_blas_iamin`   
-           -     Index of the minimum absolute value element of a vector     
-
-.. toctree::
-    :hidden:
-
-    asum
-    axpy
-    copy
-    dot
-    sdsdot
-    dotc
-    dotu
-    nrm2
-    rot
-    rotg
-    rotm
-    rotmg
-    scal
-    swap
-    iamax
-    iamin
-
-
-**Parent topic:** :ref:`onemkl_blas`
diff --git a/docs/domains/blas/blas-level-2-routines.rst b/docs/domains/blas/blas-level-2-routines.rst
deleted file mode 100644
index 427acbc9b..000000000
--- a/docs/domains/blas/blas-level-2-routines.rst
+++ /dev/null
@@ -1,105 +0,0 @@
-.. _blas-level-2-routines:
-
-BLAS Level 2 Routines
-=====================
-
-
-.. container::
-
-
-   BLAS Level 2 includes routines which perform
-   matrix-vector operations as described in the following table. 
-
-
-   .. container:: tablenoborder
-
-
-      .. list-table:: 
-         :header-rows: 1
-
-         * -     Routines
-           -     Description  
-         * -     :ref:`onemkl_blas_gbmv`   
-           -     Matrix-vector product using a general band matrix         
-         * -     :ref:`onemkl_blas_gemv`   
-           -     Matrix-vector product using a general matrix     
-         * -     :ref:`onemkl_blas_ger`   
-           -     Rank-1 update of a general matrix     
-         * -     :ref:`onemkl_blas_gerc`   
-           -     Rank-1 update of a conjugated general matrix     
-         * -     :ref:`onemkl_blas_geru`   
-           -     Rank-1 update of a general matrix, unconjugated          
-         * -     :ref:`onemkl_blas_hbmv`   
-           -     Matrix-vector product using a Hermitian band matrix          
-         * -     :ref:`onemkl_blas_hemv`
-           -     Matrix-vector product using a Hermitian matrix          
-         * -     :ref:`onemkl_blas_her`   
-           -     Rank-1 update of a Hermitian matrix     
-         * -     :ref:`onemkl_blas_her2`   
-           -     Rank-2 update of a Hermitian matrix     
-         * -     :ref:`onemkl_blas_hpmv`   
-           -     Matrix-vector product using a Hermitian packed matrix          
-         * -     :ref:`onemkl_blas_hpr`   
-           -     Rank-1 update of a Hermitian packed matrix     
-         * -     :ref:`onemkl_blas_hpr2`   
-           -     Rank-2 update of a Hermitian packed matrix     
-         * -     :ref:`onemkl_blas_sbmv`   
-           -     Matrix-vector product using symmetric band matrix          
-         * -     :ref:`onemkl_blas_spmv`   
-           -     Matrix-vector product using a symmetric packed matrix          
-         * -     :ref:`onemkl_blas_spr`   
-           -     Rank-1 update of a symmetric packed matrix     
-         * -     :ref:`onemkl_blas_spr2`   
-           -     Rank-2 update of a symmetric packed matrix     
-         * -     :ref:`onemkl_blas_symv`   
-           -     Matrix-vector product using a symmetric matrix          
-         * -     :ref:`onemkl_blas_syr`   
-           -     Rank-1 update of a symmetric matrix     
-         * -     :ref:`onemkl_blas_syr2`   
-           -     Rank-2 update of a symmetric matrix     
-         * -     :ref:`onemkl_blas_tbmv`   
-           -     Matrix-vector product using a triangular band matrix          
-         * -     :ref:`onemkl_blas_tbsv`   
-           -     Solution of a linear system of equations with a triangular band matrix    
-         * -     :ref:`onemkl_blas_tpmv`   
-           -     Matrix-vector product using a triangular packed matrix          
-         * -     :ref:`onemkl_blas_tpsv`   
-           -     Solution of a linear system of equations with a triangular packed matrix    
-         * -     :ref:`onemkl_blas_trmv`   
-           -     Matrix-vector product using a triangular matrix          
-         * -     :ref:`onemkl_blas_trsv`   
-           -     Solution of a linear system of equations with a triangular matrix    
-
-
-
-
-.. toctree::
-    :hidden:
-
-    gbmv
-    gemv
-    ger
-    gerc
-    geru
-    hbmv
-    hemv
-    her
-    her2
-    hpmv
-    hpr
-    hpr2
-    sbmv
-    spmv
-    spr
-    spr2
-    symv
-    syr
-    syr2
-    tbmv
-    tbsv
-    tpmv
-    tpsv
-    trmv
-    trsv
-
-**Parent topic:** :ref:`onemkl_blas`
diff --git a/docs/domains/blas/blas-level-3-routines.rst b/docs/domains/blas/blas-level-3-routines.rst
deleted file mode 100644
index bb7f3f4d6..000000000
--- a/docs/domains/blas/blas-level-3-routines.rst
+++ /dev/null
@@ -1,55 +0,0 @@
-.. _blas-level-3-routines:
-
-BLAS Level 3 Routines
-=====================
-
-
-.. container::
-
-   BLAS Level 3 includes routines which perform
-   matrix-matrix operations as described in the following table. 
-
-
-   .. container:: tablenoborder
-
-
-      .. list-table:: 
-         :header-rows: 1
-
-         * -     Routines
-           -     Description     
-         * -     :ref:`onemkl_blas_gemm`   
-           -     Computes a matrix-matrix product with general matrices.   
-         * -     :ref:`onemkl_blas_hemm`   
-           -     Computes a matrix-matrix product where one input matrix is Hermitian and one is general.   
-         * -     :ref:`onemkl_blas_herk`   
-           -     Performs a Hermitian rank-k update.    
-         * -     :ref:`onemkl_blas_her2k`   
-           -     Performs a Hermitian rank-2k update.    
-         * -     :ref:`onemkl_blas_symm`   
-           -     Computes a matrix-matrix product where one input matrix is symmetric and one matrix is general.   
-         * -     :ref:`onemkl_blas_syrk`   
-           -     Performs a symmetric rank-k update.    
-         * -     :ref:`onemkl_blas_syr2k`   
-           -     Performs a symmetric rank-2k update.    
-         * -     :ref:`onemkl_blas_trmm`   
-           -     Computes a matrix-matrix product where one input matrix is triangular and one input matrix is general.   
-         * -     :ref:`onemkl_blas_trsm`   
-           -     Solves a triangular matrix equation (forward or backward solve).   
-
-
-
-.. toctree::
-    :hidden:
-
-    gemm
-    hemm
-    herk
-    her2k
-    symm
-    syrk
-    syr2k
-    trmm
-    trsm
-
-**Parent topic:** :ref:`onemkl_blas`
diff --git a/docs/domains/blas/blas-like-extensions.rst b/docs/domains/blas/blas-like-extensions.rst
deleted file mode 100644
index f447e2f9e..000000000
--- a/docs/domains/blas/blas-like-extensions.rst
+++ /dev/null
@@ -1,55 +0,0 @@
-.. _blas-like-extensions:
-
-BLAS-like Extensions
-====================
-
-
-.. container::
-
-
-   oneAPI Math Kernel Library DPC++ provides additional routines to
-   extend the functionality of the BLAS routines. These include routines
-   to compute many independent vector-vector and matrix-matrix operations.
-
-   The following table lists the BLAS-like extensions with their descriptions.
-
-
-   .. container:: tablenoborder
-
-
-      .. list-table:: 
-         :header-rows: 1
-
-         * -     Routines
-           -     Description     
-         * -     :ref:`onemkl_blas_axpy_batch`   
-           -     Computes groups of vector-scalar products added to a vector.
-         * -     :ref:`onemkl_blas_gemm_batch`   
-           -     Computes groups of matrix-matrix products with general matrices.   
-         * -     :ref:`onemkl_blas_trsm_batch`   
-           -     Solves a triangular matrix equation for a group of matrices.   
-         * -     :ref:`onemkl_blas_gemmt`   
-           -     Computes a matrix-matrix product with general matrices, but updates
-                 only the upper or lower triangular part of the result matrix.
-         * -     :ref:`onemkl_blas_gemm_bias`   
-           -     Computes a matrix-matrix product using general integer matrices with bias
- 
-
-
-
-
-.. toctree::
-    :hidden:
-
-    axpy_batch
-    axpby
-    copy_batch
-    dgmm_batch
-    gemm_batch
-    gemv_batch
-    syrk_batch
-    trsm_batch
-    gemmt
-    gemm_bias
-
-**Parent topic:** :ref:`onemkl_blas`
diff --git a/docs/domains/blas/blas.rst b/docs/domains/blas/blas.rst
deleted file mode 100644
index 50411efb8..000000000
--- a/docs/domains/blas/blas.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-.. _onemkl_blas:
-
-BLAS Routines
-+++++++++++++
-
-oneMKL provides DPC++ interfaces to the Basic Linear Algebra Subprograms (BLAS) routines (Level1, Level2, Level3), as well as several BLAS-like extension routines.
-
-.. toctree::
-    :maxdepth: 1
-
-    blas-level-1-routines.rst
-    blas-level-2-routines.rst
-    blas-level-3-routines.rst
-    blas-like-extensions.rst
-
-
-**Parent topic:** :ref:`onemkl_dense_linear_algebra`
diff --git a/docs/domains/blas/copy.rst b/docs/domains/blas/copy.rst
deleted file mode 100644
index e09db618e..000000000
--- a/docs/domains/blas/copy.rst
+++ /dev/null
@@ -1,159 +0,0 @@
-.. _onemkl_blas_copy:
-
-copy
-====
-
-Copies a vector to another vector.
-
-.. _onemkl_blas_copy_description:
-
-.. rubric:: Description
-
-The ``copy`` routines copy one vector to another:
-
-.. math::
-      
-      y \leftarrow  x
-
-where ``x`` and ``y`` are vectors of n elements.
-
-``copy`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-
-.. _onemkl_blas_copy_buffer:
-
-copy (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void copy(sycl::queue &queue,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void copy(sycl::queue &queue,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at least
-      (1 + (``n`` – 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   incy
-      Stride of vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Buffer holding the updated vector ``y``.
-
-
-.. _onemkl_blas_copy_usm:
-
-copy (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event copy(sycl::queue &queue,
-                        std::int64_t n,
-                        const T *x,
-                        std::int64_t incx,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event copy(sycl::queue &queue,
-                        std::int64_t n,
-                        const T *x,
-                        std::int64_t incx,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-   
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   x
-      Pointer to the input vector ``x``. The array holding the vector
-      ``x`` must be of size at least (1 + (``n`` – 1)*abs(``incx``)). See
-      :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   incy
-      Stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Pointer to the updated vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/copy_batch.rst b/docs/domains/blas/copy_batch.rst
deleted file mode 100644
index b02d71c56..000000000
--- a/docs/domains/blas/copy_batch.rst
+++ /dev/null
@@ -1,328 +0,0 @@
-.. _onemkl_blas_copy_batch:
-
-copy_batch
-==========
-
-Computes a group of ``copy`` operations.
-
-.. _onemkl_blas_copy_batch_description:
-
-.. rubric:: Description
-
-The ``copy_batch`` routines are batched versions of :ref:`onemkl_blas_copy`, performing
-multiple ``copy`` operations in a single call. Each ``copy`` 
-operation copies one vector to another.
-   
-``copy_batch`` supports the following precisions for data.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_copy_batch_buffer:
-
-copy_batch (Buffer Version)
----------------------------
-
-.. rubric:: Description
-
-The buffer version of ``copy_batch`` supports only the strided API. 
-
-The strided API operation is defined as:
-::
-  
-   for i = 0 … batch_size – 1
-      X and Y are vectors at offset i * stridex, i * stridey in x and y
-      Y := X
-   end for
-
-where:
-
-``X`` and ``Y`` are vectors.
-   
-**Strided API**
-
-.. rubric:: Syntax
- 
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void copy_batch(sycl::queue &queue,
-                       std::int64_t n,
-                       sycl::buffer<T,
-                       1> &x,
-                       std::int64_t incx,
-                       std::int64_t stridex,
-                       sycl::buffer<T,
-                       1> &y,
-                       std::int64_t incy,
-                       std::int64_t stridey,
-                       std::int64_t batch_size)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void copy_batch(sycl::queue &queue,
-                       std::int64_t n,
-                       sycl::buffer<T,
-                       1> &x,
-                       std::int64_t incx,
-                       std::int64_t stridex,
-                       sycl::buffer<T,
-                       1> &y,
-                       std::int64_t incy,
-                       std::int64_t stridey,
-                       std::int64_t batch_size)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in ``X`` and ``Y``.
-
-   x
-      Buffer holding input vectors ``X`` with size ``stridex`` * ``batch_size``.
-
-   incx 
-      Stride of vector ``X``.
-
-   stridex 
-      Stride between different ``X`` vectors.
-
-   y
-      Buffer holding input/output vectors ``Y`` with size ``stridey`` * ``batch_size``.
-
-   incy 
-      Stride of vector ``Y``.
-   
-   stridey 
-      Stride between different ``Y`` vectors.
-
-   batch_size 
-      Specifies the number of ``copy`` operations to perform.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Output buffer, overwritten by ``batch_size`` ``copy`` operations.
-
-
-.. _onemkl_blas_copy_batch_usm:
-
-copy_batch (USM Version)
-------------------------
-
-.. rubric:: Description
-
-The USM version of ``copy_batch`` supports the group API and strided API. 
-
-The group API operation is defined as
-::
-   
-   idx = 0
-   for i = 0 … group_count – 1
-       for j = 0 … group_size – 1
-           X and Y are vectors in x[idx] and y[idx]
-           Y := X
-           idx := idx + 1
-       end for
-   end for
-
-The strided API operation is defined as
-::
-   
-   for i = 0 … batch_size – 1
-      X and Y are vectors at offset i * stridex, i * stridey in x and y
-      Y := X
-   end for
-
-where:
-
-``X`` and ``Y`` are vectors.
-
-For group API, ``x`` and ``y`` arrays contain the pointers for all the input vectors. 
-The total number of vectors in ``x`` and ``y`` are given by:
-
-.. math::
-
-      total\_batch\_count = \sum_{i=0}^{group\_count-1}group\_size[i]    
-
-For strided API, ``x`` and ``y`` arrays contain all the input vectors. 
-The total number of vectors in ``x`` and ``y`` are given by the ``batch_size`` parameter.
-
-**Group API**
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event copy_batch(sycl::queue &queue,
-                              std::int64_t *n,
-                              const T **x,
-                              std::int64_t *incx,
-                              T **y,
-                              std::int64_t *incy,
-                              std::int64_t group_count,
-                              std::int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event copy_batch(sycl::queue &queue,
-                              std::int64_t *n,
-                              const T **x,
-                              std::int64_t *incx,
-                              T **y,
-                              std::int64_t *incy,
-                              std::int64_t group_count,
-                              std::int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Array of ``group_count`` integers. ``n[i]`` specifies the number of elements in vectors ``X`` and ``Y`` for every vector in group ``i``.
-
-   x
-      Array of pointers to input vectors ``X`` with size ``total_batch_count``.
-      The size of array allocated for the ``X`` vector of the group ``i`` must be at least (1 + (``n[i]`` – 1)*abs(``incx[i]``)). 
-      See :ref:`matrix-storage` for more details.
-
-   incx
-      Array of ``group_count`` integers. ``incx[i]`` specifies the stride of vector ``X`` in group ``i``.
- 
-   y
-      Array of pointers to input/output vectors ``Y`` with size ``total_batch_count``.
-      The size of array allocated for the ``Y`` vector of the group ``i`` must be at least (1 + (``n[i]`` – 1)*abs(``incy[i]``)). 
-      See :ref:`matrix-storage` for more details.
-
-   incy
-      Array of ``group_count`` integers. ``incy[i]`` specifies the stride of vector ``Y`` in group ``i``.
-
-   group_count
-      Number of groups. Must be at least 0.
-
-   group_size
-      Array of ``group_count`` integers. ``group_size[i]`` specifies the number of ``copy`` operations in group ``i``. 
-      Each element in ``group_size`` must be at least 0.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Array of pointers holding the ``Y`` vectors, overwritten by ``total_batch_count`` ``copy`` operations.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event copy_batch(sycl::queue &queue,
-                              std::int64_t n,
-                              const T *x,
-                              std::int64_t incx,
-                              std::int64_t stridex,
-                              T *y,
-                              std::int64_t incy,
-                              std::int64_t stridey,
-                              std::int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event copy_batch(sycl::queue &queue,
-                              std::int64_t n,
-                              const T *x,
-                              std::int64_t incx,
-                              std::int64_t stridex,
-                              T *y,
-                              std::int64_t incy,
-                              std::int64_t stridey,
-                              std::int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in ``X`` and ``Y``.
-
-   x
-      Pointer to input vectors ``X`` with size ``stridex`` * ``batch_size``.
-
-   incx 
-      Stride of vector ``X``.
-   
-   stridex 
-      Stride between different ``X`` vectors.
-
-   y
-      Pointer to input/output vectors ``Y`` with size ``stridey`` * ``batch_size``.
-
-   incy 
-      Stride of vector ``Y``.
-   
-   stridey 
-      Stride between different ``Y`` vectors.
-
-   batch_size 
-      Specifies the number of ``copy`` operations to perform.
-  
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Output vectors, overwritten by ``batch_size`` ``copy`` operations
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:**:ref:`blas-like-extensions`
diff --git a/docs/domains/blas/dgmm_batch.rst b/docs/domains/blas/dgmm_batch.rst
deleted file mode 100644
index 25eaace9f..000000000
--- a/docs/domains/blas/dgmm_batch.rst
+++ /dev/null
@@ -1,462 +0,0 @@
-.. _onemkl_blas_dgmm_batch:
-
-dgmm_batch
-==========
-
-Computes a group of ``dgmm`` operations.
-
-.. _onemkl_blas_dgmm_batch_description:
-
-.. rubric:: Description
-
-The ``dgmm_batch`` routines perform
-multiple diagonal matrix-matrix product operations in a single call.
-   
-``dgmm_batch`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_dgmm_batch_buffer:
-
-dgmm_batch (Buffer Version)
----------------------------
-
-.. rubric:: Description
-
-The buffer version of ``dgmm_batch`` supports only the strided API. 
-
-The strided API operation is defined as:
-::
-
-   for i = 0 … batch_size – 1
-       A and C are matrices at offset i * stridea in a, i * stridec in c.
-       X is a vector at offset i * stridex in x
-       C := diag(X) * A or  C = A * diag(X)
-   end for
-
-where:
-
-``A`` is a matrix,
-
-``X`` is a diagonal matrix stored as a vector
-
-The ``a`` and ``x`` buffers contain all the input matrices. The stride 
-between matrices is given by the stride parameter. The total number
-of matrices in ``a`` and ``x`` buffers is given by the ``batch_size`` parameter.
-
-**Strided API**
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void dgmm_batch(sycl::queue &queue,
-                       onemkl::mkl::side left_right,
-                       std::int64_t m,
-                       std::int64_t n,
-                       sycl::buffer<T,1> &a,
-                       std::int64_t lda,
-                       std::int64_t stridea,
-                       sycl::buffer<T,1> &x,
-                       std::int64_t incx,
-                       std::int64_t stridex,
-                       sycl::buffer<T,1> &c,
-                       std::int64_t ldc,
-                       std::int64_t stridec,
-                       std::int64_t batch_size)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void dgmm_batch(sycl::queue &queue,
-                       onemkl::mkl::side left_right,
-                       std::int64_t m,
-                       std::int64_t n,
-                       sycl::buffer<T,1> &a,
-                       std::int64_t lda,
-                       std::int64_t stridea,
-                       sycl::buffer<T,1> &x,
-                       std::int64_t incx,
-                       std::int64_t stridex,
-                       sycl::buffer<T,1> &c,
-                       std::int64_t ldc,
-                       std::int64_t stridec,
-                       std::int64_t batch_size)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   left_right
-      Specifies the position of the diagonal matrix in the product.
-      See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Number of rows of matrices ``A`` and ``C``. Must be at least zero.
-
-   n
-      Number of columns of matrices ``A`` and ``C``. Must be at least zero.
-
-   a
-
-      Buffer holding the input matrices ``A`` with size ``stridea`` *
-      ``batch_size``.  Must be of at least ``lda`` * ``j`` +
-      ``stridea`` * (``batch_size`` - 1) where j is n if column major
-      layout is used or m if major layout is used.
-
-   lda
-      The leading dimension of the matrices ``A``. It must be positive
-      and at least ``m`` if column major layout is used or at least
-      ``n`` if row major layout is used.
-
-   stridea
-      Stride between different ``A`` matrices.
-
-   x
-      Buffer holding the input matrices ``X`` with size ``stridex`` *
-      ``batch_size``.  Must be of size at least 
-      (1 + (``len`` - 1)*abs(``incx``)) + ``stridex`` * (``batch_size`` - 1) 
-      where ``len`` is ``n`` if the diagonal matrix is on the right 
-      of the product or ``m`` otherwise.
-
-   incx
-      Stride between two consecutive elements of the ``x`` vectors.
-
-   stridex
-      Stride between different ``X`` vectors, must be at least 0.
-
-   c
-      Buffer holding input/output matrices ``C`` with size ``stridec`` * ``batch_size``.
-
-   ldc
-      The leading dimension of the matrices ``C``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if column major layout is used to store matrices.
-
-   stridec
-      Stride between different ``C`` matrices. Must be at least
-      ``ldc`` * ``n`` if column major layout is used or ``ldc`` * ``m`` if row
-      major layout is used.
-
-   batch_size
-      Specifies the number of diagonal matrix-matrix product operations to perform.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Output overwritten by ``batch_size`` diagonal matrix-matrix product
-      operations.
-
-
-.. _onemkl_blas_dgmm_batch_usm:
-
-dgmm_batch (USM Version)
----------------------------
-
-.. rubric:: Description
-
-The USM version of ``dgmm_batch`` supports the group API and strided API. 
-
-The group API operation is defined as:
-::
-
-   idx = 0
-   for i = 0 … group_count – 1
-       for j = 0 … group_size – 1
-           a and c are matrices of size mxn at position idx in a_array and c_array
-           x is a vector of size m or n depending on left_right, at position idx in x_array
-           if (left_right == oneapi::mkl::side::left)
-               c := diag(x) * a
-           else
-               c := a * diag(x)
-           idx := idx + 1
-       end for
-   end for
-
-The strided API operation is defined as
-::
-
-   for i = 0 … batch_size – 1
-       A and C are matrices at offset i * stridea in a, i * stridec in c.
-       X is a vector at offset i * stridex in x
-       C := diag(X) * A or  C = A * diag(X)
-   end for
-
-where:
-
-``A`` is a matrix,
-
-``X`` is a diagonal matrix stored as a vector
-
-The ``a`` and ``x`` buffers contain all the input matrices. The stride 
-between matrices is given by the stride parameter. The total number
-of matrices in ``a`` and ``x`` buffers is given by the ``batch_size`` parameter.
- 
-For group API, ``a`` and ``x`` arrays contain the pointers for all the input matrices. 
-The total number of matrices in ``a`` and ``x`` are given by: 
-
-.. math::
-
-      total\_batch\_count = \sum_{i=0}^{group\_count-1}group\_size[i]    
- 
-For strided API, ``a`` and ``x`` arrays contain all the input matrices. The total number of matrices 
-in ``a`` and ``x`` are given by the ``batch_size`` parameter.  
-   
-**Group API**
-
-.. rubric:: Syntax
-   
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event dgmm_batch(sycl::queue &queue,
-                              onemkl::mkl::side *left_right,
-                              std::int64_t *m,
-                              std::int64_t *n,
-                              const T **a,
-                              std::int64_t *lda,
-                              const T **x,
-                              std::int64_t *incx,
-                              T **c,
-                              std::int64_t *ldc,
-                              std::int64_t group_count,
-                              std::int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event dgmm_batch(sycl::queue &queue,
-                              onemkl::mkl::side *left_right,
-                              std::int64_t *m,
-                              std::int64_t *n,
-                              const T **a,
-                              std::int64_t *lda,
-                              const T **x,
-                              std::int64_t *incx,
-                              T **c,
-                              std::int64_t *ldc,
-                              std::int64_t group_count,
-                              std::int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   left_right
-      Specifies the position of the diagonal matrix in the product.
-      See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Array of ``group_count`` integers. ``m[i]`` specifies the
-      number of rows of ``A`` for every matrix in group ``i``. All entries must be at least zero.
-
-   n
-      Array of ``group_count`` integers. ``n[i]`` specifies the
-      number of columns of ``A`` for every matrix in group ``i``. All entries must be at least zero.
-
-   a
-      Array of pointers to input matrices ``A`` with size
-      ``total_batch_count``.  Must be of size at least ``lda[i]`` * ``n[i]`` if
-      column major layout is used or at least ``lda[i]`` * ``m[i]`` if row major
-      layout is used.
-      See :ref:`matrix-storage` for more details.
-
-   lda
-      Array of ``group_count`` integers. ``lda[i]`` specifies the
-      leading dimension of ``A`` for every matrix in group ``i``. All
-      entries must be positive and at least ``m[i]`` if column major
-      layout is used or at least ``n[i]`` if row major layout is used.
-
-   x
-      Array of pointers to input vectors ``X`` with size
-      ``total_batch_count``.  Must be of size at least (1 + ``len[i]`` –
-      1)*abs(``incx[i]``)) where ``len[i]`` is ``n[i]`` if the diagonal matrix is on the
-      right of the product or ``m[i]`` otherwise.
-      See :ref:`matrix-storage` for more details.
-
-   incx
-      Array of ``group_count`` integers. ``incx[i]`` specifies the
-      stride of ``x`` for every vector in group ``i``. All entries
-      must be positive.
-   c
-      Array of pointers to input/output matrices ``C`` with size ``total_batch_count``. 
-      Must be of size at least
-      ``ldc[i]`` * ``n[i]``
-      if column major layout is used or at least
-      ``ldc[i]`` * ``m[i]``
-      if row major layout is used.
-      See :ref:`matrix-storage` for more details.
-
-   ldc
-      Array of ``group_count`` integers. ``ldc[i]`` specifies the
-      leading dimension of ``C`` for every matrix in group ``i``.  All
-      entries must be positive and ``ldc[i]`` must be at least
-      ``m[i]`` if column major layout is used to store matrices or at
-      least ``n[i]`` if row major layout is used to store matrices.
-
-   group_count
-      Specifies the number of groups. Must be at least 0.
-
-   group_size
-      Array of ``group_count`` integers. ``group_size[i]`` specifies the
-      number of diagonal matrix-matrix product operations in group ``i``.
-      All entries must be at least 0.
-
-   dependencies
-         List of events to wait for before starting computation, if any.
-         If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Output overwritten by ``batch_size`` diagonal matrix-matrix product
-      operations.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event dgmm_batch(sycl::queue &queue,
-                              onemkl::mkl::side left_right,
-                              std::int64_t m,
-                              std::int64_t n,
-                              const T *a,
-                              std::int64_t lda,
-                              std::int64_t stridea,
-                              const T *b,
-                              std::int64_t incx,
-                              std::int64_t stridex,
-                              T *c,
-                              std::int64_t ldc,
-                              std::int64_t stridec,
-                              std::int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event dgmm_batch(sycl::queue &queue,
-                              onemkl::mkl::side left_right,
-                              std::int64_t m,
-                              std::int64_t n,
-                              const T *a,
-                              std::int64_t lda,
-                              std::int64_t stridea,
-                              const T *b,
-                              std::int64_t incx,
-                              std::int64_t stridex,
-                              T *c,
-                              std::int64_t ldc,
-                              std::int64_t stridec,
-                              std::int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   left_right
-      Specifies the position of the diagonal matrix in the product.
-      See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Number of rows of ``A``. Must be at least zero.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   a
-      Pointer to input matrices ``A`` with size ``stridea`` *
-      ``batch_size``.  Must be of size at least
-      ``lda`` * ``k`` + ``stridea`` * (``batch_size`` - 1) 
-      where ``k`` is ``n`` if column major layout is used 
-      or ``m`` if row major layout is used.
-
-   lda
-      The leading dimension of the matrices ``A``. It must be positive
-      and at least ``m``.  Must be positive and at least ``m`` if column
-      major layout is used or at least ``n`` if row major layout is used.
-
-   stridea
-      Stride between different ``A`` matrices.
-
-   x
-      Pointer to input matrices ``X`` with size ``stridex`` * ``batch_size``.
-      Must be of size at least
-      (1 + (``len`` - 1)*abs(``incx``)) + ``stridex`` * (``batch_size`` - 1)
-      where ``len`` is ``n`` if the diagonal matrix is on the right
-      of the product or ``m`` otherwise.
-
-   incx
-      Stride between two consecutive elements of the ``x`` vector.
-
-   stridex
-      Stride between different ``X`` vectors, must be at least 0.
-
-   c
-      Pointer to input/output matrices ``C`` with size ``stridec`` * ``batch_size``.
-
-   ldc
-      The leading dimension of the matrices ``C``. It must be positive and at least
-      ``ldc`` * ``m`` if column major layout is used to store matrices or at
-      least ``ldc`` * ``n`` if column major layout is used to store matrices.
-
-   stridec
-      Stride between different ``C`` matrices. Must be at least
-      ``ldc`` * ``n`` if column major layout is used or 
-      ``ldc`` * ``m`` if row major layout is used.
-
-   batch_size
-      Specifies the number of diagonal matrix-matrix product operations to perform.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Output overwritten by ``batch_size`` diagonal matrix-matrix product
-      operations.
-
-.. container:: section
-      
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-like-extensions`
diff --git a/docs/domains/blas/dot.rst b/docs/domains/blas/dot.rst
deleted file mode 100644
index 8ae352889..000000000
--- a/docs/domains/blas/dot.rst
+++ /dev/null
@@ -1,182 +0,0 @@
-.. _onemkl_blas_dot:
-
-dot
-===
-
-Computes the dot product of two real vectors.
-
-.. _onemkl_blas_dot_description:
-
-.. rubric:: Description
-
-The ``dot`` routines perform a dot product between two vectors:
-
-.. math::
-
-   result = \sum_{i=1}^{n}X_iY_i 
-
-``dot`` supports the following precisions for data.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-        -  T_res 
-      * -  ``float`` 
-        -  ``float`` 
-      * -  ``double`` 
-        -  ``double`` 
-      * -  ``float`` 
-        -  ``double`` 
-
-.. container:: Note
-
-   .. rubric:: Note
-      :class: NoteTipHead
-
-   For the mixed precision version (inputs are float while result is
-   double), the dot product is computed with double precision.
-
-.. _onemkl_blas_dot_buffer:
-
-dot (Buffer Version)
---------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void dot(sycl::queue &queue,
-                std::int64_t n,
-                sycl::buffer<T,1> &x,
-                std::int64_t incx,
-                sycl::buffer<T,1> &y,
-                std::int64_t incy,
-                sycl::buffer<T_res,1> &result)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void dot(sycl::queue &queue,
-                std::int64_t n,
-                sycl::buffer<T,1> &x,
-                std::int64_t incx,
-                sycl::buffer<T,1> &y,
-                std::int64_t incy,
-                sycl::buffer<T_res,1> &result)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vectors ``x`` and ``y``.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at least
-      (1 + (``n`` – 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input vector ``y``. The buffer must be of size at least
-      (1 + (``n`` – 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      Buffer where the result (a scalar) will be stored.
-
-
-.. _onemkl_blas_dot_usm:
-
-dot (USM Version)
------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event dot(sycl::queue &queue,
-                       std::int64_t n,
-                       const T *x,
-                       std::int64_t incx,
-                       const T *y,
-                       std::int64_t incy,
-                       T_res *result,
-                       const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event dot(sycl::queue &queue,
-                       std::int64_t n,
-                       const T *x,
-                       std::int64_t incx,
-                       const T *y,
-                       std::int64_t incy,
-                       T_res *result,
-                       const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vectors ``x`` and ``y``.
-
-   x
-      Pointer to the input vector ``x``. The array holding the vector ``x``
-      must be of size at least (1 + (``n`` – 1)*abs(``incx``)). See
-      :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Pointer to the input vector ``y``. The array holding the vector ``y``
-      must be of size at least (1 + (``n`` – 1)*abs(``incy``)). See
-      :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      Pointer to where the result (a scalar) will be stored.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/dotc.rst b/docs/domains/blas/dotc.rst
deleted file mode 100644
index d6746f124..000000000
--- a/docs/domains/blas/dotc.rst
+++ /dev/null
@@ -1,170 +0,0 @@
-.. _onemkl_blas_dotc:
-
-dotc
-====
-
-Computes the dot product of two complex vectors, conjugating the first vector.
-
-.. _onemkl_blas_dotc_description:
-
-.. rubric:: Description
-
-The ``dotc`` routines perform a dot product between two complex
-vectors, conjugating the first of them:
-
-.. math::
-
-   result = \sum_{i=1}^{n}\overline{X_i}Y_i 
-
-``dotc`` supports the following precisions for data.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_dotc_buffer:
-
-dotc (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void dotc(sycl::queue &queue,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &result)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void dotc(sycl::queue &queue,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &result)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      The number of elements in vectors ``x`` and ``y``.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      The stride of vector ``x``.
-
-   y
-      Buffer holding input vector ``y``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details..
-
-   incy
-      The stride of vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      The buffer where the result (a scalar) is stored.
-
-
-.. _onemkl_blas_dotc_usm:
-
-dotc (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void dotc(sycl::queue &queue,
-                 std::int64_t n,
-                 const T *x,
-                 std::int64_t incx,
-                 const T *y,
-                 std::int64_t incy,
-                 T *result,
-                 const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void dotc(sycl::queue &queue,
-                 std::int64_t n,
-                 const T *x,
-                 std::int64_t incx,
-                 const T *y,
-                 std::int64_t incy,
-                 T *result,
-                 const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      The number of elements in vectors ``x`` and ``y``.
-
-   x
-      Pointer to input vector ``x``. The array holding the input
-      vector ``x`` must be of size at least (1 + (``n`` -
-      1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      The stride of vector ``x``.
-
-   y
-      Pointer to input vector ``y``. The array holding the input
-      vector ``y`` must be of size at least (1 + (``n`` -
-      1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details..
-
-   incy
-      The stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      The pointer to where the result (a scalar) is stored.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/dotu.rst b/docs/domains/blas/dotu.rst
deleted file mode 100644
index d936815ae..000000000
--- a/docs/domains/blas/dotu.rst
+++ /dev/null
@@ -1,170 +0,0 @@
-.. _onemkl_blas_dotu:
-
-dotu
-====
-
-Computes the dot product of two complex vectors.
-
-.. _onemkl_blas_dotu_description:
-
-.. rubric:: Description
-
-The ``dotu`` routines perform a dot product between two complex vectors:
-
-.. math::
-
-   result = \sum_{i=1}^{n}X_iY_i 
-
-``dotu`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_dotu_buffer:
-
-dotu (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void dotu(sycl::queue &queue,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &result)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void dotu(sycl::queue &queue,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &result)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vectors ``x`` and ``y``.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input vector ``y``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      Buffer where the result (a scalar) is stored.
-
-
-.. _onemkl_blas_dotu_usm:
-
-dotu (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event dotu(sycl::queue &queue,
-                        std::int64_t n,
-                        const T *x,
-                        std::int64_t incx,
-                        const T *y,
-                        std::int64_t incy,
-                        T *result,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event dotu(sycl::queue &queue,
-                        std::int64_t n,
-                        const T *x,
-                        std::int64_t incx,
-                        const T *y,
-                        std::int64_t incy,
-                        T *result,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vectors ``x`` and ``y``.
-
-   x
-      Pointer to the input vector ``x``. The array holding input
-      vector ``x`` must be of size at least (1 + (``n`` -
-      1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Pointer to input vector ``y``. The array holding input vector
-      ``y`` must be of size at least (1 + (``n`` - 1)*abs(``incy``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      Pointer to where the result (a scalar) is stored.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/gbmv.rst b/docs/domains/blas/gbmv.rst
deleted file mode 100644
index 366ce69f2..000000000
--- a/docs/domains/blas/gbmv.rst
+++ /dev/null
@@ -1,285 +0,0 @@
-.. _onemkl_blas_gbmv:
-
-gbmv
-====
-
-Computes a matrix-vector product with a general band matrix.
-
-.. _onemkl_blas_gbmv_description:
-
-.. rubric:: Description
-
-The ``gbmv`` routines compute a scalar-matrix-vector product and add
-the result to a scalar-vector product, with a general band matrix.
-The operation is defined as
-
-.. math::
-      
-      y \leftarrow alpha*op(A)*x + beta*y
-
-where:
-
-op(``A``) is one of op(``A``) = ``A``, or op(``A``) =
-``A``\ :sup:`T`, or op(``A``) = ``A``\ :sup:`H`,
-
-``alpha`` and ``beta`` are scalars,
-
-``A`` is an ``m``-by-``n`` matrix with ``kl`` sub-diagonals and
-``ku`` super-diagonals,
-
-``x`` and ``y`` are vectors.
-
-``gbmv`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_gbmv_buffer:
-
-gbmv (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void gbmv(sycl::queue &queue,
-                 onemkl::transpose trans,
-                 std::int64_t m,
-                 std::int64_t n,
-                 std::int64_t kl,
-                 std::int64_t ku,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void gbmv(sycl::queue &queue,
-                 onemkl::transpose trans,
-                 std::int64_t m,
-                 std::int64_t n,
-                 std::int64_t kl,
-                 std::int64_t ku,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to ``A``.
-      See
-      :ref:`onemkl_datatypes` for more
-      details.
-
-   m
-      Number of rows of ``A``. Must be at least zero.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   kl
-      Number of sub-diagonals of the matrix ``A``. Must be at least
-      zero.
-
-   ku
-      Number of super-diagonals of the matrix ``A``. Must be at least
-      zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least ``lda``\ \*\ ``n``
-      if column major layout is used or at least ``lda``\ \*\ ``m``
-      if row major layout is used. See :ref:`matrix-storage` for more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least (``kl`` +
-      ``ku`` + 1), and positive.
-
-   x
-      Buffer holding input vector ``x``. The length ``len`` of vector
-      ``x`` is ``n`` if ``A`` is not transposed, and ``m`` if ``A`` is
-      transposed. The buffer must be of size at least (1 + (``len`` -
-      1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   beta
-      Scaling factor for vector ``y``.
-
-   y
-      Buffer holding input/output vector ``y``. The length ``len`` of
-      vector ``y`` is ``m``, if ``A`` is not transposed, and ``n`` if
-      ``A`` is transposed. The buffer must be of size at least (1 +
-      (``len`` - 1)*abs(``incy``)) where ``len`` is this length. See
-      :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Buffer holding the updated vector ``y``.
-
-
-.. _onemkl_blas_gbmv_usm:
-
-gbmv (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event gbmv(sycl::queue &queue,
-                        onemkl::transpose trans,
-                        std::int64_t m,
-                        std::int64_t n,
-                        std::int64_t kl,
-                        std::int64_t ku,
-                        T alpha,
-                        const T *a,
-                        std::int64_t lda,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event gbmv(sycl::queue &queue,
-                        onemkl::transpose trans,
-                        std::int64_t m,
-                        std::int64_t n,
-                        std::int64_t kl,
-                        std::int64_t ku,
-                        T alpha,
-                        const T *a,
-                        std::int64_t lda,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to
-      ``A``. See
-      :ref:`onemkl_datatypes` for
-      more details.
-
-   m
-      Number of rows of ``A``. Must be at least zero.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   kl
-      Number of sub-diagonals of the matrix ``A``. Must be at least
-      zero.
-
-   ku
-      Number of super-diagonals of the matrix ``A``. Must be at least
-      zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least ``lda``\ \*\ ``n`` if column
-      major layout is used or at least ``lda``\ \*\ ``m`` if row
-      major layout is used. See :ref:`matrix-storage` for more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least (``kl`` +
-      ``ku`` + 1), and positive.
-
-   x
-      Pointer to input vector ``x``. The length ``len`` of vector
-      ``x`` is ``n`` if ``A`` is not transposed, and ``m`` if ``A``
-      is transposed. The array holding input vector ``x`` must be of
-      size at least (1 + (``len`` - 1)*abs(``incx``)). See 
-      :ref:`matrix-storage` for more details.
-
-   incx
-      Stride of vector ``x``.
-
-   beta
-      Scaling factor for vector ``y``.
-
-   y
-      Pointer to input/output vector ``y``. The length ``len`` of
-      vector ``y`` is ``m``, if ``A`` is not transposed, and ``n`` if
-      ``A`` is transposed. The array holding input/output vector
-      ``y`` must be of size at least (1 + (``len`` -
-      1)*abs(``incy``)) where ``len`` is this length. 
-      See :ref:`matrix-storage` for more details.
-
-   incy
-      Stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Pointer to the updated vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/gemm.rst b/docs/domains/blas/gemm.rst
deleted file mode 100644
index e9e2c89ea..000000000
--- a/docs/domains/blas/gemm.rst
+++ /dev/null
@@ -1,455 +0,0 @@
-.. _onemkl_blas_gemm:
-
-gemm
-====
-
-Computes a matrix-matrix product with general matrices.
-
-.. _onemkl_blas_gemm_description:
-
-.. rubric:: Description
-
-The ``gemm`` routines compute a scalar-matrix-matrix product and add the
-result to a scalar-matrix product, with general matrices. The
-operation is defined as:
-
-.. math::
-
-      C \leftarrow alpha*op(A)*op(B) + beta*C
-
-where:
-
-op(``X``) is one of op(``X``) = ``X``, or op(``X``) = ``X``\ :sup:`T`, or
-op(``X``) = ``X``\ :sup:`H`,
-
-``alpha`` and ``beta`` are scalars,
-
-``A``, ``B`` and ``C`` are matrices,
-
-``op(A)`` is an ``m``-by-``k`` matrix,
-
-``op(B)`` is a ``k``-by-``n`` matrix,
-
-``C`` is an ``m``-by-``n`` matrix.
-
-``gemm`` supports the following precisions.
-
-   .. list-table:: 
-     :header-rows: 1
-
-     * -  Ts 
-       -  Ta 
-       -  Tb 
-       -  Tc 
-     * -  ``float`` 
-       -  ``half`` 
-       -  ``half`` 
-       -  ``float`` 
-     * -  ``half`` 
-       -  ``half`` 
-       -  ``half`` 
-       -  ``half`` 
-     * -  ``float``
-       -  ``bfloat16``
-       -  ``bfloat16``
-       -  ``float``
-     * -  ``float`` 
-       -  ``float`` 
-       -  ``float`` 
-       -  ``float`` 
-     * -  ``double`` 
-       -  ``double`` 
-       -  ``double`` 
-       -  ``double`` 
-     * -  ``std::complex<float>`` 
-       -  ``std::complex<float>`` 
-       -  ``std::complex<float>`` 
-       -  ``std::complex<float>`` 
-     * -  ``std::complex<double>`` 
-       -  ``std::complex<double>`` 
-       -  ``std::complex<double>`` 
-       -  ``std::complex<double>`` 
-
-.. _onemkl_blas_gemm_buffer:
-
-gemm (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void gemm(sycl::queue &queue,
-                 onemkl::transpose transa,
-                 onemkl::transpose transb,
-                 std::int64_t m,
-                 std::int64_t n,
-                 std::int64_t k,
-                 Ts alpha,
-                 sycl::buffer<Ta,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<Tb,1> &b,
-                 std::int64_t ldb,
-                 Ts beta,
-                 sycl::buffer<Tc,1> &c,
-                 std::int64_t ldc)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void gemm(sycl::queue &queue,
-                 onemkl::transpose transa,
-                 onemkl::transpose transb,
-                 std::int64_t m,
-                 std::int64_t n,
-                 std::int64_t k,
-                 Ts alpha,
-                 sycl::buffer<Ta,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<Tb,1> &b,
-                 std::int64_t ldb,
-                 Ts beta,
-                 sycl::buffer<Tc,1> &c,
-                 std::int64_t ldc)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   transa
-      Specifies the form of op(``A``), the transposition operation
-      applied to ``A``.
-
-   transb
-      Specifies the form of op(``B``), the transposition operation
-      applied to ``B``.
-
-   m
-      Specifies the number of rows of the matrix op(``A``) and of the
-      matrix ``C``. The value of m must be at least zero.
-
-   n
-      Specifies the number of columns of the matrix op(``B``) and the
-      number of columns of the matrix ``C``. The value of n must be at
-      least zero.
-
-   k
-      Specifies the number of columns of the matrix op(``A``) and the
-      number of rows of the matrix op(``B``). The value of k must be at
-      least zero.
-
-   alpha
-      Scaling factor for the matrix-matrix product.
-
-   a
-      The buffer holding the input matrix ``A``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``A`` is an ``m``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-           - ``A`` is an ``k``-by-``m`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``m``
-         * - Row major
-           - ``A`` is an ``m``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``m``.
-           - ``A`` is an ``k``-by-``m`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``
-
-      See :ref:`matrix-storage` for more details.
-
-   lda
-      The leading dimension of ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``lda`` must be at least ``m``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``m``.
-             
-   b
-      The buffer holding the input matrix ``B``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``
-         * - Row major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``
-      
-      See :ref:`matrix-storage` for more details.
-
-   ldb
-      The leading dimension of ``B``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``ldb`` must be at least ``k``.
-           - ``ldb`` must be at least ``n``.
-         * - Row major
-           - ``ldb`` must be at least ``n``.
-           - ``ldb`` must be at least ``k``.
-
-   beta
-      Scaling factor for matrix ``C``.
-
-   c
-      The buffer holding the input/output matrix ``C``. It must have a
-      size of at least ``ldc``\ \*\ ``n`` if column major layout is
-      used to store matrices or at least ``ldc``\ \*\ ``m`` if row
-      major layout is used to store matrices . See :ref:`matrix-storage` for more details.
-
-   ldc
-      The leading dimension of ``C``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if row major layout is used to store matrices.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      The buffer, which is overwritten by
-      ``alpha``\ \*\ op(``A``)*op(``B``) + ``beta``\ \*\ ``C``.
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``beta`` = 0, matrix ``C`` does not need to be initialized before
-   calling ``gemm``.
-
-
-.. _onemkl_blas_gemm_usm:
-
-gemm (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event gemm(sycl::queue &queue,
-                        onemkl::transpose transa,
-                        onemkl::transpose transb,
-                        std::int64_t m,
-                        std::int64_t n,
-                        std::int64_t k,
-                        Ts alpha,
-                        const Ta *a,
-                        std::int64_t lda,
-                        const Tb *b,
-                        std::int64_t ldb,
-                        Ts beta,
-                        Tc *c,
-                        std::int64_t ldc,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event gemm(sycl::queue &queue,
-                        onemkl::transpose transa,
-                        onemkl::transpose transb,
-                        std::int64_t m,
-                        std::int64_t n,
-                        std::int64_t k,
-                        Ts alpha,
-                        const Ta *a,
-                        std::int64_t lda,
-                        const Tb *b,
-                        std::int64_t ldb,
-                        Ts beta,
-                        Tc *c,
-                        std::int64_t ldc,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   transa
-      Specifies the form of op(``A``), the transposition operation
-      applied to ``A``.
-
-
-   transb
-      Specifies the form of op(``B``), the transposition operation
-      applied to ``B``.
-
-
-   m
-      Specifies the number of rows of the matrix op(``A``) and of the
-      matrix ``C``. The value of m must be at least zero.
-
-
-   n
-      Specifies the number of columns of the matrix op(``B``) and the
-      number of columns of the matrix ``C``. The value of n must be
-      at least zero.
-
-
-   k
-      Specifies the number of columns of the matrix op(``A``) and the
-      number of rows of the matrix op(``B``). The value of k must be
-      at least zero.
-
-
-   alpha
-      Scaling factor for the matrix-matrix product.
-
-
-   a
-      Pointer to input matrix ``A``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``A`` is an ``m``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-           - ``A`` is an ``k``-by-``m`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``m``
-         * - Row major
-           - ``A`` is an ``m``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``m``.
-           - ``A`` is an ``k``-by-``m`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``
-             
-      See :ref:`matrix-storage` for more details.
-
-   lda
-      The leading dimension of ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``lda`` must be at least ``m``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``m``.
-             
-   b
-      Pointer to input matrix ``B``.
-      
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``
-         * - Row major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``
-   
-      See :ref:`matrix-storage` for more details.
-
-   ldb
-      The leading dimension of ``B``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``ldb`` must be at least ``k``.
-           - ``ldb`` must be at least ``n``.
-         * - Row major
-           - ``ldb`` must be at least ``n``.
-           - ``ldb`` must be at least ``k``.
-
-   beta
-      Scaling factor for matrix ``C``.
-
-   c
-      The pointer to input/output matrix ``C``. It must have a
-      size of at least ``ldc``\ \*\ ``n`` if column major layout is
-      used to store matrices or at least ``ldc``\ \*\ ``m`` if row
-      major layout is used to store matrices . See :ref:`matrix-storage` for more details.
-
-   ldc
-      The leading dimension of ``C``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if row major layout is used to store matrices.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Pointer to the output matrix, overwritten by
-      ``alpha``\ \*\ op(``A``)*op(``B``) + ``beta``\ \*\ ``C``.
- 
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``beta`` = 0, matrix ``C`` does not need to be initialized
-   before calling ``gemm``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/gemm_batch.rst b/docs/domains/blas/gemm_batch.rst
deleted file mode 100644
index e63129fdd..000000000
--- a/docs/domains/blas/gemm_batch.rst
+++ /dev/null
@@ -1,606 +0,0 @@
-.. _onemkl_blas_gemm_batch:
-
-gemm_batch
-==========
-
-Computes a group of ``gemm`` operations.
-
-.. _onemkl_blas_gemm_batch_description:
-
-.. rubric:: Description
-
-The ``gemm_batch`` routines are batched versions of :ref:`onemkl_blas_gemm`, performing
-multiple ``gemm`` operations in a single call. Each ``gemm`` 
-operation perform a matrix-matrix product with general matrices.
-   
-``gemm_batch`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``half``
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_gemm_batch_buffer:
-
-gemm_batch (Buffer Version)
----------------------------
-
-.. rubric:: Description
-
-The buffer version of ``gemm_batch`` supports only the strided API. 
-
-The strided API operation is defined as:
-::
-
-   for i = 0 … batch_size – 1
-       A, B and C are matrices at offset i * stridea, i * strideb, i * stridec in a, b and c.
-       C := alpha * op(A) * op(B) + beta * C
-   end for
-
-where:
-
-op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = X\ :sup:`H`,
-
-``alpha`` and ``beta`` are scalars,
-
-``A``, ``B``, and ``C`` are matrices,
-
-op(``A``) is ``m`` x ``k``, op(``B``) is 
-``k`` x ``n``, and ``C`` is ``m`` x ``n``.
-
-The ``a``, ``b`` and ``c`` buffers contain all the input matrices. The stride 
-between matrices is given by the stride parameter. The total number
-of matrices in ``a``, ``b`` and ``c`` buffers is given by the ``batch_size`` parameter.
-
-**Strided API**
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void gemm_batch(sycl::queue &queue,
-                       onemkl::transpose transa,
-                       onemkl::transpose transb,
-                       std::int64_t m,
-                       std::int64_t n,
-                       std::int64_t k,
-                       T alpha,
-                       sycl::buffer<T,1> &a,
-                       std::int64_t lda,
-                       std::int64_t stridea,
-                       sycl::buffer<T,1> &b,
-                       std::int64_t ldb,
-                       std::int64_t strideb,
-                       T beta,
-                       sycl::buffer<T,1> &c,
-                       std::int64_t ldc,
-                       std::int64_t stridec,
-                       std::int64_t batch_size)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void gemm_batch(sycl::queue &queue,
-                       onemkl::transpose transa,
-                       onemkl::transpose transb,
-                       std::int64_t m,
-                       std::int64_t n,
-                       std::int64_t k,
-                       T alpha,
-                       sycl::buffer<T,1> &a,
-                       std::int64_t lda,
-                       std::int64_t stridea,
-                       sycl::buffer<T,1> &b,
-                       std::int64_t ldb,
-                       std::int64_t strideb,
-                       T beta,
-                       sycl::buffer<T,1> &c,
-                       std::int64_t ldc,
-                       std::int64_t stridec,
-                       std::int64_t batch_size)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   transa
-      Specifies op(``A``) the transposition operation applied to the
-      matrices ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   transb
-      Specifies op(``B``) the transposition operation applied to the
-      matrices ``B``. See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Number of rows of op(``A``) and ``C``. Must be at least zero.
-
-
-   n
-      Number of columns of op(``B``) and ``C``. Must be at least zero.
-
-
-   k
-      Number of columns of op(``A``) and rows of op(``B``). Must be at
-      least zero.
-
-   alpha
-      Scaling factor for the matrix-matrix products.
-
-   a
-      Buffer holding the input matrices ``A`` with size ``stridea`` * ``batch_size``.
-
-   lda
-      The leading dimension of the matrices ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``lda`` must be at least ``m``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``m``.
-
-   stridea
-      Stride between different ``A`` matrices.
-
-   b
-      Buffer holding the input matrices ``B`` with size ``strideb`` * ``batch_size``.
-
-   ldb
-      The leading dimension of the matrices``B``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``ldb`` must be at least ``k``.
-           - ``ldb`` must be at least ``n``.
-         * - Row major
-           - ``ldb`` must be at least ``n``.
-           - ``ldb`` must be at least ``k``.
-
-   strideb
-      Stride between different ``B`` matrices.
-
-   beta
-      Scaling factor for the matrices ``C``.
-
-   c
-      Buffer holding input/output matrices ``C`` with size ``stridec`` * ``batch_size``.
-
-   ldc
-      The leading dimension of the matrices ``C``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if row major layout is used to store matrices.
-
-   stridec
-      Stride between different ``C`` matrices. Must be at least
-      ``ldc`` * ``n``.
-
-   batch_size
-      Specifies the number of matrix multiply operations to perform.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Output buffer, overwritten by ``batch_size`` matrix multiply
-      operations of the form ``alpha`` * op(``A``)*op(``B``) + ``beta`` * ``C``.
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``beta`` = 0, matrix ``C`` does not need to be initialized before
-   calling ``gemm_batch``.
-
-
-.. _onemkl_blas_gemm_batch_usm:
-
-gemm_batch (USM Version)
----------------------------
-
-.. rubric:: Description
-
-The USM version of ``gemm_batch`` supports the group API and strided API.
-
-The group API operation is defined as:
-::
-
-   idx = 0
-   for i = 0 … group_count – 1
-       for j = 0 … group_size – 1
-           A, B, and C are matrices in a[idx], b[idx] and c[idx]
-           C := alpha[i] * op(A) * op(B) + beta[i] * C
-           idx = idx + 1
-       end for
-   end for
-
-The strided API operation is defined as
-::
-
-   for i = 0 … batch_size – 1
-       A, B and C are matrices at offset i * stridea, i * strideb, i * stridec in a, b and c.
-       C := alpha * op(A) * op(B) + beta * C
-   end for
-
-where:
-
-op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = X\ :sup:`H`,
-
-``alpha`` and ``beta`` are scalars,
-
-``A``, ``B``, and ``C`` are matrices,
-
-op(``A``) is ``m`` x ``k``, op(``B``) is ``k`` x ``n``, and ``C`` is ``m`` x ``n``.
-
- 
-For group API, ``a``, ``b`` and ``c`` arrays contain the pointers for all the input matrices. 
-The total number of matrices in ``a``, ``b`` and ``c`` are given by: 
-
-.. math::
-
-      total\_batch\_count = \sum_{i=0}^{group\_count-1}group\_size[i]    
- 
-For strided API, ``a``, ``b``, ``c`` arrays contain all the input matrices. The total number of matrices 
-in ``a``, ``b`` and ``c`` are given by the ``batch_size`` parameter.  
-   
-**Group API**
-
-.. rubric:: Syntax
-   
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event gemm_batch(sycl::queue &queue,
-                              onemkl::transpose *transa,
-                              onemkl::transpose *transb,
-                              std::int64_t *m,
-                              std::int64_t *n,
-                              std::int64_t *k,
-                              T *alpha,
-                              const T **a,
-                              std::int64_t *lda,
-                              const T **b,
-                              std::int64_t *ldb,
-                              T *beta,
-                              T **c,
-                              std::int64_t *ldc,
-                              std::int64_t group_count,
-                              std::int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event gemm_batch(sycl::queue &queue,
-                              onemkl::transpose *transa,
-                              onemkl::transpose *transb,
-                              std::int64_t *m,
-                              std::int64_t *n,
-                              std::int64_t *k,
-                              T *alpha,
-                              const T **a,
-                              std::int64_t *lda,
-                              const T **b,
-                              std::int64_t *ldb,
-                              T *beta,
-                              T **c,
-                              std::int64_t *ldc,
-                              std::int64_t group_count,
-                              std::int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   transa
-      Array of ``group_count`` ``onemkl::transpose`` values. ``transa[i]`` specifies the form of op(``A``) used in
-      the matrix multiplication in group ``i``. See :ref:`onemkl_datatypes` for more details.
-
-   transb
-      Array of ``group_count`` ``onemkl::transpose`` values. ``transb[i]`` specifies the form of op(``B``) used in
-      the matrix multiplication in group ``i``. See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Array of ``group_count`` integers. ``m[i]`` specifies the
-      number of rows of op(``A``) and ``C`` for every matrix in group ``i``. All entries must be at least zero.
-
-   n
-      Array of ``group_count`` integers. ``n[i]`` specifies the
-      number of columns of op(``B``) and ``C`` for every matrix in group ``i``. All entries must be at least zero.
-
-   k
-      Array of ``group_count`` integers. ``k[i]`` specifies the
-      number of columns of op(``A``) and rows of op(``B``) for every matrix in group ``i``. All entries must be at
-      least zero.
-
-   alpha
-      Array of ``group_count`` scalar elements. ``alpha[i]`` specifies the scaling factor for every matrix-matrix
-      product in group ``i``.
-
-   a
-      Array of pointers to input matrices ``A`` with size ``total_batch_count``. 
-      
-      See :ref:`matrix-storage` for more details.
-
-   lda
-      Array of ``group_count`` integers. ``lda[i]`` specifies the
-      leading dimension of ``A`` for every matrix in group ``i``. All
-      entries must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``lda[i]`` must be at least ``m[i]``.
-           - ``lda[i]`` must be at least ``k[i]``.
-         * - Row major
-           - ``lda[i]`` must be at least ``k[i]``.
-           - ``lda[i]`` must be at least ``m[i]``.
-             
-   b
-      Array of pointers to input matrices ``B`` with size ``total_batch_count``. 
-      
-      See :ref:`matrix-storage` for more details.
-
-   ldb
-      Array of ``group_count`` integers. ``ldb[i]`` specifies the
-      leading dimension of ``B`` for every matrix in group ``i``. All
-      entries must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``ldb[i]`` must be at least ``k[i]``.
-           - ``ldb[i]`` must be at least ``n[i]``.
-         * - Row major
-           - ``ldb[i]`` must be at least ``n[i]``.
-           - ``ldb[i]`` must be at least ``k[i]``.
-             
-   beta
-      Array of ``group_count`` scalar elements. ``beta[i]`` specifies the scaling factor for matrix ``C`` 
-      for every matrix in group ``i``.
-
-   c
-      Array of pointers to input/output matrices ``C`` with size ``total_batch_count``. 
-      
-      See :ref:`matrix-storage` for more details.
-
-   ldc
-      Array of ``group_count`` integers. ``ldc[i]`` specifies the
-      leading dimension of ``C`` for every matrix in group ``i``.  All
-      entries must be positive and ``ldc[i]`` must be at least
-      ``m[i]`` if column major layout is used to store matrices or at
-      least ``n[i]`` if row major layout is used to store matrices.
-
-   group_count
-      Specifies the number of groups. Must be at least 0.
-
-   group_size
-      Array of ``group_count`` integers. ``group_size[i]`` specifies the
-      number of matrix multiply products in group ``i``. All entries must be at least 0.
-
-   dependencies
-         List of events to wait for before starting computation, if any.
-         If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Overwritten by the ``m[i]``-by-``n[i]`` matrix calculated by 
-      (``alpha[i]`` * op(``A``)*op(``B``) + ``beta[i]`` * ``C``) for group ``i``.
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``beta`` = 0, matrix ``C`` does not need to be initialized
-   before calling ``gemm_batch``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event gemm_batch(sycl::queue &queue,
-                              onemkl::transpose transa,
-                              onemkl::transpose transb,
-                              std::int64_t m,
-                              std::int64_t n,
-                              std::int64_t k,
-                              T alpha,
-                              const T *a,
-                              std::int64_t lda,
-                              std::int64_t stridea,
-                              const T *b,
-                              std::int64_t ldb,
-                              std::int64_t strideb,
-                              T beta,
-                              T *c,
-                              std::int64_t ldc,
-                              std::int64_t stridec,
-                              std::int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event gemm_batch(sycl::queue &queue,
-                              onemkl::transpose transa,
-                              onemkl::transpose transb,
-                              std::int64_t m,
-                              std::int64_t n,
-                              std::int64_t k,
-                              T alpha,
-                              const T *a,
-                              std::int64_t lda,
-                              std::int64_t stridea,
-                              const T *b,
-                              std::int64_t ldb,
-                              std::int64_t strideb,
-                              T beta,
-                              T *c,
-                              std::int64_t ldc,
-                              std::int64_t stridec,
-                              std::int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   transa
-      Specifies op(``A``) the transposition operation applied to the
-      matrices ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   transb
-      Specifies op(``B``) the transposition operation applied to the
-      matrices ``B``. See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Number of rows of op(``A``) and ``C``. Must be at least zero.
-
-   n
-      Number of columns of op(``B``) and ``C``. Must be at least zero.
-
-   k
-      Number of columns of op(``A``) and rows of op(``B``). Must be at
-      least zero.
-
-   alpha
-      Scaling factor for the matrix-matrix products.
-
-   a
-      Pointer to input matrices ``A`` with size ``stridea`` * ``batch_size``.
-
-   lda
-      The leading dimension of the matrices ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``lda`` must be at least ``m``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``m``.
-
-   stridea
-      Stride between different ``A`` matrices.
-
-   b
-      Pointer to input matrices ``B`` with size ``strideb`` * ``batch_size``.
-
-   ldb
-      The leading dimension of the matrices``B``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``ldb`` must be at least ``k``.
-           - ``ldb`` must be at least ``n``.
-         * - Row major
-           - ``ldb`` must be at least ``n``.
-           - ``ldb`` must be at least ``k``.
-
-   strideb
-      Stride between different ``B`` matrices.
-
-   beta
-      Scaling factor for the matrices ``C``.
-
-   c
-      Pointer to input/output matrices ``C`` with size ``stridec`` * ``batch_size``.
-
-   ldc
-      The leading dimension of the matrices ``C``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if row major layout is used to store matrices.
-
-   stridec
-      Stride between different ``C`` matrices.
-
-   batch_size
-      Specifies the number of matrix multiply operations to perform.
-
-   dependencies
-         List of events to wait for before starting computation, if any.
-         If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Output matrices, overwritten by ``batch_size`` matrix multiply
-      operations of the form ``alpha`` * op(``A``)*op(``B``) + ``beta`` * ``C``.
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``beta`` = 0, matrix ``C`` does not need to be initialized before
-   calling ``gemm_batch``.
-
-.. container:: section
-      
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-like-extensions`
diff --git a/docs/domains/blas/gemm_bias.rst b/docs/domains/blas/gemm_bias.rst
deleted file mode 100644
index dd7ce4ecc..000000000
--- a/docs/domains/blas/gemm_bias.rst
+++ /dev/null
@@ -1,513 +0,0 @@
-.. _onemkl_blas_gemm_bias:
-
-gemm_bias
-=========
-
-Computes a matrix-matrix product using general integer matrices with bias.
-
-.. _onemkl_blas_gemm_bias_description:
-
-.. rubric:: Description
-
-The gemm_bias routines compute a scalar-matrix-matrix product and
-add the result to a scalar-matrix product, using general integer matrices with biases/offsets. 
-The operation is defined as:
-
-.. math::
-      
-      \scriptstyle C \leftarrow alpha*(op(A) - A\_offset)*(op(B) - B\_offset) + beta*C + C\_offset
-
-where:
-
-op(``X``) is one of op(``X``) = ``X``, or op(``X``) = ``X``\ :sup:`T`, or
-op(``X``) = ``X``\ :sup:`H`,
-
-``alpha`` and ``beta`` are scalars,
-
-``A_offset`` is an ``m``-by-``k`` matrix with every element equal to the value ao,
-
-``B_offset`` is a ``k``-by-``n`` matrix with every element equal to the value bo,
-
-``C_offset`` is an ``m``-by-``n`` matrix defined by the 
-co buffer as described below, 
-
-``A``, ``B``, and ``C`` are matrices,
-
-op(``A``) is ``m`` x ``k``, op(``B``) is ``k`` x ``n``, and
-``C`` is ``m`` x ``n``.
-
-``gemm_bias`` supports the following precisions.
-
-  .. list-table:: 
-     :header-rows: 1
-
-     * -  Ts 
-       -  Ta 
-       -  Tb 
-       -  Tc 
-     * -  ``float`` 
-       -  ``std::uint8_t`` 
-       -  ``std::uint8_t`` 
-       -  ``std::int32_t`` 
-     * -  ``float`` 
-       -  ``std::int8_t`` 
-       -  ``std::uint8_t`` 
-       -  ``std::int32_t`` 
-     * -  ``float`` 
-       -  ``std::uint8_t`` 
-       -  ``std::int8_t`` 
-       -  ``std::int32_t`` 
-     * -  ``float`` 
-       -  ``std::int8_t`` 
-       -  ``std::int8_t`` 
-       -  ``std::int32_t`` 
-
-.. _onemkl_blas_gemm_bias_buffer:
-
-gemm_bias (Buffer Version)
---------------------------
-
-.. rubric:: Syntax
-      
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void gemm_bias(sycl::queue &queue,
-                      onemkl::transpose transa,
-                      onemkl::transpose transb,
-                      onemkl::offset offset_type,
-                      std::int64_t m,
-                      std::int64_t n,
-                      std::int64_t k,
-                      Ts alpha,
-                      sycl::buffer<Ta,1> &a,
-                      std::int64_t lda,
-                      Ta ao,
-                      sycl::buffer<Tb,1> &b,
-                      std::int64_t ldb,
-                      Tb bo,
-                      Ts beta,
-                      sycl::buffer<Tc,1> &c,
-                      std::int64_t ldc,
-                      sycl::buffer<Tc,1> &co)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void gemm_bias(sycl::queue &queue,
-                      onemkl::transpose transa,
-                      onemkl::transpose transb,
-                      onemkl::offset offset_type,
-                      std::int64_t m,
-                      std::int64_t n,
-                      std::int64_t k,
-                      Ts alpha,
-                      sycl::buffer<Ta,1> &a,
-                      std::int64_t lda,
-                      Ta ao,
-                      sycl::buffer<Tb,1> &b,
-                      std::int64_t ldb,
-                      Tb bo,
-                      Ts beta,
-                      sycl::buffer<Tc,1> &c,
-                      std::int64_t ldc,
-                      sycl::buffer<Tc,1> &co)
-   }
-      
-.. container:: section
-   
-   .. rubric:: Input Parameters
- 
-   queue
-      The queue where the routine should be executed.
- 
-   transa
-      Specifies op(``A``), the transposition operation applied to
-      ``A``. See
-      :ref:`onemkl_datatypes` for
-      more details.
- 
-   transb
-      Specifies op(``B``), the transposition operation applied to
-      ``B``. See
-      :ref:`onemkl_datatypes` for
-      more details.
- 
-   offset_type
-      Specifies the form of ``C_offset`` used in the matrix
-      multiplication. See
-      :ref:`onemkl_datatypes` for
-      more details.
- 
-   m
-      Number of rows of op(``A``) and ``C``. Must be at least zero.
- 
-   n
-      Number of columns of op(``B``) and ``C``. Must be at least
-      zero.
- 
-   k
-      Number of columns of op(``A``) and rows of op(``B``). Must be
-      at least zero.
- 
-   alpha
-      Scaling factor for the matrix-matrix product.
- 
-   a
-      The buffer holding the input matrix ``A``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``A`` is an ``m``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-           - ``A`` is an ``k``-by-``m`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``m``
-         * - Row major
-           - ``A`` is an ``m``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``m``.
-           - ``A`` is an ``k``-by-``m`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``
-
-      See :ref:`matrix-storage` for more details.
- 
-   lda
-      The leading dimension of ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``lda`` must be at least ``m``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``m``.
- 
-   ao 
-      Specifies the scalar offset value for matrix ``A``.
- 
-   b
-      Buffer holding the input matrix ``B``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``
-         * - Row major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``
-   
-      See :ref:`matrix-storage` for more details.
- 
-   ldb
-      The leading dimension of ``B``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``ldb`` must be at least ``k``.
-           - ``ldb`` must be at least ``n``.
-         * - Row major
-           - ``ldb`` must be at least ``n``.
-           - ``ldb`` must be at least ``k``.
- 
-   bo 
-      Specifies the scalar offset value for matrix ``B``.
- 
-   beta
-      Scaling factor for matrix ``C``.
- 
-   c
-      Buffer holding the input/output matrix ``C``.  It must have a
-      size of at least ``ldc``\ \*\ ``n`` if column major layout is
-      used to store matrices or at least ``ldc``\ \*\ ``m`` if row
-      major layout is used to store matrices . 
-      See :ref:`matrix-storage` for more details.
- 
-   ldc
-      The leading dimension of ``C``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if row major layout is used to store matrices.
-      
-   co
-      Buffer holding the offset values for matrix ``C``.
- 
-      If ``offset_type`` = ``offset::fix``, the ``co`` array must have
-      size at least 1.
- 
- 
-      If ``offset_type`` = ``offset::col``, the ``co`` array must have
-      size at least ``max(1,m)``.
- 
- 
-      If ``offset_type`` = ``offset::row``, the ``co`` array must have
-      size at least ``max(1,n)``. 
- 
-.. container:: section
- 
-   .. rubric:: Output Parameters
- 
-   c
-      Output buffer, overwritten by ``alpha`` * (op(``A``) -
-      ``A_offset``)*(op(``B``) - ``B_offset``) + ``beta`` * ``C`` + ``C_offset``.
- 
-.. container:: section
- 
-   .. rubric:: Notes
- 
-   If ``beta`` = 0, matrix ``C`` does not need to be initialized
-   before calling ``gemm_bias``.
-
-
-.. _onemkl_blas_gemm_bias_usm:
-
-gemm_bias (USM Version)
------------------------
-
-.. rubric:: Syntax
-      
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event gemm_bias(sycl::queue &queue,
-                             onemkl::transpose transa,
-                             onemkl::transpose transb,
-                             onemkl::offset offset_type,
-                             std::int64_t m,
-                             std::int64_t n,
-                             std::int64_t k,
-                             Ts alpha,
-                             const Ta *a,
-                             std::int64_t lda,
-                             Ta ao,
-                             const Tb *b,
-                             std::int64_t ldb,
-                             Tb bo,
-                             Ts beta,
-                             Tc *c,
-                             std::int64_t ldc,
-                             const Tc *co,
-                             const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event gemm_bias(sycl::queue &queue,
-                             onemkl::transpose transa,
-                             onemkl::transpose transb,
-                             onemkl::offset offset_type,
-                             std::int64_t m,
-                             std::int64_t n,
-                             std::int64_t k,
-                             Ts alpha,
-                             const Ta *a,
-                             std::int64_t lda,
-                             Ta ao,
-                             const Tb *b,
-                             std::int64_t ldb,
-                             Tb bo,
-                             Ts beta,
-                             Tc *c,
-                             std::int64_t ldc,
-                             const Tc *co,
-                             const std::vector<sycl::event> &dependencies = {})
-   }
-      
-.. container:: section
-   
-   .. rubric:: Input Parameters
- 
-   queue
-      The queue where the routine should be executed.
- 
-   transa
-      Specifies op(``A``), the transposition operation applied to
-      ``A``. See
-      :ref:`onemkl_datatypes` for
-      more details.
- 
-   transb
-      Specifies op(``B``), the transposition operation applied to
-      ``B``. See
-      :ref:`onemkl_datatypes` for
-      more details.
- 
-   offset_type
-      Specifies the form of ``C_offset`` used in the matrix
-      multiplication. See
-      :ref:`onemkl_datatypes` for
-      more details.
- 
-   m
-      Number of rows of op(``A``) and ``C``. Must be at least zero.
- 
-   n
-      Number of columns of op(``B``) and ``C``. Must be at least
-      zero.
- 
-   k
-      Number of columns of op(``A``) and rows of op(``B``). Must be
-      at least zero.
- 
-   alpha
-      Scaling factor for the matrix-matrix product.
- 
-   a
-      Pointer to input matrix ``A``.
- 
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``A`` is an ``m``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-           - ``A`` is an ``k``-by-``m`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``m``
-         * - Row major
-           - ``A`` is an ``m``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``m``.
-           - ``A`` is an ``k``-by-``m`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``
- 
-      See :ref:`matrix-storage` for more details.
- 
-   lda
-      The leading dimension of ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``lda`` must be at least ``m``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``m``.
- 
-   ao
-      Specifies the scalar offset value for matrix ``A``.
- 
-   b
-      Pointer to input matrix ``B``.
- 
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``
-         * - Row major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``
- 
-      See :ref:`matrix-storage` for more details.
- 
-   ldb
-      The leading dimension of ``B``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``ldb`` must be at least ``k``.
-           - ``ldb`` must be at least ``n``.
-         * - Row major
-           - ``ldb`` must be at least ``n``.
-           - ``ldb`` must be at least ``k``.
- 
-   bo 
-      Specifies the scalar offset value for matrix ``B``.
- 
-   beta
-      Scaling factor for matrix ``C``.
- 
-   c
-      Pointer to input/output matrix ``C``. It must have a
-      size of at least ``ldc``\ \*\ ``n`` if column major layout is
-      used to store matrices or at least ``ldc``\ \*\ ``m`` if row
-      major layout is used to store matrices . See :ref:`matrix-storage` for more details.
- 
-   ldc
-      The leading dimension of ``C``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if row major layout is used to store matrices.
-
-   co
-      Pointer to offset values for matrix ``C``.
- 
- 
-      If ``offset_type`` = ``offset::fix``, the ``co`` array must have
-      size at least 1.
- 
- 
-      If ``offset_type`` = ``offset::col``, the ``co`` array must have
-      size at least ``max(1,m)``.
- 
- 
-      If ``offset_type`` = ``offset::row``, the ``co`` array must have
-      size at least ``max(1,n)``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
- 
-.. container:: section
- 
-   .. rubric:: Output Parameters
- 
-   c
-      Pointer to the output matrix, overwritten by ``alpha`` * (op(``A``) -
-      ``A_offset``)*(op(``B``) - ``B_offset``) + ``beta`` * ``C`` + ``C_offset``.
- 
-.. container:: section
- 
-   .. rubric:: Notes
- 
-   If ``beta`` = 0, matrix ``C`` does not need to be initialized
-   before calling ``gemm_bias``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-like-extensions`
diff --git a/docs/domains/blas/gemmt.rst b/docs/domains/blas/gemmt.rst
deleted file mode 100644
index 7f224b2f7..000000000
--- a/docs/domains/blas/gemmt.rst
+++ /dev/null
@@ -1,418 +0,0 @@
-.. _onemkl_blas_gemmt:
-
-gemmt
-=====
-
-Computes a matrix-matrix product with general matrices, but updates
-only the upper or lower triangular part of the result matrix.
-
-.. _onemkl_blas_gemmt_description:
-
-.. rubric:: Description
-
-The gemmt routines compute a scalar-matrix-matrix product and add
-the result to the upper or lower part of a scalar-matrix product,
-with general matrices. The operation is defined as:
-
-.. math::
-
-      C \leftarrow alpha*op(A)*op(B) + beta*C 
-
-where:
-
-op(``X``) is one of op(``X``) = ``X``, or op(``X``) = ``X``\ :sup:`T`, or
-op(``X``) = ``X``\ :sup:`H`,
-
-``alpha`` and ``beta`` are scalars
-
-``A``, ``B``, and ``C`` are matrices
-
-op(``A``) is ``n`` x ``k``, op(``B``) is ``k`` x ``n``, and
-``C`` is ``n`` x ``n``.
-
-``gemmt`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_gemmt_buffer:
-
-gemmt (Buffer Version)
-----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void gemmt(sycl::queue &queue,
-                  onemkl::uplo upper_lower,
-                  onemkl::transpose transa,
-                  onemkl::transpose transb,
-                  std::int64_t n,
-                  std::int64_t k,
-                  T alpha,
-                  sycl::buffer<T,1> &a,
-                  std::int64_t lda,
-                  sycl::buffer<T,1> &b,
-                  std::int64_t ldb,
-                  T beta,
-                  sycl::buffer<T,1> &c,
-                  std::int64_t ldc)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void gemmt(sycl::queue &queue,
-                  onemkl::uplo upper_lower,
-                  onemkl::transpose transa,
-                  onemkl::transpose transb,
-                  std::int64_t n,
-                  std::int64_t k,
-                  T alpha,
-                  sycl::buffer<T,1> &a,
-                  std::int64_t lda,
-                  sycl::buffer<T,1> &b,
-                  std::int64_t ldb,
-                  T beta,
-                  sycl::buffer<T,1> &c,
-                  std::int64_t ldc)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``C``\ ’s data is stored in its upper or
-      lower triangle. See :ref:`onemkl_datatypes` for more details.
-   
-   transa
-      Specifies op(``A``), the transposition operation applied to
-      ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   transb
-      Specifies op(``B``), the transposition operation applied to
-      ``B``. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows of op(``A``), columns of op(``B``), and
-      columns and rows of\ ``C``. Must be at least zero.
-
-   k
-      Number of columns of op(``A``) and rows of op(``B``). Must be
-      at least zero.
-
-   alpha
-      Scaling factor for the matrix-matrix product.
-
-   a
-      Buffer holding the input matrix ``A``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``
-         * - Row major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-      
-      See :ref:`matrix-storage` for more details.
-
-   lda
-      The leading dimension of ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``lda`` must be at least ``n``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``n``.
-
-   b
-      Buffer holding the input matrix ``B``.
-      
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``
-         * - Row major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-   
-      See :ref:`matrix-storage` for more details.
-
-   ldb
-      The leading dimension of ``B``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``ldb`` must be at least ``k``.
-           - ``ldb`` must be at least ``n``.
-         * - Row major
-           - ``ldb`` must be at least ``n``.
-           - ``ldb`` must be at least ``k``.
-
-   beta
-      Scaling factor for matrix ``C``.
-
-   c
-      Buffer holding the input/output matrix ``C``. Must have size at
-      least ``ldc`` \* ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   ldc
-      Leading dimension of ``C``. Must be positive and at least
-      ``m``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Output buffer, overwritten by the upper or lower triangular
-      part of ``alpha`` * op(``A``)*op(``B``) + ``beta`` * ``C``.
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``beta`` = 0, matrix ``C`` does not need to be initialized
-   before calling gemmt.
-
-
-.. _onemkl_blas_gemmt_usm:
-
-gemmt (USM Version)
--------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event gemmt(sycl::queue &queue,
-                         onemkl::uplo upper_lower,
-                         onemkl::transpose transa,
-                         onemkl::transpose transb,
-                         std::int64_t n,
-                         std::int64_t k,
-                         T alpha,
-                         const T* a,
-                         std::int64_t lda,
-                         const T* b,
-                         std::int64_t ldb,
-                         T beta,
-                         T* c,
-                         std::int64_t ldc,
-                         const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event gemmt(sycl::queue &queue,
-                         onemkl::uplo upper_lower,
-                         onemkl::transpose transa,
-                         onemkl::transpose transb,
-                         std::int64_t n,
-                         std::int64_t k,
-                         T alpha,
-                         const T* a,
-                         std::int64_t lda,
-                         const T* b,
-                         std::int64_t ldb,
-                         T beta,
-                         T* c,
-                         std::int64_t ldc,
-                         const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``C``\ ’s data is stored in its upper or
-      lower triangle. See
-      :ref:`onemkl_datatypes` for
-      more details.
-
-   transa
-      Specifies op(``A``), the transposition operation applied to
-      ``A``. See
-      :ref:`onemkl_datatypes` for
-      more details.
-
-   transb
-      Specifies op(``B``), the transposition operation applied to
-      ``B``. See
-      :ref:`onemkl_datatypes` for
-      more details.
-
-   n
-      Number of columns of op(``A``), columns of op(``B``), and
-      columns of\ ``C``. Must be at least zero.
-
-   k
-      Number of columns of op(``A``) and rows of op(``B``). Must be
-      at least zero.
-
-   alpha
-      Scaling factor for the matrix-matrix product.
-
-   a
-      Pointer to input matrix ``A``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``
-         * - Row major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``
-
-      See :ref:`matrix-storage` for more details.
-
-   lda
-      The leading dimension of ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``lda`` must be at least ``n``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``n``.
-
-   b
-      Pointer to input matrix ``B``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``
-         * - Row major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``
-
-      See :ref:`matrix-storage` for more details.
-
-   ldb
-      The leading dimension of ``B``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``B`` not transposed
-           - ``B`` transposed
-         * - Column major
-           - ``ldb`` must be at least ``k``.
-           - ``ldb`` must be at least ``n``.
-         * - Row major
-           - ``ldb`` must be at least ``n``.
-           - ``ldb`` must be at least ``k``.
-      
-   beta
-      Scaling factor for matrix ``C``.
-
-   c
-      Pointer to input/output matrix ``C``. Must have size at least
-      ``ldc`` \* ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   ldc
-      Leading dimension of ``C``. Must be positive and at least
-      ``m``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Pointer to the output matrix, overwritten by the upper or lower
-      triangular part of ``alpha`` * op(``A``)*op(``B``) + ``beta`` * ``C``.
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``beta`` = 0, matrix ``C`` does not need to be initialized
-   before calling gemmt.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-like-extensions`
diff --git a/docs/domains/blas/gemv.rst b/docs/domains/blas/gemv.rst
deleted file mode 100644
index 9577fc111..000000000
--- a/docs/domains/blas/gemv.rst
+++ /dev/null
@@ -1,261 +0,0 @@
-.. _onemkl_blas_gemv:
-
-gemv
-====
-
-Computes a matrix-vector product using a general matrix.
-
-.. _onemkl_blas_gemv_description:
-
-.. rubric:: Description
-
-The ``gemv`` routines compute a scalar-matrix-vector product and add the
-result to a scalar-vector product, with a general matrix. The
-operation is defined as:
-
-.. math::
-      
-      y \leftarrow alpha*op(A)*x + beta*y
-
-where:
-
-op(``A``) is one of op(``A``) = ``A``, or op(``A``) =
-``A``\ :sup:`T`, or op(``A``) = ``A``\ :sup:`H`,
-
-``alpha`` and ``beta`` are scalars,
-
-``A`` is an ``m``-by-``n`` matrix, and ``x``, ``y`` are vectors.
-
-``gemv`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_gemv_buffer:
-
-gemv (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void gemv(sycl::queue &queue,
-                 onemkl::transpose trans,
-                 std::int64_t m,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void gemv(sycl::queue &queue,
-                 onemkl::transpose trans,
-                 std::int64_t m,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   trans
-      Specifies ``op(A)``, the transposition operation applied to ``A``.
-
-   m
-      Specifies the number of rows of the matrix ``A``. The value of
-      ``m`` must be at least zero.
-
-   n
-      Specifies the number of columns of the matrix ``A``. The value of
-      ``n`` must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      The buffer holding the input matrix ``A``. Must have a size of at
-      least ``lda``\ \*\ ``n`` if column major layout is used or at
-      least ``lda``\ \*\ ``m`` if row major layout is used. See
-      :ref:`matrix-storage` for more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be positive and at least
-      ``m`` if column major layout is used or at least ``n`` if row
-      major layout is used.
-
-   x
-      Buffer holding input vector ``x``. The length ``len`` of vector
-      ``x`` is ``n`` if ``A`` is not transposed, and ``m`` if ``A`` is
-      transposed. The buffer must be of size at least (1 + (``len`` -
-      1)*abs(``incx``)). See :ref:`matrix-storage` for more details.
-
-   incx
-      The stride of vector ``x``.
-
-   beta
-      The scaling factor for vector ``y``.
-
-   y
-      Buffer holding input/output vector ``y``. The length ``len`` of
-      vector ``y`` is ``m``, if ``A`` is not transposed, and ``n`` if
-      ``A`` is transposed. The buffer must be of size at least (1 +
-      (``len`` - 1)*abs(``incy``)) where ``len`` is this length. See
-      :ref:`matrix-storage` for more details.
-
-   incy
-      The stride of vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      The buffer holding updated vector ``y``.
-
-
-.. _onemkl_blas_gemv_usm:
-
-gemv (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event gemv(sycl::queue &queue,
-                        onemkl::transpose trans,
-                        std::int64_t m,
-                        std::int64_t n,
-                        T alpha,
-                        const T *a,
-                        std::int64_t lda,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event gemv(sycl::queue &queue,
-                        onemkl::transpose trans,
-                        std::int64_t m,
-                        std::int64_t n,
-                        T alpha,
-                        const T *a,
-                        std::int64_t lda,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   trans
-      Specifies ``op(A)``, the transposition operation applied to
-      ``A``. See
-      :ref:`onemkl_datatypes` for
-      more details.
-
-   m
-      Specifies the number of rows of the matrix ``A``. The value of
-      ``m`` must be at least zero.
-
-   n
-      Specifies the number of columns of the matrix ``A``. The value
-      of ``n`` must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Pointer to the input matrix ``A``. Must have a size of at
-      least ``lda``\ \*\ ``n`` if column major layout is used or at
-      least ``lda``\ \*\ ``m`` if row major layout is used. See
-      :ref:`matrix-storage` for more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be positive and at least
-      ``m`` if column major layout is used or at least ``n`` if row
-      major layout is used.
-
-   x
-      Pointer to the input vector ``x``. The length ``len`` of vector
-      ``x`` is ``n`` if ``A`` is not transposed, and ``m`` if ``A``
-      is transposed. The array holding vector ``x`` must be of size
-      at least (1 + (``len`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      The stride of vector ``x``.
-
-   beta
-      The scaling factor for vector ``y``.
-
-   y
-      Pointer to input/output vector ``y``. The length ``len`` of
-      vector ``y`` is ``m``, if ``A`` is not transposed, and ``n`` if
-      ``A`` is transposed. The array holding input/output vector
-      ``y`` must be of size at least (1 + (``len`` -
-      1)*abs(``incy``)) where ``len`` is this length. See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      The stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      The pointer to updated vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/gemv_batch.rst b/docs/domains/blas/gemv_batch.rst
deleted file mode 100644
index 0a56b73b6..000000000
--- a/docs/domains/blas/gemv_batch.rst
+++ /dev/null
@@ -1,472 +0,0 @@
-.. _onemkl_blas_gemv_batch:
-
-gemv_batch
-==========
-
-Computes a group of ``gemv`` operations.
-
-.. _onemkl_blas_gemv_batch_description:
-
-.. rubric:: Description
-
-The ``gemv_batch`` routines are batched versions of
-:ref:`onemkl_blas_gemv`, performing multiple ``gemv`` operations in a
-single call. Each ``gemv`` operations perform a scalar-matrix-vector
-product and add the result to a scalar-vector product.
-   
-``gemv_batch`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_gemv_batch_buffer:
-
-gemv_batch (Buffer Version)
----------------------------
-
-.. rubric:: Description
-
-The buffer version of ``gemv_batch`` supports only the strided API. 
-
-The strided API operation is defined as:
-::
-
-   for i = 0 … batch_size – 1
-       A is a matrix at offset i * stridea in a.
-       X and Y are matrices at offset i * stridex, i * stridey, in x and y.
-       Y := alpha * op(A) * X + beta * Y
-   end for
-
-where:
-
-op(A) is one of op(A) = A, or op(A) = A\ :sup:`T`, or op(A) = A\ :sup:`H`,
-
-``alpha`` and ``beta`` are scalars,
-
-``A`` is a matrix and ``X`` and ``Y`` are vectors,
-
-The ``x`` and ``y`` buffers contain all the input matrices. The stride
-between vectors is given by the stride parameter. The total number of
-vectors in ``x`` and ``y`` buffers is given by the ``batch_size``
-parameter.
-
-**Strided API**
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void gemv_batch(sycl::queue &queue,
-                       onemkl::transpose trans,
-                       std::int64_t m,
-                       std::int64_t n,
-                       T alpha,
-                       sycl::buffer<T,1> &a,
-                       std::int64_t lda,
-                       std::int64_t stridea,
-                       sycl::buffer<T,1> &x,
-                       std::int64_t incx,
-                       std::int64_t stridex,
-                       T beta,
-                       sycl::buffer<T,1> &y,
-                       std::int64_t incy,
-                       std::int64_t stridey,
-                       std::int64_t batch_size)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void gemv_batch(sycl::queue &queue,
-                       onemkl::transpose trans,
-                       std::int64_t m,
-                       std::int64_t n,
-                       T alpha,
-                       sycl::buffer<T,1> &a,
-                       std::int64_t lda,
-                       std::int64_t stridea,
-                       sycl::buffer<T,1> &x,
-                       std::int64_t incx,
-                       std::int64_t stridex,
-                       T beta,
-                       sycl::buffer<T,1> &y,
-                       std::int64_t incy,
-                       std::int64_t stridey,
-                       std::int64_t batch_size)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   trans
-      Specifies op(``A``) the transposition operation applied to the
-      matrices ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Number of rows of op(``A``). Must be at least zero.
-
-   n
-      Number of columns of op(``A``). Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector products.
-
-   a
-      Buffer holding the input matrices ``A`` with size ``stridea`` * ``batch_size``.
-
-   lda
-      The leading dimension of the matrices ``A``. It must be positive
-      and at least ``m`` if column major layout is used or at least
-      ``n`` if row major layout is used.
-
-   stridea
-      Stride between different ``A`` matrices.
-
-   x
-      Buffer holding the input vectors ``X`` with size ``stridex`` * ``batch_size``.
-
-   incx
-      The stride of the vector ``X``. It must be positive.
-
-   stridex
-      Stride between different consecutive ``X`` vectors, must be at least 0.
-
-   beta
-      Scaling factor for the vector ``Y``.
-
-   y
-      Buffer holding input/output vectors ``Y`` with size ``stridey`` * ``batch_size``.
-
-   incy
-      Stride between two consecutive elements of the ``y`` vectors.
-
-   stridey
-      Stride between two consecutive ``Y`` vectors. Must be at least
-      (1 + (len-1)*abs(incy)) where ``len`` is ``m`` if the matrix ``A``
-      is non transpose or ``n`` otherwise.
-
-   batch_size
-      Specifies the number of matrix-vector operations to perform.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Output overwritten by ``batch_size`` matrix-vector product
-      operations of the form ``alpha`` * op(``A``) * ``X`` + ``beta`` * ``Y``.
-
-
-.. _onemkl_blas_gemv_batch_usm:
-
-gemv_batch (USM Version)
----------------------------
-
-.. rubric:: Description
-
-The USM version of ``gemv_batch`` supports the group API and strided API. 
-
-The group API operation is defined as:
-::
-
-   idx = 0
-   for i = 0 … group_count – 1
-       for j = 0 … group_size – 1
-           A is an m x n matrix in a[idx]
-           X and Y are vectors in x[idx] and y[idx]
-           Y := alpha[i] * op(A) * X + beta[i] * Y
-           idx = idx + 1
-       end for
-   end for
-
-The strided API operation is defined as
-::
-
-   for i = 0 … batch_size – 1
-       A is a matrix at offset i * stridea in a.
-       X and Y are vectors at offset i * stridex, i * stridey in x and y.
-       Y := alpha * op(A) * X + beta * Y
-   end for
-
-where:
-
-op(A) is one of op(A) = A, or op(A) = A\ :sup:`T`, or op(A) = A\ :sup:`H`,
-
-``alpha`` and ``beta`` are scalars,
-
-``A`` is a matrix and ``X`` and ``Y`` are vectors,
-
-For group API, ``x`` and ``y`` arrays contain the pointers for all the input vectors. 
-``A`` array contains the pointers to all input matrices.
-The total number of vectors in ``x`` and ``y`` and matrices in ``A`` are given by: 
-
-.. math::
-
-      total\_batch\_count = \sum_{i=0}^{group\_count-1}group\_size[i]    
- 
-For strided API, ``x`` and ``y`` arrays contain all the input
-vectors. ``A`` array contains the pointers to all input matrices.  The
-total number of vectors in ``x`` and ``y`` and matrices in ``A`` are given by the
-``batch_size`` parameter.
-   
-**Group API**
-
-.. rubric:: Syntax
-   
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event gemv_batch(sycl::queue &queue,
-                              onemkl::transpose *trans,
-                              std::int64_t *m,
-                              std::int64_t *n,
-                              T *alpha,
-                              const T **a,
-                              std::int64_t *lda,
-                              const T **x,
-                              std::int64_t *incx,
-                              T *beta,
-                              T **y,
-                              std::int64_t *incy,
-                              std::int64_t group_count,
-                              std::int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event gemv_batch(sycl::queue &queue,
-                              onemkl::transpose *trans,
-                              std::int64_t *m,
-                              std::int64_t *n,
-                              T *alpha,
-                              const T **a,
-                              std::int64_t *lda,
-                              const T **x,
-                              std::int64_t *incx,
-                              T *beta,
-                              T **y,
-                              std::int64_t *incy,
-                              std::int64_t group_count,
-                              std::int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   trans
-      Array of ``group_count`` ``onemkl::transpose`` values. ``trans[i]`` specifies the form of op(``A``) used in
-      the matrix-vector product in group ``i``. See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Array of ``group_count`` integers. ``m[i]`` specifies the
-      number of rows of op(``A``) for every matrix in group ``i``. All entries must be at least zero.
-
-   n
-      Array of ``group_count`` integers. ``n[i]`` specifies the
-      number of columns of op(``A``) for every matrix in group ``i``. All entries must be at least zero.
-
-   alpha
-      Array of ``group_count`` scalar elements. ``alpha[i]`` specifies
-      the scaling factor for every matrix-vector product in group
-      ``i``.
-
-   a
-      Array of pointers to input matrices ``A`` with size ``total_batch_count``. 
-      
-      See :ref:`matrix-storage` for more details.
-
-   lda
-      Array of ``group_count`` integers. ``lda[i]`` specifies the
-      leading dimension of ``A`` for every matrix in group ``i``. All
-      entries must be positive and at least ``m`` if column major
-      layout is used or at least ``n`` if row major layout is used.
-             
-   x
-      Array of pointers to input vectors ``X`` with size ``total_batch_count``. 
-      
-      See :ref:`matrix-storage` for more details.
-
-   incx
-      Array of ``group_count`` integers. ``incx[i]`` specifies the
-      stride of ``X`` for every vector in group ``i``. All
-      entries must be positive.
-             
-   beta
-      Array of ``group_count`` scalar elements. ``beta[i]`` specifies
-      the scaling factor for vector ``Y`` for every vector in group
-      ``i``.
-
-   y
-      Array of pointers to input/output vectors ``Y`` with size ``total_batch_count``. 
-      
-      See :ref:`matrix-storage` for more details.
-
-   incy
-      Array of ``group_count`` integers. ``incy[i]`` specifies the
-      leading dimension of ``Y`` for every vector in group ``i``.  All
-      entries must be positive and ``incy[i]`` must be at least
-      ``m[i]`` if column major layout is used or at
-      least ``n[i]`` if row major layout is used.
-
-   group_count
-      Specifies the number of groups. Must be at least 0.
-
-   group_size
-      Array of ``group_count`` integers. ``group_size[i]`` specifies the
-      number of matrix-vector products in group ``i``. All entries must be at least 0.
-
-   dependencies
-         List of events to wait for before starting computation, if any.
-         If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Overwritten by vector calculated by 
-      (``alpha[i]`` * op(``A``) * ``X`` + ``beta[i]`` * ``Y``) for group ``i``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event gemv_batch(sycl::queue &queue,
-                              onemkl::transpose trans,
-                              std::int64_t m,
-                              std::int64_t n,
-                              T alpha,
-                              const T *a,
-                              std::int64_t lda,
-                              std::int64_t stridea,
-                              const T *x,
-                              std::int64_t incx,
-                              std::int64_t stridex,
-                              T beta,
-                              T *y,
-                              std::int64_t incy,
-                              std::int64_t stridey,
-                              std::int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event gemv_batch(sycl::queue &queue,
-                              onemkl::transpose trans,
-                              std::int64_t m,
-                              std::int64_t n,
-                              T alpha,
-                              const T *a,
-                              std::int64_t lda,
-                              std::int64_t stridea,
-                              const T *x,
-                              std::int64_t incx,
-                              std::int64_t stridex,
-                              T beta,
-                              T *y,
-                              std::int64_t incy,
-                              std::int64_t stridey,
-                              std::int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   trans
-      Specifies op(``A``) the transposition operation applied to the
-      matrices ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Number of rows of op(``A``). Must be at least zero.
-
-   n
-      Number of columns of op(``A``). Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector products.
-
-   a
-      Pointer to the input matrices ``A`` with size ``stridea`` * ``batch_size``.
-
-   lda
-      The leading dimension of the matrices ``A``. It must be positive
-      and at least ``m`` if column major layout is used or at least
-      ``n`` if row major layout is used.
-
-   stridea
-      Stride between different ``A`` matrices.
-
-   x
-      Pointer to the input vectors ``X`` with size ``stridex`` * ``batch_size``.
-
-   incx
-      Stride of the vector ``X``. It must be positive.
-
-   stridex
-      Stride between different consecutive ``X`` vectors, must be at least 0.
-
-   beta
-      Scaling factor for the vector ``Y``.
-
-   y
-      Pointer to the input/output vectors ``Y`` with size ``stridey`` * ``batch_size``.
-
-   incy
-      Stride between two consecutive elements of the ``y`` vectors.
-
-   stridey
-      Stride between two consecutive ``Y`` vectors. Must be at least
-      (1 + (len-1)*abs(incy)) where ``len`` is ``m`` if the matrix ``A``
-      is non transpose or ``n`` otherwise.
-
-   batch_size
-      Specifies the number of matrix-vector operations to perform.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Output overwritten by ``batch_size`` matrix-vector product
-      operations of the form ``alpha`` * op(``A``) * ``X`` + ``beta`` * ``Y``.
-
-.. container:: section
-      
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-like-extensions`
diff --git a/docs/domains/blas/ger.rst b/docs/domains/blas/ger.rst
deleted file mode 100644
index ea128414d..000000000
--- a/docs/domains/blas/ger.rst
+++ /dev/null
@@ -1,226 +0,0 @@
-.. _onemkl_blas_ger:
-
-ger
-===
-
-Computes a rank-1 update of a general matrix.
-
-.. _onemkl_blas_ger_description:
-
-.. rubric:: Description
-
-The ``ger`` routines compute a scalar-vector-vector product and add the
-result to a general matrix. The operation is defined as:
-
-.. math::
-
-      A \leftarrow alpha*x*y^T + A
-
-where:
-
-``alpha`` is scalar,
-
-``A`` is an ``m``-by-``n`` matrix,
-
-``x`` is a vector of length ``m``,
-
-``y`` is a vector of length ``n``.
-
-``ger`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-
-.. _onemkl_blas_ger_buffer:
-
-ger (Buffer Version)
---------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void ger(sycl::queue &queue,
-                std::int64_t m,
-                std::int64_t n,
-                T alpha,
-                sycl::buffer<T,1> &x,
-                std::int64_t incx,
-                sycl::buffer<T,1> &y,
-                std::int64_t incy,
-                sycl::buffer<T,1> &a,
-                std::int64_t lda)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void ger(sycl::queue &queue,
-                std::int64_t m,
-                std::int64_t n,
-                T alpha,
-                sycl::buffer<T,1> &x,
-                std::int64_t incx,
-                sycl::buffer<T,1> &y,
-                std::int64_t incy,
-                sycl::buffer<T,1> &a,
-                std::int64_t lda)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   m
-      Number of rows of ``A``. Must be at least zero.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``m`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input/output vector ``y``. The buffer must be of
-      size at least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage`
-      for more details.
-
-   incy
-      Stride of vector ``y``.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n`` if column major layout is used or at least ``lda``\ \*\ ``m``
-      if row major layout is used. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be positive and at least
-      ``m`` if column major layout is used or at least ``n`` if row
-      major layout is used.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Buffer holding the updated matrix ``A``.
-
-
-.. _onemkl_blas_ger_usm:
-
-ger (USM Version)
------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event ger(sycl::queue &queue,
-                       std::int64_t m,
-                       std::int64_t n,
-                       T alpha,
-                       const T *x,
-                       std::int64_t incx,
-                       const T *y,
-                       std::int64_t incy,
-                       T *a,
-                       std::int64_t lda,
-                       const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event ger(sycl::queue &queue,
-                       std::int64_t m,
-                       std::int64_t n,
-                       T alpha,
-                       const T *x,
-                       std::int64_t incx,
-                       const T *y,
-                       std::int64_t incy,
-                       T *a,
-                       std::int64_t lda,
-                       const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   m
-      Number of rows of ``A``. Must be at least zero.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``m`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Pointer to input/output vector ``y``. The array holding
-      input/output vector ``y`` must be of size at least (1 + (``n``
-      - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   a
-      Pointer to input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n`` if column major layout is used or at least ``lda``\ \*\ ``m``
-      if row major layout is used. See :ref:`matrix-storage` for more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be positive and at least
-      ``m`` if column major layout is used or at least ``n`` if row
-      major layout is used.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Pointer to the updated matrix ``A``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-      
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/gerc.rst b/docs/domains/blas/gerc.rst
deleted file mode 100644
index 8a8c28463..000000000
--- a/docs/domains/blas/gerc.rst
+++ /dev/null
@@ -1,227 +0,0 @@
-.. _onemkl_blas_gerc:
-
-gerc
-====
-
-Computes a rank-1 update (conjugated) of a general complex matrix.
-
-.. _onemkl_blas_gerc_description:
-
-.. rubric:: Description
-
-The ``gerc`` routines compute a scalar-vector-vector product and add the
-result to a general matrix. The operation is defined as:
-
-.. math::
-
-      A \leftarrow alpha*x*y^H + A
-
-
-where:
-
-``alpha`` is a scalar,
-
-``A`` is an ``m``-by-``n`` matrix,
-
-``x`` is a vector of length ``m``,
-
-``y`` is vector of length ``n``.
-
-``gerc`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_gerc_buffer:
-
-gerc (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void gerc(sycl::queue &queue,
-                 std::int64_t m,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void gerc(sycl::queue &queue,
-                 std::int64_t m,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   m
-      Number of rows of ``A``. Must be at least zero.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``m`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input/output vector ``y``. The buffer must be of
-      size at least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage`
-      for more details.
-
-   incy
-      Stride of vector ``y``.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n`` if column major layout is used or at least ``lda``\ \*\ ``m``
-      if row major layout is used. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be positive and at least
-      ``m`` if column major layout is used or at least ``n`` if row
-      major layout is used.
-
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Buffer holding the updated matrix ``A``.
-
-
-.. _onemkl_blas_gerc_usm:
-
-gerc (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event gerc(sycl::queue &queue,
-                        std::int64_t m,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x,
-                        std::int64_t incx,
-                        const T *y,
-                        std::int64_t incy,
-                        T *a,
-                        std::int64_t lda,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event gerc(sycl::queue &queue,
-                        std::int64_t m,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x,
-                        std::int64_t incx,
-                        const T *y,
-                        std::int64_t incy,
-                        T *a,
-                        std::int64_t lda,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   m
-      Number of rows of ``A``. Must be at least zero.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Pointer to the input vector ``x``. The array holding input
-      vector ``x`` must be of size at least (1 + (``m`` -
-      1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Pointer to the input/output vector ``y``. The array holding the
-      input/output vector ``y`` must be of size at least (1 + (``n``
-      - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A``\ must have size at least ``lda``\ \*\ ``n`` if column
-      major layout is used or at least ``lda``\ \*\ ``m`` if row
-      major layout is used. See :ref:`matrix-storage` for more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be positive and at least
-      ``m`` if column major layout is used or at least ``n`` if row
-      major layout is used.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Pointer to the updated matrix ``A``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/geru.rst b/docs/domains/blas/geru.rst
deleted file mode 100644
index 8e22b3ba9..000000000
--- a/docs/domains/blas/geru.rst
+++ /dev/null
@@ -1,227 +0,0 @@
-.. _onemkl_blas_geru:
-
-geru
-====
-
-Computes a rank-1 update (unconjugated) of a general complex matrix.
-
-.. _onemkl_blas_geru_description:
-
-.. rubric:: Description
-
-The ``geru`` routines routines compute a scalar-vector-vector product and
-add the result to a general matrix. The operation is defined as
-
-.. math::
-      
-      A \leftarrow alpha*x*y^T + A
-
-where:
-
-``alpha`` is a scalar,
-
-``A`` is an ``m``-by-``n`` matrix,
-
-``x`` is a vector of length ``m``,
-
-``y`` is a vector of length ``n``.
-
-``geru`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_geru_buffer:
-
-geru (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void geru(sycl::queue &queue,
-                 std::int64_t m,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void geru(sycl::queue &queue,
-                 std::int64_t m,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   m
-      Number of rows of ``A``. Must be at least zero.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``m`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input/output vector ``y``. The buffer must be of
-      size at least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage`
-      for more details.
-
-   incy
-      Stride of vector ``y``.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n`` if column major layout is used or at least ``lda``\ \*\ ``m``
-      if row major layout is used. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be positive and at least
-      ``m`` if column major layout is used or at least ``n`` if row
-      major layout is used.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Buffer holding the updated matrix ``A``.
-
-
-.. _onemkl_blas_geru_usm:
-
-geru (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event geru(sycl::queue &queue,
-                        std::int64_t m,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x,
-                        std::int64_t incx,
-                        const T *y,
-                        std::int64_t incy,
-                        T *a,
-                        std::int64_t lda,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event geru(sycl::queue &queue,
-                        std::int64_t m,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x,
-                        std::int64_t incx,
-                        const T *y,
-                        std::int64_t incy,
-                        T *a,
-                        std::int64_t lda,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   m
-      Number of rows of ``A``. Must be at least zero.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Pointer to the input vector ``x``. The array holding input
-      vector ``x`` must be of size at least (1 + (``m`` -
-      1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Pointer to input/output vector ``y``. The array holding
-      input/output vector ``y`` must be of size at least (1 + (``n``
-      - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least ``lda``\ \*\ ``n`` if column
-      major layout is used or at least ``lda``\ \*\ ``m`` if row
-      major layout is used. See :ref:`matrix-storage` for more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be positive and at
-      least ``m`` if column major layout is used or at least ``n``
-      if row major layout is used.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Pointer to the updated matrix ``A``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-      
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/hbmv.rst b/docs/domains/blas/hbmv.rst
deleted file mode 100644
index a86e7fef8..000000000
--- a/docs/domains/blas/hbmv.rst
+++ /dev/null
@@ -1,245 +0,0 @@
-.. _onemkl_blas_hbmv:
-
-hbmv
-====
-
-Computes a matrix-vector product using a Hermitian band matrix.
-
-.. _onemkl_blas_hbmv_description:
-
-.. rubric:: Description
-
-The ``hbmv`` routines compute a scalar-matrix-vector product and add the
-result to a scalar-vector product, with a Hermitian band matrix. The
-operation is defined as
-
-.. math::
-
-      y \leftarrow alpha*A*x + beta*y
-
-where:
-
-``alpha`` and ``beta`` are scalars,
-
-``A`` is an ``n``-by-``n`` Hermitian band matrix, with ``k``
-super-diagonals,
-
-``x`` and ``y`` are vectors of length ``n``.
-
-``hbmv`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_hbmv_buffer:
-
-hbmv (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void hbmv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 std::int64_t k,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void hbmv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 std::int64_t k,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   k
-      Number of super-diagonals of the matrix ``A``. Must be at least
-      zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least (``k`` + 1),
-      and positive.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``m`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   beta
-      Scaling factor for vector ``y``.
-
-   y
-      Buffer holding input/output vector ``y``. The buffer must be of
-      size at least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage`
-      for more details.
-
-   incy
-      Stride of vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Buffer holding the updated vector ``y``.
-
-
-.. _onemkl_blas_hbmv_usm:
-
-hbmv (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event hbmv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        std::int64_t k,
-                        T alpha,
-                        const T *a,
-                        std::int64_t lda,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event hbmv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        std::int64_t k,
-                        T alpha,
-                        const T *a,
-                        std::int64_t lda,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   k
-      Number of super-diagonals of the matrix ``A``. Must be at least
-      zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Pointer to the input matrix ``A``. The array holding input
-      matrix ``A`` must have size at least ``lda``\ \*\ ``n``. See
-      :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least (``k`` +
-      1), and positive.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``m`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   beta
-      Scaling factor for vector ``y``.
-
-   y
-      Pointer to input/output vector ``y``. The array holding
-      input/output vector ``y`` must be of size at least (1 + (``n``
-      - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Pointer to the updated vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/hemm.rst b/docs/domains/blas/hemm.rst
deleted file mode 100644
index 4098cd75b..000000000
--- a/docs/domains/blas/hemm.rst
+++ /dev/null
@@ -1,315 +0,0 @@
-.. _onemkl_blas_hemm:
-
-hemm
-====
-
-Computes a matrix-matrix product where one input matrix is Hermitian
-and one is general.
-
-.. _onemkl_blas_hemm_description:
-
-.. rubric:: Description
-
-The ``hemm`` routines compute a scalar-matrix-matrix product and add the
-result to a scalar-matrix product, where one of the matrices in the
-multiplication is Hermitian. The argument ``left_right`` determines
-if the Hermitian matrix, ``A``, is on the left of the multiplication
-(``left_right`` = ``side::left``) or on the right (``left_right`` =
-``side::right``). Depending on ``left_right``, the operation is
-defined as:
-
-.. math::
-
-      C \leftarrow alpha*A*B + beta*C
-
-or
-
-.. math::
-
-      C \leftarrow alpha*B*A + beta*C
-
-where:
-
-``alpha`` and ``beta`` are scalars,
-
-``A`` is a Hermitian matrix, either ``m``-by-``m`` or ``n``-by-``n``
-matrices,
-
-``B`` and ``C`` are ``m``-by-``n`` matrices.
-
-``hemm`` supports the following precisions:
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_hemm_buffer:
-
-hemm (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void hemm(sycl::queue &queue,
-                 onemkl::side left_right,
-                 onemkl::uplo upper_lower,
-                 std::int64_t m,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &b,
-                 std::int64_t ldb,
-                 T beta,
-                 sycl::buffer<T,1> &c,
-                 std::int64_t ldc)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void hemm(sycl::queue &queue,
-                 onemkl::side left_right,
-                 onemkl::uplo upper_lower,
-                 std::int64_t m,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &b,
-                 std::int64_t ldb,
-                 T beta,
-                 sycl::buffer<T,1> &c,
-                 std::int64_t ldc)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   left_right
-      Specifies whether ``A`` is on the left side of the multiplication
-      (``side::left``) or on the right side (``side::right``). See :ref:`onemkl_datatypes` for more details.
-
-   uplo
-      Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Specifies the number of rows of the matrix ``B`` and ``C``.
-
-      The value of ``m`` must be at least zero.
-
-   n
-      Specifies the number of columns of the matrix ``B`` and ``C``.
-
-      The value of ``n`` must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-matrix product.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``m`` if ``A`` is on the left of the multiplication,
-      or ``lda``\ \*\ ``n`` if ``A`` is on the right. See :ref:`matrix-storage`
-      for more details.
-
-   lda
-      Leading dimension of ``A``. Must be at least ``m`` if ``A`` is on
-      the left of the multiplication, or at least ``n`` if ``A`` is on
-      the right. Must be positive.
-
-   b
-      Buffer holding input matrix ``B``. Must have size at least
-      ``ldb``\ \*\ ``n`` if column major layout is
-      used to store matrices or at least ``ldb``\ \*\ ``m`` if row
-      major layout is used to store matrices. See :ref:`matrix-storage` for
-      more details.
-
-   ldb
-      Leading dimension of ``B``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if column major layout is used to store matrices.
-
-   beta
-      Scaling factor for matrix ``C``.
-
-   c
-      The buffer holding the input/output matrix ``C``. It must have a
-      size of at least ``ldc``\ \*\ ``n`` if column major layout is
-      used to store matrices or at least ``ldc``\ \*\ ``m`` if row
-      major layout is used to store matrices . See :ref:`matrix-storage` for more details.
-
-   ldc
-      The leading dimension of ``C``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if column major layout is used to store matrices.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Output buffer, overwritten by ``alpha``\ \*\ ``A``\ \*\ ``B`` +
-      ``beta``\ \*\ ``C`` (``left_right`` = ``side::left``) or
-      ``alpha``\ \*\ ``B``\ \*\ ``A`` + ``beta``\ \*\ ``C``
-      (``left_right`` = ``side::right``).
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``beta`` = 0, matrix ``C`` does not need to be initialized before
-   calling ``hemm``.
-
-      
-
-.. _onemkl_blas_hemm_usm:
-
-hemm (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event hemm(sycl::queue &queue,
-                        onemkl::side left_right,
-                        onemkl::uplo upper_lower,
-                        std::int64_t m,
-                        std::int64_t n,
-                        T alpha,
-                        const T* a,
-                        std::int64_t lda,
-                        const T* b,
-                        std::int64_t ldb,
-                        T beta,
-                        T* c,
-                        std::int64_t ldc,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event hemm(sycl::queue &queue,
-                        onemkl::side left_right,
-                        onemkl::uplo upper_lower,
-                        std::int64_t m,
-                        std::int64_t n,
-                        T alpha,
-                        const T* a,
-                        std::int64_t lda,
-                        const T* b,
-                        std::int64_t ldb,
-                        T beta,
-                        T* c,
-                        std::int64_t ldc,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   left_right
-      Specifies whether ``A`` is on the left side of the
-      multiplication (``side::left``) or on the right side
-      (``side::right``). See :ref:`onemkl_datatypes` for more details.
-
-   uplo
-      Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Specifies the number of rows of the matrix ``B`` and ``C``.
-
-      The value of ``m`` must be at least zero.
-
-   n
-      Specifies the number of columns of the matrix ``B`` and ``C``.
-
-      The value of ``n`` must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-matrix product.
-
-   a
-      Pointer to input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``m`` if ``A`` is on the left of the
-      multiplication, or ``lda``\ \*\ ``n`` if ``A`` is on the right.
-      See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of ``A``. Must be at least ``m`` if ``A`` is
-      on the left of the multiplication, or at least ``n`` if ``A``
-      is on the right. Must be positive.
-
-   b
-      Pointer to input matrix ``B``. Must have size at least
-      ``ldb``\ \*\ ``n`` if column major layout is
-      used to store matrices or at least ``ldb``\ \*\ ``m`` if row
-      major layout is used to store matrices. See :ref:`matrix-storage` for
-      more details.
-
-   ldb
-      Leading dimension of ``B``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if column major layout is used to store matrices.
-
-   beta
-      Scaling factor for matrix ``C``.
-
-   c
-      The pointer to input/output matrix ``C``. It must have a
-      size of at least ``ldc``\ \*\ ``n`` if column major layout is
-      used to store matrices or at least ``ldc``\ \*\ ``m`` if row
-      major layout is used to store matrices . See :ref:`matrix-storage` for more details.
-      
-   ldc
-      The leading dimension of ``C``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if column major layout is used to store matrices.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Pointer to the output matrix, overwritten by
-      ``alpha``\ \*\ ``A``\ \*\ ``B`` + ``beta``\ \*\ ``C``
-      (``left_right`` = ``side::left``) or
-      ``alpha``\ \*\ ``B``\ \*\ ``A`` + ``beta``\ \*\ ``C``
-      (``left_right`` = ``side::right``).
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``beta`` = 0, matrix ``C`` does not need to be initialized
-   before calling ``hemm``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/hemv.rst b/docs/domains/blas/hemv.rst
deleted file mode 100644
index cc959ba95..000000000
--- a/docs/domains/blas/hemv.rst
+++ /dev/null
@@ -1,232 +0,0 @@
-.. _onemkl_blas_hemv:
-
-hemv
-====
-
-Computes a matrix-vector product using a Hermitian matrix.
-
-.. _onemkl_blas_hemv_description:
-
-.. rubric:: Description
-
-The ``hemv`` routines compute a scalar-matrix-vector product and add the
-result to a scalar-vector product, with a Hermitian matrix. The
-operation is defined as
-
-.. math::
-
-      y \leftarrow alpha*A*x + beta*y 
-
-where:
-
-``alpha`` and ``beta`` are scalars,
-
-``A`` is an ``n``-by-``n`` Hermitian matrix,
-
-``x`` and ``y`` are vectors of length ``n``.
-
-``hemv`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_hemv_buffer:
-
-hemv (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void hemv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void hemv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether *A* is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``m``, and
-      positive.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   beta
-      Scaling factor for vector ``y``.
-
-   y
-      Buffer holding input/output vector ``y``. The buffer must be of
-      size at least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage`
-      for more details.
-
-   incy
-      Stride of vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Buffer holding the updated vector ``y``.
-
-      
-
-.. _onemkl_blas_hemv_usm:
-
-hemv (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event hemv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *a,
-                        std::int64_t lda,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event hemv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *a,
-                        std::int64_t lda,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether *A* is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``m``, and
-      positive.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   beta
-      Scaling factor for vector ``y``.
-
-   y
-      Pointer to input/output vector ``y``. The array holding
-      input/output vector ``y`` must be of size at least (1 + (``n``
-      - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Pointer to the updated vector ``y``.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/her.rst b/docs/domains/blas/her.rst
deleted file mode 100644
index ed243f62d..000000000
--- a/docs/domains/blas/her.rst
+++ /dev/null
@@ -1,205 +0,0 @@
-.. _onemkl_blas_her:
-
-her
-===
-
-Computes a rank-1 update of a Hermitian matrix.
-
-.. _onemkl_blas_her_description:
-
-.. rubric:: Description
-
-The ``her`` routines compute a scalar-vector-vector product and add the
-result to a Hermitian matrix. The operation is defined as:
-
-.. math::
-      
-      A \leftarrow alpha*x*x^H + A
-
-where:
-
-``alpha`` is scalar,
-
-``A`` is an ``n``-by-``n`` Hermitian matrix,
-
-``x`` is a vector of length ``n``.
-
-``her`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_her_buffer:
-
-her (Buffer Version)
---------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void her(sycl::queue &queue,
-                onemkl::uplo upper_lower,
-                std::int64_t n,
-                T alpha,
-                sycl::buffer<T,1> &x,
-                std::int64_t incx,
-                sycl::buffer<T,1> &a,
-                std::int64_t lda)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void her(sycl::queue &queue,
-                onemkl::uplo upper_lower,
-                std::int64_t n,
-                T alpha,
-                sycl::buffer<T,1> &x,
-                std::int64_t incx,
-                sycl::buffer<T,1> &a,
-                std::int64_t lda)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``n``, and
-      positive.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Buffer holding the updated upper triangular part of the Hermitian
-      matrix ``A`` if ``upper_lower``\  \= ``upper`` or the updated
-      lower triangular part of the Hermitian matrix ``A`` if
-      ``upper_lower``\ \ =\ ``lower``.
-
-      The imaginary parts of the diagonal elements are set to zero.
-
-
-.. _onemkl_blas_her_usm:
-
-her (USM Version)
------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event her(sycl::queue &queue,
-                       onemkl::uplo upper_lower,
-                       std::int64_t n,
-                       T alpha,
-                       const T *x,
-                       std::int64_t incx,
-                       T *a,
-                       std::int64_t lda,
-                       const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event her(sycl::queue &queue,
-                       onemkl::uplo upper_lower,
-                       std::int64_t n,
-                       T alpha,
-                       const T *x,
-                       std::int64_t incx,
-                       T *a,
-                       std::int64_t lda,
-                       const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether *A* is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``n``, and
-      positive.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Pointer to the updated upper triangular part of the Hermitian
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper`` or the updated
-      lower triangular part of the Hermitian matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-      The imaginary parts of the diagonal elements are set to zero.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/her2.rst b/docs/domains/blas/her2.rst
deleted file mode 100644
index f9272adae..000000000
--- a/docs/domains/blas/her2.rst
+++ /dev/null
@@ -1,231 +0,0 @@
-.. _onemkl_blas_her2:
-
-her2
-====
-
-Computes a rank-2 update of a Hermitian matrix.
-
-.. _onemkl_blas_her2_description:
-
-.. rubric:: Description
-
-The ``her2`` routines compute two scalar-vector-vector products and add
-them to a Hermitian matrix. The operation is defined as:
-
-.. math::
-
-      A \leftarrow alpha*x*y^H + conjg(alpha)*y*x^H + A
-
-where:
-
-``alpha`` is a scalar,
-
-``A`` is an ``n``-by-``n`` Hermitian matrix,
-
-``x`` and ``y`` are vectors or length ``n``.
-
-``her2`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_her2_buffer:
-
-her2 (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void her2(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void her2(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input/output vector ``y``. The buffer must be of
-      size at least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage`
-      for more details.
-
-   incy
-      Stride of vector ``y``.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``n``, and
-      positive.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Buffer holding the updated upper triangular part of the Hermitian
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper``, or the updated
-      lower triangular part of the Hermitian matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-      The imaginary parts of the diagonal elements are set to zero.
-
-      
-
-.. _onemkl_blas_her2_usm:
-
-her2 (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event her2(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x,
-                        std::int64_t incx,
-                        const T *y,
-                        std::int64_t incy,
-                        T *a,
-                        std::int64_t lda,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event her2(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x,
-                        std::int64_t incx,
-                        const T *y,
-                        std::int64_t incy,
-                        T *a,
-                        std::int64_t lda,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Pointer to input/output vector ``y``. The array holding
-      input/output vector ``y`` must be of size at least (1 + (``n``
-      - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``n``, and
-      positive.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Pointer to the updated upper triangular part of the Hermitian
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper``, or the updated
-      lower triangular part of the Hermitian matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-      The imaginary parts of the diagonal elements are set to zero.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/her2k.rst b/docs/domains/blas/her2k.rst
deleted file mode 100644
index 6f03052da..000000000
--- a/docs/domains/blas/her2k.rst
+++ /dev/null
@@ -1,397 +0,0 @@
-.. _onemkl_blas_her2k:
-
-her2k
-=====
-
-Performs a Hermitian rank-2k update.
-
-.. _onemkl_blas_her2k_description:
-
-.. rubric:: Description
-
-The ``her2k`` routines perform a rank-2k update of an ``n`` x ``n``
-Hermitian matrix ``C`` by general matrices ``A`` and ``B``. 
-
-If ``trans`` = ``transpose::nontrans``, the operation is defined as:
-
-.. math::
-
-      C \leftarrow alpha*A*B^H + conjg(alpha)*B*A^H + beta*C
-
-where ``A`` is ``n`` x ``k`` and ``B`` is ``k`` x ``n``.
-
-If ``trans`` = ``transpose::conjtrans``, the operation is defined as:
-
-.. math::
-
-      C \leftarrow alpha*B*A^H + conjg(alpha)*A*B^H + beta*C
-
-where ``A`` is ``k`` x ``n`` and ``B`` is ``n`` x ``k``.
-
-In both cases:
-
-``alpha`` is a complex scalar and ``beta`` is a real scalar.
-
-``C`` is a Hermitian matrix and ``A`` , ``B`` are general matrices.
-
-The inner dimension of both matrix multiplications is ``k``.
-
-``her2k`` supports the following precisions:
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-        -  T_real 
-      * -  ``std::complex<float>`` 
-        -  ``float`` 
-      * -  ``std::complex<double>`` 
-        -  ``double`` 
-
-.. _onemkl_blas_her2k_buffer:
-
-her2k (Buffer Version)
-----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void her2k(sycl::queue &queue,
-                  onemkl::uplo upper_lower,
-                  onemkl::transpose trans,
-                  std::int64_t n,
-                  std::int64_t k,
-                  T alpha,
-                  sycl::buffer<T,1> &a,
-                  std::int64_t lda,
-                  sycl::buffer<T,1> &b,
-                  std::int64_t ldb,
-                  T_real beta,
-                  sycl::buffer<T,1> &c,
-                  std::int64_t ldc)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void her2k(sycl::queue &queue,
-                  onemkl::uplo upper_lower,
-                  onemkl::transpose trans,
-                  std::int64_t n,
-                  std::int64_t k,
-                  T alpha,
-                  sycl::buffer<T,1> &a,
-                  std::int64_t lda,
-                  sycl::buffer<T,1> &b,
-                  std::int64_t ldb,
-                  T_real beta,
-                  sycl::buffer<T,1> &c,
-                  std::int64_t ldc)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies the operation to apply, as described above. Supported
-      operations are ``transpose::nontrans`` and
-      ``transpose::conjtrans``.
-
-   n
-      The number of rows and columns in ``C``. The value of ``n`` must
-      be at least zero.
-
-   k
-      The inner dimension of matrix multiplications. The value of ``k``
-      must be at least equal to zero.
-
-   alpha
-      Complex scaling factor for the rank-2k update.
-
-   a
-      Buffer holding input matrix ``A``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``
-         * - Row major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-
-      See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      The leading dimension of ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``lda`` must be at least ``n``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``n``.
-                
-   b
-      Buffer holding input matrix ``B``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``
-         * - Row major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-
-      See :ref:`matrix-storage`
-      for more details.
-
-   ldb
-      The leading dimension of ``B``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``ldb`` must be at least ``k``.
-           - ``ldb`` must be at least ``n``.
-         * - Row major
-           - ``ldb`` must be at least ``n``.
-           - ``ldb`` must be at least ``k``.
-             
-   beta
-      Real scaling factor for matrix ``C``.
-      
-   c
-      Buffer holding input/output matrix ``C``. Must have size at least
-      ``ldc``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   ldc
-      Leading dimension of ``C``. Must be positive and at least ``n``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Output buffer, overwritten by the updated ``C`` matrix.
-
-
-.. _onemkl_blas_her2k_usm:
-
-her2k (USM Version)
--------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event her2k(sycl::queue &queue,
-                         onemkl::uplo upper_lower,
-                         onemkl::transpose trans,
-                         std::int64_t n,
-                         std::int64_t k,
-                         T alpha,
-                         const T* a,
-                         std::int64_t lda,
-                         const T* b,
-                         std::int64_t ldb,
-                         T_real beta,
-                         T* c,
-                         std::int64_t ldc,
-                         const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event her2k(sycl::queue &queue,
-                         onemkl::uplo upper_lower,
-                         onemkl::transpose trans,
-                         std::int64_t n,
-                         std::int64_t k,
-                         T alpha,
-                         const T* a,
-                         std::int64_t lda,
-                         const T* b,
-                         std::int64_t ldb,
-                         T_real beta,
-                         T* c,
-                         std::int64_t ldc,
-                         const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies the operation to apply, as described above. Supported
-      operations are ``transpose::nontrans`` and
-      ``transpose::conjtrans``.
-
-   n
-      The number of rows and columns in ``C``. The value of ``n``
-      must be at least zero.
-
-   k
-      The inner dimension of matrix multiplications. The value of
-      ``k`` must be at least equal to zero.
-
-   alpha
-      Complex scaling factor for the rank-2k update.
-
-   a
-      Pointer to input matrix ``A``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``
-         * - Row major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-      
-      See :ref:`matrix-storage` for more details.
-
-   lda
-      The leading dimension of ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``lda`` must be at least ``n``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``n``.
-   
-   b
-      Pointer to input matrix ``B``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``
-         * - Row major
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``.
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-   
-      See :ref:`matrix-storage` for
-      more details.
-
-   ldb
-      The leading dimension of ``B``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``ldb`` must be at least ``k``.
-           - ``ldb`` must be at least ``n``.
-         * - Row major
-           - ``ldb`` must be at least ``n``.
-           - ``ldb`` must be at least ``k``.
-
-   beta
-      Real scaling factor for matrix ``C``.
-
-   c
-      Pointer to input/output matrix ``C``. Must have size at least
-      ``ldc``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   ldc
-      Leading dimension of ``C``. Must be positive and at least
-      ``n``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Pointer to the output matrix, overwritten by the updated ``C``
-      matrix.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-      
-
-   **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/herk.rst b/docs/domains/blas/herk.rst
deleted file mode 100644
index e4d4d16a9..000000000
--- a/docs/domains/blas/herk.rst
+++ /dev/null
@@ -1,309 +0,0 @@
-.. _onemkl_blas_herk:
-
-herk
-====
-
-Performs a Hermitian rank-k update.
-
-.. _onemkl_blas_herk_description:
-
-.. rubric:: Description
-
-The ``herk`` routines compute a rank-k update of a Hermitian matrix
-``C`` by a general matrix ``A``. The operation is defined as:
-
-.. math::
-
-      C \leftarrow alpha*op(A)*op(A)^H + beta*C
-
-where:
-
-op(``X``) is one of op(``X``) = ``X`` or op(``X``) = ``X``\ :sup:`H`,
-
-``alpha`` and ``beta`` are real scalars,
-
-``C`` is a Hermitian matrix and ``A`` is a general matrix.
-
-Here op(``A``) is ``n`` x ``k``, and ``C`` is ``n`` x ``n``.
-
-``herk`` supports the following precisions:
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-        -  T_real 
-      * -  ``std::complex<float>`` 
-        -  ``float`` 
-      * -  ``std::complex<double>`` 
-        -  ``double`` 
-
-.. _onemkl_blas_herk_buffer:
-
-herk (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void herk(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 std::int64_t n,
-                 std::int64_t k,
-                 T_real alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 T_real beta,
-                 sycl::buffer<T,1> &c,
-                 std::int64_t ldc)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void herk(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 std::int64_t n,
-                 std::int64_t k,
-                 T_real alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 T_real beta,
-                 sycl::buffer<T,1> &c,
-                 std::int64_t ldc)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to ``A``. See
-      :ref:`onemkl_datatypes` for more
-      details. Supported operations are ``transpose::nontrans`` and
-      ``transpose::conjtrans``.
-
-   n
-      The number of rows and columns in ``C``.The value of ``n`` must be
-      at least zero.
-
-   k
-      Number of columns in op(``A``).
-
-      The value of ``k`` must be at least zero.
-
-   alpha
-      Real scaling factor for the rank-k update.
-
-   a
-      Buffer holding input matrix ``A``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``
-         * - Row major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-
-      See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      The leading dimension of ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``lda`` must be at least ``n``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``n``.
-
-   beta
-      Real scaling factor for matrix ``C``.
-
-   c
-      Buffer holding input/output matrix ``C``. Must have size at least
-      ``ldc``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   ldc
-      Leading dimension of ``C``. Must be positive and at least ``n``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      The output buffer, overwritten by
-      ``alpha``\ \*op(``A``)*op(``A``)\ :sup:`T` + ``beta``\ \*\ ``C``.
-      The imaginary parts of the diagonal elements are set to zero.
-
-      
-
-.. _onemkl_blas_herk_usm:
-
-herk (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event herk(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        std::int64_t n,
-                        std::int64_t k,
-                        T_real alpha,
-                        const T* a,
-                        std::int64_t lda,
-                        T_real beta,
-                        T* c,
-                        std::int64_t ldc,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event herk(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        std::int64_t n,
-                        std::int64_t k,
-                        T_real alpha,
-                        const T* a,
-                        std::int64_t lda,
-                        T_real beta,
-                        T* c,
-                        std::int64_t ldc,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to
-      ``A``. See :ref:`onemkl_datatypes` for more details. Supported operations are ``transpose::nontrans``
-      and ``transpose::conjtrans``.
-
-   n
-      The number of rows and columns in ``C``.The value of ``n`` must
-      be at least zero.
-
-   k
-      Number of columns in op(``A``).
-
-      The value of ``k`` must be at least zero.
-
-   alpha
-      Real scaling factor for the rank-k update.
-
-   a
-      Pointer to input matrix ``A``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``
-         * - Row major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-      
-      See :ref:`matrix-storage` for more details.
-
-   lda
-      The leading dimension of ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``lda`` must be at least ``n``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``n``.
-
-   beta
-      Real scaling factor for matrix ``C``.
-
-   c
-      Pointer to input/output matrix ``C``. Must have size at least
-      ``ldc``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   ldc
-      Leading dimension of ``C``. Must be positive and at least
-      ``n``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Pointer to the output matrix, overwritten by
-      ``alpha``\ \*op(``A``)*op(``A``)\ :sup:`T` +
-      ``beta``\ \*\ ``C``. The imaginary parts of the diagonal
-      elements are set to zero.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-      
-
-   **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/hpmv.rst b/docs/domains/blas/hpmv.rst
deleted file mode 100644
index 17872b2b2..000000000
--- a/docs/domains/blas/hpmv.rst
+++ /dev/null
@@ -1,228 +0,0 @@
-.. _onemkl_blas_hpmv:
-
-hpmv
-====
-
-Computes a matrix-vector product using a Hermitian packed matrix.
-
-.. _onemkl_blas_hpmv_description:
-
-.. rubric:: Description
-
-The ``hpmv`` routines compute a scalar-matrix-vector product and add the
-result to a scalar-vector product, with a Hermitian packed matrix.
-The operation is defined as
-
-.. math::
-
-      y \leftarrow alpha*A*x + beta*y
-
-where:
-
-``alpha`` and ``beta`` are scalars,
-
-``A`` is an ``n``-by-``n`` Hermitian matrix supplied in packed form,
-
-``x`` and ``y`` are vectors of length ``n``.
-
-``hpmv`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_hpmv_buffer:
-
-hpmv (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void hpmv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void hpmv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      (``n``\ \*(``n``\ +1))/2. See :ref:`matrix-storage` for
-      more details.
-
-      The imaginary parts of the diagonal elements need not be set and
-      are assumed to be zero.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   beta
-      Scaling factor for vector ``y``.
-
-   y
-      Buffer holding input/output vector ``y``. The buffer must be of
-      size at least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage`
-      for more details.
-
-   incy
-      Stride of vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Buffer holding the updated vector ``y``.
-
-      
-
-.. _onemkl_blas_hpmv_usm:
-
-hpmv (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event hpmv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *a,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event hpmv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *a,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least (``n``\ \*(``n``\ +1))/2. See
-      :ref:`matrix-storage` for
-      more details.
-
-      The imaginary parts of the diagonal elements need not be set
-      and are assumed to be zero.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   beta
-      Scaling factor for vector ``y``.
-
-   y
-      Pointer to input/output vector ``y``. The array holding
-      input/output vector ``y`` must be of size at least (1 + (``n``
-      - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Pointer to the updated vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-      
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/hpr.rst b/docs/domains/blas/hpr.rst
deleted file mode 100644
index a7fd49cde..000000000
--- a/docs/domains/blas/hpr.rst
+++ /dev/null
@@ -1,201 +0,0 @@
-.. _onemkl_blas_hpr:
-
-hpr
-===
-
-Computes a rank-1 update of a Hermitian packed matrix.
-
-.. _onemkl_blas_hpr_description:
-
-.. rubric:: Description
-
-The ``hpr`` routines compute a scalar-vector-vector product and add the
-result to a Hermitian packed matrix. The operation is defined as
-
-.. math::
-
-      A \leftarrow alpha*x*x^H + A
-
-where:
-
-``alpha`` is scalar,
-
-``A`` is an ``n``-by-``n`` Hermitian matrix, supplied in packed form,
-
-``x`` is a vector of length ``n``.
-
-``hpr`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_hpr_buffer:
-
-hpr (Buffer Version)
---------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void hpr(sycl::queue &queue,
-                onemkl::uplo upper_lower,
-                std::int64_t n,
-                T alpha,
-                sycl::buffer<T,1> &x,
-                std::int64_t incx,
-                sycl::buffer<T,1> &a)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void hpr(sycl::queue &queue,
-                onemkl::uplo upper_lower,
-                std::int64_t n,
-                T alpha,
-                sycl::buffer<T,1> &x,
-                std::int64_t incx,
-                sycl::buffer<T,1> &a)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      (``n``\ \*(``n``-1))/2. See :ref:`matrix-storage` for
-      more details.
-
-      The imaginary part of the diagonal elements need not be set and
-      are assumed to be zero.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Buffer holding the updated upper triangular part of the Hermitian
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper``, or the updated lower
-      triangular part of the Hermitian matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-      The imaginary parts of the diagonal elements are set to zero.
-
-
-.. _onemkl_blas_hpr_usm:
-
-hpr (USM Version)
------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event hpr(sycl::queue &queue,
-                       onemkl::uplo upper_lower,
-                       std::int64_t n,
-                       T alpha,
-                       const T *x,
-                       std::int64_t incx,
-                       T *a,
-                       const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event hpr(sycl::queue &queue,
-                       onemkl::uplo upper_lower,
-                       std::int64_t n,
-                       T alpha,
-                       const T *x,
-                       std::int64_t incx,
-                       T *a,
-                       const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least (``n``\ \*(``n``-1))/2. See
-      :ref:`matrix-storage` for
-      more details.
-
-      The imaginary part of the diagonal elements need not be set and
-      are assumed to be zero.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Pointer to the updated upper triangular part of the Hermitian
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper``, or the updated lower
-      triangular part of the Hermitian matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-      The imaginary parts of the diagonal elements are set to zero.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-      
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/hpr2.rst b/docs/domains/blas/hpr2.rst
deleted file mode 100644
index 3e3fa4a63..000000000
--- a/docs/domains/blas/hpr2.rst
+++ /dev/null
@@ -1,226 +0,0 @@
-.. _onemkl_blas_hpr2:
-
-hpr2
-====
-
-Performs a rank-2 update of a Hermitian packed matrix.
-
-.. _onemkl_blas_hpr2_description:
-
-.. rubric:: Description
-
-The ``hpr2`` routines compute two scalar-vector-vector products and add
-them to a Hermitian packed matrix. The operation is defined as
-
-.. math::
-
-      A \leftarrow alpha*x*y^H + conjg(alpha)*y*x^H + A
-
-where:
-
-``alpha`` is a scalar,
-
-``A`` is an ``n``-by-``n`` Hermitian matrix, supplied in packed form,
-
-``x`` and ``y`` are vectors of length ``n``.
-
-``hpr2`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_hpr2_buffer:
-
-hpr2 (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void hpr2(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &a)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void hpr2(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &a)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input/output vector ``y``. The buffer must be of
-      size at least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage`
-      for more details.
-
-   incy
-      Stride of vector ``y``.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      (``n``\ \*(``n``-1))/2. See :ref:`matrix-storage` for
-      more details.
-
-      The imaginary parts of the diagonal elements need not be set and
-      are assumed to be zero.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Buffer holding the updated upper triangular part of the Hermitian
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper``, or the updated lower
-      triangular part of the Hermitian matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-      The imaginary parts of the diagonal elements are set to zero.
-
-      
-
-.. _onemkl_blas_hpr2_usm:
-
-hpr2 (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event hpr2(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x,
-                        std::int64_t incx,
-                        const T *y,
-                        std::int64_t incy,
-                        T *a,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event hpr2(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x,
-                        std::int64_t incx,
-                        const T *y,
-                        std::int64_t incy,
-                        T *a,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Pointer to input/output vector ``y``. The array holding
-      input/output vector ``y`` must be of size at least (1 + (``n``
-      - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least (``n``\ \*(``n``-1))/2. See
-      :ref:`matrix-storage` for
-      more details.
-
-      The imaginary parts of the diagonal elements need not be set
-      and are assumed to be zero.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Pointer to the updated upper triangular part of the Hermitian
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper``, or the updated lower
-      triangular part of the Hermitian matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-      The imaginary parts of the diagonal elements are set to zero.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/iamax.rst b/docs/domains/blas/iamax.rst
deleted file mode 100644
index ce02af8de..000000000
--- a/docs/domains/blas/iamax.rst
+++ /dev/null
@@ -1,167 +0,0 @@
-.. _onemkl_blas_iamax:
-
-iamax
-=====
-
-Finds the index of the element with the largest absolute value in a vector.
-
-.. _onemkl_blas_iamax_description:
-
-.. rubric:: Description
-
-The ``iamax`` routines return an index ``i`` such that ``x[i]``
-has the maximum absolute value of all elements in vector ``x`` (real
-variants), or such that (\|Re(``x[i]``)\| + \|Im(``x[i]``)\|) is maximal
-(complex variants).
-
-If either ``n`` or ``incx`` are not positive, the routine returns
-``0``.
-
-If more than one vector element is found with the same largest
-absolute value, the index of the first one encountered is returned.
-
-If the vector contains ``NaN`` values, then the routine returns the
-index of the first ``NaN``.
-
-``iamax`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std:complex<double>`` 
-
-.. container:: Note
-
-   .. rubric:: Note
-      :class: NoteTipHead
-
-   The index is zero-based.
-
-.. _onemkl_blas_iamax_buffer:
-
-iamax (Buffer Version)
-----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void iamax(sycl::queue &queue,
-                  std::int64_t n,
-                  sycl::buffer<T,
-                  1> &x,
-                  std::int64_t incx,
-                  sycl::buffer<std::int64_t,
-                  1> &result)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void iamax(sycl::queue &queue,
-                  std::int64_t n,
-                  sycl::buffer<T,
-                  1> &x,
-                  std::int64_t incx,
-                  sycl::buffer<std::int64_t,
-                  1> &result)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      The number of elements in vector ``x``.
-
-   x
-      The buffer that holds the input vector ``x``. The buffer must be
-      of size at least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage`
-      for more details.
-
-   incx
-      The stride of vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      The buffer where the zero-based index ``i`` of the maximal element
-      is stored.
-
-
-.. _onemkl_blas_iamax_usm:
-
-iamax (USM Version)
--------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event iamax(sycl::queue &queue,
-                         std::int64_t n,
-                         const T *x,
-                         std::int64_t incx,
-                         T_res *result,
-                         const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event iamax(sycl::queue &queue,
-                         std::int64_t n,
-                         const T *x,
-                         std::int64_t incx,
-                         T_res *result,
-                         const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      The number of elements in vector ``x``.
-
-   x
-      The pointer to the input vector ``x``. The array holding the
-      input vector ``x`` must be of size at least (1 + (``n`` -
-      1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      The stride of vector ``x``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      The pointer to where the zero-based index ``i`` of the maximal
-      element is stored.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/iamin.rst b/docs/domains/blas/iamin.rst
deleted file mode 100644
index fb724cc47..000000000
--- a/docs/domains/blas/iamin.rst
+++ /dev/null
@@ -1,160 +0,0 @@
-.. _onemkl_blas_iamin:
-
-iamin
-=====
-
-Finds the index of the element with the smallest absolute value.
-
-.. _onemkl_blas_iamin_description:
-
-.. rubric:: Description
-
-The ``iamin`` routines return an index ``i`` such that ``x[i]`` has
-the minimum absolute value of all elements in vector ``x`` (real
-variants), or such that (\|Re(``x[i]``)\| + \|Im(``x[i]``)\|) is minimal
-(complex variants).
-
-If either ``n`` or ``incx`` are not positive, the routine returns
-``0``.
-
-If more than one vector element is found with the same smallest
-absolute value, the index of the first one encountered is returned.
-
-If the vector contains ``NaN`` values, then the routine returns the
-index of the first ``NaN``.
-
-``iamin`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. container:: Note
-
-   .. rubric:: Note
-      :class: NoteTipHead
-
-   The index is zero-based.
-
-.. _onemkl_blas_iamin_buffer:
-
-iamin (Buffer Version)
-----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void iamin(sycl::queue &queue,
-                  std::int64_t n,
-                  sycl::buffer<T,1> &x,
-                  std::int64_t incx,
-                  sycl::buffer<std::int64_t,1> &result)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void iamin(sycl::queue &queue,
-                  std::int64_t n,
-                  sycl::buffer<T,1> &x,
-                  std::int64_t incx,
-                  sycl::buffer<std::int64_t,1> &result)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector x.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      Buffer where the zero-based index ``i`` of the minimum element
-      will be stored.
-
-
-.. _onemkl_blas_iamin_usm:
-
-iamin (USM Version)
--------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event iamin(sycl::queue &queue,
-                         std::int64_t n,
-                         const T *x,
-                         std::int64_t incx,
-                         T_res *result,
-                         const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event iamin(sycl::queue &queue,
-                         std::int64_t n,
-                         const T *x,
-                         std::int64_t incx,
-                         T_res *result,
-                         const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   x
-      The pointer to input vector ``x``. The array holding input
-      vector ``x`` must be of size at least (1 + (``n`` -
-      1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector x.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      Pointer to where the zero-based index ``i`` of the minimum
-      element will be stored.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-      
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/nrm2.rst b/docs/domains/blas/nrm2.rst
deleted file mode 100644
index 879862c73..000000000
--- a/docs/domains/blas/nrm2.rst
+++ /dev/null
@@ -1,158 +0,0 @@
-.. _onemkl_blas_nrm2:
-
-nrm2
-====
-
-Computes the Euclidean norm of a vector.
-
-.. _onemkl_blas_nrm2_description:
-
-.. rubric:: Description
-
-The ``nrm2`` routines computes Euclidean norm of a vector
-
-.. math:: 
-   
-      result = \| x\|   
-
-where:
-
-``x`` is a vector of ``n`` elements.
-
-``nrm2`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-        -  T_res 
-      * -  ``float`` 
-        -  ``float`` 
-      * -  ``double`` 
-        -  ``double`` 
-      * -  ``std::complex<float>`` 
-        -  ``float`` 
-      * -  ``std::complex<double>`` 
-        -  ``double`` 
-
-.. _onemkl_blas_nrm2_buffer:
-
-nrm2 (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void nrm2(sycl::queue &queue,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T_res,1> &result)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void nrm2(sycl::queue &queue,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T_res,1> &result)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      Buffer where the Euclidean norm of the vector ``x`` will be
-      stored.
-
-
-.. _onemkl_blas_nrm2_usm:
-
-nrm2 (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event nrm2(sycl::queue &queue,
-                        std::int64_t n,
-                        const T *x,
-                        std::int64_t incx,
-                        T_res *result,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event nrm2(sycl::queue &queue,
-                        std::int64_t n,
-                        const T *x,
-                        std::int64_t incx,
-                        T_res *result,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      Pointer to where the Euclidean norm of the vector ``x`` will be
-      stored.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-      
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/rot.rst b/docs/domains/blas/rot.rst
deleted file mode 100644
index 736db3b4d..000000000
--- a/docs/domains/blas/rot.rst
+++ /dev/null
@@ -1,208 +0,0 @@
-.. _onemkl_blas_rot:
-
-rot
-===
-
-Performs rotation of points in the plane.
-
-.. _onemkl_blas_rot_description:
-
-.. rubric:: Description
-
-Given two vectors ``x`` and ``y`` of ``n`` elements, the ``rot`` routines
-compute four scalar-vector products and update the input vectors with
-the sum of two of these scalar-vector products as follow:
-
-.. math::
-  
-   \left[\begin{array}{c}
-      x\\y
-   \end{array}\right]
-   \leftarrow
-   \left[\begin{array}{c}
-      \phantom{-}c*x + s*y\\
-      -s*x + c*y
-   \end{array}\right]
-
-``rot`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-        -  T_scalar 
-      * -  ``float`` 
-        -  ``float`` 
-      * -  ``double`` 
-        -  ``double`` 
-      * -  ``std::complex<float>`` 
-        -  ``float`` 
-      * -  ``std::complex<double>`` 
-        -  ``double`` 
-
-.. _onemkl_blas_rot_buffer:
-
-rot (Buffer Version)
---------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void rot(sycl::queue &queue,
-                std::int64_t n,
-                sycl::buffer<T,1> &x,
-                std::int64_t incx,
-                sycl::buffer<T,1> &y,
-                std::int64_t incy,
-                T_scalar c,
-                T_scalar s)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void rot(sycl::queue &queue,
-                std::int64_t n,
-                sycl::buffer<T,1> &x,
-                std::int64_t incx,
-                sycl::buffer<T,1> &y,
-                std::int64_t incy,
-                T_scalar c,
-                T_scalar s)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input vector ``y``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   c
-      Scaling factor.
-
-   s
-      Scaling factor.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Buffer holding updated buffer ``x``.
-
-   y
-      Buffer holding updated buffer ``y``.
-
-      
-
-.. _onemkl_blas_rot_usm:
-
-rot (USM Version)
------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event rot(sycl::queue &queue,
-                       std::int64_t n,
-                       T *x,
-                       std::int64_t incx,
-                       T *y,
-                       std::int64_t incy,
-                       T_scalar c,
-                       T_scalar s,
-                       const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event rot(sycl::queue &queue,
-                       std::int64_t n,
-                       T *x,
-                       std::int64_t incx,
-                       T *y,
-                       std::int64_t incy,
-                       T_scalar c,
-                       T_scalar s,
-                       const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Pointer to input vector ``y``. The array holding input vector
-      ``y`` must be of size at least (1 + (``n`` - 1)*abs(``incy``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   c
-      Scaling factor.
-
-   s
-      Scaling factor.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Pointer to the updated matrix ``x``.
-
-   y
-      Pointer to the updated matrix ``y``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/rotg.rst b/docs/domains/blas/rotg.rst
deleted file mode 100644
index 18a065f0e..000000000
--- a/docs/domains/blas/rotg.rst
+++ /dev/null
@@ -1,175 +0,0 @@
-.. _onemkl_blas_rotg:
-
-rotg
-====
-
-Computes the parameters for a Givens rotation.
-
-.. _onemkl_blas_rotg_description:
-
-.. rubric:: Description
-
-Given the Cartesian coordinates ``(a, b)`` of a point, the ``rotg``
-routines return the parameters ``c``, ``s``, ``r``, and ``z``
-associated with the Givens rotation. The parameters ``c`` and ``s``
-define a unitary matrix such that:
-
-.. math::
-      
-      \begin{bmatrix}c & s \\ -s & c\end{bmatrix}.
-      \begin{bmatrix}a \\ b\end{bmatrix}
-      =\begin{bmatrix}r \\ 0\end{bmatrix} 
-
-The parameter ``z`` is defined such that if \|\ ``a``\ \| >
-\|\ ``b``\ \|, ``z`` is ``s``; otherwise if ``c`` is not 0 ``z`` is
-1/``c``; otherwise ``z`` is 1.
-
-``rotg`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-        -  T_res 
-      * -  ``float`` 
-        -  ``float`` 
-      * -  ``double`` 
-        -  ``double`` 
-      * -  ``std::complex<float>`` 
-        -  ``float`` 
-      * -  ``std::complex<double>`` 
-        -  ``double`` 
-
-.. _onemkl_blas_rotg_buffer:
-
-rotg (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void rotg(sycl::queue &queue,
-                 sycl::buffer<T,1> &a,
-                 sycl::buffer<T,1> &b,
-                 sycl::buffer<T_real,1> &c,
-                 sycl::buffer<T,1> &s)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void rotg(sycl::queue &queue,
-                 sycl::buffer<T,1> &a,
-                 sycl::buffer<T,1> &b,
-                 sycl::buffer<T_real,1> &c,
-                 sycl::buffer<T,1> &s)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed
-
-   a
-      Buffer holding the ``x``-coordinate of the point.
-
-   b
-      Buffer holding the ``y``-coordinate of the point.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Buffer holding the parameter ``r`` associated with the Givens
-      rotation.
-
-   b
-      Buffer holding the parameter ``z`` associated with the Givens
-      rotation.
-
-   c
-      Buffer holding the parameter ``c`` associated with the Givens
-      rotation.
-
-   s
-      Buffer holding the parameter ``s`` associated with the Givens
-      rotation.
-
-
-.. _onemkl_blas_rotg_usm:
-
-rotg (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event rotg(sycl::queue &queue,
-                        T *a,
-                        T *b,
-                        T_real *c,
-                        T *s,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event rotg(sycl::queue &queue,
-                        T *a,
-                        T *b,
-                        T_real *c,
-                        T *s,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed
-
-   a
-      Pointer to the ``x``-coordinate of the point.
-
-   b
-      Pointer to the ``y``-coordinate of the point.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Pointer to the parameter ``r`` associated with the Givens
-      rotation.
-
-   b
-      Pointer to the parameter ``z`` associated with the Givens
-      rotation.
-
-   c
-      Pointer to the parameter ``c`` associated with the Givens
-      rotation.
-
-   s
-      Pointer to the parameter ``s`` associated with the Givens
-      rotation.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/rotm.rst b/docs/domains/blas/rotm.rst
deleted file mode 100644
index da9a40c95..000000000
--- a/docs/domains/blas/rotm.rst
+++ /dev/null
@@ -1,266 +0,0 @@
-.. _onemkl_blas_rotm:
-
-rotm
-====
-
-Performs modified Givens rotation of points in the plane.
-
-.. _onemkl_blas_rotm_description:
-
-.. rubric:: Description
-
-Given two vectors ``x`` and ``y``, each vector element of these
-vectors is replaced as follows:
-
-.. math::
-
-      \begin{bmatrix}x_i \\ y_i\end{bmatrix}=
-      H
-      \begin{bmatrix}x_i \\ y_i\end{bmatrix} 
-
-for ``i`` from 1 to ``n``, where ``H`` is a modified Givens
-transformation matrix.
-
-``rotm`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-
-.. _onemkl_blas_rotm_buffer:
-
-rotm (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void rotm(sycl::queue &queue,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &param)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void rotm(sycl::queue &queue,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &param)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   param
-      Buffer holding an array of size 5.
-
-      The elements of the ``param`` array are:
-
-      ``param[0]`` contains a switch, ``flag``. The other array elements
-      ``param[1-4]`` contain the components of the modified Givens 
-      transformation matrix ``H``:
-      h\ :sub:`11`, h\ :sub:`21`, h\ :sub:`12`, and
-      h\ :sub:`22`, respectively.
-
-      Depending on the values of ``flag``, the components of ``H``
-      are set as follows:
-
-      | ``flag = -1.0``:
-
-      .. math::
-   
-         H=\begin{bmatrix}h_{11} & h_{12} \\ h_{21} & h_{22}\end{bmatrix} 
-
-      | ``flag = 0.0``:
-
-      .. math::
-   
-         H=\begin{bmatrix}1.0 & h_{12} \\ h_{21} & 1.0\end{bmatrix} 
-
-      | ``flag = 1.0``:
-
-      .. math::
-   
-         H=\begin{bmatrix}h_{11} & 1.0 \\ -1.0 & h_{22}\end{bmatrix} 
-
-      | ``flag = -2.0``:
-      
-      .. math::
-   
-         H=\begin{bmatrix}1.0 & 0.0 \\ 0.0 & 1.0\end{bmatrix} 
-
-      In the last three cases, the matrix entries of 1.0, -1.0, and 0.0
-      are assumed based on the value of ``flag`` and are not required to
-      be set in the ``param`` vector.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Buffer holding updated buffer ``x``.
-
-   y
-      Buffer holding updated buffer ``y``.
-
-      
-
-.. _onemkl_blas_rotm_usm:
-
-rotm (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event rotm(sycl::queue &queue,
-                        std::int64_t n,
-                        T *x,
-                        std::int64_t incx,
-                        T *y,
-                        std::int64_t incy,
-                        T *param,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event rotm(sycl::queue &queue,
-                        std::int64_t n,
-                        T *x,
-                        std::int64_t incx,
-                        T *y,
-                        std::int64_t incy,
-                        T *param,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-   
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   x
-      Pointer to the input vector ``x``. The array holding the vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   yparam
-      Pointer to the input vector ``y``. The array holding the vector
-      ``y`` must be of size at least (1 + (``n`` - 1)*abs(``incy``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   param
-      Buffer holding an array of size 5.
-
-      The elements of the ``param`` array are:
-
-      ``param[0]`` contains a switch, ``flag``. The other array elements
-      ``param[1-4]`` contain the components of the modified Givens 
-      transformation matrix ``H``:
-      h\ :sub:`11`, h\ :sub:`21`, h\ :sub:`12`, and
-      h\ :sub:`22`, respectively.
-
-      Depending on the values of ``flag``, the components of ``H``
-      are set as follows:
-
-      | ``flag = -1.0``:
-
-      .. math::
-   
-         H=\begin{bmatrix}h_{11} & h_{12} \\ h_{21} & h_{22}\end{bmatrix} 
-
-      | ``flag = 0.0``:
-
-      .. math::
-   
-         H=\begin{bmatrix}1.0 & h_{12} \\ h_{21} & 1.0\end{bmatrix} 
-
-      | ``flag = 1.0``:
-
-      .. math::
-   
-         H=\begin{bmatrix}h_{11} & 1.0 \\ -1.0 & h_{22}\end{bmatrix} 
-
-      | ``flag = -2.0``:
-      
-      .. math::
-   
-         H=\begin{bmatrix}1.0 & 0.0 \\ 0.0 & 1.0\end{bmatrix} 
-
-      In the last three cases, the matrix entries of 1.0, -1.0, and 0.0
-      are assumed based on the value of ``flag`` and are not required to
-      be set in the ``param`` vector.
-   
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Pointer to the updated array ``x``.
-
-   y
-      Pointer to the updated array ``y``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/rotmg.rst b/docs/domains/blas/rotmg.rst
deleted file mode 100644
index 49a16ff00..000000000
--- a/docs/domains/blas/rotmg.rst
+++ /dev/null
@@ -1,257 +0,0 @@
-.. _onemkl_blas_rotmg:
-
-rotmg
-=====
-
-Computes the parameters for a modified Givens rotation.
-
-.. _onemkl_blas_rotmg_description:
-
-.. rubric:: Description
-
-Given Cartesian coordinates (``x1``, ``y1``) of an
-input vector, the ``rotmg`` routines compute the components of a modified
-Givens transformation matrix ``H`` that zeros the ``y``-component of
-the resulting vector:
-
-.. math::
-
-      \begin{bmatrix}x1 \\ 0\end{bmatrix}=
-      H
-      \begin{bmatrix}x1\sqrt{d1} \\ y1\sqrt{d2}\end{bmatrix} 
-      
-``rotmg`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-
-.. _onemkl_blas_rotmg_buffer:
-
-rotmg (Buffer Version)
-----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void rotmg(sycl::queue &queue,
-                  sycl::buffer<T,1> &d1,
-                  sycl::buffer<T,1> &d2,
-                  sycl::buffer<T,1> &x1,
-                  sycl::buffer<T,1> &y1,
-                  sycl::buffer<T,1> &param)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void rotmg(sycl::queue &queue,
-                  sycl::buffer<T,1> &d1,
-                  sycl::buffer<T,1> &d2,
-                  sycl::buffer<T,1> &x1,
-                  sycl::buffer<T,1> &y1,
-                  sycl::buffer<T,1> &param)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   d1
-      Buffer holding the scaling factor for the ``x``-coordinate of the
-      input vector.
-
-   d2
-      Buffer holding the scaling factor for the ``y``-coordinate of the
-      input vector.
-
-   x1
-      Buffer holding the ``x``-coordinate of the input vector.
-
-   y1
-      Scalar specifying the ``y``-coordinate of the input vector.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   d1
-      Buffer holding the first diagonal element of the updated matrix.
-
-   d2
-      Buffer holding the second diagonal element of the updated matrix.
-
-   x1
-      Buffer holding the ``x``-coordinate of the rotated vector before
-      scaling
-
-   param
-      Buffer holding an array of size 5.
-
-      The elements of the ``param`` array are:
-
-      ``param[0]`` contains a switch, ``flag``. The other array elements
-      ``param[1-4]`` contain the components of the modified Givens 
-      transformation matrix ``H``:
-      h\ :sub:`11`, h\ :sub:`21`, h\ :sub:`12`, and
-      h\ :sub:`22`, respectively.
-
-      Depending on the values of ``flag``, the components of ``H`` are
-      set as follows:
-
-      | ``flag = -1.0``:
-
-      .. math::
-   
-         H=\begin{bmatrix}h_{11} & h_{12} \\ h_{21} & h_{22}\end{bmatrix} 
-
-      | ``flag = 0.0``:
-
-      .. math::
-   
-         H=\begin{bmatrix}1.0 & h_{12} \\ h_{21} & 1.0\end{bmatrix} 
-
-      | ``flag = 1.0``:
-
-      .. math::
-   
-         H=\begin{bmatrix}h_{11} & 1.0 \\ -1.0 & h_{22}\end{bmatrix} 
-
-      | ``flag = -2.0``:
-      
-      .. math::
-   
-         H=\begin{bmatrix}1.0 & 0.0 \\ 0.0 & 1.0\end{bmatrix} 
-
-      In the last three cases, the matrix entries of 1.0, -1.0, and 0.0
-      are assumed based on the value of ``flag`` and are not required to
-      be set in the ``param`` vector.
-
-      
-
-.. _onemkl_blas_rotmg_usm:
-
-rotmg (USM Version)
--------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event rotmg(sycl::queue &queue,
-                         T *d1,
-                         T *d2,
-                         T *x1,
-                         T *y1,
-                         T *param,
-                         const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event rotmg(sycl::queue &queue,
-                         T *d1,
-                         T *d2,
-                         T *x1,
-                         T *y1,
-                         T *param,
-                         const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   d1
-      Pointer to the scaling factor for the ``x``-coordinate of the
-      input vector.
-
-   d2
-      Pointer to the scaling factor for the ``y``-coordinate of the
-      input vector.
-
-   x1
-      Pointer to the ``x``-coordinate of the input vector.
-
-   y1
-      Scalar specifying the ``y``-coordinate of the input vector.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   d1
-      Pointer to the first diagonal element of the updated matrix.
-
-   d2
-      Pointer to the second diagonal element of the updated matrix.
-
-   x1
-      Pointer to the ``x``-coordinate of the rotated vector before
-      scaling
-
-   param
-      Buffer holding an array of size 5.
-
-      The elements of the ``param`` array are:
-
-      ``param[0]`` contains a switch, ``flag``. The other array elements
-      ``param[1-4]`` contain the components of the modified Givens 
-      transformation matrix ``H``:
-      h\ :sub:`11`, h\ :sub:`21`, h\ :sub:`12`, and
-      h\ :sub:`22`, respectively.
-
-      Depending on the values of ``flag``, the components of ``H``
-      are set as follows:
-
-      | ``flag = -1.0``:
-
-      .. math::
-   
-         H=\begin{bmatrix}h_{11} & h_{12} \\ h_{21} & h_{22}\end{bmatrix} 
-
-      | ``flag = 0.0``:
-
-      .. math::
-   
-         H=\begin{bmatrix}1.0 & h_{12} \\ h_{21} & 1.0\end{bmatrix} 
-
-      | ``flag = 1.0``:
-
-      .. math::
-   
-         H=\begin{bmatrix}h_{11} & 1.0 \\ -1.0 & h_{22}\end{bmatrix} 
-
-      | ``flag = -2.0``:
-      
-      .. math::
-   
-         H=\begin{bmatrix}1.0 & 0.0 \\ 0.0 & 1.0\end{bmatrix} 
-
-      In the last three cases, the matrix entries of 1.0, -1.0, and 0.0
-      are assumed based on the value of ``flag`` and are not required to
-      be set in the ``param`` vector.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/sbmv.rst b/docs/domains/blas/sbmv.rst
deleted file mode 100644
index a0c071f3c..000000000
--- a/docs/domains/blas/sbmv.rst
+++ /dev/null
@@ -1,244 +0,0 @@
-.. _onemkl_blas_sbmv:
-
-sbmv
-====
-
-Computes a matrix-vector product with a symmetric band matrix.
-
-.. _onemkl_blas_sbmv_description:
-
-.. rubric:: Description
-
-The ``sbmv`` routines compute a scalar-matrix-vector product and add the
-result to a scalar-vector product, with a symmetric band matrix. The
-operation is defined as:
-
-.. math::
-
-      y \leftarrow alpha*A*x + beta*y
-
-where:
-
-``alpha`` and ``beta`` are scalars,
-
-``A`` is an ``n``-by-``n`` symmetric matrix with ``k``
-super-diagonals,
-
-``x`` and ``y`` are vectors of length ``n``.
-
-``sbmv`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-
-.. _onemkl_blas_sbmv_buffer:
-
-sbmv (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void sbmv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 std::int64_t k,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void sbmv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 std::int64_t k,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   k
-      Number of super-diagonals of the matrix ``A``. Must be at least
-      zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least (``k`` + 1),
-      and positive.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   beta
-      Scaling factor for vector ``y``.
-
-   y
-      Buffer holding input/output vector ``y``. The buffer must be of
-      size at least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage`
-      for more details.
-
-   incy
-      Stride of vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Buffer holding the updated vector ``y``.
-
-
-.. _onemkl_blas_sbmv_usm:
-
-sbmv (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event sbmv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        std::int64_t k,
-                        T alpha,
-                        const T *a,
-                        std::int64_t lda,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event sbmv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        std::int64_t k,
-                        T alpha,
-                        const T *a,
-                        std::int64_t lda,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   k
-      Number of super-diagonals of the matrix ``A``. Must be at least
-      zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least (``k`` +
-      1), and positive.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   beta
-      Scaling factor for vector ``y``.
-
-   y
-      Pointer to input/output vector ``y``. The array holding
-      input/output vector ``y`` must be of size at least (1 + (``n``
-      - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Pointer to the updated vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/scal.rst b/docs/domains/blas/scal.rst
deleted file mode 100644
index 155b5fc49..000000000
--- a/docs/domains/blas/scal.rst
+++ /dev/null
@@ -1,162 +0,0 @@
-.. _onemkl_blas_scal:
-
-scal
-====
-
-Computes the product of a vector by a scalar.
-
-.. _onemkl_blas_scal_description:
-
-.. rubric:: Description
-
-The ``scal`` routines computes a scalar-vector product:
-
-.. math::
-
-      x \leftarrow alpha*x
-
-where:
-
-``x`` is a vector of ``n`` elements,
-
-``alpha`` is a scalar.
-
-``scal`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-        -  T_scalar 
-      * -  ``float`` 
-        -  ``float`` 
-      * -  ``double`` 
-        -  ``double`` 
-      * -  ``std::complex<float>`` 
-        -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-        -  ``std::complex<double>`` 
-      * -  ``std::complex<float>`` 
-        -  ``float`` 
-      * -  ``std::complex<double>`` 
-        -  ``double`` 
-
-.. _onemkl_blas_scal_buffer:
-
-scal (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void scal(sycl::queue &queue,
-                 std::int64_t n,
-                 T_scalar alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void scal(sycl::queue &queue,
-                 std::int64_t n,
-                 T_scalar alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   alpha
-      Specifies the scalar ``alpha``.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Buffer holding updated buffer ``x``.
-
-
-.. _onemkl_blas_scal_usm:
-
-scal (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event scal(sycl::queue &queue,
-                        std::int64_t n,
-                        T_scalar alpha,
-                        T *x,
-                        std::int64_t incx,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event scal(sycl::queue &queue,
-                        std::int64_t n,
-                        T_scalar alpha,
-                        T *x,
-                        std::int64_t incx,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   alpha
-      Specifies the scalar ``alpha``.
-
-   x
-      Pointer to the input vector ``x``. The array must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Pointer to the updated array ``x``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/sdsdot.rst b/docs/domains/blas/sdsdot.rst
deleted file mode 100644
index 34d939c2f..000000000
--- a/docs/domains/blas/sdsdot.rst
+++ /dev/null
@@ -1,172 +0,0 @@
-.. _onemkl_blas_sdsdot:
-
-sdsdot
-======
-
-Computes a vector-vector dot product with double precision.
-
-.. _onemkl_blas_sdsdot_description:
-
-.. rubric:: Description
-
-The ``sdsdot`` routines perform a dot product between two vectors with
-double precision:
-
-.. math::
-
-   result = sb + \sum_{i=1}^{n}X_iY_i
-
-.. _onemkl_blas_sdsdot_buffer:
-
-sdsdot (Buffer Version)
------------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void sdsdot(sycl::queue &queue,
-                   std::int64_t n,
-                   float sb,
-                   sycl::buffer<float,1> &x,
-                   std::int64_t incx,
-                   sycl::buffer<float,1> &y,
-                   std::int64_t incy,
-                   sycl::buffer<float,1> &result)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void sdsdot(sycl::queue &queue,
-                   std::int64_t n,
-                   float sb,
-                   sycl::buffer<float,1> &x,
-                   std::int64_t incx,
-                   sycl::buffer<float,1> &y,
-                   std::int64_t incy,
-                   sycl::buffer<float,1> &result)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vectors ``x`` and ``y``.
-
-   sb
-      Single precision scalar to be added to the dot product.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size
-      at least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input vector ``y``. The buffer must be of size
-      at least (1 + (``n`` - 1)*abs(``incxy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      Buffer where the result (a scalar) will be stored. If ``n`` < 0
-      the result is ``sb``.
-
-
-.. _onemkl_blas_sdsdot_usm:
-
-sdsdot (USM Version)
---------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event sdsdot(sycl::queue &queue,
-                          std::int64_t n,
-                          float sb,
-                          const float *x,
-                          std::int64_t incx,
-                          const float *y,
-                          std::int64_t incy,
-                          float *result,
-                          const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event sdsdot(sycl::queue &queue,
-                          std::int64_t n,
-                          float sb,
-                          const float *x,
-                          std::int64_t incx,
-                          const float *y,
-                          std::int64_t incy,
-                          float *result,
-                          const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vectors ``x`` and ``y``.
-
-   sb
-      Single precision scalar to be added to the dot product.
-
-   x
-      Pointer to the input vector ``x``. The array must be of size
-      at least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage`
-      for more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Pointer to the input vector ``y``. The array must be of size
-      at least (1 + (``n`` - 1)*abs(``incxy``)). See :ref:`matrix-storage`
-      for more details.
-
-   incy
-      Stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if
-      any. If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   result
-      Pointer to where the result (a scalar) will be stored. If
-      ``n`` < 0 the result is ``sb``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-      
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/spmv.rst b/docs/domains/blas/spmv.rst
deleted file mode 100644
index 3ae196901..000000000
--- a/docs/domains/blas/spmv.rst
+++ /dev/null
@@ -1,220 +0,0 @@
-.. _onemkl_blas_spmv:
-
-spmv
-====
-
-Computes a matrix-vector product with a symmetric packed matrix.
-
-.. _onemkl_blas_spmv_description:
-
-.. rubric:: Description
-
-The ``spmv`` routines compute a scalar-matrix-vector product and add the
-result to a scalar-vector product, with a symmetric packed matrix.
-The operation is defined as:
-
-.. math::
-
-      y \leftarrow alpha*A*x + beta*y
-
-where:
-
-``alpha`` and ``beta`` are scalars,
-
-``A`` is an ``n``-by-``n`` symmetric matrix, supplied in packed form,
-
-``x`` and ``y`` are vectors of length ``n``.
-
-``spmv`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-
-.. _onemkl_blas_spmv_buffer:
-
-spmv (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void spmv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void spmv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      (``n``\ \*(``n``\ +1))/2. See :ref:`matrix-storage` for
-      more details.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-   
-   beta
-      Scaling factor for vector ``y``.
-
-   y
-      Buffer holding input/output vector ``y``. The buffer must be of
-      size at least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage`
-      for more details.
-
-   incy
-      Stride of vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Buffer holding the updated vector ``y``.
-
-
-.. _onemkl_blas_spmv_usm:
-
-spmv (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event spmv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *a,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event spmv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *a,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-   
-.. container:: section
-      
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least (``n``\ \*(``n``\ +1))/2. See
-      :ref:`matrix-storage` for
-      more details.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   beta
-      Scaling factor for vector ``y``.
-
-   y
-      Pointer to input/output vector ``y``. The array holding
-      input/output vector ``y`` must be of size at least (1 + (``n``
-      - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Pointer to the updated vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/spr.rst b/docs/domains/blas/spr.rst
deleted file mode 100644
index 2ef91546d..000000000
--- a/docs/domains/blas/spr.rst
+++ /dev/null
@@ -1,193 +0,0 @@
-.. _onemkl_blas_spr:
-
-spr
-===
-
-Performs a rank-1 update of a symmetric packed matrix.
-
-.. _onemkl_blas_spr_description:
-
-.. rubric:: Description
-
-The ``spr`` routines compute a scalar-vector-vector product and add the
-result to a symmetric packed matrix. The operation is defined as:
-
-.. math::
-
-      A \leftarrow alpha*x*x^T + A
-
-where:
-
-``alpha`` is scalar,
-
-``A`` is an ``n``-by-``n`` symmetric matrix, supplied in packed form,
-
-``x`` is a vector of length ``n``.
-
-``spr`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-
-.. _onemkl_blas_spr_buffer:
-
-spr (Buffer Version)
---------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void spr(sycl::queue &queue,
-                onemkl::uplo upper_lower,
-                std::std::int64_t n,
-                T alpha,
-                sycl::buffer<T,1> &x,
-                std::int64_t incx,
-                sycl::buffer<T,1> &a)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void spr(sycl::queue &queue,
-                onemkl::uplo upper_lower,
-                std::std::int64_t n,
-                T alpha,
-                sycl::buffer<T,1> &x,
-                std::int64_t incx,
-                sycl::buffer<T,1> &a)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      (``n``\ \*(``n`` + 1))/2. See :ref:`matrix-storage` for
-      more details.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-      :class: sectiontitle
-
-   a
-      Buffer holding the updated upper triangular part of the symmetric
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper``, or the updated lower
-      triangular part of the symmetric matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-      
-
-.. _onemkl_blas_spr_usm:
-
-spr (USM Version)
------------------
-
-.. rubric:: Syntax
-         
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event spr(sycl::queue &queue,
-                       onemkl::uplo upper_lower,
-                       std::int64_t n,
-                       T alpha,
-                       const T *x,
-                       std::int64_t incx,
-                       T *a,
-                       const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event spr(sycl::queue &queue,
-                       onemkl::uplo upper_lower,
-                       std::int64_t n,
-                       T alpha,
-                       const T *x,
-                       std::int64_t incx,
-                       T *a,
-                       const std::vector<sycl::event> &dependencies = {})
-   }
-   
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least (``n``\ \*(``n`` + 1))/2. See
-      :ref:`matrix-storage` for
-      more details.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Pointer to the updated upper triangular part of the symmetric
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper``, or the updated lower
-      triangular part of the symmetric matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-      
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/spr2.rst b/docs/domains/blas/spr2.rst
deleted file mode 100644
index 6cd195cbb..000000000
--- a/docs/domains/blas/spr2.rst
+++ /dev/null
@@ -1,213 +0,0 @@
-.. _onemkl_blas_spr2:
-
-spr2
-====
-
-Computes a rank-2 update of a symmetric packed matrix.
-
-.. _onemkl_blas_spr2_description:
-
-.. rubric:: Description
-
-The ``spr2`` routines compute two scalar-vector-vector products and add
-them to a symmetric packed matrix. The operation is defined as:
-
-.. math::
-
-      A \leftarrow alpha*x*y^T + alpha*y*x^T + A
-
-where:
-
-``alpha`` is scalar,
-
-``A`` is an ``n``-by-``n`` symmetric matrix, supplied in packed form,
-
-``x`` and ``y`` are vectors of length ``n``.
-
-``spr`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-
-.. _onemkl_blas_spr2_buffer:
-
-spr2 (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void spr2(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &a)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void spr2(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &a)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input/output vector ``y``. The buffer must be of
-      size at least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage`
-      for more details.
-
-   incy
-      Stride of vector ``y``.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      (``n``\ \*(``n``-1))/2. See :ref:`matrix-storage` for
-      more details.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Buffer holding the updated upper triangular part of the symmetric
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper`` or the updated lower
-      triangular part of the symmetric matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-
-.. _onemkl_blas_spr2_usm:
-
-spr2 (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event spr2(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x,
-                        std::int64_t incx,
-                        const T *y,
-                        std::int64_t incy,
-                        T *a)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event spr2(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x,
-                        std::int64_t incx,
-                        const T *y,
-                        std::int64_t incy,
-                        T *a)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Pointer to input/output vector ``y``. The array holding
-      input/output vector ``y`` must be of size at least (1 + (``n``
-      - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least (``n``\ \*(``n``-1))/2. See
-      :ref:`matrix-storage` for
-      more details.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Pointer to the updated upper triangular part of the symmetric
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper`` or the updated lower
-      triangular part of the symmetric matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/swap.rst b/docs/domains/blas/swap.rst
deleted file mode 100644
index 79c2d4121..000000000
--- a/docs/domains/blas/swap.rst
+++ /dev/null
@@ -1,184 +0,0 @@
-.. _onemkl_blas_swap:
-
-swap
-====
-
-Swaps a vector with another vector.
-
-.. _onemkl_blas_swap_description:
-
-.. rubric:: Description
-
-Given two vectors of ``n`` elements, ``x`` and ``y``, the ``swap``
-routines return vectors ``y`` and ``x`` swapped, each replacing the
-other.
-
-.. math::
-
-   \left[\begin{array}{c}
-      y\\x
-   \end{array}\right]
-   \leftarrow
-   \left[\begin{array}{c}
-      x\\y
-   \end{array}\right]
-
-``swap`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_swap_buffer:
-
-swap (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void swap(sycl::queue &queue,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void swap(sycl::queue &queue,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input vector ``y``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Buffer holding updated buffer ``x``, that is, the input vector
-      ``y``.
-
-   y
-      Buffer holding updated buffer ``y``, that is, the input vector
-      ``x``.
-
-      
-
-.. _onemkl_blas_swap_usm:
-
-swap (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event swap(sycl::queue &queue,
-                        std::int64_t n,
-                        T *x,
-                        std::int64_t incx,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event swap(sycl::queue &queue,
-                        std::int64_t n,
-                        T *x,
-                        std::int64_t incx,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-   
-.. container:: section
-   
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   n
-      Number of elements in vector ``x``.
-
-   x
-      Pointer to the input vector ``x``. The array must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Pointer to the input vector ``y``. The array must be of size at
-      least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Pointer to the updated array ``x``, that is, the input vector
-      ``y``.
-
-   y
-      Pointer to the updated array ``y``, that is, the input vector
-      ``x``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-1-routines`
diff --git a/docs/domains/blas/symm.rst b/docs/domains/blas/symm.rst
deleted file mode 100644
index 26e16e499..000000000
--- a/docs/domains/blas/symm.rst
+++ /dev/null
@@ -1,311 +0,0 @@
-.. _onemkl_blas_symm:
-
-symm
-====
-
-Computes a matrix-matrix product where one input matrix is symmetric
-and one matrix is general.
-
-.. _onemkl_blas_symm_description:
-
-.. rubric:: Description
-
-The ``symm`` routines compute a scalar-matrix-matrix product and add the
-result to a scalar-matrix product, where one of the matrices in the
-multiplication is symmetric. The argument ``left_right`` determines
-if the symmetric matrix, ``A``, is on the left of the multiplication
-(``left_right`` = ``side::left``) or on the right (``left_right`` =
-``side::right``). Depending on ``left_right``, the operation is
-defined as:
-
-.. math::
-
-      C \leftarrow alpha*A*B + beta*C
-
-or
-
-.. math::
-
-      C \leftarrow alpha*B*A + beta*C
-
-where:
-
-``alpha`` and ``beta`` are scalars,
-
-``A`` is a symmetric matrix, either ``m``-by-``m`` or ``n``-by-``n``,
-
-``B`` and ``C`` are ``m``-by-``n`` matrices.
-
-``symm`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_symm_buffer:
-
-symm (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void symm(sycl::queue &queue,
-                 onemkl::side left_right,
-                 onemkl::uplo upper_lower,
-                 std::int64_t m,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &b,
-                 std::int64_t ldb,
-                 T beta,
-                 sycl::buffer<T,1> &c,
-                 std::int64_t ldc)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void symm(sycl::queue &queue,
-                 onemkl::side left_right,
-                 onemkl::uplo upper_lower,
-                 std::int64_t m,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &b,
-                 std::int64_t ldb,
-                 T beta,
-                 sycl::buffer<T,1> &c,
-                 std::int64_t ldc)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   left_right
-      Specifies whether ``A`` is on the left side of the multiplication
-      (``side::left``) or on the right side (``side::right``). See :ref:`onemkl_datatypes` for more details.
-
-   upper_lower
-      Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Number of rows of ``B`` and ``C``. The value of ``m`` must be at
-      least zero.
-
-   n
-      Number of columns of ``B`` and ``C``. The value of ``n`` must be
-      at least zero.
-
-   alpha
-      Scaling factor for the matrix-matrix product.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``m`` if ``A`` is on the left of the multiplication,
-      or ``lda``\ \*\ ``n`` if ``A`` is on the right. See :ref:`matrix-storage`
-      for more details.
-
-   lda
-      Leading dimension of ``A``. Must be at least ``m`` if ``A`` is on
-      the left of the multiplication, or at least ``n`` if ``A`` is on
-      the right. Must be positive.
-
-   b
-      Buffer holding input matrix ``B``. Must have size at least
-      ``ldb``\ \*\ ``n`` if column major layout is
-      used to store matrices or at least ``ldb``\ \*\ ``m`` if row
-      major layout is used to store matrices. See :ref:`matrix-storage` for
-      more details.
-
-   ldb
-      Leading dimension of ``B``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if column major layout is used to store matrices.
-
-   beta
-      Scaling factor for matrix ``C``.
-
-   c
-      The buffer holding the input/output matrix ``C``. It must have a
-      size of at least ``ldc``\ \*\ ``n`` if column major layout is
-      used to store matrices or at least ``ldc``\ \*\ ``m`` if row
-      major layout is used to store matrices. See :ref:`matrix-storage` for more details.
-
-   ldc
-      The leading dimension of ``C``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if column major layout is used to store matrices.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Output buffer, overwritten by ``alpha``\ \*\ ``A``\ \*\ ``B`` +
-      ``beta``\ \*\ ``C`` (``left_right`` = ``side::left``) or
-      ``alpha``\ \*\ ``B``\ \*\ ``A`` + ``beta``\ \*\ ``C``
-      (``left_right`` = ``side::right``).
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``beta`` = 0, matrix ``C`` does not need to be initialized before
-   calling ``symm``.
-
-
-.. _onemkl_blas_symm_usm:
-
-symm (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event symm(sycl::queue &queue,
-                        onemkl::side left_right,
-                        onemkl::uplo upper_lower,
-                        std::int64_t m,
-                        std::int64_t n,
-                        T alpha,
-                        const T* a,
-                        std::int64_t lda,
-                        const T* b,
-                        std::int64_t ldb,
-                        T beta,
-                        T* c,
-                        std::int64_t ldc,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event symm(sycl::queue &queue,
-                        onemkl::side left_right,
-                        onemkl::uplo upper_lower,
-                        std::int64_t m,
-                        std::int64_t n,
-                        T alpha,
-                        const T* a,
-                        std::int64_t lda,
-                        const T* b,
-                        std::int64_t ldb,
-                        T beta,
-                        T* c,
-                        std::int64_t ldc,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   left_right
-      Specifies whether ``A`` is on the left side of the
-      multiplication (``side::left``) or on the right side
-      (``side::right``). See :ref:`onemkl_datatypes` for more details.
-
-   upper_lower
-      Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Number of rows of ``B`` and ``C``. The value of ``m`` must be
-      at least zero.
-
-   n
-      Number of columns of ``B`` and ``C``. The value of ``n`` must
-      be at least zero.
-
-   alpha
-      Scaling factor for the matrix-matrix product.
-
-   a
-      Pointer to input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``m`` if ``A`` is on the left of the
-      multiplication, or ``lda``\ \*\ ``n`` if ``A`` is on the right.
-      See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of ``A``. Must be at least ``m`` if ``A`` is
-      on the left of the multiplication, or at least ``n`` if ``A``
-      is on the right. Must be positive.
-
-   b
-      Pointer to input matrix ``B``. Must have size at least
-      ``ldb``\ \*\ ``n`` if column major layout is
-      used to store matrices or at least ``ldb``\ \*\ ``m`` if row
-      major layout is used to store matrices. See :ref:`matrix-storage` for
-      more details.
-
-   ldb
-      Leading dimension of ``B``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if column major layout is used to store matrices.
-      
-   beta
-      Scaling factor for matrix ``C``.
-
-   c
-      The pointer to input/output matrix ``C``. It must have a
-      size of at least ``ldc``\ \*\ ``n`` if column major layout is
-      used to store matrices or at least ``ldc``\ \*\ ``m`` if row
-      major layout is used to store matrices . See :ref:`matrix-storage` for more details.
-
-   ldc
-      The leading dimension of ``C``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if column major layout is used to store matrices.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Pointer to the output matrix, overwritten by
-      ``alpha``\ \*\ ``A``\ \*\ ``B`` + ``beta``\ \*\ ``C``
-      (``left_right`` = ``side::left``) or
-      ``alpha``\ \*\ ``B``\ \*\ ``A`` + ``beta``\ \*\ ``C``
-      (``left_right`` = ``side::right``).
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``beta`` = 0, matrix ``C`` does not need to be initialized
-   before calling ``symm``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/symv.rst b/docs/domains/blas/symv.rst
deleted file mode 100644
index d3750ec58..000000000
--- a/docs/domains/blas/symv.rst
+++ /dev/null
@@ -1,226 +0,0 @@
-.. _onemkl_blas_symv:
-
-symv
-====
-
-Computes a matrix-vector product for a symmetric matrix.
-
-.. _onemkl_blas_symv_description:
-
-.. rubric:: Description
-
-The ``symv`` routines routines compute a scalar-matrix-vector product and
-add the result to a scalar-vector product, with a symmetric matrix.
-The operation is defined as:
-
-.. math::
-
-      y \leftarrow alpha*A*x + beta*y
-
-where:
-
-``alpha`` and ``beta`` are scalars,
-
-``A`` is an ``n``-by-``n`` symmetric matrix,
-
-``x`` and ``y`` are vectors of length ``n``.
-
-``symv`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-
-.. _onemkl_blas_symv_buffer:
-
-symv (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void symv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void symv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 T beta,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``m``, and
-      positive.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input/output vector ``y``. The buffer must be of
-      size at least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage`
-      for more details.
-
-   incy
-      Stride of vector ``y``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Buffer holding the updated vector ``y``.
-
-
-.. _onemkl_blas_symv_usm:
-
-symv (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event symv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *a,
-                        std::int64_t lda,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event symv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *a,
-                        std::int64_t lda,
-                        const T *x,
-                        std::int64_t incx,
-                        T beta,
-                        T *y,
-                        std::int64_t incy,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``m``, and
-      positive.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Pointer to input/output vector ``y``. The array holding
-      input/output vector ``y`` must be of size at least (1 + (``n``
-      - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   y
-      Pointer to the updated vector ``y``.
-
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/syr.rst b/docs/domains/blas/syr.rst
deleted file mode 100644
index 74f692a70..000000000
--- a/docs/domains/blas/syr.rst
+++ /dev/null
@@ -1,202 +0,0 @@
-.. _onemkl_blas_syr:
-
-syr
-===
-
-Computes a rank-1 update of a symmetric matrix.
-
-.. _onemkl_blas_syr_description:
-
-.. rubric:: Description
-
-The ``syr`` routines compute a scalar-vector-vector product add them and
-add the result to a matrix, with a symmetric matrix. The operation is
-defined as:
-
-.. math::
-
-      A \leftarrow alpha*x*x^T + A
-
-where:
-
-``alpha`` is scalar,
-
-``A`` is an ``n``-by-``n`` symmetric matrix,
-
-``x`` is a vector of length ``n``.
-
-``syr`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-
-.. _onemkl_blas_syr_buffer:
-
-syr (Buffer Version)
---------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void syr(sycl::queue &queue,
-                onemkl::uplo upper_lower,
-                std::int64_t n,
-                T alpha,
-                sycl::buffer<T,1> &x,
-                std::int64_t incx,
-                sycl::buffer<T,1> &a,
-                std::int64_t lda)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void syr(sycl::queue &queue,
-                onemkl::uplo upper_lower,
-                std::int64_t n,
-                T alpha,
-                sycl::buffer<T,1> &x,
-                std::int64_t incx,
-                sycl::buffer<T,1> &a,
-                std::int64_t lda)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``n``, and
-      positive.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Buffer holding the updated upper triangular part of the symmetric
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper`` or the updated lower
-      triangular part of the symmetric matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-
-.. _onemkl_blas_syr_usm:
-
-syr (USM Version)
------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event syr(sycl::queue &queue,
-                       onemkl::uplo upper_lower,
-                       std::int64_t n,
-                       T alpha,
-                       const T *x,
-                       std::int64_t incx,
-                       T *a,
-                       std::int64_t lda,
-                       const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event syr(sycl::queue &queue,
-                       onemkl::uplo upper_lower,
-                       std::int64_t n,
-                       T alpha,
-                       const T *x,
-                       std::int64_t incx,
-                       T *a,
-                       std::int64_t lda,
-                       const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``n``, and
-      positive.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Pointer to the updated upper triangular part of the symmetric
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper`` or the updated lower
-      triangular part of the symmetric matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/syr2.rst b/docs/domains/blas/syr2.rst
deleted file mode 100644
index b7628e3f1..000000000
--- a/docs/domains/blas/syr2.rst
+++ /dev/null
@@ -1,228 +0,0 @@
-.. _onemkl_blas_syr2:
-
-syr2
-====
-
-Computes a rank-2 update of a symmetric matrix.
-
-.. _onemkl_blas_syr2_description:
-
-.. rubric:: Description
-
-The ``syr2`` routines compute two scalar-vector-vector product add them
-and add the result to a matrix, with a symmetric matrix. The
-operation is defined as:
-
-.. math::
-
-      A \leftarrow alpha*x*y^T + alpha*y*x^T + A
-      
-where:
-
-``alpha`` is a scalar,
-
-``A`` is an ``n``-by-``n`` symmetric matrix,
-
-``x`` and ``y`` are vectors of length ``n``.
-
-``syr2`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-
-.. _onemkl_blas_syr2_buffer:
-
-syr2 (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void syr2(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void syr2(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx,
-                 sycl::buffer<T,1> &y,
-                 std::int64_t incy,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Buffer holding input/output vector ``y``. The buffer must be of
-      size at least (1 + (``n`` - 1)*abs(``incy``)). See :ref:`matrix-storage`
-      for more details.
-
-   incy
-      Stride of vector ``y``.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``n``, and
-      positive.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Buffer holding the updated upper triangular part of the symmetric
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper``, or the updated lower
-      triangular part of the symmetric matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-      
-
-.. _onemkl_blas_syr2_usm:
-
-syr2 (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event syr2(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x,
-                        std::int64_t incx,
-                        const T *y,
-                        std::int64_t incy,
-                        T *a,
-                        std::int64_t lda,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event syr2(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        std::int64_t n,
-                        T alpha,
-                        const T *x,
-                        std::int64_t incx,
-                        const T *y,
-                        std::int64_t incy,
-                        T *a,
-                        std::int64_t lda,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of columns of ``A``. Must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-vector product.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   y
-      Pointer to input/output vector ``y``. The array holding
-      input/output vector ``y`` must be of size at least (1 + (``n``
-      - 1)*abs(``incy``)). See :ref:`matrix-storage` for
-      more details.
-
-   incy
-      Stride of vector ``y``.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``n``, and
-      positive.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   a
-      Pointer to the updated upper triangular part of the symmetric
-      matrix ``A`` if ``upper_lower``\ \=\ ``upper``, or the updated lower
-      triangular part of the symmetric matrix ``A`` if
-      ``upper_lower``\ \=\ ``lower``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/syr2k.rst b/docs/domains/blas/syr2k.rst
deleted file mode 100644
index 8605779c6..000000000
--- a/docs/domains/blas/syr2k.rst
+++ /dev/null
@@ -1,397 +0,0 @@
-.. _onemkl_blas_syr2k:
-
-syr2k
-=====
-
-Performs a symmetric rank-2k update.
-
-.. _onemkl_blas_syr2k_description:
-
-.. rubric:: Description
-
-The ``syr2k`` routines perform a rank-2k update of an ``n`` x ``n``
-symmetric matrix ``C`` by general matrices ``A`` and ``B``. 
-
-If ``trans`` = ``transpose::nontrans``, the operation is defined as:
-
-.. math::
-
-      C \leftarrow alpha*(A*B^T + B*A^T) + beta*C
-
-where ``A`` and ``B`` are ``n`` x ``k`` matrices.
-
-If ``trans`` = ``transpose::trans``, the operation is defined as:
-
-.. math::
-
-      C \leftarrow alpha*(A^T*B + B^T*A) + beta * C
-
-
-where ``A`` and ``B`` are ``k`` x ``n`` matrices.
-
-
-In both cases:
-
-``alpha`` and ``beta`` are scalars,
-
-``C`` is a symmetric matrix and ``A``,\ ``B`` are general matrices,
-
-The inner dimension of both matrix multiplications is ``k``.
-
-``syr2k`` supports the following precisions:
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_syr2k_buffer:
-
-syr2k (Buffer Version)
-----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void syr2k(sycl::queue &queue,
-                  onemkl::uplo upper_lower,
-                  onemkl::transpose trans,
-                  std::int64_t n,
-                  std::int64_t k,
-                  T alpha,
-                  sycl::buffer<T,1> &a,
-                  std::int64_t lda,
-                  sycl::buffer<T,1> &b,
-                  std::int64_t ldb,
-                  T beta,
-                  sycl::buffer<T,1> &c,
-                  std::int64_t ldc)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void syr2k(sycl::queue &queue,
-                  onemkl::uplo upper_lower,
-                  onemkl::transpose trans,
-                  std::int64_t n,
-                  std::int64_t k,
-                  T alpha,
-                  sycl::buffer<T,1> &a,
-                  std::int64_t lda,
-                  sycl::buffer<T,1> &b,
-                  std::int64_t ldb,
-                  T beta,
-                  sycl::buffer<T,1> &c,
-                  std::int64_t ldc)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies the operation to apply, as described above. Conjugation
-      is never performed, even if ``trans`` = ``transpose::conjtrans``.
-
-   n
-      Number of rows and columns in ``C``.The value of ``n`` must be at
-      least zero.
-
-   k
-      Inner dimension of matrix multiplications.The value of ``k`` must
-      be at least zero.
-
-   alpha
-      Scaling factor for the rank-2k update.
-
-   a
-      Buffer holding input matrix ``A``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``
-         * - Row major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-
-      See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      The leading dimension of ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``lda`` must be at least ``n``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``n``.
-
-   b
-      Buffer holding input matrix ``B``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-         * - Row major
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``.
-
-      See :ref:`matrix-storage`
-      for more details.
-
-   ldb
-      The leading dimension of ``B``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``ldb`` must be at least ``n``.
-           - ``ldb`` must be at least ``k``.
-         * - Row major
-           - ``ldb`` must be at least ``k``.
-           - ``ldb`` must be at least ``n``.
-
-   beta
-      Scaling factor for matrix ``C``.
-
-   c
-      Buffer holding input/output matrix ``C``. Must have size at least
-      ``ldc``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details
-
-   ldc
-      Leading dimension of ``C``. Must be positive and at least ``n``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Output buffer, overwritten by the updated ``C`` matrix.
-
-      
-
-.. _onemkl_blas_syr2k_usm:
-
-syr2k (USM Version)
--------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event syr2k(sycl::queue &queue,
-                         onemkl::uplo upper_lower,
-                         onemkl::transpose trans,
-                         std::int64_t n,
-                         std::int64_t k,
-                         T alpha,
-                         const T* a,
-                         std::int64_t lda,
-                         const T* b,
-                         std::int64_t ldb,
-                         T beta,
-                         T* c,
-                         std::int64_t ldc,
-                         const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event syr2k(sycl::queue &queue,
-                         onemkl::uplo upper_lower,
-                         onemkl::transpose trans,
-                         std::int64_t n,
-                         std::int64_t k,
-                         T alpha,
-                         const T* a,
-                         std::int64_t lda,
-                         const T* b,
-                         std::int64_t ldb,
-                         T beta,
-                         T* c,
-                         std::int64_t ldc,
-                         const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies the operation to apply, as described above.
-      Conjugation is never performed, even if ``trans`` =
-      ``transpose::conjtrans``.
-
-   n
-      Number of rows and columns in ``C``. The value of ``n`` must be
-      at least zero.
-
-   k
-      Inner dimension of matrix multiplications.The value of ``k``
-      must be at least zero.
-
-   alpha
-      Scaling factor for the rank-2k update.
-
-   a
-      Pointer to input matrix ``A``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``
-         * - Row major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-      
-      See :ref:`matrix-storage` for more details.
-
-   lda
-      The leading dimension of ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``lda`` must be at least ``n``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``n``.
-
-   b
-      Pointer to input matrix ``B``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-         * - Row major
-           - ``B`` is an ``n``-by-``k`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``n``.
-           - ``B`` is an ``k``-by-``n`` matrix so the array ``b``
-             must have size at least ``ldb``\ \*\ ``k``.
-   
-      See :ref:`matrix-storage` for
-      more details.
-
-   ldb
-      The leading dimension of ``B``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``ldb`` must be at least ``n``.
-           - ``ldb`` must be at least ``k``.
-         * - Row major
-           - ``ldb`` must be at least ``k``.
-           - ``ldb`` must be at least ``n``.
-
-   beta
-      Scaling factor for matrix ``C``.
-
-   c
-      Pointer to input/output matrix ``C``. Must have size at least
-      ``ldc``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details
-
-   ldc
-      Leading dimension of ``C``. Must be positive and at least
-      ``n``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Pointer to the output matrix, overwritten by the updated ``C``
-      matrix.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/syrk.rst b/docs/domains/blas/syrk.rst
deleted file mode 100644
index 74cf63af0..000000000
--- a/docs/domains/blas/syrk.rst
+++ /dev/null
@@ -1,296 +0,0 @@
-.. _onemkl_blas_syrk:
-
-syrk
-====
-
-Performs a symmetric rank-k update.
-
-.. _onemkl_blas_syrk_description:
-
-.. rubric:: Description
-
-The ``syrk`` routines perform a rank-k update of a symmetric matrix ``C``
-by a general matrix ``A``. The operation is defined as:
-
-.. math::
-
-      C \leftarrow alpha*op(A)*op(A)^T + beta*C
-
-where:
-
-op(``X``) is one of op(``X``) = ``X`` or op(``X``) = ``X``\ :sup:`T`
-,
-
-``alpha`` and ``beta`` are scalars,
-
-``C`` is a symmetric matrix and ``A``\ is a general matrix.
-
-Here op(``A``) is ``n``-by-``k``, and ``C`` is ``n``-by-``n``.
-
-``syrk`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_syrk_buffer:
-
-syrk (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void syrk(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 std::int64_t n,
-                 std::int64_t k,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 T beta,
-                 sycl::buffer<T,1> &c,
-                 std::int64_t ldc)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void syrk(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 std::int64_t n,
-                 std::int64_t k,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 T beta,
-                 sycl::buffer<T,1> &c,
-                 std::int64_t ldc)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to ``A`` (See :ref:`onemkl_datatypes` for more details). Conjugation is never performed, even if ``trans`` = ``transpose::conjtrans``.
-
-   n
-      Number of rows and columns in ``C``. The value of ``n`` must be at
-      least zero.
-
-   k
-      Number of columns in op(``A``).The value of ``k`` must be at least
-      zero.
-
-   alpha
-      Scaling factor for the rank-k update.
-
-   a
-      Buffer holding input matrix ``A``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``
-         * - Row major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-
-      See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      The leading dimension of ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``lda`` must be at least ``n``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``n``.
-      
-   beta
-      Scaling factor for matrix ``C``.
-
-   c
-      Buffer holding input/output matrix ``C``. Must have size at least
-      ``ldc``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   ldc
-      Leading dimension of ``C``. Must be positive and at least ``n``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Output buffer, overwritten by
-      ``alpha``\ \*op(``A``)*op(``A``)\ :sup:`T` + ``beta``\ \*\ ``C``.
-
-
-.. _onemkl_blas_syrk_usm:
-
-syrk (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event syrk(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        std::int64_t n,
-                        std::int64_t k,
-                        T alpha,
-                        const T* a,
-                        std::int64_t lda,
-                        T beta,
-                        T* c,
-                        std::int64_t ldc,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event syrk(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        std::int64_t n,
-                        std::int64_t k,
-                        T alpha,
-                        const T* a,
-                        std::int64_t lda,
-                        T beta,
-                        T* c,
-                        std::int64_t ldc,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A``'s data is stored in its upper or lower
-      triangle. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to
-      ``A`` (See :ref:`onemkl_datatypes` for more details). Conjugation is never performed, even if
-      ``trans`` = ``transpose::conjtrans``.
-
-   n
-      Number of rows and columns in ``C``. The value of ``n`` must be
-      at least zero.
-
-   k
-      Number of columns in op(``A``). The value of ``k`` must be at
-      least zero.
-
-   alpha
-      Scaling factor for the rank-k update.
-
-   a
-      Pointer to input matrix ``A``.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``
-         * - Row major
-           - ``A`` is an ``n``-by-``k`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``n``.
-           - ``A`` is an ``k``-by-``n`` matrix so the array ``a``
-             must have size at least ``lda``\ \*\ ``k``.
-      
-      See :ref:`matrix-storage` for more details.
-
-   lda
-      The leading dimension of ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``trans`` = ``transpose::nontrans``
-           - ``trans`` = ``transpose::trans`` or ``transpose::conjtrans``
-         * - Column major
-           - ``lda`` must be at least ``n``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``n``.
-
-   beta
-      Scaling factor for matrix ``C``.
-
-   c
-      Pointer to input/output matrix ``C``. Must have size at least
-      ``ldc``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   ldc
-      Leading dimension of ``C``. Must be positive and at least
-      ``n``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Pointer to the output matrix, overwritten by
-      ``alpha``\ \*op(``A``)*op(``A``)\ :sup:`T` +
-      ``beta``\ \*\ ``C``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/syrk_batch.rst b/docs/domains/blas/syrk_batch.rst
deleted file mode 100644
index b9782041e..000000000
--- a/docs/domains/blas/syrk_batch.rst
+++ /dev/null
@@ -1,484 +0,0 @@
-.. _onemkl_blas_syrk_batch:
-
-syrk_batch
-==========
-
-Computes a group of ``syrk`` operations.
-
-.. _onemkl_blas_syrk_batch_description:
-
-.. rubric:: Description
-
-The ``syrk_batch`` routines are batched versions of :ref:`onemkl_blas_syrk`, performing
-multiple ``syrk`` operations in a single call. Each ``syrk`` 
-operation perform a rank-k update with general matrices.
-   
-``syrk_batch`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_syrk_batch_buffer:
-
-syrk_batch (Buffer Version)
----------------------------
-
-.. rubric:: Description
-
-The buffer version of ``syrk_batch`` supports only the strided API. 
-
-The strided API operation is defined as:
-::
-
-   for i = 0 … batch_size – 1
-       A and C are matrices at offset i * stridea, i * stridec in a and c.
-       C := alpha * op(A) * op(A)^T + beta * C
-   end for
-
-where:
-
-op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = X\ :sup:`H`,
-
-``alpha`` and ``beta`` are scalars,
-
-``A`` and ``C`` are matrices,
-
-op(``A``) is ``n`` x ``k`` and ``C`` is ``n`` x ``n``.
-
-The ``a`` and ``c`` buffers contain all the input matrices. The stride 
-between matrices is given by the stride parameter. The total number
-of matrices in ``a`` and ``c`` buffers is given by the ``batch_size`` parameter.
-
-**Strided API**
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void syrk_batch(sycl::queue &queue,
-                       onemkl::uplo upper_lower,
-                       onemkl::transpose trans,
-                       std::int64_t n,
-                       std::int64_t k,
-                       T alpha,
-                       sycl::buffer<T,1> &a,
-                       std::int64_t lda,
-                       std::int64_t stridea,
-                       T beta,
-                       sycl::buffer<T,1> &c,
-                       std::int64_t ldc,
-                       std::int64_t stridec,
-                       std::int64_t batch_size)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void syrk_batch(sycl::queue &queue,
-                       onemkl::uplo upper_lower,
-                       onemkl::transpose trans,
-                       std::int64_t n,
-                       std::int64_t k,
-                       T alpha,
-                       sycl::buffer<T,1> &a,
-                       std::int64_t lda,
-                       std::int64_t stridea,
-                       T beta,
-                       sycl::buffer<T,1> &c,
-                       std::int64_t ldc,
-                       std::int64_t stridec,
-                       std::int64_t batch_size)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether data in ``C`` is stored in its upper or lower triangle.
-      For more details, see :ref:`onemkl_datatypes`.
-
-   trans
-      Specifies op(``A``) the transposition operation applied to the
-      matrix ``A``. Conjugation is never performed, even if trans =
-      transpose::conjtrans. See :ref:`onemkl_datatypes` for more
-      details.
-
-   n
-      Number of rows and columns of ``C``.
-      Must be at least zero.
-
-   k
-      Number of columns of op(``A``).
-      Must be at least zero.
-
-   alpha
-      Scaling factor for the rank-k update.
-
-   a
-      Buffer holding the input matrices ``A`` with size ``stridea`` * ``batch_size``.
-
-   lda
-      The leading dimension of the matrices ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``lda`` must be at least ``n``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``n``.
-
-   stridea
-      Stride between different ``A`` matrices.
-
-   beta
-      Scaling factor for the matrices ``C``.
-
-   c
-      Buffer holding input/output matrices ``C`` with size ``stridec`` * ``batch_size``.
-
-   ldc
-      The leading dimension of the matrices ``C``. It must be positive
-      and at least ``n``.
-
-   stridec
-      Stride between different ``C`` matrices. Must be at least
-      ``ldc`` * ``n``.
-
-   batch_size
-      Specifies the number of rank-k update operations to perform.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Output buffer, overwritten by ``batch_size`` rank-k update
-      operations of the form ``alpha`` * op(``A``)*op(``A``)^T + ``beta`` * ``C``.
-
-
-.. _onemkl_blas_syrk_batch_usm:
-
-syrk_batch (USM Version)
----------------------------
-
-.. rubric:: Description
-
-The USM version of ``syrk_batch`` supports the group API and strided API. 
-
-The group API operation is defined as:
-::
-
-   idx = 0
-   for i = 0 … group_count – 1
-       for j = 0 … group_size – 1
-           A, B, and C are matrices in a[idx] and c[idx]
-           C := alpha[i] * op(A) * op(A)^T + beta[i] * C
-           idx = idx + 1
-       end for
-   end for
-
-The strided API operation is defined as
-::
-
-   for i = 0 … batch_size – 1
-       A, B and C are matrices at offset i * stridea, i * stridec in a and c.
-       C := alpha * op(A) * op(A)^T + beta * C
-   end for
-
-where:
-
-op(X) is one of op(X) = X, or op(X) = X\ :sup:`T`, or op(X) = X\ :sup:`H`,
-
-``alpha`` and ``beta`` are scalars,
-
-``A`` and ``C`` are matrices,
-
-op(``A``) is ``n`` x ``k`` and ``C`` is ``n`` x ``n``.
-
- 
-For group API, ``a`` and ``c`` arrays contain the pointers for all the input matrices. 
-The total number of matrices in ``a`` and ``c`` are given by: 
-
-.. math::
-
-      total\_batch\_count = \sum_{i=0}^{group\_count-1}group\_size[i]    
- 
-For strided API, ``a`` and ``c`` arrays contain all the input matrices. The total number of matrices 
-in ``a`` and ``c`` are given by the ``batch_size`` parameter.  
-   
-**Group API**
-
-.. rubric:: Syntax
-   
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event syrk_batch(sycl::queue &queue,
-                              uplo *upper_lower,
-                              transpose *trans,
-                              std::int64_t *n,
-                              std::int64_t *k,
-                              T *alpha,
-                              const T **a,
-                              std::int64_t *lda,
-                              T *beta,
-                              T **c,
-                              std::int64_t *ldc,
-                              std::int64_t group_count,
-                              std::int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event syrk_batch(sycl::queue &queue,
-                              uplo *upper_lower,
-                              transpose *trans,
-                              std::int64_t *n,
-                              std::int64_t *k,
-                              T *alpha,
-                              const T **a,
-                              std::int64_t *lda,
-                              T *beta,
-                              T **c,
-                              std::int64_t *ldc,
-                              std::int64_t group_count,
-                              std::int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Array of ``group_count`` ``onemkl::upper_lower``
-      values. ``upper_lower[i]`` specifies whether data in C for every
-      matrix in group ``i`` is in upper or lower triangle.
-
-   trans
-      Array of ``group_count`` ``onemkl::transpose`` values. ``trans[i]`` specifies the form of op(``A``) used in
-      the rank-k update in group ``i``. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Array of ``group_count`` integers. ``n[i]`` specifies the
-      number of rows and columns of ``C`` for every matrix in group ``i``. All entries must be at least zero.
-
-   k
-      Array of ``group_count`` integers. ``k[i]`` specifies the
-      number of columns of op(``A``) for every matrix in group ``i``. All entries must be at
-      least zero.
-
-   alpha
-      Array of ``group_count`` scalar elements. ``alpha[i]`` specifies the scaling factor for every rank-k update in group ``i``.
-
-   a
-      Array of pointers to input matrices ``A`` with size ``total_batch_count``. 
-      
-      See :ref:`matrix-storage` for more details.
-
-   lda
-      Array of ``group_count`` integers. ``lda[i]`` specifies the
-      leading dimension of ``A`` for every matrix in group ``i``. All
-      entries must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``lda[i]`` must be at least ``n[i]``.
-           - ``lda[i]`` must be at least ``k[i]``.
-         * - Row major
-           - ``lda[i]`` must be at least ``k[i]``.
-           - ``lda[i]`` must be at least ``n[i]``.
-             
-   beta
-      Array of ``group_count`` scalar elements. ``beta[i]`` specifies the scaling factor for matrix ``C`` 
-      for every matrix in group ``i``.
-
-   c
-      Array of pointers to input/output matrices ``C`` with size ``total_batch_count``. 
-      
-      See :ref:`matrix-storage` for more details.
-
-   ldc
-      Array of ``group_count`` integers. ``ldc[i]`` specifies the
-      leading dimension of ``C`` for every matrix in group ``i``.  All
-      entries must be positive and ``ldc[i]`` must be at least ``n[i]``.
-
-   group_count
-      Specifies the number of groups. Must be at least 0.
-
-   group_size
-      Array of ``group_count`` integers. ``group_size[i]`` specifies the
-      number of rank-k update products in group ``i``. All entries must be at least 0.
-
-   dependencies
-         List of events to wait for before starting computation, if any.
-         If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Overwritten by the ``n[i]``-by-``n[i]`` matrix calculated by 
-      (``alpha[i]`` * op(``A``)*op(``A``)^T + ``beta[i]`` * ``C``) for group ``i``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event syrk_batch(sycl::queue &queue,
-                              uplo upper_lower,
-                              transpose trans,
-                              std::int64_t n,
-                              std::int64_t k,
-                              T alpha,
-                              const T *a,
-                              std::int64_t lda,
-                              std::int64_t stride_a,
-                              T beta,
-                              T *c,
-                              std::int64_t ldc,
-                              std::int64_t stride_c,
-                              std::int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event syrk_batch(sycl::queue &queue,
-                              uplo upper_lower,
-                              transpose trans,
-                              std::int64_t n,
-                              std::int64_t k,
-                              T alpha,
-                              const T *a,
-                              std::int64_t lda,
-                              std::int64_t stride_a,
-                              T beta,
-                              T *c,
-                              std::int64_t ldc,
-                              std::int64_t stride_c,
-                              std::int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether data in ``C`` is stored in its upper or lower triangle.
-      For more details, see :ref:`onemkl_datatypes`.
-
-   trans
-      Specifies op(``A``) the transposition operation applied to the
-      matrices ``A``. Conjugation is never performed, even if trans =
-      transpose::conjtrans. See :ref:`onemkl_datatypes` for more
-      details.
-
-   n
-      Number of rows and columns of ``C``.
-      Must be at least zero.
-
-   k
-      Number of columns of op(``A``).
-      Must be at least zero.
-
-   alpha
-      Scaling factor for the rank-k updates.
-
-   a
-      Pointer to input matrices ``A`` with size ``stridea`` * ``batch_size``.
-
-   lda
-      The leading dimension of the matrices ``A``. It must be positive.
-
-      .. list-table::
-         :header-rows: 1
-
-         * -
-           - ``A`` not transposed
-           - ``A`` transposed
-         * - Column major
-           - ``lda`` must be at least ``n``.
-           - ``lda`` must be at least ``k``.
-         * - Row major
-           - ``lda`` must be at least ``k``.
-           - ``lda`` must be at least ``n``.
-
-   stridea
-      Stride between different ``A`` matrices.
-
-   beta
-      Scaling factor for the matrices ``C``.
-
-   c
-      Pointer to input/output matrices ``C`` with size ``stridec`` * ``batch_size``.
-
-   ldc
-      The leading dimension of the matrices ``C``. It must be positive
-      and at least ``n``.
-
-   stridec
-      Stride between different ``C`` matrices.
-
-   batch_size
-      Specifies the number of rank-k update operations to perform.
-
-   dependencies
-         List of events to wait for before starting computation, if any.
-         If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   c
-      Output matrices, overwritten by ``batch_size`` rank-k update
-      operations of the form ``alpha`` * op(``A``)*op(``A``)^T + ``beta`` * ``C``.
-
-.. container:: section
-      
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-like-extensions`
diff --git a/docs/domains/blas/tbmv.rst b/docs/domains/blas/tbmv.rst
deleted file mode 100644
index 716013cb3..000000000
--- a/docs/domains/blas/tbmv.rst
+++ /dev/null
@@ -1,223 +0,0 @@
-.. _onemkl_blas_tbmv:
-
-tbmv
-====
-
-Computes a matrix-vector product using a triangular band matrix.
-
-.. _onemkl_blas_tbmv_description:
-
-.. rubric:: Description
-
-The ``tbmv`` routines compute a matrix-vector product with a triangular
-band matrix. The operation is defined as:
-
-.. math::
-
-      x \leftarrow op(A)*x
-
-where:
-
-op(``A``) is one of op(``A``) = ``A``, or op(``A``) =
-``A``\ :sup:`T`, or op(``A``) = ``A``\ :sup:`H`,
-
-``A`` is an ``n``-by-``n`` unit or non-unit, upper or lower
-triangular band matrix, with (``k`` + 1) diagonals,
-
-``x`` is a vector of length ``n``.
-
-``tbmv`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_tbmv_buffer:
-
-tbmv (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void tbmv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 onemkl::diag unit_nonunit,
-                 std::int64_t n,
-                 std::int64_t k,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void tbmv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 onemkl::diag unit_nonunit,
-                 std::int64_t n,
-                 std::int64_t k,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Numbers of rows and columns of ``A``. Must be at least zero.
-
-   k
-      Number of sub/super-diagonals of the matrix ``A``. Must be at
-      least zero.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least (``k`` + 1),
-      and positive.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Buffer holding the updated vector ``x``.
-
-      
-
-.. _onemkl_blas_tbmv_usm:
-
-tbmv (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event tbmv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        onemkl::diag unit_nonunit,
-                        std::int64_t n,
-                        std::int64_t k,
-                        const T *a,
-                        std::int64_t lda,
-                        T *x,
-                        std::int64_t incx,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event tbmv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        onemkl::diag unit_nonunit,
-                        std::int64_t n,
-                        std::int64_t k,
-                        const T *a,
-                        std::int64_t lda,
-                        T *x,
-                        std::int64_t incx,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to
-      ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Numbers of rows and columns of ``A``. Must be at least zero.
-
-   k
-      Number of sub/super-diagonals of the matrix ``A``. Must be at
-      least zero.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least (``k`` +
-      1), and positive.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Pointer to the updated vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/tbsv.rst b/docs/domains/blas/tbsv.rst
deleted file mode 100644
index bce876115..000000000
--- a/docs/domains/blas/tbsv.rst
+++ /dev/null
@@ -1,225 +0,0 @@
-.. _onemkl_blas_tbsv:
-
-tbsv
-====
-
-Solves a system of linear equations whose coefficients are in a
-triangular band matrix.
-
-.. _onemkl_blas_tbsv_description:
-
-.. rubric:: Description
-
-The ``tbsv`` routines solve a system of linear equations whose
-coefficients are in a triangular band matrix. The operation is
-defined as:
-
-.. math::
-
-      op(A)*x = b
-
-where:
-
-op(``A``) is one of op(``A``) = ``A``, or op(``A``) =
-``A``\ :sup:`T`, or op(``A``) = ``A``\ :sup:`H`,
-
-``A`` is an ``n``-by-``n`` unit or non-unit, upper or lower
-triangular band matrix, with (``k`` + 1) diagonals,
-
-``b`` and ``x`` are vectors of length ``n``.
-
-``tbsv`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_tbsv_buffer:
-
-tbsv (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void tbsv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 onemkl::diag unit_nonunit,
-                 std::int64_t n,
-                 std::int64_t k,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void tbsv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 onemkl::diag unit_nonunit,
-                 std::int64_t n,
-                 std::int64_t k,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   k
-      Number of sub/super-diagonals of the matrix ``A``. Must be at
-      least zero.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least (``k`` + 1),
-      and positive.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Buffer holding the solution vector ``x``.
-
-      
-
-.. _onemkl_blas_tbsv_usm:
-
-tbsv (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event tbsv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        onemkl::diag unit_nonunit,
-                        std::int64_t n,
-                        std::int64_t k,
-                        const T *a,
-                        std::int64_t lda,
-                        T *x,
-                        std::int64_t incx,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event tbsv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        onemkl::diag unit_nonunit,
-                        std::int64_t n,
-                        std::int64_t k,
-                        const T *a,
-                        std::int64_t lda,
-                        T *x,
-                        std::int64_t incx,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to
-      ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Number of rows and columns of ``A``. Must be at least zero.
-
-   k
-      Number of sub/super-diagonals of the matrix ``A``. Must be at
-      least zero.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least (``k`` +
-      1), and positive.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Pointer to the solution vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/tpmv.rst b/docs/domains/blas/tpmv.rst
deleted file mode 100644
index 736fbcf21..000000000
--- a/docs/domains/blas/tpmv.rst
+++ /dev/null
@@ -1,199 +0,0 @@
-.. _onemkl_blas_tpmv:
-
-tpmv
-====
-
-Computes a matrix-vector product using a triangular packed matrix.
-
-.. _onemkl_blas_tpmv_description:
-
-.. rubric:: Description
-
-The ``tpmv`` routines compute a matrix-vector product with a triangular
-packed matrix. The operation is defined as:
-
-.. math::
-
-      x \leftarrow op(A)*x
-
-where:
-
-op(``A``) is one of op(``A``) = ``A``, or op(``A``) =
-``A``\ :sup:`T`, or op(``A``) = ``A``\ :sup:`H`,
-
-``A`` is an ``n``-by-``n`` unit or non-unit, upper or lower
-triangular band matrix, supplied in packed form,
-
-``x`` is a vector of length ``n``.
-
-``tpmv`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_tpmv_buffer:
-
-tpmv (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void tpmv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 onemkl::diag unit_nonunit,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &a,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void tpmv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 onemkl::diag unit_nonunit,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &a,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Numbers of rows and columns of ``A``. Must be at least zero.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      (``n``\ \*(``n``\ +1))/2. See :ref:`matrix-storage` for
-      more details.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Buffer holding the updated vector ``x``.
-
-
-.. _onemkl_blas_tpmv_usm:
-
-tpmv (USM Version)
-------------------
-      
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event tpmv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        onemkl::diag unit_nonunit,
-                        std::int64_t n,
-                        const T *a,
-                        T *x,
-                        std::int64_t incx,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event tpmv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        onemkl::diag unit_nonunit,
-                        std::int64_t n,
-                        const T *a,
-                        T *x,
-                        std::int64_t incx,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to
-      ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Numbers of rows and columns of ``A``. Must be at least zero.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least (``n``\ \*(``n``\ +1))/2. See
-      :ref:`matrix-storage` for
-      more details.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Pointer to the updated vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/tpsv.rst b/docs/domains/blas/tpsv.rst
deleted file mode 100644
index 14082a077..000000000
--- a/docs/domains/blas/tpsv.rst
+++ /dev/null
@@ -1,207 +0,0 @@
-.. _onemkl_blas_tpsv:
-
-tpsv
-====
-
-Solves a system of linear equations whose coefficients are in a
-triangular packed matrix.
-
-.. _onemkl_blas_tpsv_description:
-
-.. rubric:: Description
-
-The ``tpsv`` routines solve a system of linear equations whose
-coefficients are in a triangular packed matrix. The operation is
-defined as:
-
-.. math::
-
-      op(A)*x = b
-
-where:
-
-op(``A``) is one of op(``A``) = ``A``, or op(``A``) =
-``A``\ :sup:`T`, or op(``A``) = ``A``\ :sup:`H`,
-
-``A`` is an ``n``-by-``n`` unit or non-unit, upper or lower
-triangular band matrix, supplied in packed form,
-
-``b`` and ``x`` are vectors of length ``n``.
-
-``tpsv`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_tpsv_buffer:
-
-tpsv (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-      
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void tpsv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 onemkl::diag unit_nonunit,
-                 std::int64_t n,
-                 std::int64_t k,
-                 sycl::buffer<T,1> &a,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void tpsv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 onemkl::diag unit_nonunit,
-                 std::int64_t n,
-                 std::int64_t k,
-                 sycl::buffer<T,1> &a,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Numbers of rows and columns of ``A``. Must be at least zero.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      (``n``\ \*(``n``\ +1))/2. See :ref:`matrix-storage` for
-      more details.
-
-   x
-      Buffer holding the ``n``-element right-hand side vector ``b``. The
-      buffer must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Buffer holding the solution vector ``x``.
-
-
-.. _onemkl_blas_tpsv_usm:
-
-tpsv (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event tpsv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        onemkl::diag unit_nonunit,
-                        std::int64_t n,
-                        std::int64_t k,
-                        const T *a,
-                        T *x,
-                        std::int64_t incx,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event tpsv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        onemkl::diag unit_nonunit,
-                        std::int64_t n,
-                        std::int64_t k,
-                        const T *a,
-                        T *x,
-                        std::int64_t incx,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to
-      ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Numbers of rows and columns of ``A``. Must be at least zero.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least (``n``\ \*(``n``\ +1))/2. See
-      :ref:`matrix-storage` for
-      more details.
-
-   x
-      Pointer to the ``n``-element right-hand side vector ``b``. The
-      array holding the ``n``-element right-hand side vector ``b``
-      must be of size at least (1 + (``n`` - 1)*abs(``incx``)). See
-      :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Pointer to the solution vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/trmm.rst b/docs/domains/blas/trmm.rst
deleted file mode 100644
index 1a812d56c..000000000
--- a/docs/domains/blas/trmm.rst
+++ /dev/null
@@ -1,288 +0,0 @@
-.. _onemkl_blas_trmm:
-
-trmm
-====
-
-Computes a matrix-matrix product where one input matrix is triangular
-and one input matrix is general.
-
-.. _onemkl_blas_trmm_description:
-
-.. rubric:: Description
-
-The ``trmm`` routines compute a scalar-matrix-matrix product where one of
-the matrices in the multiplication is triangular. The argument
-``left_right`` determines if the triangular matrix, ``A``, is on the
-left of the multiplication (``left_right`` = ``side::left``) or on
-the right (``left_right`` = ``side::right``). Depending on
-``left_right``. The operation is defined as:
-
-.. math::
-
-      B \leftarrow alpha*op(A)*B
-
-or
-
-.. math::
-
-      B \leftarrow alpha*B*op(A)
-
-where:
-
-op(``A``) is one of op(``A``) = *A*, or op(``A``) = ``A``\ :sup:`T`,
-or op(``A``) = ``A``\ :sup:`H`,
-
-``alpha`` is a scalar,
-
-``A`` is a triangular matrix, and ``B`` is a general matrix.
-
-Here ``B`` is ``m`` x ``n`` and ``A`` is either ``m`` x ``m`` or
-``n`` x ``n``, depending on ``left_right``.
-
-``trmm`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_trmm_buffer:
-
-trmm (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void trmm(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose transa,
-                 onemkl::diag unit_diag,
-                 std::int64_t m,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &b,
-                 std::int64_t ldb)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void trmm(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose transa,
-                 onemkl::diag unit_diag,
-                 std::int64_t m,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &b,
-                 std::int64_t ldb)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   left_right
-      Specifies whether ``A`` is on the left side of the multiplication
-      (``side::left``) or on the right side (``side::right``). See :ref:`onemkl_datatypes` for more details.
-
-   uplo
-      Specifies whether the matrix ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_diag
-      Specifies whether ``A`` is assumed to be unit triangular (all
-      diagonal elements are 1). See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Specifies the number of rows of ``B``. The value of ``m`` must be
-      at least zero.
-
-   n
-      Specifies the number of columns of ``B``. The value of ``n`` must
-      be at least zero.
-
-   alpha
-      Scaling factor for the matrix-matrix product.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``m`` if ``left_right`` = ``side::left``, or
-      ``lda``\ \*\ ``n`` if ``left_right`` = ``side::right``. See
-      :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of ``A``. Must be at least ``m`` if
-      ``left_right`` = ``side::left``, and at least ``n`` if
-      ``left_right`` = ``side::right``. Must be positive.
-
-   b
-      Buffer holding input/output matrix ``B``. Must have size at
-      least ``ldb``\ \*\ ``n`` if column major layout is used to store
-      matrices or at least ``ldb``\ \*\ ``m`` if row major layout is
-      used to store matrices. See :ref:`matrix-storage` for more details.
-
-   ldb
-      Leading dimension of ``B``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if row major layout is used to store matrices.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   b
-      Output buffer, overwritten by ``alpha``\ \*op(``A``)\*\ ``B`` or
-      ``alpha``\ \*\ ``B``\ \*op(``A``).
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``alpha`` = 0, matrix ``B`` is set to zero, and ``A`` and ``B`` do
-   not need to be initialized at entry.
-
-      
-
-.. _onemkl_blas_trmm_usm:
-
-trmm (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event trmm(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose transa,
-                        onemkl::diag unit_diag,
-                        std::int64_t m,
-                        std::int64_t n,
-                        T alpha,
-                        const T* a,
-                        std::int64_t lda,
-                        T* b,
-                        std::int64_t ldb,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event trmm(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose transa,
-                        onemkl::diag unit_diag,
-                        std::int64_t m,
-                        std::int64_t n,
-                        T alpha,
-                        const T* a,
-                        std::int64_t lda,
-                        T* b,
-                        std::int64_t ldb,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-   
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   left_right
-      Specifies whether ``A`` is on the left side of the
-      multiplication (``side::left``) or on the right side
-      (``side::right``). See :ref:`onemkl_datatypes` for more details.
-
-   uplo
-      Specifies whether the matrix ``A`` is upper or lower
-      triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to
-      ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_diag
-      Specifies whether ``A`` is assumed to be unit triangular (all
-      diagonal elements are 1). See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Specifies the number of rows of ``B``. The value of ``m`` must
-      be at least zero.
-
-   n
-      Specifies the number of columns of ``B``. The value of ``n``
-      must be at least zero.
-
-   alpha
-      Scaling factor for the matrix-matrix product.
-
-   a
-      Pointer to input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``m`` if ``left_right`` = ``side::left``, or
-      ``lda``\ \*\ ``n`` if ``left_right`` = ``side::right``. See
-      :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of ``A``. Must be at least ``m`` if
-      ``left_right`` = ``side::left``, and at least ``n`` if
-      ``left_right`` = ``side::right``. Must be positive.
-
-   b
-      Pointer to input/output matrix ``B``. Must have size at
-      least ``ldb``\ \*\ ``n`` if column major layout is used to store
-      matrices or at least ``ldb``\ \*\ ``m`` if row major layout is
-      used to store matrices. See :ref:`matrix-storage` for more details.
-
-   ldb
-      Leading dimension of ``B``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if row major layout is used to store matrices.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   b
-      Pointer to the output matrix, overwritten by
-      ``alpha``\ \*op(``A``)\*\ ``B`` or
-      ``alpha``\ \*\ ``B``\ \*op(``A``).
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``alpha`` = 0, matrix ``B`` is set to zero, and ``A`` and ``B``
-   do not need to be initialized at entry.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/trmv.rst b/docs/domains/blas/trmv.rst
deleted file mode 100644
index d779c12a9..000000000
--- a/docs/domains/blas/trmv.rst
+++ /dev/null
@@ -1,210 +0,0 @@
-.. _onemkl_blas_trmv:
-
-trmv
-====
-
-Computes a matrix-vector product using a triangular matrix.
-
-.. _onemkl_blas_trmv_description:
-
-.. rubric:: Description
-
-The ``trmv`` routines compute a matrix-vector product with a triangular
-matrix. The operation is defined as:
-
-.. math::
-
-      x \leftarrow op(A)*x
-
-where:
-
-op(``A``) is one of op(``A``) = ``A``, or op(``A``) =
-``A``\ :sup:`T`, or op(``A``) = ``A``\ :sup:`H`,
-
-``A`` is an ``n``-by-``n`` unit or non-unit, upper or lower
-triangular band matrix,
-
-``x`` is a vector of length ``n``.
-
-``trmv`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_trmv_buffer:
-
-trmv (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void trmv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 onemkl::diag unit_nonunit,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void trmv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 onemkl::diag unit_nonunit,
-                 std::int64_t n,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Numbers of rows and columns of ``A``. Must be at least zero.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``n``, and
-      positive.
-
-   x
-      Buffer holding input vector ``x``. The buffer must be of size at
-      least (1 + (``n`` - 1)*abs(``incx``)). See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Buffer holding the updated vector ``x``.
-
-
-.. _onemkl_blas_trmv_usm:
-
-trmv (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event trmv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        onemkl::diag unit_nonunit,
-                        std::int64_t n,
-                        const T *a,
-                        std::int64_t lda,
-                        T *x,
-                        std::int64_t incx,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event trmv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        onemkl::diag unit_nonunit,
-                        std::int64_t n,
-                        const T *a,
-                        std::int64_t lda,
-                        T *x,
-                        std::int64_t incx,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to
-      ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Numbers of rows and columns of ``A``. Must be at least zero.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``n``, and
-      positive.
-
-   x
-      Pointer to input vector ``x``. The array holding input vector
-      ``x`` must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for
-      more details.
-
-   incx
-      Stride of vector ``x``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Pointer to the updated vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/blas/trsm.rst b/docs/domains/blas/trsm.rst
deleted file mode 100644
index 0185e69d7..000000000
--- a/docs/domains/blas/trsm.rst
+++ /dev/null
@@ -1,286 +0,0 @@
-.. _onemkl_blas_trsm:
-
-trsm
-====
-
-Solves a triangular matrix equation (forward or backward solve).
-
-.. _onemkl_blas_trsm_description:
-
-.. rubric:: Description
-
-The ``trsm`` routines solve one of the following matrix equations:
-
-.. math::
-
-      op(A)*X = alpha*B
-
-or
-
-.. math::
-
-      X*op(A) = alpha*B
-
-where:
-
-op(``A``) is one of op(``A``) = ``A``, or op(``A``) =
-``A``\ :sup:`T`, or op(``A``) = ``A``\ :sup:`H`,
-
-``alpha`` is a scalar,
-
-``A`` is a triangular matrix, and
-
-``B`` and ``X`` are ``m`` x ``n`` general matrices.
-
-``A`` is either ``m`` x ``m`` or ``n`` x ``n``, depending on whether
-it multiplies ``X`` on the left or right. On return, the matrix ``B``
-is overwritten by the solution matrix ``X``.
-
-``trsm`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_trsm_buffer:
-
-trsm (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void trsm(sycl::queue &queue,
-                 onemkl::side left_right,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose transa,
-                 onemkl::diag unit_diag,
-                 std::int64_t m,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &b,
-                 std::int64_t ldb)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void trsm(sycl::queue &queue,
-                 onemkl::side left_right,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose transa,
-                 onemkl::diag unit_diag,
-                 std::int64_t m,
-                 std::int64_t n,
-                 T alpha,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &b,
-                 std::int64_t ldb)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   left_right
-      Specifies whether ``A`` multiplies ``X`` on the left
-      (``side::left``) or on the right (``side::right``). See :ref:`onemkl_datatypes` for more details.
-
-   uplo
-      Specifies whether the matrix ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_diag
-      Specifies whether ``A`` is assumed to be unit triangular (all
-      diagonal elements are 1). See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Specifies the number of rows of ``B``. The value of ``m`` must be
-      at least zero.
-
-   n
-      Specifies the number of columns of ``B``. The value of ``n`` must
-      be at least zero.
-
-   alpha
-      Scaling factor for the solution.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``m`` if ``left_right`` = ``side::left``, or
-      ``lda``\ \*\ ``n`` if ``left_right`` = ``side::right``. See
-      :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of ``A``. Must be at least ``m`` if
-      ``left_right`` = ``side::left``, and at least ``n`` if
-      ``left_right`` = ``side::right``. Must be positive.
-
-   b
-      Buffer holding input/output matrix ``B``. Must have size at
-      least ``ldb``\ \*\ ``n`` if column major layout is used to store
-      matrices or at least ``ldb``\ \*\ ``m`` if row major layout is
-      used to store matrices. See :ref:`matrix-storage` for more details.
-
-   ldb
-      Leading dimension of ``B``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if row major layout is used to store matrices.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   b
-      Output buffer. Overwritten by the solution matrix ``X``.
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``alpha`` = 0, matrix ``B`` is set to zero, and ``A`` and ``B`` do
-   not need to be initialized at entry.
-
-      
-
-.. _onemkl_blas_trsm_usm:
-
-trsm (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event trsm(sycl::queue &queue,
-                        onemkl::side left_right,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose transa,
-                        onemkl::diag unit_diag,
-                        std::int64_t m,
-                        std::int64_t n,
-                        T alpha,
-                        const T* a,
-                        std::int64_t lda,
-                        T* b,
-                        std::int64_t ldb,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event trsm(sycl::queue &queue,
-                        onemkl::side left_right,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose transa,
-                        onemkl::diag unit_diag,
-                        std::int64_t m,
-                        std::int64_t n,
-                        T alpha,
-                        const T* a,
-                        std::int64_t lda,
-                        T* b,
-                        std::int64_t ldb,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   left_right
-      Specifies whether ``A`` multiplies ``X`` on the left
-      (``side::left``) or on the right (``side::right``). See :ref:`onemkl_datatypes` for more details.
-
-   uplo
-      Specifies whether the matrix ``A`` is upper or lower
-      triangular. See :ref:`onemkl_datatypes` for more details.
-
-   transa
-      Specifies op(``A``), the transposition operation applied to
-      ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_diag
-      Specifies whether ``A`` is assumed to be unit triangular (all
-      diagonal elements are 1). See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Specifies the number of rows of ``B``. The value of ``m`` must
-      be at least zero.
-
-   n
-      Specifies the number of columns of ``B``. The value of ``n``
-      must be at least zero.
-
-   alpha
-      Scaling factor for the solution.
-
-   a
-      Pointer to input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``m`` if ``left_right`` = ``side::left``, or
-      ``lda``\ \*\ ``n`` if ``left_right`` = ``side::right``. See
-      :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of ``A``. Must be at least ``m`` if
-      ``left_right`` = ``side::left``, and at least ``n`` if
-      ``left_right`` = ``side::right``. Must be positive.
-
-   b
-      Pointer to input/output matrix ``B``. Must have size at
-      least ``ldb``\ \*\ ``n`` if column major layout is used to store
-      matrices or at least ``ldb``\ \*\ ``m`` if row major layout is
-      used to store matrices. See :ref:`matrix-storage` for more details.
-
-   ldb
-      Leading dimension of ``B``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if row major layout is used to store matrices.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   b
-      Pointer to the output matrix. Overwritten by the solution
-      matrix ``X``.
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``alpha`` = 0, matrix ``B`` is set to zero, and ``A`` and ``B``
-   do not need to be initialized at entry.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-3-routines`
diff --git a/docs/domains/blas/trsm_batch.rst b/docs/domains/blas/trsm_batch.rst
deleted file mode 100644
index e68b68aa2..000000000
--- a/docs/domains/blas/trsm_batch.rst
+++ /dev/null
@@ -1,497 +0,0 @@
-.. _onemkl_blas_trsm_batch:
-
-trsm_batch
-==========
-
-Computes a group of ``trsm`` operations.
-
-.. _onemkl_blas_trsm_batch_description:
-
-.. rubric:: Description
-
-The ``trsm_batch`` routines are batched versions of :ref:`onemkl_blas_trsm`, performing
-multiple ``trsm`` operations in a single call. Each ``trsm`` 
-solves an equation of the form op(A) \* X = alpha \* B or X \* op(A) = alpha \* B. 
-   
-``trsm_batch`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_trsm_batch_buffer:
-
-trsm_batch (Buffer Version)
----------------------------
-
-.. rubric:: Description
-
-The buffer version of ``trsm_batch`` supports only the strided API. 
-   
-The strided API operation is defined as:
-::
-
-   for i = 0 … batch_size – 1
-       A and B are matrices at offset i * stridea and i * strideb in a and b.
-       if (left_right == onemkl::side::left) then
-           compute X such that op(A) * X = alpha * B
-       else
-           compute X such that X * op(A) = alpha * B
-       end if
-       B := X
-   end for
-
-where:
-
-op(``A``) is one of op(``A``) = ``A``, or op(A) = ``A``\ :sup:`T`,
-or op(``A``) = ``A``\ :sup:`H`,
-
-``alpha`` is a scalar,
-
-``A`` is a triangular matrix,
-
-``B`` and ``X`` are ``m`` x ``n`` general matrices,
-
-``A`` is either ``m`` x ``m`` or ``n`` x ``n``,depending on whether
-it multiplies ``X`` on the left or right. On return, the matrix ``B``
-is overwritten by the solution matrix ``X``.
-
-The ``a`` and ``b`` buffers contain all the input matrices. The stride 
-between matrices is given by the stride parameter. The total number
-of matrices in ``a`` and ``b`` buffers are given by the ``batch_size`` parameter.
-
-**Strided API**
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void trsm_batch(sycl::queue &queue,
-                       onemkl::side left_right,
-                       onemkl::uplo upper_lower,
-                       onemkl::transpose trans,
-                       onemkl::diag unit_diag,
-                       std::int64_t m,
-                       std::int64_t n,
-                       T alpha,
-                       sycl::buffer<T,1> &a,
-                       std::int64_t lda,
-                       std::int64_t stridea,
-                       sycl::buffer<T,1> &b,
-                       std::int64_t ldb,
-                       std::int64_t strideb,
-                       std::int64_t batch_size)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void trsm_batch(sycl::queue &queue,
-                       onemkl::side left_right,
-                       onemkl::uplo upper_lower,
-                       onemkl::transpose trans,
-                       onemkl::diag unit_diag,
-                       std::int64_t m,
-                       std::int64_t n,
-                       T alpha,
-                       sycl::buffer<T,1> &a,
-                       std::int64_t lda,
-                       std::int64_t stridea,
-                       sycl::buffer<T,1> &b,
-                       std::int64_t ldb,
-                       std::int64_t strideb,
-                       std::int64_t batch_size)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   left_right
-      Specifies whether the matrices ``A`` multiply ``X`` on the left
-      (``side::left``) or on the right (``side::right``). See :ref:`onemkl_datatypes` for more details.
-
-   upper_lower
-      Specifies whether the matrices ``A`` are upper or lower
-      triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to the
-      matrices ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_diag
-      Specifies whether the matrices ``A`` are assumed to be unit
-      triangular (all diagonal elements are 1). See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Number of rows of the ``B`` matrices. Must be at least zero.
-
-   n
-      Number of columns of the ``B`` matrices. Must be at least zero.
-
-   alpha
-      Scaling factor for the solutions.
-
-   a
-      Buffer holding the input matrices ``A`` with size ``stridea`` * ``batch_size``.
-
-   lda
-      Leading dimension of the matrices ``A``. Must be at least ``m`` if
-      ``left_right`` = ``side::left``, and at least ``n`` if ``left_right`` =
-      ``side::right``. Must be positive.
-
-   stridea
-      Stride between different ``A`` matrices.
-
-   b
-      Buffer holding the input matrices ``B`` with size ``strideb`` * ``batch_size``.
-
-   ldb
-      Leading dimension of the matrices ``B``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if row major layout is used to store matrices.
-
-   strideb
-      Stride between different ``B`` matrices.
-
-   batch_size
-      Specifies the number of triangular linear systems to solve.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   b
-      Output buffer, overwritten by ``batch_size`` solution matrices
-      ``X``.
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``alpha`` = 0, matrix ``B`` is set to zero and the matrices ``A``
-   and ``B`` do not need to be initialized before calling ``trsm_batch``.
-
-
-.. rubric:: Description
-
-The USM version of ``trsm_batch`` supports the group API and strided API. 
-
-The group API operation is defined as:
-::
-
-   idx = 0
-   for i = 0 … group_count – 1
-       for j = 0 … group_size – 1
-           A and B are matrices in a[idx] and b[idx]
-           if (left_right == onemkl::side::left) then
-               compute X such that op(A) * X = alpha[i] * B
-           else
-               compute X such that X * op(A) = alpha[i] * B
-           end if
-           B := X
-           idx = idx + 1
-       end for
-   end for     
-
-
-The strided API operation is defined as:
-::
-
-   for i = 0 … batch_size – 1
-       A and B are matrices at offset i * stridea and i * strideb in a and b.
-       if (left_right == onemkl::side::left) then
-           compute X such that op(A) * X = alpha * B
-       else
-           compute X such that X * op(A) = alpha * B
-       end if
-       B := X
-   end for
-
-   where:
-
-op(``A``) is one of op(``A``) = ``A``, or op(A) = ``A``\ :sup:`T`,
-or op(``A``) = ``A``\ :sup:`H`,
-
-``alpha`` is a scalar,
-
-``A`` is a triangular matrix,
-
-``B`` and ``X`` are ``m`` x ``n`` general matrices,
-
-``A`` is either ``m`` x ``m`` or ``n`` x ``n``,depending on whether
-it multiplies ``X`` on the left or right. On return, the matrix ``B``
-is overwritten by the solution matrix ``X``.
-
-For group API, ``a`` and ``b`` arrays contain the pointers for all the input matrices. 
-The total number of matrices in ``a`` and ``b`` are given by: 
- 
-.. math::
-      
-      total\_batch\_count = \sum_{i=0}^{group\_count-1}group\_size[i]
-
-For strided API, ``a`` and ``b`` arrays contain all the input matrices. The total number of matrices 
-in ``a`` and ``b`` are given by the ``batch_size`` parameter.  
-
-**Group API**
-
-.. rubric:: Syntax
-      
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event trsm_batch(sycl::queue &queue,
-                              onemkl::side *left_right,
-                              onemkl::uplo *upper_lower,
-                              onemkl::transpose *trans,
-                              onemkl::diag *unit_diag,
-                              std::int64_t *m,
-                              std::int64_t *n,
-                              T *alpha,
-                              const T **a,
-                              std::int64_t *lda,
-                              T **b,
-                              std::int64_t *ldb,
-                              std::int64_t group_count,
-                              std::int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event trsm_batch(sycl::queue &queue,
-                              onemkl::side *left_right,
-                              onemkl::uplo *upper_lower,
-                              onemkl::transpose *trans,
-                              onemkl::diag *unit_diag,
-                              std::int64_t *m,
-                              std::int64_t *n,
-                              T *alpha,
-                              const T **a,
-                              std::int64_t *lda,
-                              T **b,
-                              std::int64_t *ldb,
-                              std::int64_t group_count,
-                              std::int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   left_right
-      Array of ``group_count`` ``onemkl::side`` values. ``left_right[i]`` specifies whether ``A`` multiplies
-      ``X`` on the left (``side::left``) or on the right
-      (``side::right``) for every ``trsm`` operation in group ``i``. See :ref:`onemkl_datatypes` for more details.
-
-   upper_lower
-      Array of ``group_count`` ``onemkl::uplo`` values. ``upper_lower[i]`` specifies whether ``A`` is upper or lower
-      triangular for every matrix in group ``i``. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Array of ``group_count`` ``onemkl::transpose`` values. ``trans[i]`` specifies the form of op(``A``) used
-      for every ``trsm`` operation in group ``i``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_diag
-      Array of ``group_count`` ``onemkl::diag`` values. ``unit_diag[i]`` specifies whether ``A`` is assumed to
-      be unit triangular (all diagonal elements are 1) for every matrix in group ``i``. See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Array of ``group_count`` integers. ``m[i]`` specifies the
-      number of rows of ``B`` for every matrix in group ``i``. All entries must be at least zero.
-
-   n
-      Array of ``group_count`` integers. ``n[i]`` specifies the
-      number of columns of ``B`` for every matrix in group ``i``. All entries must be at least zero.
-
-   alpha
-      Array of ``group_count`` scalar elements. ``alpha[i]`` specifies the scaling factor in group ``i``.
-
-   a
-      Array of pointers to input matrices ``A`` with size ``total_batch_count``. See :ref:`matrix-storage` for more details.
-
-   lda
-      Array of ``group_count`` integers. ``lda[i]`` specifies the leading dimension of ``A`` for every matrix in group ``i``. 
-      All entries must be at least ``m``
-      if ``left_right`` is ``side::left``, and at least 
-      ``n`` if ``left_right`` is ``side::right``. All entries must be positive.
-
-   b
-      Array of pointers to input matrices ``B`` with size ``total_batch_count``. See :ref:`matrix-storage` for more details.
-
-   ldb
-      Array of ``group_count`` integers. ``ldb[i]`` specifies the
-      leading dimension of ``B`` for every matrix in group ``i``.  All
-      entries must be positive and at least ``m`` and positive if
-      column major layout is used to store matrices or at least ``n``
-      if row major layout is used to store matrices.
-
-   group_count
-      Specifies the number of groups. Must be at least 0.
-
-   group_size
-      Array of ``group_count`` integers. ``group_size[i]`` specifies the
-      number of ``trsm`` operations in group ``i``. All entries must be at least 0.
-
-   dependencies
-         List of events to wait for before starting computation, if any.
-         If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   b
-      Output buffer, overwritten by the ``total_batch_count`` solution
-      matrices ``X``.
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``alpha`` = 0, matrix ``B`` is set to zero and the matrices ``A``
-   and ``B`` do not need to be initialized before calling ``trsm_batch``.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event trsm_batch(sycl::queue &queue,
-                              onemkl::side left_right,
-                              onemkl::uplo upper_lower,
-                              onemkl::transpose trans,
-                              onemkl::diag unit_diag,
-                              std::int64_t m,
-                              std::int64_t n,
-                              T alpha,
-                              const T *a,
-                              std::int64_t lda,
-                              std::int64_t stridea,
-                              T *b,
-                              std::int64_t ldb,
-                              std::int64_t strideb,
-                              std::int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event trsm_batch(sycl::queue &queue,
-                              onemkl::side left_right,
-                              onemkl::uplo upper_lower,
-                              onemkl::transpose trans,
-                              onemkl::diag unit_diag,
-                              std::int64_t m,
-                              std::int64_t n,
-                              T alpha,
-                              const T *a,
-                              std::int64_t lda,
-                              std::int64_t stridea,
-                              T *b,
-                              std::int64_t ldb,
-                              std::int64_t strideb,
-                              std::int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   left_right
-      Specifies whether the matrices ``A`` multiply ``X`` on the left
-      (``side::left``) or on the right (``side::right``). See :ref:`onemkl_datatypes` for more details.
-
-   upper_lower
-      Specifies whether the matrices ``A`` are upper or lower
-      triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to the
-      matrices ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_diag
-      Specifies whether the matrices ``A`` are assumed to be unit
-      triangular (all diagonal elements are 1). See :ref:`onemkl_datatypes` for more details.
-
-   m
-      Number of rows of the ``B`` matrices. Must be at least zero.
-
-   n
-      Number of columns of the ``B`` matrices. Must be at least zero.
-
-   alpha
-      Scaling factor for the solutions.
-
-   a
-      Pointer to input matrices ``A`` with size ``stridea`` * ``batch_size``.
-
-   lda
-      Leading dimension of the matrices ``A``. Must be at least ``m`` if
-      ``left_right`` = ``side::left``, and at least ``n`` if ``left_right`` =
-      ``side::right``. Must be positive.
-
-   stridea
-      Stride between different ``A`` matrices.
-
-   b
-      Pointer to input matrices ``B`` with size ``strideb`` * ``batch_size``.
-
-   ldb
-      Leading dimension of the matrices ``B``. It must be positive and at least
-      ``m`` if column major layout is used to store matrices or at
-      least ``n`` if row major layout is used to store matrices.
-
-   strideb
-      Stride between different ``B`` matrices. 
-
-   batch_size
-      Specifies the number of triangular linear systems to solve.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   b
-      Output matrices, overwritten by ``batch_size`` solution matrices
-      ``X``.
-
-.. container:: section
-
-   .. rubric:: Notes
-
-   If ``alpha`` = 0, matrix ``B`` is set to zero and the matrices ``A``
-   and ``B`` do not need to be initialized before calling ``trsm_batch``.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-like-extensions`
diff --git a/docs/domains/blas/trsv.rst b/docs/domains/blas/trsv.rst
deleted file mode 100644
index 1a30dad8d..000000000
--- a/docs/domains/blas/trsv.rst
+++ /dev/null
@@ -1,215 +0,0 @@
-.. _onemkl_blas_trsv:
-
-trsv
-====
-
-Solves a system of linear equations whose coefficients are in a
-triangular matrix.
-
-.. _onemkl_blas_trsv_description:
-
-.. rubric:: Description
-
-The ``trsv`` routines compute a matrix-vector product with a triangular
-band matrix. The operation is defined as:
-
-.. math::
-
-      op(A)*x = b
-
-where:
-
-op(``A``) is one of op(``A``) = ``A``, or op(``A``) =
-``A``\ :sup:`T`, or op(``A``) = ``A``\ :sup:`H`,
-
-``A`` is an ``n``-by-``n`` unit or non-unit, upper or lower
-triangular matrix,
-
-``b`` and ``x`` are vectors of length ``n``.
-
-``trsv`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_blas_trsv_buffer:
-
-trsv (Buffer Version)
----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       void trsv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 onemkl::diag unit_nonunit,
-                 std::int64_t n,
-                 std::int64_t k,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx)
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       void trsv(sycl::queue &queue,
-                 onemkl::uplo upper_lower,
-                 onemkl::transpose trans,
-                 onemkl::diag unit_nonunit,
-                 std::int64_t n,
-                 std::int64_t k,
-                 sycl::buffer<T,1> &a,
-                 std::int64_t lda,
-                 sycl::buffer<T,1> &x,
-                 std::int64_t incx)
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Numbers of rows and columns of ``A``. Must be at least zero.
-
-   a
-      Buffer holding input matrix ``A``. Must have size at least
-      ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``n``, and
-      positive.
-
-   x
-      Buffer holding the ``n``-element right-hand side vector ``b``. The
-      buffer must be of size at least (1 + (``n`` - 1)*abs(``incx``)).
-      See :ref:`matrix-storage` for more details.
-
-   incx
-      Stride of vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Buffer holding the solution vector ``x``.
-
-      
-
-.. _onemkl_blas_trsv_usm:
-
-trsv (USM Version)
-------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::column_major {
-       sycl::event trsv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        onemkl::diag unit_nonunit,
-                        std::int64_t n,
-                        std::int64_t k,
-                        const T *a,
-                        std::int64_t lda,
-                        T *x,
-                        std::int64_t incx,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-.. code-block:: cpp
-
-   namespace oneapi::mkl::blas::row_major {
-       sycl::event trsv(sycl::queue &queue,
-                        onemkl::uplo upper_lower,
-                        onemkl::transpose trans,
-                        onemkl::diag unit_nonunit,
-                        std::int64_t n,
-                        std::int64_t k,
-                        const T *a,
-                        std::int64_t lda,
-                        T *x,
-                        std::int64_t incx,
-                        const std::vector<sycl::event> &dependencies = {})
-   }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-   queue
-      The queue where the routine should be executed.
-
-   upper_lower
-      Specifies whether ``A`` is upper or lower triangular. See :ref:`onemkl_datatypes` for more details.
-
-   trans
-      Specifies op(``A``), the transposition operation applied to
-      ``A``. See :ref:`onemkl_datatypes` for more details.
-
-   unit_nonunit
-      Specifies whether the matrix ``A`` is unit triangular or not. See :ref:`onemkl_datatypes` for more details.
-
-   n
-      Numbers of rows and columns of ``A``. Must be at least zero.
-
-   a
-      Pointer to input matrix ``A``. The array holding input matrix
-      ``A`` must have size at least ``lda``\ \*\ ``n``. See :ref:`matrix-storage` for
-      more details.
-
-   lda
-      Leading dimension of matrix ``A``. Must be at least ``n``, and
-      positive.
-
-   x
-      Pointer to the ``n``-element right-hand side vector ``b``. The
-      array holding the ``n``-element right-hand side vector ``b``
-      must be of size at least (1 + (``n`` - 1)*abs(``incx``)). See
-      :ref:`matrix-storage` for more details.
-
-   incx
-      Stride of vector ``x``.
-
-   dependencies
-      List of events to wait for before starting computation, if any.
-      If omitted, defaults to no dependencies.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-   x
-      Pointer to the solution vector ``x``.
-
-.. container:: section
-
-   .. rubric:: Return Values
-
-   Output event to wait on to ensure computation is complete.
-
-
-   **Parent topic:** :ref:`blas-level-2-routines`
diff --git a/docs/domains/dense_linear_algebra.rst b/docs/domains/dense_linear_algebra.rst
deleted file mode 100644
index 6544b9074..000000000
--- a/docs/domains/dense_linear_algebra.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-.. _onemkl_dense_linear_algebra:
-
-Dense Linear Algebra
----------------------
-
-This section contains information about dense linear algebra routines:
-
-:ref:`matrix-storage` provides information about dense matrix and vector storage formats that are used by oneMKL :ref:`onemkl_blas` and :ref:`onemkl_lapack`.
-
-:ref:`onemkl_blas` provides vector, matrix-vector, and matrix-matrix routines for dense matrices and vector operations.
-
-:ref:`onemkl_lapack` provides more complex dense linear algebra routines, e.g., matrix factorization, solving dense systems of linear equations, least square problems, eigenvalue and singular value problems, and performing a number of related computational tasks.
-
-.. toctree::
-    :hidden:
-
-    matrix-storage.rst
-    blas/blas.rst
-    lapack/lapack.rst
diff --git a/docs/domains/lapack/gebrd.rst b/docs/domains/lapack/gebrd.rst
deleted file mode 100644
index 7e014f97f..000000000
--- a/docs/domains/lapack/gebrd.rst
+++ /dev/null
@@ -1,230 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_gebrd:
-
-gebrd
-=====
-
-Reduces a general matrix to bidiagonal form.
-
-.. container:: section
-
-    .. rubric:: Description
-
-``gebrd`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-The routine reduces a general :math:`m \times n` matrix :math:`A` to a 
-bidiagonal matrix :math:`B` by an orthogonal (unitary) transformation.
-
-
-If :math:`m \ge n`, the reduction is given by :math:`A=QBP^H=\begin{pmatrix}B_1\\0\end{pmatrix}P^H=Q_1B_1P_H`
-
-where :math:`B_{1}` is an :math:`n \times n` upper diagonal matrix,
-:math:`Q` and :math:`P` are orthogonal or, for a complex :math:`A`, unitary
-matrices; :math:`Q_{1}` consists of the first :math:`n` columns of
-:math:`Q`.
-
-If :math:`m < n`, the reduction is given by
-
-:math:`A = QBP^H = Q\begin{pmatrix}B_1\\0\end{pmatrix}P^H = Q_1B_1P_1^H`,
-
-where :math:`B_{1}` is an :math:`m \times m` lower diagonal matrix,
-:math:`Q` and :math:`P` are orthogonal or, for a complex :math:`A`, unitary
-matrices; :math:`P_{1}` consists of the first :math:`m` columns of
-:math:`P`.
-
-The routine does not form the matrices :math:`Q` and :math:`P` explicitly,
-but represents them as products of elementary reflectors. Routines
-are provided to work with the matrices :math:`Q` and :math:`P` in this
-representation:
-
-If the matrix :math:`A` is real,
-
--  to compute :math:`Q` and :math:`P` explicitly, call
-   :ref:`onemkl_lapack_orgbr`.
-
-If the matrix :math:`A` is complex,
-
--  to compute :math:`Q` and :math:`P` explicitly, call
-   :ref:`onemkl_lapack_ungbr`
-
-gebrd (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<realT,1> &d, sycl::buffer<realT,1> &e, sycl::buffer<T,1> &tauq, sycl::buffer<T,1> &taup, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns in the matrix :math:`A` (:math:`0 \le n`).
-
-a
-   The buffer :math:`a`, size (``lda,*``). The buffer ``a`` contains the
-   matrix :math:`A`. The second dimension of ``a`` must be at least
-   :math:`\max(1, m)`.
-
-lda
-   The leading dimension of :math:`a`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_gebrd_scratchpad_size` function.
-
-.. container:: section
-
-    .. rubric:: Output Parameters
-
-a
-   If :math:`m \ge n`, the diagonal and first super-diagonal of a are
-   overwritten by the upper bidiagonal matrix :math:`B`. The elements
-   below the diagonal, with the buffer tauq, represent the orthogonal
-   matrix :math:`Q` as a product of elementary reflectors, and the
-   elements above the first superdiagonal, with the buffer ``taup``,
-   represent the orthogonal matrix :math:`P` as a product of elementary
-   reflectors.
-
-   If :math:`m<n`, the diagonal and first sub-diagonal of a are
-   overwritten by the lower bidiagonal matrix :math:`B`. The elements
-   below the first subdiagonal, with the buffer tauq, represent the
-   orthogonal matrix :math:`Q` as a product of elementary reflectors, and
-   the elements above the diagonal, with the buffer ``taup``, represent
-   the orthogonal matrix :math:`P` as a product of elementary reflectors.
-
-d
-   Buffer, size at least :math:`\max(1, \min(m,n))`. Contains the diagonal
-   elements of :math:`B`.
-
-e
-   Buffer, size at least :math:`\max(1, \min(m,n) - 1)`. Contains the
-   off-diagonal elements of :math:`B`.
-
-tauq
-   Buffer, size at least :math:`\max(1, \min(m, n))`. The scalar factors of
-   the elementary reflectors which represent the orthogonal or
-   unitary matrix :math:`Q`.
-
-taup
-   Buffer, size at least :math:`\max(1, \min(m, n))`. The scalar factors of
-   the elementary reflectors which represent the orthogonal or
-   unitary matrix :math:`P`.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-gebrd (USM Version)
--------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, T *a, std::int64_t lda, RealT *d, RealT *e, T *tauq, T *taup, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-    .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns in the matrix :math:`A` (:math:`0 \le n`).
-
-a
-   Pointer to matrix :math:`A`. The second dimension of ``a`` must be at least
-   :math:`\max(1, m)`.
-
-lda
-   The leading dimension of ``a``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type T.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_gebrd_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-    .. rubric:: Output Parameters
-
-a
-   If :math:`m \ge n`, the diagonal and first super-diagonal of a are
-   overwritten by the upper bidiagonal matrix :math:`B`. The elements
-   below the diagonal, with the array tauq, represent the orthogonal
-   matrix :math:`Q` as a product of elementary reflectors, and the
-   elements above the first superdiagonal, with the array ``taup``,
-   represent the orthogonal matrix :math:`P` as a product of elementary
-   reflectors.
-
-   If :math:`m<n`, the diagonal and first sub-diagonal of a are
-   overwritten by the lower bidiagonal matrix :math:`B`. The elements
-   below the first subdiagonal, with the array tauq, represent the
-   orthogonal matrix :math:`Q` as a product of elementary reflectors, and
-   the elements above the diagonal, with the array ``taup``, represent
-   the orthogonal matrix :math:`P` as a product of elementary reflectors.
-
-d
-   Pointer to memory of size at least :math:`\max(1, \min(m,n))`. Contains the diagonal
-   elements of :math:`B`.
-
-e
-   Pointer to memory of size at least :math:`\max(1, \min(m,n) - 1)`. Contains the
-   off-diagonal elements of :math:`B`.
-
-tauq
-   Pointer to memory of size at least :math:`\max(1, \min(m, n))`. The scalar factors of
-   the elementary reflectors which represent the orthogonal or
-   unitary matrix :math:`Q`.
-
-taup
-   Pointer to memory of size at least :math:`\max(1, \min(m, n))`. The scalar factors of
-   the elementary reflectors which represent the orthogonal or
-   unitary matrix :math:`P`.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-    .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/gebrd_scratchpad_size.rst b/docs/domains/lapack/gebrd_scratchpad_size.rst
deleted file mode 100644
index 954c8b032..000000000
--- a/docs/domains/lapack/gebrd_scratchpad_size.rst
+++ /dev/null
@@ -1,61 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_gebrd_scratchpad_size:
-
-gebrd_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_gebrd` function.
-
-.. rubric:: Description
-
-``gebrd_scratchpad_size`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``float`` 
-       * -  ``double`` 
-       * -  ``std::complex<float>`` 
-       * -  ``std::complex<double>``
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_gebrd` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_gebrd` function will be performed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns in the matrix :math:`A` (:math:`0 \le n`).
-
-lda
-   The leading dimension of ``a``.
-
-.. container:: section
-
-   .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_gebrd` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/geqrf.rst b/docs/domains/lapack/geqrf.rst
deleted file mode 100644
index 31a2c97ce..000000000
--- a/docs/domains/lapack/geqrf.rst
+++ /dev/null
@@ -1,157 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_geqrf:
-
-geqrf
-=====
-
-Computes the QR factorization of a general :math:`m \times n` matrix.
-
-.. rubric:: Description
-
-``geqrf`` supports the following precisions:
-
-.. list-table:: 
-   :header-rows: 1
-
-   * -  T 
-   * -  ``float`` 
-   * -  ``double`` 
-   * -  ``std::complex<float>`` 
-   * -  ``std::complex<double>`` 
-
-The routine forms the QR factorization of a general
-:math:`m \times n` matrix :math:`A`. No pivoting is performed.
-
-The routine does not form the matrix :math:`Q` explicitly. Instead, :math:`Q`
-is represented as a product of :math:`\min(m, n)` elementary
-reflectors. Routines are provided to work with :math:`Q` in this
-representation.
-
-geqrf (Buffer Version)
-----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-    .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns in :math:`A` (:math:`0 \le n`).
-
-a
-   Buffer holding input matrix :math:`A`. Must have size at least
-   :math:`\text{lda} \cdot n`.
-
-lda
-   The leading dimension of :math:`A`; at least :math:`\max(1, m)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_geqrf_scratchpad_size` function.
-
-.. container:: section
-
-    .. rubric:: Output Parameters
-
-a
-   Output buffer, overwritten by the factorization data as follows:
-
-   The elements on and above the diagonal of the array contain the
-   :math:`\min(m,n) \times n` upper trapezoidal matrix :math:`R` (:math:`R` is upper
-   triangular if :math:`m \ge n`); the elements below the diagonal, with the
-   array tau, represent the orthogonal matrix :math:`Q` as a product of
-   :math:`\min(m,n)` elementary reflectors.
-
-tau
-   Output buffer, size at least :math:`\max(1, \min(m, n))`. Contains scalars
-   that define elementary reflectors for the matrix :math:`Q` in its
-   decomposition in a product of elementary reflectors.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-geqrf (USM Version)
-----------------------
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, T *a, std::int64_t lda, T *tau, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-    .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns in :math:`A` (:math:`0 \le n`).
-
-a
-   Pointer to memory holding input matrix :math:`A`. Must have size at least
-   :math:`\text{lda} \cdot n`.
-
-lda
-   The leading dimension of :math:`A`; at least :math:`\max(1, m)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_geqrf_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-
-.. container:: section
-
-    .. rubric:: Output Parameters
-
-a
-   Overwritten by the factorization data as follows:
-
-   The elements on and above the diagonal of the array contain the
-   :math:`\min(m,n) \times n` upper trapezoidal matrix :math:`R` (:math:`R` is upper
-   triangular if :math:`m \ge n`); the elements below the diagonal, with the
-   array tau, represent the orthogonal matrix :math:`Q` as a product of
-   :math:`\min(m,n)` elementary reflectors.
-
-tau
-   Array, size at least :math:`\max(1, \min(m, n))`. Contains scalars
-   that define elementary reflectors for the matrix :math:`Q` in its
-   decomposition in a product of elementary reflectors.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-    .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
-
diff --git a/docs/domains/lapack/geqrf_batch.rst b/docs/domains/lapack/geqrf_batch.rst
deleted file mode 100644
index 12581248c..000000000
--- a/docs/domains/lapack/geqrf_batch.rst
+++ /dev/null
@@ -1,239 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_geqrf_batch:
-
-geqrf_batch
-===========
-
-Computes the QR factorizations of a batch of general matrices.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``geqrf_batch`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_lapack_geqrf_batch_buffer:
-
-geqrf_batch (Buffer Version)
-----------------------------
-
-.. container:: section
-
-  .. rubric:: Description
-
-The buffer version of ``geqrf_batch`` supports only the strided API. 
- 
-**Strided API**
-
-.. container:: section
-
-   .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, std::int64_t stride_a, sycl::buffer<T> &tau, std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-queue  
-   Device queue where calculations will be performed.
- 
-m
-   Number of rows in matrices :math:`A_i` (:math:`0 \le m`).
-
-n  
-   Number of columns in matrices :math:`A_i` (:math:`0 \le n`).
-
-a
-   Array holding input matrices :math:`A_i`. 
-
-lda
-   Leading dimension of matrices :math:`A_i`.
-
-stride_a
-   Stride between the beginnings of matrices :math:`A_i` inside the batch array ``a``.
-
-stride_tau
-   Stride between the beginnings of arrays :math:`\tau_i` inside the array ``tau``.
-
-batch_size
-   Number of problems in a batch.
-
-scratchpad
-   Scratchpad memory to be used by routine for storing intermediate results.
-         
-scratchpad_size
-   Size of scratchpad memory as the number of floating point elements of type ``T``. Size should not be less than the value returned by the Strided API of the :ref:`onemkl_lapack_geqrf_batch_scratchpad_size` function.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
- 
-a
-  Factorization data as follows: The elements on and above the diagonal of :math:`A_i` contain the :math:`\min(m,n) \times n` upper trapezoidal matrices :math:`R_i` (:math:`R_i` is upper triangular if :math:`m \ge n`); the elements below the diagonal, with the array :math:`\tau_i`, contain the orthogonal matrix :math:`Q_i` as a product of :math:`\min(m,n)` elementary reflectors.
-
-tau 
-    Array to store batch of :math:`\tau_i`, each of size :math:`\min(m,n)`, containing scalars that define elementary reflectors for the matrices :math:`Q_i` in its decomposition in a product of elementary reflectors.
-
-geqrf_batch (USM Version)
--------------------------
-
-.. container:: section
-
-  .. rubric:: Description
-
-The USM version of ``geqrf_batch`` supports the group API and strided API. 
-
-**Group API**
-
-The routine forms the :math:`Q_iR_i` factorizations of a general :math:`m \times n` matrices :math:`A_i`, :math:`i \in \{1...batch\_size\}`, where ``batch_size`` is the sum of all parameter group sizes as provided with ``group_sizes`` array.
-No pivoting is performed during factorization.
-The routine does not form the matrices :math:`Q_i` explicitly. Instead, :math:`Q_i` is represented as a product of :math:`\min(m,n)` elementary reflectors. Routines are provided to work with :math:`Q_i` in this representation.
-The total number of problems to solve, ``batch_size``, is a sum of sizes of all of the groups of parameters as provided by ``group_sizes`` array.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, T **a, std::int64_t *lda, T **tau, std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-queue 
-  Device queue where calculations will be performed.
-
-m
-  Array of ``group_count`` :math:`m_g` parameters. Each :math:`m_g` specifies the number of rows in matrices :math:`A_i` from array ``a``, belonging to group :math:`g`.
-
-n 
-  Array of ``group_count`` :math:`n_g` parameters.
-  Each :math:`n_g` specifies the number of columns in matrices :math:`A_i` from array ``a``, belonging to group :math:`g`.
-
-a  
-  Array of ``batch_size`` pointers to input matrices :math:`A_i`, each of size :math:`\text{lda}_g\cdot n_g` (:math:`g` is an index of group to which :math:`A_i` belongs)
-
-lda
-  Array of ``group_count`` :math:`\text{lda}_g`` parameters, each representing the leading dimensions of input matrices :math:`A_i` from array ``a``, belonging to group :math:`g`.
-
-group_count
-  Specifies the number of groups of parameters. Must be at least 0.
-
-group_sizes 
-  Array of ``group_count`` integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as the number of floating point elements of type ``T``. Size should not be less than the value returned by the Group API of the :ref:`onemkl_lapack_geqrf_batch_scratchpad_size` function.
-
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-a
-  Factorization data as follows: The elements on and above the diagonal of :math:`A_i` contain the :math:`\min(m_g,n_g) \times n_g` upper trapezoidal matrices :math:`R_i` (:math:`R_i` is upper triangular if :math:`m_g \ge n_g`); the elements below the diagonal, with the array :math:`\tau_i`, contain the orthogonal matrix :math:`Q_i` as a product of :math:`\min(m_g,n_g)` elementary reflectors. Here :math:`g` is the index of the parameters group corresponding to the :math:`i`-th decomposition.
-
-tau
-  Array of pointers to store arrays :math:`\tau_i`, each of size :math:`\min(m_g,n_g)`, containing scalars that define elementary reflectors for the matrices :math:`Q_i` in its decomposition in a product of elementary reflectors. Here :math:`g` is the index of the parameters group corresponding to the :math:`i`-th decomposition.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
-The routine forms the :math:`Q_iR_i` factorizations of general :math:`m \times n` matrices :math:`A_i`. No pivoting is performed.
-The routine does not form the matrices :math:`Q_i` explicitly. Instead, :math:`Q_i` is represented as a product of :math:`\min(m,n)` elementary reflectors. Routines are provided to work with :math:`Q_i` in this representation.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, T *a, std::int64_t lda, std::int64_t stride_a, T *tau, std::int64_t stride_tau, std::int64_t batch_size, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m 
-  Number of rows in matrices :math:`A_i` (:math:`0 \le m`).
-
-n
-  Number of columns in matrices :math:`A_i` (:math:`0 \le n`).
-
-a
-  Array holding input matrices :math:`A_i`.
-
-lda
-  Leading dimensions of :math:`A_i`.
-
-stride_a
-  Stride between the beginnings of matrices :math:`A_i` inside the batch array ``a``.
-
-stride_tau
-  Stride between the beginnings of arrays :math:`\tau_i` inside the array ``tau``.
-
-batch_size
-  Number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as the number of floating point elements of type ``T``. Size should not be less than the value returned by the Strided API of the :ref:`onemkl_lapack_geqrf_batch_scratchpad_size` function.
-
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-a
-  Factorization data as follows: The elements on and above the diagonal of :math:`A_i` contain the :math:`\min(m,n) \times n` upper trapezoidal matrices :math:`R_i` (:math:`R_i` is upper triangular if :math:`m \ge n`); the elements below the diagonal, with the array :math:`\tau_i`, contain the orthogonal matrix :math:`Q_i` as a product of :math:`\min(m,n)` elementary reflectors.
-
-tau
-  Array to store batch of :math:`\tau_i`, each of size :math:`\min(m,n)`, containing scalars that define elementary reflectors for the matrices :math:`Q_i` in its decomposition in a product of elementary reflectors.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
diff --git a/docs/domains/lapack/geqrf_batch_scratchpad_size.rst b/docs/domains/lapack/geqrf_batch_scratchpad_size.rst
deleted file mode 100644
index bea681f3d..000000000
--- a/docs/domains/lapack/geqrf_batch_scratchpad_size.rst
+++ /dev/null
@@ -1,111 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_geqrf_batch_scratchpad_size:
-
-geqrf_batch_scratchpad_size
-===========================
-
-Computes size of scratchpad memory required for the :ref:`onemkl_lapack_geqrf_batch` function.
-
-.. rubric:: Description
-
-``geqrf_batch_scratchpad_size`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-**Group API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_geqrf_batch` function.
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes)
-    }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-m
- | Array of ``group_count`` :math:`m_g` parameters.
- | Each of :math:`m_g` specifies the number of rows in the matrices :math:`A_i` belonging to group :math:`g`.
-
-n
- | Array of ``group_count`` :math:`n_g` parameters.
- | Each of :math:`n_g` specifies the number of columns in the matrices :math:`A_i` belonging to group :math:`g`.
-
-lda
-  Array of ``group_count`` :math:`lda_g` parameters, each representing the leading dimensions of input matrices belonging to group :math:`g`.
-
-group_count
-  Number of groups of parameters. Must be at least 0.
-
-group_sizes
-  Array of ``group_count`` integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_geqrf_batch` function.
-
-**Strided API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_geqrf_batch` function.
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size)
-    };
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Number of rows in the matrices :math:`A_i` (:math:`0 \le m`).
-
-n
-  Number of columns in :math:`A_i` (:math:`0 \le n`).
-
-lda
-  Leading dimension of :math:`A_i`.
-
-stride_a
-  Stride between the beginnings of matrices :math:`A_i` inside the batch array ``a``.
-
-stride_tau
-  Stride between the beginnings of arrays :math:`\tau_i` inside the array ``tau``.
-
-batch_size
-  Number of problems in a batch.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_geqrf_batch` function.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
diff --git a/docs/domains/lapack/geqrf_scratchpad_size.rst b/docs/domains/lapack/geqrf_scratchpad_size.rst
deleted file mode 100644
index 8541bc724..000000000
--- a/docs/domains/lapack/geqrf_scratchpad_size.rst
+++ /dev/null
@@ -1,64 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_geqrf_scratchpad_size:
-
-geqrf_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_geqrf` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``geqrf_scratchpad_size`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``float`` 
-       * -  ``double`` 
-       * -  ``std::complex<float>`` 
-       * -  ``std::complex<double>``
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_geqrf` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_geqrf` function will be performed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns in the matrix :math:`A` (:math:`0 \le n`).
-
-lda
-   The leading dimension of ``a``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_geqrf` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/gerqf.rst b/docs/domains/lapack/gerqf.rst
deleted file mode 100644
index 7f072eba3..000000000
--- a/docs/domains/lapack/gerqf.rst
+++ /dev/null
@@ -1,148 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_gerqf:
-
-gerqf
-=====
-
-Computes the RQ factorization of a general :math:`m \times n` matrix.
-
-.. container:: section
-
-  .. rubric:: Description
-      
-``gerqf`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>``
-
-The routine forms the RQ factorization of a general :math:`m \times n` matrix :math:`A`. No pivoting is performed.
-The routine does not form the matrix :math:`Q` explicitly. Instead, :math:`Q` is represented as a product of :math:`\min(m, n)` elementary reflectors. Routines are provided to work with :math:`Q` in this representation
-
-gerqf (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations will be performed.
-   
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-   
-n
-   The number of columns in the matrix :math:`A` (:math:`0 \le n`).
-   
-a
-   Buffer holding input matrix :math:`A`. The second dimension of ``a`` must be at least :math:`\max(1, n)`.
-   
-lda
-   The leading dimension of ``a``, at least :math:`\max(1, m)`.
-      
-scratchpad
-   Buffer holding scratchpad memory to be used by the routine for storing intermediate results.
-   
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less than the value returned by the :ref:`onemkl_lapack_gerqf_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Output buffer, overwritten by the factorization data as follows:
-
-   If :math:`m \le n`, the upper triangle of the subarray ``a(1:m, n-m+1:n)`` contains the :math:`m \times m` upper triangular matrix :math:`R`; if :math:`m \ge n`, the elements on and above the :math:`(m-n)`-th subdiagonal contain the :math:`m \times n` upper trapezoidal matrix :math:`R`
-
-   In both cases, the remaining elements, with the array ``tau``, represent the orthogonal/unitary matrix :math:`Q` as a product of :math:`\min(m,n)` elementary reflectors.
-
-tau
-   Array, size at least :math:`\min(m,n)`.
-
-   Contains scalars that define elementary reflectors for the matrix :math:`Q` in its decomposition in a product of elementary reflectors.
-
-gerqf (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, T *a, std::int64_t lda, T *tau, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations will be performed.
-   
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-   
-n
-   The number of columns in the matrix :math:`A` (:math:`0 \le n`).
-   
-a
-   Buffer holding input matrix :math:`A`. The second dimension of ``a`` must be at least :math:`\max(1, n)`.
-   
-lda
-   The leading dimension of ``a``, at least :math:`\max(1, m)`.
-      
-scratchpad
-   Buffer holding scratchpad memory to be used by the routine for storing intermediate results.
-   
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less than the value returned by the :ref:`onemkl_lapack_gerqf_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Output buffer, overwritten by the factorization data as follows:
-
-   If :math:`m \le n`, the upper triangle of the subarray ``a(1:m, n-m+1:n)`` contains the :math:`m \times m` upper triangular matrix :math:`R`; if :math:`m \ge n`, the elements on and above the :math:`(m-n)`-th subdiagonal contain the :math:`m \times n` upper trapezoidal matrix :math:`R`
-
-   In both cases, the remaining elements, with the array ``tau``, represent the orthogonal/unitary matrix :math:`Q` as a product of :math:`\min(m,n)` elementary reflectors.
-
-tau
-   Array, size at least :math:`\min(m,n)`.
-
-   Contains scalars that define elementary reflectors for the matrix :math:`Q` in its decomposition in a product of elementary reflectors.
-
-.. container:: section
-
-  .. rubric:: Return Values
-         
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/gerqf_scratchpad_size.rst b/docs/domains/lapack/gerqf_scratchpad_size.rst
deleted file mode 100644
index f35d02ef6..000000000
--- a/docs/domains/lapack/gerqf_scratchpad_size.rst
+++ /dev/null
@@ -1,68 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_gerqf_scratchpad_size:
-
-gerqf_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_gerqf` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``gerqf_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-  
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_gerqf` function should be able to hold.
-Calls to this routine must specify the template parameter
-explicitly.
-
-gerqf_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t lda)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-         
-queue
-   Device queue where calculations by the gerqf (buffer or USM version) function will be performed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns in the matrix :math:`A` (:math:`0 \le n`).
-
-lda
-   The leading dimension of ``a``; at least :math:`\max(1,m)`.
-
-.. container:: section
-
-  .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_gerqf` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/gesvd.rst b/docs/domains/lapack/gesvd.rst
deleted file mode 100644
index f778d7a73..000000000
--- a/docs/domains/lapack/gesvd.rst
+++ /dev/null
@@ -1,344 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_gesvd:
-
-gesvd
-=====
-
-Computes the singular value decomposition of a general rectangular matrix.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``gesvd`` supports the following precisions.
-
-    .. list-table::
-       :header-rows: 1
-
-       * -  T
-       * -  ``float``
-       * -  ``double``
-       * -  ``std::complex<float>``
-       * -  ``std::complex<double>``
-
-.. _onemkl_lapack_gesvd_batch_buffer:
-
-gesvd (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Description
-
-The routine computes the singular value decomposition (SVD) of a
-real/complex :math:`m \times n` matrix :math:`A`, optionally computing the
-left and/or right singular vectors. The SVD is written as
-
-:math:`A = U\Sigma V^T` for real routines
-
-:math:`A = U\Sigma V^H` for complex routines
-
-where :math:`\Sigma` is an :math:`m \times n` diagonal matrix, :math:`U` is an
-:math:`m \times m` orthogonal/unitary matrix, and :math:`V` is an
-:math:`n \times n` orthogonal/unitary matrix. The diagonal elements of :math:`\Sigma`
-are the singular values of :math:`A`; they are real and non-negative, and
-are returned in descending order. The first :math:`\min(m, n)` columns of
-:math:`U` and :math:`V` are the left and right singular vectors of :math:`A`.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void gesvd(sycl::queue &queue, oneapi::mkl::job jobu, oneapi::mkl::job jobvt, std::int64_t m, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<realT,1> &s, sycl::buffer<T,1> &u, std::int64_t ldu, sycl::buffer<T,1> &vt, std::int64_t ldvt, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-jobu
-   Must be ``job::allvec``, ``job::somevec``, ``job::overwritevec``,
-   or ``job::novec``. Specifies options for computing all or part of
-   the matrix :math:`U`.
-
-   If ``jobu = job::allvec``, all :math:`m` columns of :math:`U` are returned
-   in the buffer ``u``;
-
-   if ``jobu = job::somevec``, the first :math:`\min(m, n)` columns of
-   :math:`U` (the left singular vectors) are returned in the buffer ``u``;
-
-   if ``jobu = job::overwritevec``, the first :math:`\min(m, n)` columns
-   of :math:`U` (the left singular vectors) are overwritten on the buffer
-   a;
-
-   if ``jobu = job::novec``, no columns of :math:`U` (no left singular
-   vectors) are computed.
-
-jobvt
-   Must be ``job::allvec, job::somevec``, ``job::overwritevec``, or
-   ``job::novec``. Specifies options for computing all or part of the
-   matrix :math:`V^T/V^H`.
-
-   If ``jobvt = job::allvec``, all :math:`n` columns of :math:`V^T/V^H` are
-   returned in the buffer vt;
-
-   if ``jobvt = job::somevec``, the first :math:`\min(m, n)` columns of
-   :math:`V^T/V^H` (the left singular vectors) are returned in the buffer
-   vt;
-
-   if ``jobvt = job::overwritevec``, the first :math:`\min(m, n)` columns
-   of :math:`V^T/V^H` (the left singular vectors) are overwritten on the
-   buffer ``a``;
-
-   if ``jobvt = job::novec``, no columns of :math:`V^T/V^H` (no left
-   singular vectors) are computed.
-
-   ``jobvt`` and ``jobu`` cannot both be ``job::overwritevec``.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-a
-   The buffer ``a``, size ``(lda,*)``. The buffer ``a`` contains the
-   matrix :math:`A`. The second dimension of ``a`` must be at least
-   :math:`\max(1, m)`.
-
-lda
-   The leading dimension of ``a``.
-
-ldu
-   The leading dimension of ``u``.
-
-ldvt
-   The leading dimension of ``vt``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_gesvd_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   On exit,
-
-   If ``jobu = job::overwritevec``, ``a`` is overwritten with the first
-   :math:`\min(m,n)` columns of :math:`U` (the left singular vectors stored
-   columnwise);
-
-   If ``jobvt = job::overwritevec``, ``a`` is overwritten with the first
-   :math:`\min(m, n)` rows of :math:`V^{T}`/:math:`V^{H}` (the right
-   singular vectors stored rowwise);
-
-   If ``jobu`` :math:`\ne` ``job::overwritevec`` and ``jobvt`` :math:`\ne` ``job::overwritevec``,
-   the contents of a are destroyed.
-
-s
-   Buffer containing the singular values, size at least
-   :math:`\max(1, \min(m,n))`. Contains the singular values of :math:`A` sorted
-   so that :math:`s(i) \ge s(i+1)`.
-
-u
-   Buffer containing :math:`U`; the second dimension of ``u`` must be at
-   least :math:`\max(1, m)` if ``jobu = job::allvec``, and at least
-   :math:`\max(1, \min(m, n))` if ``jobu = job::somevec``.
-
-   If ``jobu = job::allvec``, ``u`` contains the :math:`m \times m`
-   orthogonal/unitary matrix :math:`U`.
-
-   If ``jobu = job::somevec``, ``u`` contains the first :math:`\min(m, n)`
-   columns of :math:`U` (the left singular vectors stored column-wise).
-
-   If ``jobu = job::novec`` or ``job::overwritevec``, ``u`` is not
-   referenced.
-
-vt
-   Buffer containing :math:`V^{T}`; the second dimension of ``vt`` must
-   be at least :math:`\max(1, n)`.
-
-   If ``jobvt = job::allvec``, ``vt`` contains the :math:`n \times n`
-   orthogonal/unitary matrix :math:`V^{T}`/:math:`V^{H}`.
-
-   If ``jobvt = job::somevec``, ``vt`` contains the first :math:`\min(m, n)`
-   rows of :math:`V^{T}`/:math:`V^{H}` (the right singular
-   vectors stored row-wise).
-
-   If ``jobvt = job::novec`` or ``job::overwritevec``, ``vt`` is not
-   referenced.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-gesvd (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Description
-
-The routine computes the singular value decomposition (SVD) of a
-real/complex :math:`m \times n` matrix :math:`A`, optionally computing the
-left and/or right singular vectors. The SVD is written as
-
-:math:`A = U\Sigma V^T` for real routines
-
-:math:`A = U\Sigma V^H` for complex routines
-
-where :math:`\Sigma` is an :math:`m \times n` diagonal matrix, :math:`U` is an
-:math:`m \times m` orthogonal/unitary matrix, and :math:`V` is an
-:math:`n \times n` orthogonal/unitary matrix. The diagonal elements of :math:`\Sigma`
-are the singular values of :math:`A`; they are real and non-negative, and
-are returned in descending order. The first :math:`\min(m, n)` columns of
-:math:`U` and :math:`V` are the left and right singular vectors of :math:`A`.
-
-.. container:: section
-  
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event gesvd(sycl::queue &queue, oneapi::mkl::job jobu, oneapi::mkl::job jobvt, std::int64_t m, std::int64_t n, T *a, std::int64_t lda, RealT *s, T *u, std::int64_t ldu, T *vt, std::int64_t ldvt, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-jobu
-   Must be ``job::allvec``, ``job::somevec``, ``job::overwritevec``,
-   or ``job::novec``. Specifies options for computing all or part of
-   the matrix :math:`U`.
-
-   If ``jobu = job::allvec``, all :math:`m` columns of :math:`U` are returned
-   in the array ``u``;
-
-   if ``jobu = job::somevec``, the first :math:`\min(m, n)` columns of
-   :math:`U` (the left singular vectors) are returned in the array ``u``;
-
-   if ``jobu = job::overwritevec``, the first :math:`\min(m, n)` columns
-   of :math:`U` (the left singular vectors) are overwritten on the array
-   a;
-
-   if ``jobu = job::novec``, no columns of :math:`U` (no left singular
-   vectors) are computed.
-
-jobvt
-   Must be ``job::allvec, job::somevec``, ``job::overwritevec``, or
-   ``job::novec``. Specifies options for computing all or part of the
-   matrix :math:`V^T/V^H`.
-
-   If ``jobvt = job::allvec``, all :math:`n` columns of :math:`V^T/V^H` are
-   returned in the array ``vt``;
-
-   if ``jobvt = job::somevec``, the first :math:`\min(m, n)` columns of
-   :math:`V^T/V^H` (the left singular vectors) are returned in the array
-   vt;
-
-   if ``jobvt = job::overwritevec``, the first :math:`\min(m, n)` columns
-   of :math:`V^T/V^H` (the left singular vectors) are overwritten on the
-   array ``a``;
-
-   if ``jobvt = job::novec``, no columns of :math:`V^T/V^H` (no left
-   singular vectors) are computed.
-
-   ``jobvt`` and ``jobu`` cannot both be ``job::overwritevec``.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-a
-   Pointer to array ``a``, size ``(lda,*)``, containing the
-   matrix :math:`A`. The second dimension of ``a`` must be at least
-   :math:`\max(1, m)`.
-
-lda
-   The leading dimension of ``a``.
-
-ldu
-   The leading dimension of ``u``.
-
-ldvt
-   The leading dimension of ``vt``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_gesvd_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   On exit,
-
-   If ``jobu = job::overwritevec``, ``a`` is overwritten with the first
-   :math:`\min(m,n)` columns of :math:`U` (the left singular vectors stored
-   columnwise);
-
-   If ``jobvt = job::overwritevec``, ``a`` is overwritten with the first
-   :math:`\min(m, n)` rows of :math:`V^{T}`/:math:`V^{H}` (the right
-   singular vectors stored rowwise);
-
-   If ``jobu`` :math:`\ne` ``job::overwritevec`` and ``jobvt`` :math:`\ne` ``job::overwritevec``,
-   the contents of a are destroyed.
-
-s
-   Array containing the singular values, size at least
-   :math:`\max(1, \min(m,n))`. Contains the singular values of :math:`A` sorted
-   so that :math:`s(i) \ge s(i+1)`.
-
-u
-   Array containing :math:`U`; the second dimension of ``u`` must be at
-   least :math:`\max(1, m)` if ``jobu = job::allvec``, and at least
-   :math:`\max(1, \min(m, n))` if ``jobu = job::somevec``.
-
-   If ``jobu = job::allvec``, ``u`` contains the :math:`m \times m`
-   orthogonal/unitary matrix :math:`U`.
-
-   If ``jobu = job::somevec``, ``u`` contains the first :math:`\min(m, n)`
-   columns of :math:`U` (the left singular vectors stored column-wise).
-
-   If ``jobu = job::novec`` or ``job::overwritevec``, ``u`` is not
-   referenced.
-
-vt
-   Array containing :math:`V^{T}`; the second dimension of ``vt`` must
-   be at least :math:`\max(1, n)`.
-
-   If ``jobvt = job::allvec``, ``vt`` contains the :math:`n \times n`
-   orthogonal/unitary matrix :math:`V^{T}`/:math:`V^{H}`.
-
-   If ``jobvt = job::somevec``, ``vt`` contains the first :math:`\min(m, n)`
-   rows of :math:`V^{T}`/:math:`V^{H}` (the right singular
-   vectors stored row-wise).
-
-   If ``jobvt = job::novec`` or ``job::overwritevec``, ``vt`` is not
-   referenced.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
diff --git a/docs/domains/lapack/gesvd_scratchpad_size.rst b/docs/domains/lapack/gesvd_scratchpad_size.rst
deleted file mode 100644
index fab43c61f..000000000
--- a/docs/domains/lapack/gesvd_scratchpad_size.rst
+++ /dev/null
@@ -1,111 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_gesvd_scratchpad_size:
-
-gesvd_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_gesvd` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``gesvd_scratchpad_size`` supports the following precisions.
-
-      .. list-table:: 
-         :header-rows: 1
-
-         * -  T 
-         * -  ``float`` 
-         * -  ``double`` 
-         * -  ``std::complex<float>`` 
-         * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_gesvd` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-gesvd_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobu, oneapi::mkl::job jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt) 
-    }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_gesvd` function will be performed.
-
-jobu
-   Must be ``job::allvec``, ``job::somevec``,
-   ``job::overwritevec``, or ``job::novec``. Specifies options for
-   computing all or part of the matrix :math:`U`.
-
-   If ``jobu = job::allvec``, all :math:`m` columns of :math:`U` are
-   returned in the buffer ``u``;
-
-   if ``jobu = job::somevec``, the first :math:`\min(m, n)` columns of
-   :math:`U` (the left singular vectors) are returned in the buffer ``v``;
-
-   if ``jobu = job::overwritevec``, the first :math:`\min(m, n)`
-   columns of :math:`U` (the left singular vectors) are overwritten on
-   the buffer ``a``;
-
-   if ``jobu = job::novec``, no columns of :math:`U` (no left singular
-   vectors) are computed.
-
-jobvt
-   Must be ``job::allvec``, ``job::somevec``,
-   ``job::overwritevec``, or ``job::novec``. Specifies options for
-   computing all or part of the matrix :math:`V^T/V^H`.
-
-   If ``jobvt = job::allvec``, all :math:`n` columns of :math:`V^T/V^H` are
-   returned in the buffer ``vt``;
-
-   if ``jobvt = job::somevec``, the first :math:`\min(m, n)` columns of
-   :math:`V^T/V^H` (the left singular vectors) are returned in the
-   buffer ``vt``;
-
-   if ``jobvt = job::overwritevec``, the first :math:`\min(m, n)`
-   columns of :math:`V^T/V^H` (the left singular vectors) are
-   overwritten on the buffer ``a``;
-
-   if ``jobvt = job::novec``, no columns of :math:`V^T/V^H` (no left
-   singular vectors) are computed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns in the matrix :math:`A` (:math:`0 \le n`).
-
-lda
-   The leading dimension of ``a``.
-
-ldu
-   The leading dimension of ``u``.
-
-ldvt
-   The leading dimension of ``vt``.
-
-.. container:: section
-
-   .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_gesvd` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/getrf.rst b/docs/domains/lapack/getrf.rst
deleted file mode 100644
index d23594843..000000000
--- a/docs/domains/lapack/getrf.rst
+++ /dev/null
@@ -1,144 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_getrf:
-
-getrf
-=====
-
-Computes the LU factorization of a general :math:`m \times n` matrix.
-
-.. container:: section
-
-   .. rubric:: Description
-
-``getrf`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-The routine computes the LU factorization of a general
-:math:`m \times n` matrix :math:`A` as :math:`A = PLU`,
-
-where :math:`P` is a permutation matrix, :math:`L` is lower triangular with
-unit diagonal elements (lower trapezoidal if :math:`m > n`) and :math:`U` is
-upper triangular (upper trapezoidal if :math:`m < n`). The routine uses
-partial pivoting, with row interchanges.
-
-getrf (BUFFER Version)
-----------------------
-
-.. container:: section
-
-   .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<std::int64_t,1> &ipiv, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-m
-    The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-    The number of columns in :math:`A` (:math:`0 \le n`).
-
-a
-   Buffer holding input matrix :math:`A`. The buffer a contains    the matrix :math:`A`. The second dimension of a must be at least   :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``.
-
-scratchpad_size
-      Size of scratchpad memory as a number of floating point elements of type ``T``.
-      Size should not be less than the value returned by :ref:`onemkl_lapack_getrf_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Overwritten by :math:`L` and :math:`U`. The unit diagonal    elements of :math:`L` are not stored.
-
-ipiv
-   Array, size at least :math:`\max(1,\min(m, n))`. Contains the    pivot indices; for :math:`1 \le i \le \min(m, n)`, row :math:`i` was interchanged with   row :math:`\text{ipiv}(i)`.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-getrf (USM Version)
-----------------------
-
-.. container:: section
-
-   .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, T *a, std::int64_t lda, std::int64_t *ipiv, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-m
-    The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-    The number of columns in :math:`A` (:math:`0 \le n`).
-
-a
-   Pointer to array holding input matrix :math:`A`. The second dimension of ``a`` must be at least   :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_getrf_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Overwritten by :math:`L` and :math:`U`. The unit diagonal    elements of :math:`L` are not stored.
-
-ipiv
-   Array, size at least :math:`\max(1,\min(m, n))`. Contains the    pivot indices; for :math:`1 \le i \le \min(m, n)`, row :math:`i` was interchanged with   row :math:`\text{ipiv}(i)`.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
-
diff --git a/docs/domains/lapack/getrf_batch.rst b/docs/domains/lapack/getrf_batch.rst
deleted file mode 100644
index 502707ee5..000000000
--- a/docs/domains/lapack/getrf_batch.rst
+++ /dev/null
@@ -1,226 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_getrf_batch:
-
-getrf_batch
-===========
-
-Computes the LU factorizations of a batch of general matrices.
-
-.. rubric:: Description
-
-``getrf_batch`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_lapack_getrf_batch_buffer:
-
-getrf_batch (Buffer Version)
-----------------------------
-
-.. rubric:: Description
-
-The buffer version of ``getrf_batch`` supports only the strided API. 
-
-**Strided API**
-
-The routine computes the LU factorizations of general :math:`m \times n` matrices :math:`A_i` as :math:`A_i = P_iL_iU_i`, where :math:`P_i` is a permutation matrix, :math:`L_i` is lower triangular with unit diagonal elements (lower trapezoidal if :math:`m > n`) and :math:`U_i` is upper triangular (upper trapezoidal if :math:`m < n`). The routine uses partial pivoting, with row interchanges.
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Number of rows in matrices :math:`A_i` (:math:`0 \le m`).
-
-n
-  Number of columns in matrices :math:`A_i` (:math:`0 \le n`).
-
-a
-  Array holding input matrices :math:`A_i`.
-
-lda
-  Leading dimension of matrices :math:`A_i`.
-
-stride_a
-  Stride between the beginnings of matrices :math:`A_i` inside the batch array ``a``.
-
-stride_ipiv
-  Stride between the beginnings of arrays :math:`ipiv_i` inside the array ``ipiv``.
-
-batch_size
-  Number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less than the value returned by the Strided API of the :ref:`onemkl_lapack_getrf_batch_scratchpad_size` function.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-a
-  :math:`L_i` and :math:`U_i`. The unit diagonal elements of :math:`L_i` are not stored.
-
-ipiv
-  Array containing batch of the pivot indices :math:`\text{ipiv}_i` each of size at least :math:`\max(1,\min(m,n))`; for :math:`1 \le k \le \min(m,n)`, where row :math:`k` of :math:`A_i` was interchanged with row :math:`\text{ipiv}_i(k)`.
-
-.. _onemkl_lapack_getrf_batch_usm:
-
-getrf_batch (USM Version)
--------------------------
-
-.. rubric:: Description
-
-The USM version of ``getrf_batch`` supports the group API and strided API. 
-
-**Group API**
-
-The routine computes the batch of LU factorizations of general :math:`m \times n` matrices :math:`A_i` (:math:`i \in \{1...batch\_size\}`) as :math:`A_i = P_iL_iU_i`, where :math:`P_i` is a permutation matrix, :math:`L_i` is lower triangular with unit diagonal elements (lower trapezoidal if :math:`m > n`) and :math:`U_i` is upper triangular (upper trapezoidal if :math:`m < n`). The routine uses partial pivoting, with row interchanges. Total number of problems to solve, ``batch_size``, is a sum of sizes of all of the groups of parameters as provided by ``group_sizes`` array.
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, T **a, std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Array of ``group_count`` parameters :math:`m_g` specifying the number of rows in matrices :math:`A_i` (:math:`0 \le m_g`) belonging to group :math:`g`.
-
-n
-  Array of ``group_count`` parameters :math:`n_g` specifying the number of columns in matrices :math:`A_i` (:math:`0 \le n_g`) belonging to group :math:`g`.
-
-a
-  Array holding ``batch_size`` pointers to input matrices :math:`A_i`.
-
-lda
-  Array of ``group_count`` parameters :math:`lda_g` specifying the leading dimensions of :math:`A_i` belonging to group :math:`g`.
-
-group_count
-  Number of groups of parameters. Must be at least 0.
-
-group_sizes
-  Array of group_count integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by the Group API of the :ref:`onemkl_lapack_getrf_batch_scratchpad_size` function.
-
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-a
-  :math:`L_i` and :math:`U_i`. The unit diagonal elements of :math:`L_i` are not stored.
-
-ipiv
-  Arrays of batch_size pointers to arrays containing pivot indices :math:`\text{ipiv}_i` each of size at least :math:`\max(1,\min(m_g,n_g))`; for :math:`1 \le k \le \min(m_g,n_g)`, where row :math:`k` of :math:`A_i` was interchanged with row :math:`\text{ipiv}_i(k)`.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
-The routine computes the LU factorizations of general :math:`m \times n` matrices :math:`A_i` as :math:`A_i = P_iL_iU_i`, where :math:`P_i` is a permutation matrix, :math:`L_i` is lower triangular with unit diagonal elements (lower trapezoidal if :math:`m > n`) and :math:`U_i` is upper triangular (upper trapezoidal if :math:`m < n`). The routine uses partial pivoting, with row interchanges.
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, T *a, std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    };
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Number of rows in matrices :math:`A_i` (:math:`0 \le m`).
-
-n
-  Number of columns in matrices :math:`A_i` (:math:`0 \le n`).
-
-a
-  Array holding input matrices :math:`A_i`.
-
-lda
-  Leading dimension of matrices :math:`A_i`.
-
-stride_a
-  Stride between the beginnings of matrices :math:`A_i` inside the batch array ``a``.
-
-stride_ipiv
-  Stride between the beginnings of arrays :math:`\text{ipiv}_i` inside the array ``ipiv``.
-
-batch_size
-  Number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by the Strided API of the :ref:`onemkl_lapack_getrf_batch_scratchpad_size` function.
-
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-a
-  :math:`L_i` and :math:`U_i`. The unit diagonal elements of :math:`L_i` are not stored.
-
-ipiv
-  Array containing batch of the pivot indices :math:`\text{ipiv}_i` each of size at least :math:`\max(1,\min(m,n))`; for :math:`1 \le k \le \min(m,n)`, where row :math:`k` of :math:`A_i` was interchanged with row :math:`\text{ipiv}_i(k)`.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
diff --git a/docs/domains/lapack/getrf_batch_scratchpad_size.rst b/docs/domains/lapack/getrf_batch_scratchpad_size.rst
deleted file mode 100644
index 27cfdb4da..000000000
--- a/docs/domains/lapack/getrf_batch_scratchpad_size.rst
+++ /dev/null
@@ -1,117 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_getrf_batch_scratchpad_size:
-
-getrf_batch_scratchpad_size
-===========================
-
-Computes size of scratchpad memory required for the :ref:`onemkl_lapack_getrf_batch` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``getrf_batch_scratchpad_size`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-**Group API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_getrf_batch` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes)
-    }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Array of ``group_count`` parameters :math:`m_g` specifying the number of rows in the matrices belonging to group :math:`g`.
-
-n
-  Array of ``group_count`` parameters :math:`n_g` specifying the number of columns in matrices belonging to group :math:`g`.
-
-lda
-  Array of ``group_count`` parameters :math:`\text{lda}_g` specifying the leading dimensions of matrices belonging to group :math:`g`.
-
-group_count
-  Number of groups of parameters. Must be at least 0.
-
-group_sizes
-  Array of ``group_count`` integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_getrf_batch` function.
-
-**Strided API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_getrf_batch` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size)
-    };
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Number of rows in the matrices :math:`A_i` (:math:`0 \le m`).
-
-n
-  Number of columns in :math:`A_i` (:math:`0 \le n`).
-
-lda
-  Leading dimension of :math:`A_i`.
-
-stride_a
-  Stride between the beginnings of matrices :math:`A_i` inside the batch  array ``a``.
-
-stride_ipiv
-  Stride between the beginnings of arrays :math:`\text{ipiv}_i` inside the array ``ipiv``.
-
-batch_size
-  Number of problems in a batch.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_getrf_batch` function.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
-
diff --git a/docs/domains/lapack/getrf_scratchpad_size.rst b/docs/domains/lapack/getrf_scratchpad_size.rst
deleted file mode 100644
index ae382d0fc..000000000
--- a/docs/domains/lapack/getrf_scratchpad_size.rst
+++ /dev/null
@@ -1,67 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_getrf_scratchpad_size:
-
-getrf_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_getrf` function.
-
-.. container:: section
-
-   .. rubric:: Description
-
-``getrf_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_getrf` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-getrf_scratchpad_size
----------------------
-
-.. container:: section
-
-   .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_getrf` function will be performed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns in :math:`A` (:math:`0 \le n`).
-
-lda
-   The leading dimension of ``a`` :math:`(n \le \text{lda})`.
-
-.. container:: section
-
-   .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_getrf` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/getri.rst b/docs/domains/lapack/getri.rst
deleted file mode 100644
index d0f1854e8..000000000
--- a/docs/domains/lapack/getri.rst
+++ /dev/null
@@ -1,138 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_getri:
-
-getri
-=====
-
-Computes the inverse of an LU-factored general matrix determined by
-:ref:`onemkl_lapack_getrf`.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``getri`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-  
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-The routine computes the inverse :math:`A^{-1}` of a general matrix
-:math:`A`. Before calling this routine, call :ref:`onemkl_lapack_getrf`
-to factorize :math:`A`.
-
-getri (BUFFER Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<std::int64_t,1> &ipiv, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-n
-   The order of the matrix :math:`A` :math:`(0 \le n)`.
-
-a
-   The buffer ``a`` as returned by :ref:`onemkl_lapack_getrf`. Must
-   be of size at least :math:`\text{lda} \cdot \max(1,n)`.
-
-lda
-   The leading dimension of ``a`` :math:`(n \le \text{lda})`.
-
-ipiv
-   The buffer as returned by :ref:`onemkl_lapack_getrf`. The
-   dimension of ``ipiv`` must be at least :math:`\max(1, n)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_getri_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Overwritten by the :math:`n \times n` matrix :math:`A`.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-getri (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event getri(sycl::queue &queue, std::int64_t n, T *a, std::int64_t lda, std::int64_t *ipiv, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-n
-   The order of the matrix :math:`A` :math:`(0 \le n)`.
-
-a
-   The array as returned by :ref:`onemkl_lapack_getrf`. Must
-   be of size at least :math:`\text{lda} \cdot \max(1,n)`.
-
-lda
-   The leading dimension of ``a`` :math:`(n \le \text{lda})`.
-
-ipiv
-   The array as returned by :ref:`onemkl_lapack_getrf`. The
-   dimension of ``ipiv`` must be at least :math:`\max(1, n)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_getri_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Overwritten by the :math:`n \times n` matrix :math:`A`.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/getri_batch.rst b/docs/domains/lapack/getri_batch.rst
deleted file mode 100644
index 9112100a7..000000000
--- a/docs/domains/lapack/getri_batch.rst
+++ /dev/null
@@ -1,229 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_getri_batch:
-
-getri_batch
-===========
-
-Computes the inverses of a batch of LU-factored matrices determined by :ref:`onemkl_lapack_getrf_batch`.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``getri_batch`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_lapack_getri_batch_buffer:
-
-getri_batch (Buffer Version)
-----------------------------
-
-.. container:: section
-
-  .. rubric:: Description
-
-The buffer version of ``getri_batch`` supports only the strided API. 
-
-**Strided API**
-
-The routine computes the inverses :math:`A_i^{-1}` of general matrices :math:`A_i`. Before calling this routine, call the Strided API of the :ref:`onemkl_lapack_getrf_batch_buffer` function to factorize :math:`A_i`.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-n
-  Order of the matrices :math:`A_i` (:math:`0 \le n`).
-
-a
-  Result of the Strided API of the :ref:`onemkl_lapack_getrf_batch_buffer` function.
-
-lda
-  Leading dimension of :math:`A_i` (:math:`n\le \text{lda}`).
-
-stride_a
-  Stride between the beginnings of matrices :math:`A_i` inside the batch array ``a``.
-
-ipiv
-  Arrays returned by the Strided API of the :ref:`onemkl_lapack_getrf_batch_buffer` function.
-
-stride_ipiv
-  Stride between the beginnings of arrays :math:`\text{ipiv}_i` inside the array ``ipiv``.
-
-batch_size
-  Number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less than the value returned by the Strided API of the :ref:`onemkl_lapack_getri_batch_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-  Inverse :math:`n \times n` matrices :math:`A_i^{-1}`.
-
-getri_batch (USM Version)
--------------------------
-
-.. container:: section
-
-  .. rubric:: Description
-
-The USM version of ``getri_batch`` supports the group API and strided API. 
-
-**Group API**
-
-The routine computes the inverses :math:`A_i^{-1}` of general matrices :math:`A_i`, :math:`i \in \{1...batch\_size\}`. Before calling this routine, call the Group API of the :ref:`onemkl_lapack_getrf_batch_usm` function to factorize :math:`A_i`.
-Total number of problems to solve, ``batch_size``, is a sum of sizes of all of the groups of parameters as provided by ``group_sizes`` array.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, T **a, std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-n
-  Array of ``group_count`` :math:`n_g` parameters specifying the order of the matrices :math:`A_i` (:math:`0 \le n_g`) belonging to group :math:`g`.
-
-a
-  Result of the Group API of the :ref:`onemkl_lapack_getrf_batch_usm` function.
-
-lda
-  Array of ``group_count`` :math:`\text{lda}_g` parameters specifying the leading dimensions of the matrices :math:`A_i` (:math:`n_g \le \text{lda}_g`) belonging to group :math:`g`.
-
-ipiv
-  Arrays returned by the Group API of the :ref:`onemkl_lapack_getrf_batch_usm` function.
-
-group_count
-  Number of groups of parameters. Must be at least 0.
-
-group_sizes
-  Array of ``group_count`` integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of  type ``T``. Size should not be less than the value returned by the Group API of the :ref:`onemkl_lapack_getri_batch_scratchpad_size` function.
-
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-  Inverse :math:`n_g \times n_g` matrices :math:`A_i^{-1}`.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
-The routine computes the inverses :math:`A_i^{-1}` of general matrices :math:`A_i`. Before calling this routine, call the Strided API of the :ref:`onemkl_lapack_getrf_batch_usm` function to factorize :math:`A_i`.
-
-.. container:: section
-   
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event getri_batch(sycl::queue &queue, std::int64_t n, T *a, std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    };
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-n
-  Order of the matrices :math:`A_i` (:math:`0 \le n`).
-
-a
-  Result of the Strided API of the :ref:`onemkl_lapack_getrf_batch_usm` function.
-
-lda
-  Leading dimension of :math:`A_i` (:math:`n \le \text{lda}`).
-
-stride_a
-  Stride between the beginnings of matrices :math:`A_i` inside the batch array ``a``.
-
-ipiv
-  Arrays returned by the Strided API of the :ref:`onemkl_lapack_getrf_batch_usm` function.
-
-stride_ipiv
-  Stride between the beginnings of arrays :math:`\text{ipiv}_i` inside the array ``ipiv``.
-
-batch_size
-  Number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size 
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less than the value returned by the Strided API of the :ref:`onemkl_lapack_getri_batch_scratchpad_size` function.
-
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-  Inverse :math:`n \times n` matrices :math:`A_i^{-1}`.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
-
diff --git a/docs/domains/lapack/getri_batch_scratchpad_size.rst b/docs/domains/lapack/getri_batch_scratchpad_size.rst
deleted file mode 100644
index 3e30e4400..000000000
--- a/docs/domains/lapack/getri_batch_scratchpad_size.rst
+++ /dev/null
@@ -1,111 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_getri_batch_scratchpad_size:
-
-getri_batch_scratchpad_size
-===========================
-
-Computed size of scratchpad memory required for the :ref:`onemkl_lapack_getri_batch` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``getri_batch_scratchpad_size`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-**Group API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_getri_batch` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes)
-    }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-queue
-	Device queue where calculations will be performed.
-
-n
-  Array of ``group_count`` :math:`n_g` parameters specifying the order of the matrices belonging to group :math:`g`.
-
-lda
-	Array of ``group_count`` :math:`\text{lda}_g` parameters specifying the leading dimensions of the matrices belonging to group :math:`g`.
-
-group_count
-  Number of groups of parameters. Must be at least 0.
-
-group_sizes
-	Array of ``group_count`` integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_getri_batch` function.
-
-**Strided API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_getri_batch` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size)
-    };
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-	Device queue where calculations will be performed.
-
-n
-  The order of the matrices :math:`A_i` (0 \le n).
-
-lda
-	Leading dimension of :math:`A_i` (:math:`n \le \text{lda}`).
-
-stride_a
-	Stride between the beginnings of matrices :math:`A_i` inside the batch array ``a``.
-
-stride_ipiv
-	Stride between the beginnings of arrays :math:`ipiv_i` inside the array ipiv.
-
-batch_size
-	Specifies the number of problems in a batch.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_getri_batch` function.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
-
diff --git a/docs/domains/lapack/getri_scratchpad_size.rst b/docs/domains/lapack/getri_scratchpad_size.rst
deleted file mode 100644
index 734f51a69..000000000
--- a/docs/domains/lapack/getri_scratchpad_size.rst
+++ /dev/null
@@ -1,66 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_getri_scratchpad_size:
-
-getri_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_getri` function.
-
-.. container:: section
-
-  .. rubric:: Description
-      
-``getri_scratchpad_size`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``float`` 
-       * -  ``double`` 
-       * -  ``std::complex<float>`` 
-       * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_getri` function should be able to hold.
-Calls to this routine must specify the template parameter
-explicitly.
-
-getri_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-      
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_getri` function will be performed.
-
-n
-   The order of the matrix :math:`A` :math:`(0 \le n)`.
-
-lda
-   The leading dimension of ``a`` :math:`(n \le \text{lda})`.
-
-.. container:: section
-
-  .. rubric:: Return Value
-     
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_getri` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines` 
-
-
diff --git a/docs/domains/lapack/getrs.rst b/docs/domains/lapack/getrs.rst
deleted file mode 100644
index f96ca356a..000000000
--- a/docs/domains/lapack/getrs.rst
+++ /dev/null
@@ -1,200 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_getrs:
-
-getrs
-=====
-
-Solves a system of linear equations with an LU-factored square
-coefficient matrix, with multiple right-hand sides.
-
-.. container:: section
-
-  .. rubric:: Description
-      
-``getrs`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-  
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-The routine solves for :math:`X` the following systems of linear
-equations:
-
-    .. list-table:: 
-       :header-rows: 1
-    
-       * -     \ :math:`AX = B`\     
-         -     if ``trans``\ =\ ``oneapi::mkl::transpose::nontrans``\     
-       * -     \ :math:`A^TX = B`\     
-         -     if ``trans``\ =\ ``oneapi::mkl::transpose::trans``\     
-       * -     \ :math:`A^HX = B`\     
-         -     if ``trans``\ =\ ``oneapi::mkl::transpose::conjtrans``\     
-
-Before calling this routine, you must call
-:ref:`onemkl_lapack_getrf`
-to compute the LU factorization of :math:`A`.
-
-getrs (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-      
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<std::int64_t,1> &ipiv, sycl::buffer<T,1> &b, std::int64_t ldb, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-trans
-   Indicates the form of the equations:
-
-   If ``trans=oneapi::mkl::transpose::nontrans``, then :math:`AX = B` is solved
-   for :math:`X`.
-
-   If ``trans=oneapi::mkl::transpose::trans``, then :math:`A^TX = B` is solved
-   for :math:`X`.
-
-   If ``trans=oneapi::mkl::transpose::conjtrans``, then :math:`A^HX = B` is
-   solved for :math:`X`.
-
-n
-   The order of the matrix :math:`A` and the number of rows in matrix
-   :math:`B(0 \le n)`.
-
-nrhs
-   The number of right-hand sides (:math:`0 \le \text{nrhs}`).
-
-a
-   Buffer containing the factorization of the matrix :math:`A`, as
-   returned by :ref:`onemkl_lapack_getrf`. The second dimension of ``a`` must be at least
-   :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``.
-
-ipiv
-   Array, size at least :math:`\max(1, n)`. The ``ipiv`` array, as returned by
-   :ref:`onemkl_lapack_getrf`.
-
-b
-   The array ``b`` contains the matrix :math:`B` whose columns are the
-   right-hand sides for the systems of equations. The second
-   dimension of ``b`` must be at least :math:`\max(1,\text{nrhs})`.
-
-ldb
-   The leading dimension of ``b``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_getrs_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-b
-   The buffer ``b`` is overwritten by the solution matrix :math:`X`.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-getrs (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, T *a, std::int64_t lda, std::int64_t *ipiv, T *b, std::int64_t ldb, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-trans
-   Indicates the form of the equations:
-
-   If ``trans=oneapi::mkl::transpose::nontrans``, then :math:`AX = B` is solved
-   for :math:`X`.
-
-   If ``trans=oneapi::mkl::transpose::trans``, then :math:`A^TX = B` is solved
-   for :math:`X`.
-
-   If ``trans=oneapi::mkl::transpose::conjtrans``, then :math:`A^HX = B` is
-   solved for :math:`X`.
-
-n
-   The order of the matrix :math:`A` and the number of rows in matrix
-   :math:`B(0 \le n)`.
-
-nrhs
-   The number of right-hand sides (:math:`0 \le \text{nrhs}`).
-
-a
-   Pointer to array containing the factorization of the matrix :math:`A`, as
-   returned by :ref:`onemkl_lapack_getrf`. The second dimension of ``a`` must be at least
-   :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``.
-
-ipiv
-   Array, size at least :math:`\max(1, n)`. The ``ipiv`` array, as returned by
-   :ref:`onemkl_lapack_getrf`.
-
-b
-   The array ``b`` contains the matrix :math:`B` whose columns are the
-   right-hand sides for the systems of equations. The second
-   dimension of ``b`` must be at least :math:`\max(1,\text{nrhs})`.
-
-ldb
-   The leading dimension of ``b``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_getrs_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-b
-   The array ``b`` is overwritten by the solution matrix :math:`X`.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-     
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
diff --git a/docs/domains/lapack/getrs_batch.rst b/docs/domains/lapack/getrs_batch.rst
deleted file mode 100644
index 4c23fb04d..000000000
--- a/docs/domains/lapack/getrs_batch.rst
+++ /dev/null
@@ -1,286 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_getrs_batch:
-
-getrs_batch
-===========
-
-Solves a system of linear equations with a batch of LU-factored square coefficient matrices, with multiple right-hand sides.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``getrs_batch`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_lapack_getrs_batch_buffer:
-
-getrs_batch (Buffer Version)
-----------------------------
-
-.. container:: section
-
-  .. rubric:: Description
-
-The buffer version of ``getrs_batch`` supports only the strided API. 
-   
-**Strided API**
-
- | The routine solves for the following systems of linear equations :math:`X_i`: 
- | :math:`A_iX_i = B_i`, if ``trans=mkl::transpose::nontrans``
- | :math:`A_i^TX_i = B_i`, if ``trans=mkl::transpose::trans``
- | :math:`A_i^HX_i = B_i`, if ``trans=mkl::transpose::conjtrans``
- | Before calling this routine, the Strided API of the :ref:`onemkl_lapack_getrf_batch_buffer` function should be called to compute the LU factorizations of :math:`A_i`.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void getrs_batch(sycl::queue &queue, mkl::transpose trans, std::int64_t n, std::int64_t nrhs, sycl::buffer<T> &a, std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv, sycl::buffer<T> &b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-trans
- | Form of the equations:
- | If ``trans = mkl::transpose::nontrans``, then :math:`A_iX_i = B_i` is solved for :math:`Xi`.
- | If ``trans = mkl::transpose::trans``, then :math:`A_i^TX_i = B_i` is solved for :math:`X_i`.
- | If ``trans = mkl::transpose::conjtrans``, then :math:`A_i^HX_i = B_i` is solved for :math:`X_i`.
-
-n
-  Order of the matrices :math:`A_i` and the number of rows in matrices :math:`B_i` (:math:`0 \le n`).
-
-nrhs
-  Number of right-hand sides (:math:`0 \le \text{nrhs}`).
-
-a
-  Array containing the factorizations of the matrices :math:`A_i`, as returned the Strided API of the :ref:`onemkl_lapack_getrf_batch_buffer` function.
-
-lda
-  Leading dimension of :math:`A_i`.
-
-stride_a
-  Stride between the beginnings of matrices :math:`B_i` inside the batch array ``b``.
-
-ipiv
-  ``ipiv`` array, as returned by the Strided API of the :ref:`onemkl_lapack_getrf_batch_buffer` function.
-
-stride_ipiv
-  Stride between the beginnings of arrays :math:`\text{ipiv}_i` inside the array ``ipiv``.
-
-b 
-  Array containing the matrices :math:`B_i` whose columns are the right-hand sides for the systems of equations.
-
-ldb
-  Leading dimension of :math:`B_i`.
-
-batch_size
-  Specifies the number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by the Strided API of the :ref:`onemkl_lapack_getrs_batch_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-b  
-  Solution matrices :math:`X_i`.
-
-.. _onemkl_lapack_getrs_batch_usm:
-
-getrs_batch (USM Version)
--------------------------
-
-.. container:: section
-
-  .. rubric:: Description
-
-The USM version of ``getrs_batch`` supports the group API and strided API. 
-
-**Group API**
-
- | The routine solves the following systems of linear equations for :math:`X_i` (:math:`i \in \{1...batch\_size\}`):
- | :math:`A_iX_i = B_i`, if ``trans=mkl::transpose::nontrans``
- | :math:`A_i^TX_i = B_i`, if ``trans=mkl::transpose::trans``
- | :math:`A_i^HX_i = B_i`, if ``trans=mkl::transpose::conjtrans``
- | Before calling this routine, call the Group API of the :ref:`onemkl_lapack_getrf_batch_usm` function to compute the LU factorizations of :math:`A_i`.
- | Total number of problems to solve, ``batch_size``, is a sum of sizes of all of the groups of parameters as provided by ``group_sizes`` array.
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event getrs_batch(sycl::queue &queue, mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, T **a, std::int64_t *lda, std::int64_t **ipiv, T **b, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-trans
- | Array of ``group_count`` parameters :math:`trans_g` indicating the form of the equations for the group :math:`g`:
- | If ``trans = mkl::transpose::nontrans``, then :math:`A_iX_i = B_i` is solved for :math:`X_i`.
- | If ``trans = mkl::transpose::trans``, then :math:`A_i^TX_i = B_i` is solved for :math:`X_i`.
- | If ``trans = mkl::transpose::conjtrans``, then :math:`A_i^HX_i = B_i` is solved for :math:`X_i`.
-
-n
-  Array of ``group_count`` parameters :math:`n_g` specifying the order of the matrices :math:`A_i` and the number of rows in matrices :math:`B_i` (:math:`0 \le n_g`) belonging to group :math:`g`.
-
-nrhs
-  Array of ``group_count`` parameters :math:`\text{nrhs}_g` specifying the number of right-hand sides (:math:`0 \le \text{nrhs}_g`) for group :math:`g`.
-
-a
-  Array of ``batch_size`` pointers to factorizations of the matrices :math:`A_i`, as returned by the Group API of the:ref:`onemkl_lapack_getrf_batch_usm` function.
-
-lda
-  Array of ``group_count`` parameters :math:`\text{lda}_g` specifying the leading dimensions of :math:`A_i` from group :math:`g`.
-
-ipiv
-  ``ipiv`` array, as returned by the Group API of the :ref:`onemkl_lapack_getrf_batch_usm` function.
-
-b 
-  The array containing ``batch_size`` pointers to the matrices :math:`B_i` whose columns are the right-hand sides for the systems of equations.
-
-ldb
-  Array of ``group_count`` parameters :math:`\text{ldb}_g` specifying the leading dimensions of :math:`B_i` in the group :math:`g`.
-
-group_count
-  Specifies the number of groups of parameters. Must be at least 0.
-    
-group_sizes
-  Array of ``group_count`` integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-    
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by the Group API of the :ref:`onemkl_lapack_getrs_batch_scratchpad_size` function.
-  
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-b  
-  Solution matrices :math:`X_i`.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
- | The routine solves the following systems of linear equations for :math:`X_i`:
- | :math:`A_iX_i = B_i`, if ``trans=mkl::transpose::nontrans``
- | :math:`A_i^TX_i = B_i`, if ``trans=mkl::transpose::trans``
- | :math:`A_i^HX_i = B_i`, if ``trans=mkl::transpose::conjtrans``
- | Before calling this routine, the Strided API of the :ref:`onemkl_lapack_getrf_batch` function should be called to compute the LU factorizations of :math:`A_i`.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event getrs_batch(sycl::queue &queue, mkl::transpose trans, std::int64_t n, std::int64_t nrhs, T *a, std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, T *b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    };
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-trans
- | Form of the equations:
- | If ``trans = mkl::transpose::nontrans``, then :math:`A_iX_i = B_i` is solved for :math:`X_i`.
- | If ``trans = mkl::transpose::trans``, then :math:`A_i^TX_i = B_i` is solved for :math:`X_i`.
- | If ``trans = mkl::transpose::conjtrans``, then :math:`A_i^HX_i = B_i` is solved for :math:`X_i`.
-
-n
-  Order of the matrices :math:`A_i` and the number of rows in matrices :math:`B_i` (:math:`0 \le n`).
-
-nrhs
-  Number of right-hand sides (:math:`0 \le \text{nrhs}`).
-
-a
-  Array containing the factorizations of the matrices :math:`A_i`, as returned by the Strided API of the:ref:`onemkl_lapack_getrf_batch_usm` function.
-
-lda
-  Leading dimension of :math:`A_i`.
-
-stride_a  
-  Stride between the beginnings of matrices :math:`B_i` inside the batch array ``b``.
-
-ipiv
-  ``ipiv`` array, as returned by getrf_batch (USM) function.
-
-stride_ipiv
-  Stride between the beginnings of arrays :math:`\text{ipiv}_i` inside the array ``ipiv``.
-
-b
-  Array containing the matrices :math:`B_i` whose columns are the right-hand sides for the systems of equations.
-
-ldb
-  Leading dimensions of :math:`B_i`.
-
-batch_size
-  Number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-    
-scratchpad_size 
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by the Strided API of the :ref:`onemkl_lapack_getrs_batch_scratchpad_size` function.
-
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-b  
-  Solution matrices :math:`X_i`.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
-
diff --git a/docs/domains/lapack/getrs_batch_scratchpad_size.rst b/docs/domains/lapack/getrs_batch_scratchpad_size.rst
deleted file mode 100644
index 491524482..000000000
--- a/docs/domains/lapack/getrs_batch_scratchpad_size.rst
+++ /dev/null
@@ -1,135 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_getrs_batch_scratchpad_size:
-
-getrs_batch_scratchpad_size
-===========================
-
-Computes size of scratchpad memory required for the :ref:`onemkl_lapack_getrs_batch` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``getrs_batch_scratchpad_size`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-**Group API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_getrs_batch` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-trans
- | Array of ``group_count`` parameters :math:`\text{trans}_g` indicating the form of the equations for the group :math:`g`:
- | If ``trans = mkl::transpose::nontrans``, then :math:`A_iX_i = B_i` is solved for :math:`X_i`.
- | If ``trans = mkl::transpose::trans``, then :math:`A_i^TX_i = B_i` is solved for :math:`X_i`.
- | If ``trans = mkl::transpose::conjtrans``, then :math:`A_iHX_i = B_i` is solved for :math:`X_i`.
-
-n
-  Array of ``group_count`` parameters :math:`n_g` specifying the order of the matrices :math:`A_i` and the number of rows in matrices :math:`B_i` (:math:`0 \le n_g`) belonging to group :math:`g`.
-
-nrhs
-  Array of ``group_count`` parameters nrhsg specifying the number of right-hand sides (:math:`0 \le \text{nrhs}_g`) for group :math:`g`.
-
-lda
-  Array of ``group_count`` parameters :math:`\text{lda}_g` specifying the leading dimensions of :math:`A_i` from group :math:`g`.
-
-ldb
-  Array of ``group_count`` parameters :math:`\text{ldb}_g` specifying the leading dimensions of :math:`B_i` in the group :math:`g`.
-
-group_count
-  Number of groups of parameters. Must be at least 0.
-
-group_sizes
-  Array of ``group_count`` integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_getrs_batch` function.
-
-**Strided API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_getrs_batch` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size)
-    };
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-trans
- | Indicates the form of the equations:
- | ``If trans = mkl::transpose::nontrans``, then :math:`A_iX_i = B_i` is solved for :math:`X_i`.
- | If ``trans = mkl::transpose::trans``, then :math:`A_i^TX_i = B_i` is solved for :math:`X_i`.
- | If ``trans = mkl::transpose::conjtrans``, then :math:`A_i^HX_i = B_i` is solved for :math:`X_i`.
-
-n
-  Order of the matrices :math:`A_i` and the number of rows in matrices :math:`B_i` (:math:`0 \le n`).
-
-nrhs
-  Number of right-hand sides (:math:`0 \le \text{nrhs}`).
-
-lda
-  Leading dimension of :math:`A_i`.
-
-stride_a
-  Stride between the beginnings of matrices :math:`B_i` inside the batch array ``b``.
-
-stride_ipiv
-  Stride between the beginnings of arrays ipivi inside the array ``ipiv``.
-
-ldb
-  Leading dimension of :math:`B_i`.
-
-batch_size
-  Number of problems in a batch.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_getrs_batch` function.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
-
diff --git a/docs/domains/lapack/getrs_scratchpad_size.rst b/docs/domains/lapack/getrs_scratchpad_size.rst
deleted file mode 100644
index 8a2741745..000000000
--- a/docs/domains/lapack/getrs_scratchpad_size.rst
+++ /dev/null
@@ -1,85 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_getrs_scratchpad_size:
-
-getrs_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_getrs` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``getrs_scratchpad_size`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``float`` 
-       * -  ``double`` 
-       * -  ``std::complex<float>`` 
-       * -  ``std::complex<double>`` 
-      
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_getrs` function should be able to hold.
-Calls to this routine must specify the template parameter
-explicitly.
-
-getrs_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-      
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t getrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_getrs` function will be performed.
-
-trans
-   Indicates the form of the equations:
-
-   If ``trans=oneapi::mkl::transpose::nontrans``, then :math:`AX = B` is solved
-   for :math:`X`.
-
-   If ``trans=oneapi::mkl::transpose::trans``, then :math:`A^TX = B` is solved
-   for :math:`X`.
-
-   If ``trans=oneapi::mkl::transpose::conjtrans``, then :math:`A^HX = B` is
-   solved for :math:`X`.
-
-n
-   The order of the matrix :math:`A` :math:`(0 \le n)` and the number of rows in matrix
-   :math:`B(0 \le n)`.
-
-nrhs
-   The number of right-hand sides (:math:`0 \le \text{nrhs}`).
-
-lda
-   The leading dimension of ``a``.
-
-ldb
-   The leading dimension of ``b``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_getrs` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines` 
-
-
diff --git a/docs/domains/lapack/heevd.rst b/docs/domains/lapack/heevd.rst
deleted file mode 100644
index 0e0247dd3..000000000
--- a/docs/domains/lapack/heevd.rst
+++ /dev/null
@@ -1,182 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_heevd:
-
-heevd
-=====
-
-Computes all eigenvalues and, optionally, all eigenvectors of a
-complex Hermitian matrix using divide and conquer algorithm.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``heevd`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-The routine computes all the eigenvalues, and optionally all the
-eigenvectors, of a complex Hermitian matrix :math:`A`. In other words, it
-can compute the spectral factorization of :math:`A` as: :math:`A = Z\Lambda Z^H`.
-
-Here :math:`\Lambda` is a real diagonal matrix whose diagonal elements are the
-eigenvalues :math:`\lambda_i`, and :math:`Z` is the (complex) unitary matrix
-whose columns are the eigenvectors :math:`z_{i}`. Thus,
-
-:math:`Az_i = \lambda_i z_i` for :math:`i = 1, 2, ..., n`.
-
-If the eigenvectors are requested, then this routine uses a divide
-and conquer algorithm to compute eigenvalues and eigenvectors.
-However, if only eigenvalues are required, then it uses the
-Pal-Walker-Kahan variant of the QL or QR algorithm.
-
-heevd (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo upper_lower, std::int64_t n, butter<T,1> &a, std::int64_t lda, sycl::buffer<realT,1> &w, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-jobz
-   Must be ``job::novec`` or ``job::vec``.
-
-   If ``jobz = job::novec``, then only eigenvalues are computed.
-
-   If ``jobz = job::vec``, then eigenvalues and eigenvectors are
-   computed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = job::upper``, a stores the upper triangular
-   part of :math:`A`.
-
-   If ``upper_lower = job::lower``, a stores the lower triangular
-   part of :math:`A`.
-
-n
-   The order of the matrix :math:`A` (:math:`0 \le n`).
-
-a
-   The buffer ``a``, size (``lda,*``). The buffer ``a`` contains the matrix
-   :math:`A`. The second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``. Must be at least :math:`\max(1,n)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_heevd_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-a
-   If ``jobz = job::vec``, then on exit this buffer is overwritten by
-   the unitary matrix :math:`Z` which contains the eigenvectors of :math:`A`.
-
-w
-   Buffer, size at least n. Contains the eigenvalues
-   of the matrix :math:`A` in ascending order.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-heevd (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo upper_lower, std::int64_t n, butter<T,1> &a, std::int64_t lda, RealT *w, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-jobz
-   Must be ``job::novec`` or ``job::vec``.
-
-   If ``jobz = job::novec``, then only eigenvalues are computed.
-
-   If ``jobz = job::vec``, then eigenvalues and eigenvectors are
-   computed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = job::upper``, a stores the upper triangular
-   part of :math:`A`.
-
-   If ``upper_lower = job::lower``, a stores the lower triangular
-   part of :math:`A`.
-
-n
-   The order of the matrix :math:`A` (:math:`0 \le n`).
-
-a
-   Pointer to array containing :math:`A`, size (``lda,*``).The second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``. Must be at least :math:`\max(1,n)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_heevd_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-a
-   If ``jobz = job::vec``, then on exit this array is overwritten by
-   the unitary matrix :math:`Z` which contains the eigenvectors of :math:`A`.
-
-w
-   Pointer to array of size at least :math:`n`. Contains the eigenvalues
-   of the matrix :math:`A` in ascending order.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
diff --git a/docs/domains/lapack/heevd_scratchpad_size.rst b/docs/domains/lapack/heevd_scratchpad_size.rst
deleted file mode 100644
index 4825f73e7..000000000
--- a/docs/domains/lapack/heevd_scratchpad_size.rst
+++ /dev/null
@@ -1,81 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_heevd_scratchpad_size:
-
-heevd_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_heevd` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``heevd_scratchpad_size`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``std::complex<float>`` 
-       * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_heevd` function should be able to hold.
-Calls to this routine must specify the template parameter
-explicitly.
-
-heevd_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-      
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t heevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_heevd` function will be performed.
-
-jobz
-   Must be ``job::novec`` or ``job::vec``.
-
-   If ``jobz = job::novec``, then only eigenvalues are computed.
-
-   If ``jobz = job::vec``, then eigenvalues and eigenvectors are
-   computed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = job::upper``, a stores the upper triangular
-   part of :math:`A`.
-
-   If ``upper_lower = job::lower``, a stores the lower triangular
-   part of :math:`A`.
-
-n
-   The order of the matrix :math:`A` (:math:`0 \le n`).
-
-lda
-   The leading dimension of ``a``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-      
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_heevd` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/hegvd.rst b/docs/domains/lapack/hegvd.rst
deleted file mode 100644
index 266b9964d..000000000
--- a/docs/domains/lapack/hegvd.rst
+++ /dev/null
@@ -1,249 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_hegvd:
-
-hegvd
-=====
-
-Computes all eigenvalues and, optionally, eigenvectors of a real
-generalized symmetric definite eigenproblem using a divide and
-conquer method.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``hegvd`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``std::complex<float>`` 
-       * -  ``std::complex<double>`` 
-
-The routine computes all the eigenvalues, and optionally, the
-eigenvectors of a complex generalized Hermitian positive-definite
-eigenproblem, of the form
-
-:math:`Ax = \lambda Bx, ABx = \lambda x`, or :math:`BAx =\lambda x`.
-
-Here :math:`A` and :math:`B` are assumed to be Hermitian and :math:`B` is also
-positive definite.
-
-It uses a divide and conquer algorithm.
-
-hegvd (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo upper_lower, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &b, std::int64_t ldb, sycl::buffer<realT,1> &w, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-itype
-   Must be 1 or 2 or 3. Specifies the problem type to be solved:
-
-   if :math:`\text{itype} = 1`, the problem type is :math:`Ax = \lambda Bx;`
-
-   if :math:`\text{itype} = 2`, the problem type is :math:`ABx = \lambda x;`
-
-   if :math:`\text{itype} = 3`, the problem type is :math:`BAx = \lambda x`.
-
-jobz
-   Must be ``job::novec`` or ``job::vec``.
-
-   If ``jobz = job::novec``, then only eigenvalues are computed.
-
-   If ``jobz = job::vec``, then eigenvalues and eigenvectors are
-   computed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = uplo::upper``, ``a`` and ``b`` store the upper
-   triangular part of :math:`A` and :math:`B`.
-
-   If ``upper_lower = uplo::lower``, ``a`` and ``b`` stores the lower
-   triangular part of :math:`A` and :math:`B`.
-
-n
-   The order of the matrices :math:`A` and :math:`B` (:math:`0 \le n`).
-
-a
-   Buffer, size ``a(lda,*)`` contains the upper or lower triangle of
-   the Hermitian matrix :math:`A`, as specified by upper_lower.
-
-   The second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``; at least :math:`\max(1,n)`.
-
-b
-   Buffer, size ``b(ldb,*)`` contains the upper or lower triangle of
-   the Hermitian matrix :math:`B`, as specified by upper_lower.
-
-   The second dimension of ``b`` must be at least :math:`\max(1, n)`.
-
-ldb
-   The leading dimension of ``b``; at least :math:`\max(1,n)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_hegvd_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-a
-   On exit, if ``jobz = job::vec``, then if :math:`\text{info} = 0`, ``a`` contains
-   the matrix :math:`Z` of eigenvectors. The eigenvectors are normalized
-   as follows:
-
-   if :math:`\text{itype} = 1` or :math:`\text{itype} = 2`, :math:`Z^{H}BZ = I`;
-
-   if :math:`\text{itype} = 3`, :math:`Z^{H}B^{-1}Z = I`;
-
-   If ``jobz = job::novec``, then on exit the upper triangle (if
-   ``upper_lower = uplo::upper``) or the lower triangle (if
-   ``upper_lower = uplo::lower``) of :math:`A`, including the diagonal,
-   is destroyed.
-
-b
-   On exit, if :math:`\text{info} \le n`, the part of ``b`` containing the matrix is
-   overwritten by the triangular factor :math:`U` or :math:`L` from the
-   Cholesky factorization :math:`B = U^{H}U`\ or :math:`B = LL^{H}`.
-
-w
-   Buffer, size at least :math:`n`. If :math:`\text{info} = 0`, contains the eigenvalues
-   of the matrix :math:`A` in ascending order.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-hegvd (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-      
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo upper_lower, std::int64_t n, T *a, std::int64_t lda, T *b, std::int64_t ldb, RealT *w, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-itype
-   Must be 1 or 2 or 3. Specifies the problem type to be solved:
-
-   if :math:`\text{itype} = 1`, the problem type is :math:`Ax = \lambda Bx;`
-
-   if :math:`\text{itype} = 2`, the problem type is :math:`ABx = \lambda x;`
-
-   if :math:`\text{itype} = 3`, the problem type is :math:`BAx = \lambda x`.
-
-jobz
-   Must be ``job::novec`` or ``job::vec``.
-
-   If ``jobz = job::novec``, then only eigenvalues are computed.
-
-   If ``jobz = job::vec``, then eigenvalues and eigenvectors are
-   computed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = uplo::upper``, ``a`` and ``b`` store the upper
-   triangular part of :math:`A` and :math:`B`.
-
-   If ``upper_lower = uplo::lower``, ``a`` and ``b`` stores the lower
-   triangular part of :math:`A` and :math:`B`.
-
-n
-   The order of the matrices :math:`A` and :math:`B` (:math:`0 \le n`).
-
-a
-   Pointer to array of size ``a(lda,*)`` containing the upper or lower triangle of
-   the Hermitian matrix :math:`A`, as specified by upper_lower.
-   The second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``; at least :math:`\max(1,n)`.
-
-b
-   Pointer to array of size ``b(ldb,*)`` containing the upper or lower triangle of
-   the Hermitian matrix :math:`B`, as specified by upper_lower.
-   The second dimension of ``b`` must be at least :math:`\max(1, n)`.
-
-ldb
-   The leading dimension of ``b``; at least :math:`\max(1,n)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_hegvd_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   On exit, if ``jobz = job::vec``, then if :math:`\text{info} = 0`, ``a`` contains
-   the matrix :math:`Z` of eigenvectors. The eigenvectors are normalized
-   as follows:
-
-   if :math:`\text{itype} = 1`` or :math:`\text{itype} = 2`, :math:`Z^{H}BZ = I`;
-
-   if :math:`\text{itype} = 3`, :math:`Z^{H} B^{-1} Z = I`;
-
-   If ``jobz = job::novec``, then on exit the upper triangle (if
-   ``upper_lower = uplo::upper``) or the lower triangle (if
-   ``upper_lower = uplo::lower``) of :math:`A`, including the diagonal,
-   is destroyed.
-
-b
-   On exit, if :math:`\text{info} \le n`, the part of ``b`` containing the matrix is
-   overwritten by the triangular factor :math:`U` or :math:`L` from the
-   Cholesky factorization :math:`B = U^{H}U`\ or :math:`B` =
-   :math:`LL^{H}`.
-
-w
-   Pointer to array of size at least n. If :math:`\text{info} = 0`, contains the eigenvalues
-   of the matrix :math:`A` in ascending order.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-         
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
diff --git a/docs/domains/lapack/hegvd_scratchpad_size.rst b/docs/domains/lapack/hegvd_scratchpad_size.rst
deleted file mode 100644
index 8e33d8b63..000000000
--- a/docs/domains/lapack/hegvd_scratchpad_size.rst
+++ /dev/null
@@ -1,95 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_hegvd_scratchpad_size:
-
-hegvd_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_hegvd` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``hegvd_scratchpad_size`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``std::complex<float>`` 
-       * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_hegvd` function should be able to hold.
-Calls to this routine must specify the template parameter
-explicitly.
-
-hegvd_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t hegvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t lda, std::int64_t ldb) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-         
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_hegvd` function will be performed.
-
-itype
-   Must be 1 or 2 or 3. Specifies the problem type to be solved:
-
-   if :math:`\text{itype} = 1`, the problem type is :math:`Ax = \lambda Bx`;
-
-   if :math:`\text{itype} = 2`, the problem type is :math:`ABx = \lambda x`;
-
-   if :math:`\text{itype} = 3`, the problem type is :math:`BAx = \lambda x`.
-
-jobz
-   Must be ``job::novec`` or ``job::vec``.
-
-   If ``jobz = job::novec``, then only eigenvalues are computed.
-
-   If ``jobz = job::vec``, then eigenvalues and eigenvectors are
-   computed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = uplo::upper``, ``a`` and ``b`` store the upper
-   triangular part of :math:`A` and :math:`B`.
-
-   If ``upper_lower = uplo::lower``, ``a`` and ``b`` store the lower
-   triangular part of :math:`A` and :math:`B`.
-
-n
-   The order of the matrices :math:`A` and :math:`B` (:math:`0 \le n`).
-
-lda
-   The leading dimension of ``a``. Currently ``lda`` is not referenced in
-   this function.
-
-ldb
-   The leading dimension of ``b``. Currently ``ldb`` is not referenced in
-   this function.
-
-.. container:: section
-
-  .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_hegvd` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/hetrd.rst b/docs/domains/lapack/hetrd.rst
deleted file mode 100644
index b8c855d62..000000000
--- a/docs/domains/lapack/hetrd.rst
+++ /dev/null
@@ -1,206 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_hetrd:
-
-hetrd
-=====
-
-Reduces a complex Hermitian matrix to tridiagonal form.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``hetrd`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-  
-      * -  Routine name 
-        -  T 
-      * -  ``chetrd`` 
-        -  ``std::complex<float>`` 
-      * -  ``zhetrd`` 
-        -  ``std::complex<double>``
-
-The routine reduces a complex Hermitian matrix :math:`A` to symmetric
-tridiagonal form :math:`T` by a unitary similarity transformation:
-:math:`A = QTQ^H`. The unitary matrix :math:`Q` is not formed explicitly but
-is represented as a product of :math:`n-1` elementary reflectors.
-Routines are provided to work with :math:`Q` in this representation.
-
-hetrd (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void hetrd(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<realT,1> &d, sycl::buffer<realT,1> &e, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = uplo::upper``, ``a`` stores the upper triangular
-   part of :math:`A`.
-
-   If ``upper_lower = uplo::lower``, ``a`` stores the lower triangular
-   part of :math:`A`.
-
-n
-   The order of the matrices :math:`A` :math:`(0 \le n)`.
-
-a
-   Buffer, size ``(lda,*)``. The buffer ``a`` contains either the upper
-   or lower triangle of the Hermitian matrix :math:`A`, as specified by
-   upper_lower.
-
-   The second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``; at least :math:`\max(1, n)`
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_hetrd_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   On exit,
-
-   if ``upper_lower = uplo::upper``, the diagonal and first
-   superdiagonal of :math:`A` are overwritten by the corresponding
-   elements of the tridiagonal matrix :math:`T`, and the elements above
-   the first superdiagonal, with the buffer ``tau``, represent the
-   orthogonal matrix :math:`Q` as a product of elementary reflectors;
-
-   if ``upper_lower = uplo::lower``, the diagonal and first
-   subdiagonal of :math:`A` are overwritten by the corresponding elements
-   of the tridiagonal matrix :math:`T`, and the elements below the first
-   subdiagonal, with the buffer ``tau``, represent the orthogonal matrix
-   :math:`Q` as a product of elementary reflectors.
-
-d
-   Buffer containing the diagonal elements of the matrix :math:`T`. The
-   dimension of ``d`` must be at least :math:`\max(1, n)`.
-
-e
-   Buffer containing the off diagonal elements of the matrix :math:`T`.
-   The dimension of ``e`` must be at least :math:`\max(1, n-1)`.
-
-tau
-   Buffer, size at least :math:`\max(1, n-1)`. Stores :math:`(n-1)` scalars
-   that define elementary reflectors in decomposition of the unitary
-   matrix :math:`Q` in a product of :math:`n-1` elementary reflectors.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-hetrd (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, T *a, std::int64_t lda, RealT *d, RealT *e, T *tau, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = uplo::upper``, ``a`` stores the upper triangular
-   part of :math:`A`.
-
-   If ``upper_lower = uplo::lower``, ``a`` stores the lower triangular
-   part of :math:`A`.
-
-n
-   The order of the matrices :math:`A` :math:`(0 \le n)`.
-
-a
-   The pointer to matrix :math:`A`, size ``(lda,*)``. Contains either the upper
-   or lower triangle of the Hermitian matrix :math:`A`, as specified by
-   ``upper_lower``.
-   The second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``; at least :math:`\max(1, n)`
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_hetrd_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-a
-   On exit,
-
-   if ``upper_lower = uplo::upper``, the diagonal and first
-   superdiagonal of :math:`A` are overwritten by the corresponding
-   elements of the tridiagonal matrix :math:`T`, and the elements above
-   the first superdiagonal, with the array ``tau``, represent the
-   orthogonal matrix :math:`Q` as a product of elementary reflectors;
-
-   if ``upper_lower = uplo::lower``, the diagonal and first
-   subdiagonal of :math:`A` are overwritten by the corresponding elements
-   of the tridiagonal matrix :math:`T`, and the elements below the first
-   subdiagonal, with the array ``tau``, represent the orthogonal matrix
-   :math:`Q` as a product of elementary reflectors.
-
-d
-   Pointer to diagonal elements of the matrix :math:`T`. The
-   dimension of ``d`` must be at least :math:`\max(1, n)`.
-
-e
-   Pointer to off diagonal elements of the matrix :math:`T`.
-   The dimension of ``e`` must be at least :math:`\max(1, n-1)`.
-
-tau
-   Pointer to array of size at least :math:`\max(1, n-1)`. Stores :math:`(n-1)` scalars
-   that define elementary reflectors in decomposition of the unitary
-   matrix :math:`Q` in a product of :math:`n-1` elementary reflectors.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-         
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/hetrd_scratchpad_size.rst b/docs/domains/lapack/hetrd_scratchpad_size.rst
deleted file mode 100644
index 006d50a3c..000000000
--- a/docs/domains/lapack/hetrd_scratchpad_size.rst
+++ /dev/null
@@ -1,74 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_hetrd_scratchpad_size:
-
-hetrd_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_hetrd` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``hetrd_scratchpad_size`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``std::complex<float>`` 
-       * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_hetrd` function should be able to hold.
-Calls to this routine must specify the template parameter
-explicitly.
-
-hetrd_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t hetrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_hetrd` function will be performed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = uplo::upper``, ``a`` stores the upper triangular
-   part of :math:`A` and :math:`B`.
-
-   If ``upper_lower = uplo::lower``, ``a`` stores the lower triangular
-   part of :math:`A`.
-
-n
-   The order of the matrices :math:`A` and :math:`B` (:math:`0 \le n`).
-
-lda
-   The leading dimension of ``a``. Currently, ``lda`` is not referenced in
-   this function.
-
-.. container:: section
-
-  .. rubric:: Return Value
-         
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_hetrd` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/hetrf.rst b/docs/domains/lapack/hetrf.rst
deleted file mode 100644
index 7be2b4bbe..000000000
--- a/docs/domains/lapack/hetrf.rst
+++ /dev/null
@@ -1,164 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_hetrf:
-
-hetrf
-=====
-
-Computes the Bunch-Kaufman factorization of a complex Hermitian matrix.
-
-.. container:: section
-
-  .. rubric:: Description
-      
-``hetrf`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-The routine computes the factorization of a complex Hermitian
-matrix :math:`A` using the Bunch-Kaufman diagonal pivoting method. The
-form of the factorization is:
-
--  if ``upper_lower=uplo::upper``, :math:`A` = :math:`UDU^{H}`
-
--  if ``upper_lower=uplo::lower``, :math:`A` = :math:`LDL^{H}`
-
-where :math:`A` is the input matrix, :math:`U` and :math:`L` are products of
-permutation and triangular matrices with unit diagonal (upper
-triangular for :math:`U` and lower triangular for :math:`L`), and :math:`D` is a
-Hermitian block-diagonal matrix with :math:`1 \times 1` and :math:`2 \times 2` diagonal
-blocks. :math:`U` and :math:`L` have :math:`2 \times 2` unit diagonal blocks
-corresponding to the :math:`2 \times 2` blocks of :math:`D`.
-
-hetrf (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void hetrf(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<int_64,1> &ipiv, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Indicates whether the upper or lower triangular part of    :math:`A` is stored and how :math:`A` is factored:
-
-      If ``upper_lower=uplo::upper``, the buffer ``a`` stores the upper triangular part of the matrix :math:`A`, and :math:`A` is factored as :math:`UDU^H`.
-
-      If ``upper_lower=uplo::lower``, the buffer ``a`` stores the lower triangular part of the matrix :math:`A`, and :math:`A` is factored as :math:`LDL^H`.
-
-n
-   The order of matrix :math:`A` (:math:`0 \le n`).
-
-a
-   The buffer ``a``, size :math:`\max(1,\text{lda} \cdot n)`. The buffer ``a``    contains either the upper or the lower triangular part of the matrix   :math:`A` (see ``upper_lower``). The second dimension of ``a`` must be at   least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by the routine for storing intermediate results.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_hetrf_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-a
-   The upper or lower triangular part of a is overwritten by    details of the block-diagonal matrix :math:`D` and the multipliers used   to obtain the factor :math:`U` (or :math:`L`).
-
-ipiv
-   Buffer, size at least :math:`\max(1, n)`. Contains details of    the interchanges and the block structure of :math:`D`. If   :math:`\text{ipiv}(i)=k>0`, then :math:`d_{ii}` is a :math:`1 \times 1` block, and the   :math:`i`-th row and column of :math:`A` was interchanged with the :math:`k`-th   row and column.
-
-      If ``upper_lower=oneapi::mkl::uplo::upper``   and :math:`\text{ipiv}(i)=\text{ipiv}(i-1)=-m<0`, then :math:`D` has a :math:`2 \times 2` block in   rows/columns :math:`i` and :math:`i`-1, and (:math:`i-1`)-th row and column of   :math:`A` was interchanged with the :math:`m`-th row and   column.
-
-      If ``upper_lower=oneapi::mkl::uplo::lower`` and   :math:`\text{ipiv}(i)=\text{ipiv}(i+1)=-m<0`, then :math:`D` has a :math:`2 \times 2` block in   rows/columns :math:`i` and :math:`i+1`, and (:math:`i+1`)-th row and column   of :math:`A` was interchanged with the :math:`m`-th row and column.
-
-hetrf (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, T *a, std::int64_t lda, int_64 *ipiv, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Indicates whether the upper or lower triangular part of    :math:`A` is stored and how :math:`A` is factored:
-
-      If   ``upper_lower=uplo::upper``, the array ``a`` stores the upper triangular   part of the matrix :math:`A`, and :math:`A` is factored as :math:`UDU^H`.
-
-      If ``upper_lower=uplo::lower``, the array ``a`` stores   the lower triangular part of the matrix :math:`A`, and :math:`A` is factored   as :math:`LDL^H`.
-
-n
-   The order of matrix :math:`A` (:math:`0 \le n`).
-
-a
-   The pointer to :math:`A`, size :math:`\max(1,\text{lda} \cdot n)`, containing either the upper or the lower triangular part of the matrix   :math:`A` (see ``upper_lower``). The second dimension of ``a`` must be at   least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``.
-
-scratchpad
-   Pointer to scratchpad memory to be used by the routine for storing intermediate results.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_hetrf_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   The upper or lower triangular part of a is overwritten by    details of the block-diagonal matrix :math:`D` and the multipliers used   to obtain the factor :math:`U` (or :math:`L`).
-
-ipiv
-   Pointer to array of size at least :math:`\max(1, n)`. Contains details of    the interchanges and the block structure of :math:`D`. If   :math:`\text{ipiv}(i)=k>0`, then :math:`d_{ii}` is a :math:`1 \times 1` block, and the   :math:`i`-th row and column of :math:`A` was interchanged with the :math:`k`-th   row and column.
-
-      If ``upper_lower=oneapi::mkl::uplo::upper``   and :math:`\text{ipiv}(i)=\text{ipiv}(i-1)=-m<0`, then :math:`D` has a :math:`2 \times 2` block in   rows/columns :math:`i` and :math:`i-1`, and (:math:`i-1`)-th row and column of   :math:`A` was interchanged with the :math:`m`-th row and   column.
-      
-      If ``upper_lower=oneapi::mkl::uplo::lower`` and   :math:`\text{ipiv}(i)=\text{ipiv}(i+1)=-m<0`, then :math:`D` has a :math:`2 \times 2` block in   rows/columns :math:`i` and :math:`i+1`, and (:math:`i+1`)-th row and column   of :math:`A` was interchanged with the :math:`m`-th row and column.
-
-.. container:: section
-
-  .. rubric:: Return Values
-         
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/hetrf_scratchpad_size.rst b/docs/domains/lapack/hetrf_scratchpad_size.rst
deleted file mode 100644
index 38447dd6e..000000000
--- a/docs/domains/lapack/hetrf_scratchpad_size.rst
+++ /dev/null
@@ -1,74 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_hetrf_scratchpad_size:
-
-hetrf_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_hetrf` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``hetrf_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-  
-        * -  T 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_hetrf` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-hetrf_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t hetrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-         
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_hetrf` function will be performed.
-
-upper_lower
-   Indicates whether the upper or lower triangular part of :math:`A` is
-   stored and how :math:`A` is factored:
-
-   If ``upper_lower=uplo::upper``, the buffer ``a`` stores the
-   upper triangular part of the matrix :math:`A`, and :math:`A` is
-   factored as :math:`UDU^H`.
-
-   If ``upper_lower=uplo::lower``, the buffer ``a`` stores the
-   lower triangular part of the matrix :math:`A`, and :math:`A` is
-   factored as :math:`LDL^H`
-
-n
-   The order of the matrix :math:`A` (:math:`0 \le n`).
-
-lda
-   The leading dimension of ``a``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_hetrf` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/lapack-like-extensions.inc.rst b/docs/domains/lapack/lapack-like-extensions.inc.rst
deleted file mode 100644
index b3378b25b..000000000
--- a/docs/domains/lapack/lapack-like-extensions.inc.rst
+++ /dev/null
@@ -1,74 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack-like-extensions-routines:
-
-LAPACK-like Extensions Routines
-===============================
-
-
-.. container::
-
-
-   oneAPI Math Kernel Library DPC++ provides additional routines to
-   extend the functionality of the LAPACK routines. These include routines
-   to compute many independent factorizations, linear equation solutions, and similar.
-   The following table lists the LAPACK-like Extensions routine groups.
-
-
-   .. container:: tablenoborder
-
-
-      .. list-table:: 
-         :header-rows: 1
-
-         * -     Routines
-           -     Scratchpad Size Routines
-           -     Description     
-         * -     :ref:`onemkl_lapack_geqrf_batch`
-           -     :ref:`onemkl_lapack_geqrf_batch_scratchpad_size`
-           -     Computes the QR factorizations of a batch of general matrices.
-         * -     :ref:`onemkl_lapack_getrf_batch`
-           -     :ref:`onemkl_lapack_getrf_batch_scratchpad_size`
-           -     Computes the LU factorizations of a batch of general matrices.   
-         * -     :ref:`onemkl_lapack_getri_batch`
-           -     :ref:`onemkl_lapack_getri_batch_scratchpad_size`
-           -     Computes the inverses of a batch of LU-factored general matrices.   
-         * -     :ref:`onemkl_lapack_getrs_batch`
-           -     :ref:`onemkl_lapack_getrs_batch_scratchpad_size`
-           -     Solves systems of linear equations with a batch of LU-factored square coefficient matrices, with multiple right-hand sides.    
-         * -     :ref:`onemkl_lapack_orgqr_batch`
-           -     :ref:`onemkl_lapack_orgqr_batch_scratchpad_size`
-           -     Generates the real orthogonal/complex unitary matrix :math:`Q_i` of the QR factorization formed by geqrf_batch.
-         * -     :ref:`onemkl_lapack_potrf_batch`
-           -     :ref:`onemkl_lapack_potrf_batch_scratchpad_size`
-           -     Computes the Cholesky factorization of a batch of symmetric (Hermitian) positive-definite matrices.   
-         * -     :ref:`onemkl_lapack_potrs_batch`
-           -     :ref:`onemkl_lapack_potrs_batch_scratchpad_size`
-           -     Solves systems of linear equations with a batch of Cholesky-factored symmetric (Hermitian) positive-definite coefficient matrices, with multiple right-hand sides.    
-         * -     :ref:`onemkl_lapack_ungqr_batch`
-           -     :ref:`onemkl_lapack_ungqr_batch_scratchpad_size`
-           -     Generates the complex unitary matrix :math:`Q_i` with the QR factorization formed by geqrf_batch.
-
-
-
-.. toctree::
-    :hidden:
-
-    geqrf_batch
-    geqrf_batch_scratchpad_size
-    getrf_batch
-    getrf_batch_scratchpad_size
-    getri_batch
-    getri_batch_scratchpad_size
-    getrs_batch
-    getrs_batch_scratchpad_size
-    orgqr_batch
-    orgqr_batch_scratchpad_size
-    potrf_batch
-    potrf_batch_scratchpad_size
-    potrs_batch
-    potrs_batch_scratchpad_size
-    ungqr_batch
-    ungqr_batch_scratchpad_size
diff --git a/docs/domains/lapack/lapack-linear-equation-routines.inc.rst b/docs/domains/lapack/lapack-linear-equation-routines.inc.rst
deleted file mode 100644
index 6e6c25574..000000000
--- a/docs/domains/lapack/lapack-linear-equation-routines.inc.rst
+++ /dev/null
@@ -1,121 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack-linear-equation-routines:
-
-LAPACK Linear Equation Routines
-===============================
-
-
-.. container::
-
-
-   LAPACK Linear Equation routines are used for factoring a matrix,
-   solving a system of linear equations, solving linear least squares problems,
-   and inverting a matrix. The following table lists the LAPACK Linear Equation
-   routine groups.
-
-
-   .. container:: tablenoborder
-
-
-      .. list-table:: 
-         :header-rows: 1
-
-         * -     Routines
-           -     Scratchpad Size Routines
-           -     Description     
-         * -     :ref:`onemkl_lapack_geqrf`
-           -     :ref:`onemkl_lapack_geqrf_scratchpad_size`
-           -     Computes the QR factorization of a general m-by-n matrix.
-         * -     :ref:`onemkl_lapack_gerqf`
-           -     :ref:`onemkl_lapack_gerqf_scratchpad_size`
-           -     Computes the RQ factorization of a general m-by-n matrix.
-         * -     :ref:`onemkl_lapack_getrf`
-           -     :ref:`onemkl_lapack_getrf_scratchpad_size`
-           -     Computes the LU factorization of a general m-by-n matrix.   
-         * -     :ref:`onemkl_lapack_getri`
-           -     :ref:`onemkl_lapack_getri_scratchpad_size`
-           -     Computes the inverse of an LU-factored general matrix.   
-         * -     :ref:`onemkl_lapack_getrs`
-           -     :ref:`onemkl_lapack_getrs_scratchpad_size`
-           -     Solves a system of linear equations with an LU-factored square coefficient matrix, with multiple right-hand sides.    
-         * -     :ref:`onemkl_lapack_hetrf`
-           -     :ref:`onemkl_lapack_hetrf_scratchpad_size`
-           -     Computes the Bunch-Kaufman factorization of a complex Hermitian matrix.
-         * -     :ref:`onemkl_lapack_orgqr`
-           -     :ref:`onemkl_lapack_orgqr_scratchpad_size`
-           -     Generates the real orthogonal matrix :math:`Q` of the QR factorization formed by geqrf.
-         * -     :ref:`onemkl_lapack_ormqr`
-           -     :ref:`onemkl_lapack_ormqr_scratchpad_size`
-           -     Multiplies a real matrix by the orthogonal matrix :math:`Q` of the QR factorization formed by geqrf.
-         * -     :ref:`onemkl_lapack_ormrq`
-           -     :ref:`onemkl_lapack_ormrq_scratchpad_size`
-           -     Multiplies a real matrix by the orthogonal matrix :math:`Q` of the RQ factorization formed by gerqf.
-         * -     :ref:`onemkl_lapack_potrf`
-           -     :ref:`onemkl_lapack_potrf_scratchpad_size`
-           -     Computes the Cholesky factorization of a symmetric (Hermitian) positive-definite matrix.   
-         * -     :ref:`onemkl_lapack_potri`
-           -     :ref:`onemkl_lapack_potri_scratchpad_size`
-           -     Computes the inverse of a Cholesky-factored symmetric (Hermitian) positive-definite matrix.   
-         * -     :ref:`onemkl_lapack_potrs`
-           -     :ref:`onemkl_lapack_potrs_scratchpad_size`
-           -     Solves a system of linear equations with a Cholesky-factored symmetric (Hermitian) positive-definite coefficient matrix, with multiple right-hand sides.    
-         * -     :ref:`onemkl_lapack_sytrf`
-           -     :ref:`onemkl_lapack_sytrf_scratchpad_size`
-           -     Computes the Bunch-Kaufman factorization of a symmetric matrix.   
-         * -     :ref:`onemkl_lapack_trtrs`
-           -     :ref:`onemkl_lapack_trtrs_scratchpad_size`
-           -     Solves a system of linear equations with a triangular coefficient matrix, with multiple right-hand sides.    
-         * -     :ref:`onemkl_lapack_ungqr`
-           -     :ref:`onemkl_lapack_ungqr_scratchpad_size`
-           -     Generates the complex unitary matrix :math:`Q` of the QR factorization formed by geqrf.
-         * -     :ref:`onemkl_lapack_unmqr`
-           -     :ref:`onemkl_lapack_unmqr_scratchpad_size`
-           -     Multiplies a complex matrix by the unitary matrix :math:`Q` of the QR factorization formed by geqrf.
-         * -     :ref:`onemkl_lapack_unmrq`
-           -     :ref:`onemkl_lapack_unmrq_scratchpad_size`
-           -     Multiplies a complex matrix by the unitary matrix :math:`Q` of the RQ factorization formed by gerqf.
-
-
-
-
-
-.. toctree::
-    :hidden:
-
-    geqrf
-    geqrf_scratchpad_size
-    gerqf
-    gerqf_scratchpad_size
-    getrf
-    getrf_scratchpad_size
-    getri
-    getri_scratchpad_size
-    getrs
-    getrs_scratchpad_size
-    hetrf
-    hetrf_scratchpad_size
-    orgqr
-    orgqr_scratchpad_size
-    ormqr
-    ormqr_scratchpad_size
-    ormrq
-    ormrq_scratchpad_size
-    potrf
-    potrf_scratchpad_size
-    potri
-    potri_scratchpad_size
-    potrs
-    potrs_scratchpad_size
-    sytrf
-    sytrf_scratchpad_size
-    trtrs
-    trtrs_scratchpad_size
-    ungqr
-    ungqr_scratchpad_size
-    unmqr
-    unmqr_scratchpad_size
-    unmrq
-    unmrq_scratchpad_size
diff --git a/docs/domains/lapack/lapack-singular-value-eigenvalue-routines.inc.rst b/docs/domains/lapack/lapack-singular-value-eigenvalue-routines.inc.rst
deleted file mode 100644
index bcf8c1af3..000000000
--- a/docs/domains/lapack/lapack-singular-value-eigenvalue-routines.inc.rst
+++ /dev/null
@@ -1,105 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack-singular-value-eigenvalue-routines:
-
-LAPACK Singular Value and Eigenvalue Problem Routines
-=====================================================
-
-
-.. container::
-
-
-   LAPACK Singular Value and Eigenvalue Problem routines are used for
-   singular value and eigenvalue problems, and for performing a number of related
-   computational tasks. The following table lists the LAPACK Singular Value and 
-   Eigenvalue Problem routine groups.
-
-
-   .. container:: tablenoborder
-
-
-      .. list-table:: 
-         :header-rows: 1
-
-         * -     Routines
-           -     Scratchpad Size Routines
-           -     Description     
-         * -     :ref:`onemkl_lapack_gebrd`
-           -     :ref:`onemkl_lapack_gebrd_scratchpad_size`
-           -     Reduces a general matrix to bidiagonal form.   
-         * -     :ref:`onemkl_lapack_gesvd`
-           -     :ref:`onemkl_lapack_gesvd_scratchpad_size`
-           -     Computes the singular value decomposition of a general rectangular matrix.
-         * -     :ref:`onemkl_lapack_heevd`
-           -     :ref:`onemkl_lapack_heevd_scratchpad_size`
-           -     Computes all eigenvalues and, optionally, all eigenvectors of a complex Hermitian matrix using divide and conquer algorithm.
-         * -     :ref:`onemkl_lapack_hegvd`
-           -     :ref:`onemkl_lapack_hegvd_scratchpad_size`
-           -     Computes all eigenvalues and, optionally, all eigenvectors of a complex generalized Hermitian definite eigenproblem using divide and conquer algorithm.
-         * -     :ref:`onemkl_lapack_hetrd`
-           -     :ref:`onemkl_lapack_hetrd_scratchpad_size`
-           -     Reduces a complex Hermitian matrix to tridiagonal form.
-         * -     :ref:`onemkl_lapack_orgbr`
-           -     :ref:`onemkl_lapack_orgbr_scratchpad_size`
-           -     Generates the real orthogonal matrix :math:`Q` or :math:`P^T` determined by gebrd.
-         * -     :ref:`onemkl_lapack_orgtr`
-           -     :ref:`onemkl_lapack_orgtr_scratchpad_size`
-           -     Generates the real orthogonal matrix :math:`Q` determined by sytrd.
-         * -     :ref:`onemkl_lapack_ormtr`
-           -     :ref:`onemkl_lapack_ormtr_scratchpad_size`
-           -     Multiplies a real matrix by the orthogonal matrix :math:`Q` determined by sytrd.
-         * -     :ref:`onemkl_lapack_syevd`
-           -     :ref:`onemkl_lapack_syevd_scratchpad_size`
-           -     Computes all eigenvalues and, optionally, all eigenvectors of a real symmetric matrix using divide and conquer algorithm.
-         * -     :ref:`onemkl_lapack_sygvd`
-           -     :ref:`onemkl_lapack_sygvd_scratchpad_size`
-           -     Computes all eigenvalues and, optionally, all eigenvectors of a real generalized symmetric definite eigenproblem using divide and conquer algorithm.
-         * -     :ref:`onemkl_lapack_sytrd`
-           -     :ref:`onemkl_lapack_sytrd_scratchpad_size`
-           -     Reduces a real symmetric matrix to tridiagonal form.
-         * -     :ref:`onemkl_lapack_ungbr`
-           -     :ref:`onemkl_lapack_ungbr_scratchpad_size`
-           -     Generates the complex unitary matrix :math:`Q` or :math:`P^T` determined by gebrd.
-         * -     :ref:`onemkl_lapack_ungtr`
-           -     :ref:`onemkl_lapack_ungtr_scratchpad_size`
-           -     Generates the complex unitary matrix :math:`Q` determined by hetrd.
-         * -     :ref:`onemkl_lapack_unmtr`
-           -     :ref:`onemkl_lapack_unmtr_scratchpad_size`
-           -     Multiplies a complex matrix by the unitary matrix :math:`Q` determined by hetrd.
-
-
-
-
-.. toctree::
-    :hidden:
-
-    gebrd
-    gebrd_scratchpad_size
-    gesvd
-    gesvd_scratchpad_size
-    heevd
-    heevd_scratchpad_size
-    hegvd
-    hegvd_scratchpad_size
-    hetrd
-    hetrd_scratchpad_size
-    orgbr
-    orgbr_scratchpad_size
-    orgtr
-    orgtr_scratchpad_size
-    ormtr
-    ormtr_scratchpad_size
-    syevd
-    syevd_scratchpad_size
-    sygvd
-    sygvd_scratchpad_size
-    sytrd
-    sytrd_scratchpad_size
-    ungbr
-    ungbr_scratchpad_size
-    ungtr
-    ungtr_scratchpad_size
-    unmtr
-    unmtr_scratchpad_size
diff --git a/docs/domains/lapack/lapack.rst b/docs/domains/lapack/lapack.rst
deleted file mode 100644
index bb11e72ed..000000000
--- a/docs/domains/lapack/lapack.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack:
-
-LAPACK Routines
-+++++++++++++++
-
-oneMKL provides a DPC++ interface to select routines from the Linear Algebra PACKage (LAPACK), as well as several LAPACK-like extension routines.
-
-.. include:: lapack-linear-equation-routines.inc.rst
-.. include:: lapack-singular-value-eigenvalue-routines.inc.rst
-.. include:: lapack-like-extensions.inc.rst
-
-
-.. container::
-
-   .. container:: Note
-
-
-      .. rubric:: Note
-         :class: NoteTipHead
-
-
-      Different arrays used as parameters to oneMKL LAPACK routines must
-      not overlap.
-
-
-   .. container:: Note
-
-
-      .. rubric:: Warning
-         :name: warning
-         :class: NoteTipHead
-
-
-      LAPACK routines assume that input matrices do not contain IEEE 754
-      special values such as INF or NaN values. Using these special
-      values may cause LAPACK to return unexpected results or become
-      unstable.
-
-**Parent topic:** :ref:`onemkl_dense_linear_algebra`
diff --git a/docs/domains/lapack/orgbr.rst b/docs/domains/lapack/orgbr.rst
deleted file mode 100644
index 6ff70338f..000000000
--- a/docs/domains/lapack/orgbr.rst
+++ /dev/null
@@ -1,226 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_orgbr:
-
-orgbr
-=====
-
-Generates the real orthogonal matrix :math:`Q` or :math:`P^{T}`
-determined by
-:ref:`onemkl_lapack_gebrd`.
-
-``orgbr`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``float`` 
-       * -  ``double`` 
-
-.. container:: section
-
-  .. rubric:: Description
-      
-The routine generates the whole or part of the orthogonal matrices
-:math:`Q` and :math:`P^{T}` formed by the routines :ref:`onemkl_lapack_gebrd`.
-All valid combinations of arguments are described in *Input parameters*. In
-most cases you need the following:
-
-To compute the whole :math:`m \times m` matrix :math:`Q`:
-
-::
-
-   orgbr(queue, generate::q, m, m, n, a, ...)
-
-(note that the array ``a`` must have at least :math:`m` columns).
-
-To form the :math:`n` leading columns of :math:`Q` if :math:`m > n`:
-
-::
-
-   orgbr(queue, generate::q, m, n, n, a, ...)
-
-To compute the whole :math:`n \times n` matrix :math:`P^{T}`:
-
-::
-
-   orgbr(queue, generate::p, n, n, m, a, ...)
-
-(note that the array ``a`` must have at least :math:`n` rows).
-
-To form the :math:`m` leading rows of :math:`P^{T}` if :math:`m < n`:
-
-::
-
-   orgbr(queue, generate::p, m, n, m, a, ...)
-
-orgbr (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void orgbr(sycl::queue &queue, oneapi::mkl::generate gen, std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-gen
-   Must be ``generate::q`` or ``generate::p``.
-
-   If ``gen = generate::q``, the routine generates the matrix :math:`Q`.
-
-   If ``gen = generate::p``, the routine generates the matrix
-   :math:`P^{T}`.
-
-m
-   The number of rows in the matrix :math:`Q` or :math:`P^{T}` to be
-   returned :math:`(0 \le m)`.
-
-   If ``gen = generate::q``, :math:`m \le n \le \min(m, k)`.
-
-   If ``gen = generate::p``, :math:`n \le m \le \min(n, k)`.
-
-n
-   The number of rows in the matrix :math:`Q` or :math:`P^{T}` to be
-   returned :math:`(0 \le n)`. See m for constraints.
-
-k
-   If ``gen = generate::q``, the number of columns in the original
-   :math:`m \times k` matrix reduced by
-   :ref:`onemkl_lapack_gebrd`.
-
-   If ``gen = generate::p``, the number of rows in the original
-   :math:`k \times n` matrix reduced by
-   :ref:`onemkl_lapack_gebrd`.
-
-a
-   The buffer ``a`` as returned by
-   :ref:`onemkl_lapack_gebrd`.
-
-lda
-   The leading dimension of ``a``.
-
-tau
-   Buffer, size :math:`\min(m,k)` if ``gen = generate::q``, size
-   :math:`\min(n,k)` if ``gen = generate::p``. Scalar factor of the
-   elementary reflectors, as returned by :ref:`onemkl_lapack_gebrd` in the array tauq
-   or taup.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_orgbr_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Overwritten by n leading columns of the :math:`m \times m` orthogonal matrix
-   :math:`Q` or :math:`P^{T}` (or the leading rows or columns thereof)
-   as specified by ``gen``, ``m``, and ``n``.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-orgbr (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate gen, std::int64_t m, std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-gen
-   Must be ``generate::q`` or ``generate::p``.
-
-   If ``gen = generate::q``, the routine generates the matrix :math:`Q`.
-
-   If ``gen = generate::p``, the routine generates the matrix
-   :math:`P^{T}`.
-
-m
-   The number of rows in the matrix :math:`Q` or :math:`P^{T}` to be
-   returned :math:`(0 \le m)`.
-
-   If ``gen = generate::q``, :math:`m \le n \le \min(m, k)`.
-
-   If ``gen = generate::p``, :math:`n \le m \le \min(n, k)`.
-
-n
-   The number of rows in the matrix :math:`Q` or :math:`P^{T}` to be
-   returned :math:`(0 \le n)`. See m for constraints.
-
-k
-   If ``gen = generate::q``, the number of columns in the original
-   :math:`m \times k` matrix reduced by
-   :ref:`onemkl_lapack_gebrd`.
-
-   If ``gen = generate::p``, the number of rows in the original
-   :math:`k \times n` matrix reduced by
-   :ref:`onemkl_lapack_gebrd`.
-
-a
-   Pointer to array ``a`` as returned by
-   :ref:`onemkl_lapack_gebrd`.
-
-lda
-   The leading dimension of ``a``.
-
-tau
-   Pointer to array of size :math:`\min(m,k)` if ``gen = generate::q``, size
-   :math:`\min(n,k)` if ``gen = generate::p``. Scalar factor of the
-   elementary reflectors, as returned by :ref:`onemkl_lapack_gebrd` in the array tauq
-   or taup.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_orgbr_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Overwritten by n leading columns of the :math:`m \times m` orthogonal matrix
-   :math:`Q` or :math:`P^{T}` (or the leading rows or columns thereof)
-   as specified by ``gen``, ``m``, and ``n``.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-         
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
diff --git a/docs/domains/lapack/orgbr_scratchpad_size.rst b/docs/domains/lapack/orgbr_scratchpad_size.rst
deleted file mode 100644
index 7e7804158..000000000
--- a/docs/domains/lapack/orgbr_scratchpad_size.rst
+++ /dev/null
@@ -1,90 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_orgbr_scratchpad_size:
-
-orgbr_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_orgbr` function.
-
-``orgbr_scratchpad_size`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``float`` 
-       * -  ``double`` 
-
-.. container:: section
-
-  .. rubric:: Description
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_orgbr` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-orgbr_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t orgbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate gen, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t &scratchpad_size) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-         
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_orgbr` function will be performed.
-
-gen
-   Must be ``generate::q`` or ``generate::p``.
-
-   If ``gen = generate::q``, the routine generates the matrix
-   :math:`Q`.
-
-   If ``gen = generate::p``, the routine generates the matrix
-   :math:`P^{T}`.
-
-m
-   The number of rows in the matrix :math:`Q` or :math:`P^{T}` to be
-   returned :math:`(0 \le m)`.
-
-   If ``gen = generate::q``, :math:`m \le  n \le \min(m, k)`.
-
-   If ``gen = generate::p``, :math:`n \le m \le \min(n, k)`.
-
-n
-   The number of rows in the matrix :math:`Q` or :math:`P^{T}` to be
-   returned :math:`(0 \le n)`. See ``m`` for constraints.
-
-k
-   If ``gen = generate::q``, the number of columns in the original
-   :math:`m \times k` matrix returned by
-   :ref:`onemkl_lapack_gebrd`.
-
-   If ``gen = generate::p``, the number of rows in the original
-   :math:`k \times n` matrix returned by
-   :ref:`onemkl_lapack_gebrd`.
-
-lda
-   The leading dimension of ``a``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-         
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_orgbr` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines` 
-
-
diff --git a/docs/domains/lapack/orgqr.rst b/docs/domains/lapack/orgqr.rst
deleted file mode 100644
index 532e7fd9c..000000000
--- a/docs/domains/lapack/orgqr.rst
+++ /dev/null
@@ -1,183 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_orgqr:
-
-orgqr
-=====
-
-Generates the real orthogonal matrix :math:`Q` of the QR factorization formed
-by :ref:`onemkl_lapack_geqrf`.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``orgqr`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``float`` 
-       * -  ``double`` 
-
-The routine generates the whole or part of :math:`m \times m` orthogonal
-matrix :math:`Q` of the QR factorization formed by the routine
-:ref:`onemkl_lapack_geqrf`.
-
-Usually :math:`Q` is determined from the QR factorization of an ``m``
-by ``p`` matrix :math:`A` with :math:`m \ge p`. To compute the whole matrix
-:math:`Q`, use:
-
-::
-
-   oneapi::mkl::lapack::orgqr(queue, m, m, p, a, lda, tau, scratchpad, scratchpad_size)
-
-To compute the leading :math:`p` columns of :math:`Q` (which form an
-orthonormal basis in the space spanned by the columns of :math:`A`):
-
-::
-
-   oneapi::mkl::lapack::orgqr(queue, m, p, p, a, lda, tau, scratchpad, scratchpad_size)
-
-To compute the matrix :math:`Q^{k}` of the QR factorization of
-leading :math:`k` columns of the matrix :math:`A`:
-
-::
-
-   oneapi::mkl::lapack::orgqr(queue, m, m, k, a, lda, tau, scratchpad, scratchpad_size)
-
-To compute the leading :math:`k` columns of :math:`Q^{k}` (which form
-an orthonormal basis in the space spanned by leading :math:`k` columns of
-the matrix :math:`A`):
-
-::
-
-   oneapi::mkl::lapack::orgqr(queue, m, k, k, a, lda, tau, scratchpad, scratchpad_size)
-
-orgqr (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns in the matrix :math:`A` (:math:`0 \le n`).
-
-k
-   The number of elementary reflectors whose product defines the
-   matrix :math:`Q` (:math:`0 \le k \le n`).
-
-a
-   The buffer ``a`` as returned by
-   :ref:`onemkl_lapack_geqrf`.
-
-lda
-   The leading dimension of ``a`` (:math:`\text{lda} \le m`).
-
-tau
-   The buffer ``tau`` as returned by
-   :ref:`onemkl_lapack_geqrf`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_orgqr_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Overwritten by :math:`n` leading columns of the :math:`m \times m` orthogonal matrix
-   :math:`Q`.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-orgqr (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns in the matrix :math:`A` (:math:`0 \le n`).
-
-k
-   The number of elementary reflectors whose product defines the
-   matrix :math:`Q` (:math:`0 \le k \le n`).
-
-a
-   The pointer to ``a`` as returned by
-   :ref:`onemkl_lapack_geqrf`.
-
-lda
-   The leading dimension of ``a`` (:math:`\text{lda} \le m`).
-
-tau
-   The pointer to ``tau`` as returned by
-   :ref:`onemkl_lapack_geqrf`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_orgqr_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Overwritten by :math:`n` leading columns of the :math:`m \times m` orthogonal matrix
-   :math:`Q`.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-         
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
-
diff --git a/docs/domains/lapack/orgqr_batch.rst b/docs/domains/lapack/orgqr_batch.rst
deleted file mode 100644
index 6984ebead..000000000
--- a/docs/domains/lapack/orgqr_batch.rst
+++ /dev/null
@@ -1,262 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_orgqr_batch:
-
-orgqr_batch
-===========
-
-Generates the orthogonal/unitary matrix :math:`Q_i` of the QR factorizations for a group of general matrices.
-
-.. rubric:: Description
-
-``orgqr_batch`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-
-.. _onemkl_lapack_orgqr_batch_buffer:
-
-orgqr_batch (Buffer Version)
-----------------------------
-
-.. rubric:: Description
-
-The buffer version of ``orgqr_batch`` supports only the strided API. 
-   
-**Strided API**
-
- | The routine generates the wholes or parts of :math:`m \times n` orthogonal matrices :math:`Q_i` of the batch of QR factorizations formed by the Strided API of the :ref:`onemkl_lapack_geqrf_batch_buffer` function.
- | Usually :math:`Q_i` is determined from the QR factorization of an :math:`m \times p` matrix :math:`A_i` with :math:`m \ge p`.
- | To compute the whole matrices :math:`Q_i`, use:
- | ``orgqr_batch(queue, m, m, p, a, ...)``
- | To compute the leading :math:`p` columns of :math:`Q_i` (which form an orthonormal basis in the space spanned by the columns of :math:`A_i`):
- | ``orgqr_batch(queue, m, p, p, a, ...)``
- | To compute the matrices :math:`Q_i^k` of the QR factorizations of leading :math:`k` columns of the matrices :math:`A_i`:
- | ``orgqr_batch(queue, m, m, k, a, ...)``
- | To compute the leading :math:`k` columns of :math:`Q_i^k` (which form an orthonormal basis in the space spanned by leading :math:`k` columns of the matrices :math:`A_i`):
- | ``orgqr_batch(queue, m, k, k, a, ...)``
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<T> &a, std::int64_t lda, std::int64_t stride_a, sycl::buffer<T> &tau, std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Number of rows in the matrices :math:`A_i` (:math:`0 \le m`).
-
-n
-  Number of columns in the matrices :math:`A_i` (:math:`0 \le n`).
-
-k
-  Number of elementary reflectors whose product defines the matrices :math:`Q_i` (:math:`0 \le k \le n`).
-
-a
-  Array resulting after call to the Strided API of the :ref:`onemkl_lapack_geqrf_batch_buffer` function.
-
-lda
-  Leading dimension of :math:`A_i` (:math:`\text{lda} \le m`).
-
-stride_a
-  The stride between the beginnings of matrices :math:`A_i` inside the batch array ``a``.
-
-tau
-  Array resulting from call to the Strided API of the :ref:`onemkl_lapack_geqrf_batch_buffer` function.
-
-stride_tau
-  Stride between the beginnings of arrays :math:`\tau_i` inside the array ``tau``.
-
-batch_size
-  Specifies the number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by the Strided API of the :ref:`onemkl_lapack_orgqr_batch_scratchpad_size` function.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-a
-  Batch of :math:`n` leading columns of the :math:`m \times m` orthogonal matrices :math:`Q_i`.
-
-.. _onemkl_lapack_orgqr_batch_usm:
-
-orgqr_batch (USM Version)
--------------------------
-
-.. rubric:: Description
-
-The USM version of ``orgqr_batch`` supports the group API and strided API. 
-
-**Group API**
-
- | The routine generates the wholes or parts of :math:`m \times n` orthogonal matrices :math:`Q_i` of the batch of QR factorizations formed by the Group API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
- | Usually :math:`Q_i` is determined from the QR factorization of an :math:`m \times p` matrix :math:`A_i` with :math:`m \ge p`.
- | To compute the whole matrices :math:`Q_i`, use:
- | ``orgqr_batch(queue, m, m, p, a, ...)``
- | To compute the leading :math:`p` columns of :math:`Q_i` (which form an orthonormal basis in the space spanned by the columns of :math:`A_i`):
- | ``orgqr_batch(queue, m, p, p, a, ...)``
- | To compute the matrices :math:`Q_i^k` of the QR factorizations of leading :math:`k` columns of the matrices :math:`A_i`:
- | ``orgqr_batch(queue, m, m, k, a, ...)``
- | To compute the leading :math:`k` columns of :math:`Q_i^k` (which form an orthonormal basis in the space spanned by leading :math:`k` columns of the matrices :math:`A_i`):
- | ``orgqr_batch(queue, m, k, k, a, ...)``
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, T **a, std::int64_t *lda, T **tau, std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Array of ``group_count`` :math:`m_g` parameters as previously supplied to group version of geqrf_batch function.
-
-n
-  Array of ``group_count`` :math:`n_g` parameters as previously supplied to group version of geqrf_batch function.
-
-k
-  Array of ``group_count`` :math:`k_g` parameters as previously supplied to the Group API of the :ref:`onemkl_lapack_geqrf_batch_usm` function. The number of elementary reflectors whose product defines the matrices :math:`Q_i` (:math:`0 \le k_g \le n_g`).
-
-a
-  Array resulting after call to the Group API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
-
-lda
-  Array of leading dimensions of :math:`A_i` as previously supplied to the Group API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
-
-tau
-  Array resulting after call to the Group API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
-
-group_count
-  Number of groups of parameters. Must be at least 0.
-
-group_sizes
-  Array of ``group_count`` integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by Group API of the :ref:`onemkl_lapack_orgqr_batch_scratchpad_size` function.
-
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-a
-  :math:`n_g` leading columns of the :math:`m_g \times m_g` orthogonal matrices :math:`Q_i`, where :math:`g` is an index of group of parameters corresponding to :math:`Q_i`.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
- | The routine generates the wholes or parts of :math:`m \times n` orthogonal matrices :math:`Q_i` of the batch of QR factorizations formed by the Strided API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
- | Usually :math:`Q_i` is determined from the QR factorization of an :math:`m \times p` matrix :math:`A_i` with :math:`m \ge p`.
- | To compute the whole matrices :math:`Q_i`, use:
- | ``orgqr_batch(queue, m, m, p, a, ...)``
- | To compute the leading :math:`p` columns of :math:`Q_i` (which form an orthonormal basis in the space spanned by the columns of :math:`A_i`):
- | ``orgqr_batch(queue, m, p, p, a, ...)``
- | To compute the matrices :math:`Q_i^k` of the QR factorizations of leading :math:`k` columns of the matrices :math:`A_i`:
- | ``orgqr_batch(queue, m, m, k, a, ...)``
- | To compute the leading :math:`k` columns of :math:`Q_i^k` (which form an orthonormal basis in the space spanned by leading :math:`k` columns of the matrices :math:`A_i`):
- | ``orgqr_batch(queue, m, k, k, a, ...)``
-
-.. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, T *a, std::int64_t lda, std::int64_t stride_a, T *tau, std::int64_t stride_tau, std::int64_t batch_size, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    };
-
-.. container:: section
-
-   .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Number of rows in the matrices :math:`A_i` (:math:`0 \le m`).
-
-n
-  Number of columns in the matrices :math:`A_i` (:math:`0 \le n`).
-
-k
-  Number of elementary reflectors whose product defines the matrices :math:`Q_i` (:math:`0 \le k \le n`).
-
-a
-  Array resulting after call to the Strided API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
-
-lda
-  Leading dimension of :math:`A_i` (:math:`\text{lda} \le m`).
-
-stride_a
-  The stride between the beginnings of matrices :math:`A_i` inside the batch array ``a``.
-
-tau
-  Array resulting from call to the Strided API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
-
-stride_tau
-  Stride between the beginnings of arrays :math:`\tau_i` inside the array ``tau``.
-
-batch_size
-  Specifies the number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by the Strided API of the :ref:`onemkl_lapack_orgqr_batch_scratchpad_size` function.
-
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-   .. rubric:: Output Parameters
-
-a
-  Batch of :math:`n` leading columns of the :math:`m \times m` orthogonal matrices :math:`Q_i`.
-
-.. container:: section
-   
-   .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
-
diff --git a/docs/domains/lapack/orgqr_batch_scratchpad_size.rst b/docs/domains/lapack/orgqr_batch_scratchpad_size.rst
deleted file mode 100644
index 444075609..000000000
--- a/docs/domains/lapack/orgqr_batch_scratchpad_size.rst
+++ /dev/null
@@ -1,121 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_orgqr_batch_scratchpad_size:
-
-orgqr_batch_scratchpad_size
-===========================
-
-Computes size of scratchpad memory required for the :ref:`onemkl_lapack_orgqr_batch` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``orgqr_batch_scratchpad_size`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-
-**Group API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_orgqr_batch` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Array of ``group_count`` :math:`m_g` parameters.
-
-n
-  Array of ``group_count`` :math:`n_g` parameters.
-
-k
-  Array of ``group_count`` kg parameters. The number of elementary reflectors whose product defines the matrices :math:`Q_i` (:math:`0 \le k_g \le n_g`).
-
-lda
-  Array of leading dimensions of :math:`A_i`.
-
-group_count
-  Number of groups of parameters. Must be at least 0.
-
-group_sizes
-  Array of ``group_count`` integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_orgqr_batch` function.
-
-**Strided API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_orgqr_batch` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size)
-    };
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Number of rows in the matrices :math:`A_i` (:math:`0 \le m`).
-
-n
-  Number of columns in the matrices Ai (:math:`0 \le n`).
-
-k
-  Number of elementary reflectors whose product defines the matrices :math:`Q_i` (:math:`0 \le k \le n`).
-
-lda
-  Leading dimension of :math:`A_i` (:math:`\text{lda} \le m`).
-
-stride_a
-  Stride between the beginnings of matrices :math:`A_i` inside the batch array ``a``.
-
-stride_tau  
-  Stride between the beginnings of arrays :math:`tau_i` inside the array ``tau``.
-
-batch_size
-  Number of problems in a batch.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_orgqr_batch` function.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
-
diff --git a/docs/domains/lapack/orgqr_scratchpad_size.rst b/docs/domains/lapack/orgqr_scratchpad_size.rst
deleted file mode 100644
index 4ca2f4d6c..000000000
--- a/docs/domains/lapack/orgqr_scratchpad_size.rst
+++ /dev/null
@@ -1,70 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_orgqr_scratchpad_size:
-
-orgqr_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_orgqr` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``orgqr_scratchpad_size`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``float`` 
-       * -  ``double`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_orgqr` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-orgqr_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_orgqr` function will be performed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns in the matrix :math:`A` (:math:`0 \le n \le m`).
-
-k
-   The number of elementary reflectors whose product defines the
-   matrix :math:`Q` (:math:`0 \le k \le n`).
-
-lda
-   The leading dimension of ``a``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-         
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_orgqr` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
-
diff --git a/docs/domains/lapack/orgtr.rst b/docs/domains/lapack/orgtr.rst
deleted file mode 100644
index 757e6b9ae..000000000
--- a/docs/domains/lapack/orgtr.rst
+++ /dev/null
@@ -1,148 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_orgtr:
-
-orgtr
-=====
-
-Generates the real orthogonal matrix :math:`Q` determined by
-:ref:`onemkl_lapack_sytrd`.
-
-.. container:: section
-
-  .. rubric:: Description
-      
-``orgtr`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``float`` 
-       * -  ``double`` 
-
-The routine explicitly generates the :math:`n \times n` orthogonal matrix
-:math:`Q` formed by :ref:`onemkl_lapack_sytrd` when
-reducing a real symmetric matrix :math:`A` to tridiagonal form:
-:math:`A = QTQ^T`. Use this routine after a call to
-:ref:`onemkl_lapack_sytrd`.
-
-orgtr (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void orgtr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``. Uses the same
-   ``upper_lower`` as supplied to :ref:`onemkl_lapack_sytrd`.
-
-n
-   The order of the matrix :math:`Q` :math:`(0 \le n)`.
-
-a
-   The buffer ``a`` as returned by :ref:`onemkl_lapack_sytrd`. The
-   second dimension of ``a`` must be at least :math:`\max(1,n)`.
-
-lda
-   The leading dimension of ``a`` :math:`(n \le \text{lda})`.
-
-tau
-   The buffer ``tau`` as returned by :ref:`onemkl_lapack_sytrd`. The
-   dimension of ``tau`` must be at least :math:`\max(1, n-1)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_orgtr_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Overwritten by the orthogonal matrix :math:`Q`.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-orgtr (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, T *a, std::int64_t lda, T *tau, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``. Uses the same
-   ``upper_lower`` as supplied
-   to :ref:`onemkl_lapack_sytrd`.
-
-n
-   The order of the matrix :math:`Q` :math:`(0 \le n)`.
-
-a
-   The pointer to ``a`` as returned by
-   :ref:`onemkl_lapack_sytrd`. The
-   second dimension of ``a`` must be at least :math:`\max(1,n)`.
-
-lda
-   The leading dimension of ``a`` :math:`(n \le \text{lda})`.
-
-tau
-   The pointer to ``tau`` as returned by :ref:`onemkl_lapack_sytrd`. The
-   dimension of ``tau`` must be at least :math:`\max(1, n-1)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_orgtr_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-a
-   Overwritten by the orthogonal matrix :math:`Q`.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-         
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
diff --git a/docs/domains/lapack/orgtr_scratchpad_size.rst b/docs/domains/lapack/orgtr_scratchpad_size.rst
deleted file mode 100644
index aee5516c6..000000000
--- a/docs/domains/lapack/orgtr_scratchpad_size.rst
+++ /dev/null
@@ -1,67 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_orgtr_scratchpad_size:
-
-orgtr_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_orgtr` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``orgtr_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-        
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_orgtr` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-orgtr_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-         
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_orgtr` function will be performed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``. Uses the same
-   ``upper_lower`` as supplied to :ref:`onemkl_lapack_sytrd`.
-
-n
-   The order of the matrix :math:`Q` :math:`(0 \le n)`.
-
-lda
-   The leading dimension of ``a`` :math:`(n \le \text{lda})`.
-
-.. container:: section
-
-  .. rubric:: Return Value
-         
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_orgtr` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/ormqr.rst b/docs/domains/lapack/ormqr.rst
deleted file mode 100644
index 320bfe69d..000000000
--- a/docs/domains/lapack/ormqr.rst
+++ /dev/null
@@ -1,207 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_ormqr:
-
-ormqr
-=====
-
-Multiplies a real matrix by the orthogonal matrix :math:`Q` of the QR
-factorization formed by :ref:`onemkl_lapack_geqrf`.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``ormqr`` supports the following precisions.
-
-    .. list-table::
-       :header-rows: 1
-
-       * -  T
-       * -  ``float``
-       * -  ``double``
-
-The routine multiplies a rectangular real :math:`m \times n` matrix :math:`C` by
-:math:`Q` or :math:`Q^T`, where :math:`Q` is the complex unitary matrix defined
-as a product of :math:`k` elementary reflectors :math:`H(i)` of order :math:`n`:
-:math:`Q = H(1)^TH(2)^T ... H(k)^T` as returned by the RQ factorization routine
-:ref:`onemkl_lapack_gerqf`.
-
-Depending on the parameters ``side`` and ``trans``, the routine can form one of
-the matrix products :math:`QC`, :math:`Q^TC`, :math:`CQ`, or :math:`CQ^T`
-(overwriting the result over :math:`C`).
-
-ormqr (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &c, std::int64_t ldc, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-    The queue where the routine should be executed.
-
-side
-    If ``side = oneapi::mkl::side::left``, :math:`Q` or :math:`Q^{T}` is applied
-    to :math:`C` from the left.
-
-    If ``side = oneapi::mkl::side::right``, :math:`Q` or :math:`Q^{T}` is
-    applied to :math:`C` from the right.
-
-trans
-    If ``trans = oneapi::mkl::transpose::nontrans``, the routine multiplies
-    :math:`C` by :math:`Q`.
-
-    If ``trans = oneapi::mkl::transpose::trans``, the routine multiplies :math:`C`
-    by :math:`Q^{T}`.
-
-m
-    The number of rows in the matrix :math:`C` (:math:`0 \le m`).
-
-n
-    The number of columns in the matrix :math:`C` (:math:`0 \le n`).
-
-k
-    The number of elementary reflectors whose product defines the
-    matrix :math:`Q` 
-
-    If ``side = oneapi::mkl::side::left``, :math:`0 \le k \le m`
-
-    If ``side = oneapi::mkl::side::right``, :math:`0 \le k \le n`
-
-a
-    The buffer ``a`` as returned by :ref:`onemkl_lapack_geqrf`.
-    The second dimension of ``a`` must be at least :math:`\max(1,k)`.
-
-lda
-    The leading dimension of ``a``.
-
-tau
-    The buffer ``tau`` as returned by :ref:`onemkl_lapack_geqrf`.
-
-c
-    The buffer ``c`` contains the matrix :math:`C`. The second dimension of
-    ``c`` must be at least :math:`\max(1,n)`.
-
-ldc
-    The leading dimension of ``c``.
-
-scratchpad_size
-    Size of scratchpad memory as a number of floating point elements of type
-    ``T``. Size should not be less than the value returned by the
-    :ref:`onemkl_lapack_ormqr_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-c
-    Overwritten by the product :math:`QC`, :math:`Q^{T}C`, :math:`CQ`, or
-    :math:`CQ^{T}` (as specified by ``side`` and ``trans``).
-
-scratchpad
-    Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-ormqr (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *c, std::int64_t ldc, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-    The queue where the routine should be executed.
-
-side
-    If ``side = oneapi::mkl::side::left``, :math:`Q` or :math:`Q^{T}` is applied
-    to :math:`C` from the left.
-
-    If ``side = oneapi::mkl::side::right``, :math:`Q` or :math:`Q^{T}` is
-    applied to :math:`C` from the right.
-
-trans
-    If ``trans = oneapi::mkl::transpose::nontrans``, the routine multiplies
-    :math:`C` by :math:`Q`.
-
-    If ``trans = oneapi::mkl::transpose::trans``, the routine multiplies :math:`C`
-    by :math:`Q^{T}`.
-
-m
-    The number of rows in the matrix :math:`C` (:math:`0 \le m`).
-
-n
-    The number of columns in the matrix :math:`C` (:math:`0 \le n`).
-
-k
-    The number of elementary reflectors whose product defines the
-    matrix :math:`Q`
-
-    If ``side = oneapi::mkl::side::left``, :math:`0 \le k \le m`
-
-    If ``side = oneapi::mkl::side::right``, :math:`0 \le k \le n`
-
-a
-    The pointer to ``a`` as returned by :ref:`onemkl_lapack_geqrf`.
-    The second dimension of ``a`` must be at least :math:`\max(1,k)`.
-
-lda
-    The leading dimension of ``a``.
-
-tau
-    The pointer to ``tau`` as returned by :ref:`onemkl_lapack_geqrf`.
-
-c
-    The pointer ``c`` points to the matrix :math:`C`. The second dimension of
-    ``c`` must be at least :math:`\max(1,n)`.
-
-ldc
-    The leading dimension of ``c``.
-
-scratchpad_size
-    Size of scratchpad memory as a number of floating point elements of type
-    ``T``. Size should not be less than the value returned by the
-    :ref:`onemkl_lapack_ormqr_scratchpad_size` function.
-
-events
-    List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-c
-    Overwritten by the product :math:`QC`, :math:`Q^{T}C`, :math:`CQ`, or
-    :math:`CQ^{T}` (as specified by ``side`` and ``trans``).
-
-scratchpad
-    Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
diff --git a/docs/domains/lapack/ormqr_scratchpad_size.rst b/docs/domains/lapack/ormqr_scratchpad_size.rst
deleted file mode 100644
index 66cd996d8..000000000
--- a/docs/domains/lapack/ormqr_scratchpad_size.rst
+++ /dev/null
@@ -1,87 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_ormqr_scratchpad_size:
-
-ormqr_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_ormqr` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``ormqr_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_ormqr` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-ormqr_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc, std::int64_t &scratchpad_size) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_ormqr` function will be performed.
-
-side
-   If ``side=oneapi::mkl::side::left``, :math:`Q` or :math:`Q^{T}` is
-   applied to :math:`C` from the left.
-
-   If ``side=oneapi::mkl::side::right``, :math:`Q` or :math:`Q^{T}` is
-   applied to :math:`C` from the right.
-
-trans
-   If ``trans=oneapi::mkl::transpose::nontrans``, the routine multiplies
-   :math:`C` by :math:`Q`.
-
-   If ``trans=oneapi::mkl::transpose::trans``, the routine multiplies
-   :math:`C` by :math:`Q^{T}`.
-
-m
-   The number of rows in the matrix :math:`C` (:math:`0 \le m`).
-
-n
-   The number of columns in the matrix :math:`C` (:math:`0 \le n \le m`).
-
-k
-   The number of elementary reflectors whose product defines the
-   matrix :math:`Q` (:math:`0 \le k \le n`).
-
-lda
-   The leading dimension of ``a``.
-
-ldc
-   The leading dimension of ``c``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_ormqr` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines` 
-
-
diff --git a/docs/domains/lapack/ormrq.rst b/docs/domains/lapack/ormrq.rst
deleted file mode 100644
index d49f0e3dc..000000000
--- a/docs/domains/lapack/ormrq.rst
+++ /dev/null
@@ -1,208 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_ormrq:
-
-ormrq
-=====
-
-Multiplies a real matrix by the orthogonal matrix :math:`Q` of the RQ
-factorization formed by :ref:`onemkl_lapack_gerqf`.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``ormrq`` supports the following precisions.
-
-    .. list-table::
-       :header-rows: 1
-
-       * -  T
-       * -  ``float``
-       * -  ``double``
-
-The routine multiplies a rectangular real :math:`m \times n` matrix :math:`C` by
-:math:`Q` or :math:`Q^T`, where :math:`Q` is the complex unitary matrix defined
-as a product of :math:`k` elementary reflectors :math:`H(i)` of order :math:`n`:
-:math:`Q = H(1)^TH(2)^T ... H(k)^T` as returned by the RQ factorization routine
-:ref:`onemkl_lapack_gerqf`.
-
-Depending on the parameters ``side`` and ``trans``, the routine can form one of
-the matrix products :math:`QC`, :math:`Q^TC`, :math:`CQ`, or :math:`CQ^T`
-(overwriting the result over :math:`C`).
-
-ormrq (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &c, std::int64_t ldc, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-    The queue where the routine should be executed.
-
-side
-    If ``side = oneapi::mkl::side::left``, :math:`Q` or :math:`Q^{T}` is applied
-    to :math:`C` from the left.
-
-    If ``side = oneapi::mkl::side::right``, :math:`Q` or :math:`Q^{T}` is
-    applied to :math:`C` from the right.
-
-trans
-    If ``trans = oneapi::mkl::transpose::nontrans``, the routine multiplies
-    :math:`C` by :math:`Q`.
-
-    If ``trans = oneapi::mkl::transpose::trans``, the routine multiplies :math:`C`
-    by :math:`Q^{T}`.
-
-m
-    The number of rows in the matrix :math:`C` (:math:`0 \le m`).
-
-n
-    The number of columns in the matrix :math:`C` (:math:`0 \le n`).
-
-k
-    The number of elementary reflectors whose product defines the
-    matrix :math:`Q`
-
-    If ``side = oneapi::mkl::side::left``, :math:`0 \le k \le m`
-
-    If ``side = oneapi::mkl::side::right``, :math:`0 \le k \le n`
-
-a
-    The buffer ``a`` as returned by :ref:`onemkl_lapack_gerqf`.
-    The second dimension of ``a`` must be at least :math:`\max(1,k)`.
-
-lda
-    The leading dimension of ``a``.
-
-tau
-    The buffer ``tau`` as returned by :ref:`onemkl_lapack_gerqf`.
-
-c
-    The buffer ``c`` contains the matrix :math:`C`. The second dimension of
-    ``c`` must be at least :math:`\max(1,n)`.
-
-ldc
-    The leading dimension of ``c``.
-
-scratchpad_size
-    Size of scratchpad memory as a number of floating point elements of type
-    ``T``. Size should not be less than the value returned by the
-    :ref:`onemkl_lapack_ormrq_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-c
-    Overwritten by the product :math:`QC`, :math:`Q^{T}C`, :math:`CQ`, or
-    :math:`CQ^{T}` (as specified by ``side`` and ``trans``).
-
-scratchpad
-    Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-ormrq (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *c, std::int64_t ldc, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-    The queue where the routine should be executed.
-
-side
-    If ``side = oneapi::mkl::side::left``, :math:`Q` or :math:`Q^{T}` is applied
-    to :math:`C` from the left.
-
-    If ``side = oneapi::mkl::side::right``, :math:`Q` or :math:`Q^{T}` is
-    applied to :math:`C` from the right.
-
-trans
-    If ``trans = oneapi::mkl::transpose::nontrans``, the routine multiplies
-    :math:`C` by :math:`Q`.
-
-    If ``trans = oneapi::mkl::transpose::trans``, the routine multiplies :math:`C`
-    by :math:`Q^{T}`.
-
-m
-    The number of rows in the matrix :math:`C` (:math:`0 \le m`).
-
-n
-    The number of columns in the matrix :math:`C` (:math:`0 \le n`).
-
-k
-    The number of elementary reflectors whose product defines the
-    matrix :math:`Q`
-
-    If ``side = oneapi::mkl::side::left``, :math:`0 \le k \le m`
-
-    If ``side = oneapi::mkl::side::right``, :math:`0 \le k \le n`
-
-a
-    The pointer to ``a`` as returned by :ref:`onemkl_lapack_gerqf`.
-    The second dimension of ``a`` must be at least :math:`\max(1,k)`.
-
-lda
-    The leading dimension of ``a``.
-
-tau
-    The pointer to ``tau`` as returned by :ref:`onemkl_lapack_gerqf`.
-
-c
-    The pointer ``c`` points to the matrix :math:`C`. The second dimension of
-    ``c`` must be at least :math:`\max(1,n)`.
-
-ldc
-    The leading dimension of ``c``.
-
-scratchpad_size
-    Size of scratchpad memory as a number of floating point elements of type
-    ``T``. Size should not be less than the value returned by the
-    :ref:`onemkl_lapack_ormrq_scratchpad_size` function.
-
-events
-    List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-c
-    Overwritten by the product :math:`QC`, :math:`Q^{T}C`, :math:`CQ`, or
-    :math:`CQ^{T}` (as specified by ``side`` and ``trans``).
-
-scratchpad
-    Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/ormrq_scratchpad_size.rst b/docs/domains/lapack/ormrq_scratchpad_size.rst
deleted file mode 100644
index 6699947de..000000000
--- a/docs/domains/lapack/ormrq_scratchpad_size.rst
+++ /dev/null
@@ -1,81 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_ormrq_scratchpad_size:
-
-ormrq_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_ormrq` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``ormrq_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-  
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_ormrq` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-ormrq_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc);
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-         
-queue
-   Device queue where calculations by the ormrq function will be performed.
-
-side
-   If ``side = oneapi::mkl::side::left``, :math:`Q` or :math:`Q^T` is applied to :math:`C` from the left. 
-   
-   If ``side = oneapi::mkl::side::right``, :math:`Q` or :math:`Q^T` is applied to :math:`C` from the right.
-
-trans
-   If ``trans=oneapi::mkl::transpose::nontrans``, the routine multiplies :math:`C` by :math:`Q`.
-
-   If ``trans=oneapi::mkl::transpose::trans``, the routine multiplies :math:`C` by :math:`Q^T`.
-
-m
-   The number of rows in the matrix :math:`C` (:math:`0 \le m`).
-
-n
-   The number of columns in the matrix :math:`C` (:math:`0 \le n \le m`).
-
-k
-   The number of elementary reflectors whose product defines the matrix :math:`Q` (:math:`0 \le k \le n`).
-
-lda
-   The leading dimension of ``a``.
-
-ldc
-   The leading dimension of ``c``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_ormrq` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/ormtr.rst b/docs/domains/lapack/ormtr.rst
deleted file mode 100644
index d7413a1fe..000000000
--- a/docs/domains/lapack/ormtr.rst
+++ /dev/null
@@ -1,230 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_ormtr:
-
-ormtr
-=====
-
-Multiplies a real matrix by the real orthogonal matrix :math:`Q` determined by
-:ref:`onemkl_lapack_sytrd`.
-
-.. container:: section
-
-  .. rubric:: Description
-      
-``ormtr`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-
-The routine multiplies a real matrix :math:`C` by :math:`Q` or :math:`Q^{T}`, 
-where :math:`Q` is the orthogonal matrix :math:`Q` formed by:ref:`onemkl_lapack_sytrd` 
-when reducing a real symmetric matrix :math:`A` to tridiagonal form:
-:math:`A = QTQ^T`. Use this routine after a call to :ref:`onemkl_lapack_sytrd`.
-
-Depending on the parameters side and trans, the routine can
-form one of the matrix products :math:`QC`, :math:`Q^TC`, :math:`CQ`, or
-:math:`CQ^T` (overwriting the result on :math:`C`).
-
-ormtr (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &c, std::int64_t ldc, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-In the descriptions below, ``r`` denotes the order of :math:`Q`:
-
-.. container:: tablenoborder
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  :math:`r = m` 
-          -  if ``side = side::left`` 
-        * -  :math:`r = n` 
-          -  if ``side = side::right`` 
-
-queue
-   The queue where the routine should be executed.
-
-side
-   Must be either ``side::left`` or ``side::right``.
-
-   If ``side = side::left``, :math:`Q` or :math:`Q^{T}` is
-   applied to :math:`C` from the left.
-
-   If ``side = side::right``, :math:`Q` or :math:`Q^{T}` is
-   applied to :math:`C` from the right.
-
-upper_lower
-   Must be either ``uplo::upper`` or ``uplo::lower``. Uses the same
-   ``upper_lower`` as supplied to
-   :ref:`onemkl_lapack_sytrd`.
-
-trans
-   Must be either ``transpose::nontrans`` or ``transpose::trans``.
-
-   If ``trans = transpose::nontrans``, the routine multiplies :math:`C`
-   by :math:`Q`.
-
-   If ``trans = transpose::trans``, the routine multiplies :math:`C` by
-   :math:`Q^{T}`.
-
-m
-   The number of rows in the matrix :math:`C` :math:`(m \ge 0)`.
-
-n
-   The number of columns in the matrix :math:`C` :math:`(n \ge 0)`.
-
-a
-   The buffer ``a`` as returned by   :ref:`onemkl_lapack_sytrd`.
-
-lda
-   The leading dimension of ``a`` :math:`(\max(1, r) \le \text{lda})`.
-
-tau
-   The buffer ``tau`` as returned bya   :ref:`onemkl_lapack_sytrd`. The
-   dimension of ``tau`` must be at least :math:`\max(1, r-1)`.
-
-c
-   The buffer ``c`` contains the matrix :math:`C`. The second dimension of ``c``
-   must be at least :math:`\max(1, n)`.
-
-ldc
-   The leading dimension of ``c`` :math:`(\max(1, n) \le \text{ldc})`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_ormtr_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-c
-   Overwritten by the product :math:`QC`, :math:`Q^TC`, :math:`CQ`, or :math:`CQ^T`
-   (as specified by ``side`` and ``trans``).
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-ormtr (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, T *a, std::int64_t lda, T *tau, T *c, std::int64_t ldc, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-In the descriptions below, ``r`` denotes the order of :math:`Q`:
-
-.. container:: tablenoborder
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  :math:`r = m` 
-          -  if ``side = side::left`` 
-        * -  :math:`r = n` 
-          -  if ``side = side::right`` 
-
-queue
-   The queue where the routine should be executed.
-
-side
-   Must be either ``side::left`` or ``side::right``.
-
-   If ``side = side::left``, :math:`Q` or :math:`Q^{T}` is
-   applied to :math:`C` from the left.
-
-   If ``side = side::right``, :math:`Q` or :math:`Q^{T}` is
-   applied to :math:`C` from the right.
-
-upper_lower
-   Must be either ``uplo::upper`` or ``uplo::lower``. Uses the same
-   ``upper_lower`` as supplied to   :ref:`onemkl_lapack_sytrd`.
-
-trans
-   Must be either ``transpose::nontrans`` or ``transpose::trans``.
-
-   If ``trans = transpose::nontrans``, the routine multiplies :math:`C`
-   by :math:`Q`.
-
-   If ``trans = transpose::trans``, the routine multiplies :math:`C` by
-   :math:`Q^{T}`.
-
-m
-   The number of rows in the matrix :math:`C` :math:`(m \ge 0)`.
-
-n
-   The number of columns in the matrix :math:`C` :math:`(n \ge 0)`.
-
-a
-   The pointer to ``a`` as returned by   :ref:`onemkl_lapack_sytrd`.
-
-lda
-   The leading dimension of ``a`` :math:`(\max(1, r) \le \text{lda})`.
-
-tau
-   The buffer ``tau`` as returned by   :ref:`onemkl_lapack_sytrd`. The
-   dimension of ``tau`` must be at least :math:`\max(1, r-1)`.
-
-c
-   The pointer to memory containing the matrix :math:`C`. The second dimension of ``c``
-   must be at least :math:`\max(1, n)`.
-
-ldc
-   The leading dimension of ``c`` :math:`(\max(1, n) \le \text{ldc})`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_ormtr_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-c
-   Overwritten by the product :math:`QC`, :math:`Q^TC`, :math:`CQ`, or :math:`CQ^T`
-   (as specified by ``side`` and ``trans``).
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
diff --git a/docs/domains/lapack/ormtr_scratchpad_size.rst b/docs/domains/lapack/ormtr_scratchpad_size.rst
deleted file mode 100644
index a71506b8e..000000000
--- a/docs/domains/lapack/ormtr_scratchpad_size.rst
+++ /dev/null
@@ -1,105 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_ormtr_scratchpad_size:
-
-ormtr_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_ormtr` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``ormtr_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_ormtr` function should be able to hold.
-Calls to this routine must specify the template parameter
-explicitly.
-
-ormtr_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-In the descriptions below, ``r`` denotes the order of :math:`Q`:
-
-.. container:: tablenoborder
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  :math:`r = m` 
-          -  if ``side = side::left`` 
-        * -  :math:`r = n` 
-          -  if ``side = side::right`` 
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_ormtr` function will be performed.
-
-side
-   Must be either ``side::left`` or ``side::right``.
-
-   If ``side = side::left``, :math:`Q` or :math:`Q^{T}` is
-   applied to :math:`C` from the left.
-
-   If ``side = side::right``, :math:`Q` or :math:`Q^{T}` is
-   applied to :math:`C` from the right.
-
-upper_lower
-   Must be either ``uplo::upper`` or ``uplo::lower``. Uses the
-   same ``upper_lower`` as supplied to
-   :ref:`onemkl_lapack_sytrd`.
-
-trans
-   Must be either ``transpose::nontrans`` or ``transpose::trans``.
-
-   If ``trans = transpose::nontrans``, the routine multiplies
-   :math:`C` by :math:`Q`.
-
-   If ``trans = transpose::trans``, the routine multiplies :math:`C`
-   by :math:`Q^{T}`.
-
-m
-   The number of rows in the matrix :math:`C` :math:`(m \ge 0)`.
-
-n
-   The number of rows in the matrix :math:`C` :math:`(n \ge 0)`.
-
-lda
-   The leading dimension of ``a`` :math:`(\max(1, r) \le \text{lda})`.
-
-ldc
-   The leading dimension of ``c`` :math:`(\max(1, n) \le \text{ldc})`.
-
-.. container:: section
-
-  .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_ormtr` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/potrf.rst b/docs/domains/lapack/potrf.rst
deleted file mode 100644
index dd044e1f2..000000000
--- a/docs/domains/lapack/potrf.rst
+++ /dev/null
@@ -1,172 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_potrf:
-
-potrf
-=====
-
-Computes the Cholesky factorization of a symmetric (Hermitian)
-positive-definite matrix.
-
-.. container:: section
-
-  .. rubric:: Description
-      
-``potrf`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-The routine forms the Cholesky factorization of a symmetric
-positive-definite or, for complex data, Hermitian positive-definite
-matrix :math:`A`:
-
-    .. list-table:: 
-       :header-rows: 1
- 
-       * -  :math:`A` = :math:`U^{T}U` for real data, :math:`A = U^{H}U` for complex data
-         -  if upper_lower=\ ``oneapi::mkl::uplo::upper`` 
-       * -  :math:`A` = :math:`LL^{T}` for real data, :math:`A = LL^{H}` for complex data
-         -  if upper_lower=\ ``oneapi::mkl::uplo::lower`` 
-
-where :math:`L` is a lower triangular matrix and :math:`U` is upper
-triangular.
-
-potrf (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void potrf(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Indicates whether the upper or lower triangular part of :math:`A` is
-   stored and how :math:`A` is factored:
-
-   If upper_lower=\ ``oneapi::mkl::uplo::upper``, the array ``a`` stores the
-   upper triangular part of the matrix :math:`A`, and the strictly lower
-   triangular part of the matrix is not referenced.
-
-   If upper_lower=\ ``oneapi::mkl::uplo::lower``, the array ``a`` stores the
-   lower triangular part of the matrix :math:`A`, and the strictly upper
-   triangular part of the matrix is not referenced.
-
-n
-   Specifies the order of the matrix :math:`A` (:math:`0 \le n`).
-
-a
-   Buffer holding input matrix :math:`A`. The buffer ``a`` contains either
-   the upper or the lower triangular part of the matrix :math:`A` (see
-   upper_lower). The second dimension of ``a`` must be at least
-   :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_potrf_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   The buffer ``a`` is overwritten by the Cholesky factor :math:`U` or :math:`L`,
-   as specified by ``upper_lower``.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-potrf (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, T *a, std::int64_t lda, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Indicates whether the upper or lower triangular part of :math:`A` is
-   stored and how :math:`A` is factored:
-
-   If upper_lower=\ ``oneapi::mkl::uplo::upper``, the array ``a`` stores the
-   upper triangular part of the matrix :math:`A`, and the strictly lower
-   triangular part of the matrix is not referenced.
-
-   If upper_lower=\ ``oneapi::mkl::uplo::lower``, the array ``a`` stores the
-   lower triangular part of the matrix :math:`A`, and the strictly upper
-   triangular part of the matrix is not referenced.
-
-n
-   Specifies the order of the matrix :math:`A` (:math:`0 \le n`).
-
-a
-   Pointer to input matrix :math:`A`. The array ``a`` contains either
-   the upper or the lower triangular part of the matrix :math:`A` (see
-   upper_lower). The second dimension of ``a`` must be at least
-   :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_potrf_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   The memory pointer to by pointer ``a`` is overwritten by the Cholesky factor :math:`U` or :math:`L`,
-   as specified by ``upper_lower``.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
-
diff --git a/docs/domains/lapack/potrf_batch.rst b/docs/domains/lapack/potrf_batch.rst
deleted file mode 100644
index 872886cd7..000000000
--- a/docs/domains/lapack/potrf_batch.rst
+++ /dev/null
@@ -1,239 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_potrf_batch:
-
-potrf_batch
-===========
-
-Computes the LU factorizations of a batch of general matrices.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``potrf_batch`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_lapack_potrf_batch_buffer:
-
-potrf_batch (Buffer Version)
-----------------------------
-
-.. container:: section
-
-  .. rubric:: Description
-
-The buffer version of ``potrf_batch`` supports only the strided API. 
-   
-**Strided API**
-
- | The routine forms the Cholesky factorizations of a symmetric positive-definite or, for complex data, Hermitian positive-definite matrices :math:`A_i`, :math:`i \in \{1...batch\_size\}`:
- | :math:`A_i = U_i^TU_i` for real data, :math:`A_i = U_i^HU_i` for complex data if ``uplo = mkl::uplo::upper``,
- | :math:`A_i = L_iL_i^T` for real data, :math:`A_i = L_iL_i^H` for complex data if ``uplo = mkl::uplo::lower``,
- | where :math:`L_i` is a lower triangular matrix and :math:`U_i` is upper triangular.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void potrf_batch(sycl::queue &queue, mkl::uplo uplo, std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-uplo
-   | Indicates whether the upper or lower triangular part of :math:`A_i` is stored and how :math:`A_i` is factored:
-   | If ``uplo = mkl::uplo::upper``, the array ``a`` stores the upper triangular parts of the matrices :math:`A_i`,
-   | If ``uplo = mkl::uplo::lower``, the array ``a`` stores the lower triangular parts of the matrices :math:`A_i`.
-
-n
-  Order of the matrices :math:`A_i`, (:math:`0 \le n`).
-
-a
-  Array containing batch of input matrices :math:`A_i`, each of :math:`A_i` being of size :math:`\text{lda} \cdot n` and holding either upper or lower triangular parts of the matrices :math:`A_i` (see ``uplo``).
-
-lda
-  Leading dimension of :math:`A_i`.
-
-stride_a
-  Stride between the beginnings of matrices :math:`A_i` inside the batch.
-
-batch_size
-  Number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by the Strided API of the :ref:`onemkl_lapack_potrf_batch_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-	Cholesky factors :math:`U_i` or :math:`L_i`, as specified by ``uplo``.
-
-.. _onemkl_lapack_potrf_batch_usm:
-
-potrf_batch (USM Version)
--------------------------
-
-.. container:: section
-
-  .. rubric:: Description
-
-The USM version of ``potrf_batch`` supports the group API and strided API. 
-
-**Group API**
-
- | The routine forms the Cholesky factorizations of symmetric positive-definite or, for complex data, Hermitian positive-definite matrices :math:`A_i`, :math:`i \in \{1...batch\_size\}`:
- | :math:`A_i = U_i^TU_i` for real data (:math:`A_i = U_i^HU_i` for complex), if :math:`\text{uplo}_g` is ``mkl::uplo::upper``,
- | :math:`A_i = L_iL_i^T` for real data (:math:`A_i = L_iL_i^H` for complex), if :math:`\text{uplo}_g` is ``mkl::uplo::lower``,
- | where :math:`L_i` is a lower triangular matrix and :math:`U_i` is upper triangular, :math:`g` is an index of group of parameters corresponding to :math:`A_i`, and total number of problems to solve, ``batch_size``, is a sum of sizes of all of the groups of parameters as provided by ``group_sizes`` array
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event potrf_batch(sycl::queue &queue, mkl::uplo *uplo, std::int64_t *n, T **a, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-uplo
-  | Array of ``group_count`` :math:`\text{uplo}_g` parameters. Each :math:`\text{uplo}_g` indicates whether the upper or lower triangular parts of the input matrices are provided:
-  | If :math:`\text{uplo}_g` is ``mkl::uplo::upper``, input matrices from array ``a`` belonging to group :math:`g` store the upper triangular parts,
-  | If :math:`\text{uplo}_g` is ``mkl::uplo::lower``, input matrices from array ``a`` belonging to group :math:`g` store the lower triangular parts.
-
-n
-  Array of ``group_count`` :math:`n_g` parameters. Each :math:`n_g` specifies the order of the input matrices from array a belonging to group :math:`g`.
-
-a
-  Array of ``batch_size`` pointers to input matrices :math:`A_i`, each being of size :math:`\text{lda}_g \cdot n_g` (:math:`g` is an index of group to which :math:`A_i` belongs to) and holding either upper or lower triangular part as specified by :math:`\text{uplo}_g`.
-
-lda
-  Array of ``group_count`` :math:`\text{lda}_g` parameters. Each :math:`\text{lda}_g` specifies the leading dimensions of the matrices from a belonging to group :math:`g`.
-
-group_count
-  Number of groups of parameters. Must be at least 0.
-
-group_sizes
-  Array of group_count integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by the Group API of the :ref:`onemkl_lapack_potrf_batch_scratchpad_size` function.
-
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-	Cholesky factors :math:`U_i` or :math:`L_i`, as specified by :math:`\text{uplo}_g` from corresponding group of parameters.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
- | The routine forms the Cholesky factorizations of a symmetric positive-definite or, for complex data, Hermitian positive-definite matrices :math:`A_i`, :math:`i \in \{1...batch\_size\}`:
- | :math:`A_i = U_i^TU_i` for real data, :math:`A_i = U_i^HU_i` for complex data if ``uplo = mkl::uplo::upper``,
- | :math:`A_i = L_iL_i^T` for real data, :math:`A_i = L_iL_i^H` for complex data if ``uplo = mkl::uplo::lower``,
- | where :math:`L_i` is a lower triangular matrix and :math:`U_i` is upper triangular.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event potrf_batch(sycl::queue &queue, mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    };
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-uplo
-   | Indicates whether the upper or lower triangular part of :math:`A_i` is stored and how :math:`A_i` is factored:
-   | If ``uplo = mkl::uplo::upper``, the array ``a`` stores the upper triangular parts of the matrices :math:`A_i`,
-   | If ``uplo = mkl::uplo::lower``, the array ``a`` stores the lower triangular parts of the matrices :math:`A_i`.
-
-n
-  Order of the matrices :math:`A_i`, (:math:`0 \le n`).
-
-a
-  Array containing batch of input matrices :math:`A_i`, each of :math:`A_i` being of size :math:`\text{lda} \cdot n` and holding either upper or lower triangular parts of the matrices :math:`A_i` (see ``uplo``).
-
-lda
-  Leading dimension of :math:`A_i`.
-
-stride_a
-  Stride between the beginnings of matrices :math:`A_i` inside the batch.
-
-batch_size
-  Number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by the Strided API of the :ref:`onemkl_lapack_potrf_batch_scratchpad_size` function.
-
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-	Cholesky factors :math:`U_i` or :math:`L_i`, as specified by ``uplo``.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
-
diff --git a/docs/domains/lapack/potrf_batch_scratchpad_size.rst b/docs/domains/lapack/potrf_batch_scratchpad_size.rst
deleted file mode 100644
index 4a1b57bf8..000000000
--- a/docs/domains/lapack/potrf_batch_scratchpad_size.rst
+++ /dev/null
@@ -1,120 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_potrf_batch_scratchpad_size:
-
-potrf_batch_scratchpad_size
-===========================
-
-Computes size of scratchpad memory required for the :ref:`onemkl_lapack_potrf_batch` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``potrf_batch_scratchpad_size`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-**Group API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_potrf_batch` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, mkl::uplo *uplo, std::int64_t *n, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-uplo
- | Array of ``group_count`` :math:`\text{uplo}_g` parameters.
- | Each of :math:`\text{uplo}_g` indicates whether the upper or lower triangular parts of the input matrices are provided:
- | If :math:`\text{uplo}_g` is ``mkl::uplo::upper``, input matrices from array ``a`` belonging to group :math:`g` store the upper triangular parts,
- | If :math:`\text{uplo}_g` is ``mkl::uplo::lower``, input matrices from array ``a`` belonging to group :math:`g` store the lower triangular parts.
-
-n
- | Array of ``group_count`` :math:`n_g` parameters.
- | Each ng specifies the order of the input matrices belonging to group :math:`g`.
-
-lda
- | Array of ``group_count`` :math:`\text{lda}_g` parameters.
- | Each ldag specifies the leading dimensions of the matrices belonging to group :math:`g`.
-
-group_count
-  Number of groups of parameters. Must be at least 0.
-
-group_sizes 
-  Array of ``group_count`` integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_potrf_batch` function.
-
-**Strided API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_potrf_batch` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size)
-    };
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-uplo
- | Indicates whether the upper or lower triangular part of :math:`A_i` is stored and how :math:`A_i` is factored:
- | If ``uplo = mkl::uplo::upper``, the array ``a`` stores the upper triangular parts of the matrices :math:`A_i`,
- | If ``uplo = mkl::uplo::lower``, the array ``a`` stores the lower triangular parts of the matrices :math:`A_i`.
-
-n
-  Order of the matrices :math:`A_i`, (:math:`0 \le n`).
-
-lda
-  Leading dimension of :math:`A_i`.
-
-stride_a
-  Stride between the beginnings of matrices :math:`A_i` inside the batch.
-
-batch_size
-  Number of problems in a batch.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_potrf_batch` function.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
-
diff --git a/docs/domains/lapack/potrf_scratchpad_size.rst b/docs/domains/lapack/potrf_scratchpad_size.rst
deleted file mode 100644
index c1e423fa2..000000000
--- a/docs/domains/lapack/potrf_scratchpad_size.rst
+++ /dev/null
@@ -1,77 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_potrf_scratchpad_size:
-
-potrf_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_potrf` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``potrf_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_potrf` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-potrf_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_potrf` function will be performed.
-
-upper_lower
-   Indicates whether the upper or lower triangular part of :math:`A` is
-   stored and how :math:`A` is factored:
-
-   If ``upper_lower = oneapi::mkl::uplo::upper``, the array ``a`` stores the
-   upper triangular part of the matrix :math:`A`, and the strictly lower
-   triangular part of the matrix is not referenced.
-
-   If ``upper_lower = oneapi::mkl::uplo::lower``, the array ``a`` stores the
-   lower triangular part of the matrix :math:`A`, and the strictly upper
-   triangular part of the matrix is not referenced.
-
-n
-   Specifies the order of the matrix :math:`A` (:math:`0 \le n`).
-
-lda
-   The leading dimension of ``a``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_potrf` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
-
diff --git a/docs/domains/lapack/potri.rst b/docs/domains/lapack/potri.rst
deleted file mode 100644
index 3e8de09d7..000000000
--- a/docs/domains/lapack/potri.rst
+++ /dev/null
@@ -1,144 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_potri:
-
-potri
-=====
-
-Computes the inverse of a symmetric (Hermitian) positive-definite
-matrix using the Cholesky factorization.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``potri`` supports the following precisions.
-
-      .. list-table:: 
-         :header-rows: 1
-
-         * -  T 
-         * -  ``float`` 
-         * -  ``double`` 
-         * -  ``std::complex<float>`` 
-         * -  ``std::complex<double>`` 
-
-The routine computes the inverse :math:`A^{-1}` of a symmetric positive
-definite or, for complex flavors, Hermitian positive-definite matrix
-:math:`A`. Before calling this routine, call :ref:`onemkl_lapack_potrf`
-to factorize :math:`A`.
-
-potri (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void potri(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Indicates how the input matrix :math:`A` has been    factored:
-
-   If ``upper_lower = oneapi::mkl::uplo::upper``, the upper   triangle of :math:`A` is stored.
-
-   If   ``upper_lower = oneapi::mkl::uplo::lower``, the lower triangle of :math:`A` is   stored.
-
-n
-   Specifies the order of the matrix    :math:`A` (:math:`0 \le n`).
-
-a
-   Contains the factorization of the matrix :math:`A`, as    returned by   :ref:`onemkl_lapack_potrf`.   The second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_potri_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-a
-   Overwritten by the upper or lower triangle of the inverse    of :math:`A`. Specified by ``upper_lower``.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-potri (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, T *a, std::int64_t lda, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Indicates how the input matrix :math:`A` has been    factored:
-
-   If ``upper_lower = oneapi::mkl::uplo::upper``, the upper   triangle of :math:`A` is stored.
-
-   If   ``upper_lower = oneapi::mkl::uplo::lower``, the lower triangle of :math:`A` is   stored.
-
-n
-   Specifies the order of the matrix    :math:`A` (:math:`0 \le n`).
-
-a
-   Contains the factorization of the matrix :math:`A`, as    returned by   :ref:`onemkl_lapack_potrf`.   The second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_potri_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-a
-   Overwritten by the upper or lower triangle of the inverse    of :math:`A`. Specified by ``upper_lower``.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-         
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
-
diff --git a/docs/domains/lapack/potri_scratchpad_size.rst b/docs/domains/lapack/potri_scratchpad_size.rst
deleted file mode 100644
index 07b14a341..000000000
--- a/docs/domains/lapack/potri_scratchpad_size.rst
+++ /dev/null
@@ -1,71 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_potri_scratchpad_size:
-
-potri_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_potri` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``potri_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_potri` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-potri_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_potri` function will be performed.
-
-upper_lower
-   Indicates how the input matrix :math:`A` has been    factored:
-
-   If ``upper_lower = oneapi::mkl::uplo::upper``, the upper   triangle of :math:`A` is stored.
-
-   If   ``upper_lower = oneapi::mkl::uplo::lower``, the lower triangle of :math:`A` is   stored.
-
-n
-   Specifies the order of the matrix    :math:`A` (:math:`0 \le n`).
-
-lda
-   The leading dimension of ``a``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_potri` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/potrs.rst b/docs/domains/lapack/potrs.rst
deleted file mode 100644
index 26fbc7c31..000000000
--- a/docs/domains/lapack/potrs.rst
+++ /dev/null
@@ -1,177 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_potrs:
-
-potrs
-=====
-
-Solves a system of linear equations with a Cholesky-factored
-symmetric (Hermitian) positive-definite coefficient matrix.
-
-.. container:: section
-
-  .. rubric:: Description
-      
-``potrs`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-The routine solves for :math:`X` the system of linear equations
-:math:`AX = B` with a symmetric positive-definite or, for complex data,
-Hermitian positive-definite matrix :math:`A`, given the Cholesky
-factorization of :math:`A`:
-
-.. list-table:: 
-   :header-rows: 1
-
-   * -  :math:`A = U^TU` for real data, :math:`A = U^HU` for complex data
-     -  if ``upper_lower=oneapi::mkl::uplo::upper``
-   * -  :math:`A = LL^T` for real data, :math:`A = LL^H` for complex data
-     -  if ``upper_lower=oneapi::mkl::uplo::lower``
-
-where :math:`L` is a lower triangular matrix and :math:`U` is upper
-triangular. The system is solved with multiple right-hand sides
-stored in the columns of the matrix :math:`B`.
-
-Before calling this routine, you must call :ref:`onemkl_lapack_potrf` to compute
-the Cholesky factorization of :math:`A`.
-
-potrs (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void potrs(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t nrhs, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &b, std::int64_t ldb, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Indicates how the input matrix has been factored:
-
-   If ``upper_lower = oneapi::mkl::uplo::upper``, the upper triangle   :math:`U` of :math:`A` is stored, where :math:`A` = :math:`U^{T}`U`   for real data, :math:`A` = :math:`U^{H}U` for complex data.
-
-   If ``upper_lower = oneapi::mkl::uplo::lower``, the lower triangle   :math:`L` of :math:`A` is stored, where :math:`A` = :math:`LL^{T}`   for real data, :math:`A` = :math:`LL^{H}` for complex   data.
-
-n
-   The order of matrix :math:`A` (:math:`0 \le n`).\
-
-nrhs
-   The number of right-hand sides (:math:`0 \le \text{nrhs}`).
-
-a
-   Buffer containing the factorization of the matrix A, as    returned by   :ref:`onemkl_lapack_potrf`.   The second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``.
-
-b
-   The array ``b`` contains the matrix :math:`B` whose columns    are the right-hand sides for the systems of equations. The second   dimension of ``b`` must be at least :math:`\max(1,\text{nrhs})`.
-
-ldb
-   The leading dimension of ``b``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_potrs_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-b
-   Overwritten by the solution matrix :math:`X`.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-potrs (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t nrhs, T *a, std::int64_t lda, T *b, std::int64_t ldb, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Indicates how the input matrix has been factored:
-
-   If ``upper_lower = oneapi::mkl::uplo::upper``, the upper triangle   :math:`U` of :math:`A` is stored, where :math:`A` = :math:`U^{T}U`   for real data, :math:`A` = :math:`U^{H}U` for complex data.
-
-   If ``upper_lower = oneapi::mkl::uplo::lower``, the lower triangle   :math:`L` of :math:`A` is stored, where :math:`A` = :math:`LL^{T}`   for real data, :math:`A` = :math:`LL^{H}` for complex   data.
-
-n
-   The order of matrix :math:`A` (:math:`0 \le n`).\
-
-nrhs
-   The number of right-hand sides (:math:`0 \le \text{nrhs}`).
-
-a
-   Pointer to array containing the factorization of the matrix :math:`A`, as    returned by   :ref:`onemkl_lapack_potrf`.   The second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``.
-
-b
-   The array ``b`` contains the matrix :math:`B` whose columns    are the right-hand sides for the systems of equations. The second   dimension of ``b`` must be at least :math:`\max(1,\text{nrhs})`.
-
-ldb
-   The leading dimension of ``b``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_potrs_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-b
-   Overwritten by the solution matrix :math:`X`.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-         
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
-
diff --git a/docs/domains/lapack/potrs_batch.rst b/docs/domains/lapack/potrs_batch.rst
deleted file mode 100644
index f9c8f6477..000000000
--- a/docs/domains/lapack/potrs_batch.rst
+++ /dev/null
@@ -1,276 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_potrs_batch:
-
-potrs_batch
-===========
-
-Computes the LU factorizations of a batch of general matrices.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``potrs_batch`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_lapack_potrs_batch_buffer:
-
-potrs_batch (Buffer Version)
-----------------------------
-
-.. container:: section
-
-  .. rubric:: Description
-
-The buffer version of ``potrs_batch`` supports only the strided API. 
-   
-**Strided API**
-
- | The routine solves for :math:`X_i` the systems of linear equations :math:`A_iX_i = B_i` with a symmetric positive-definite or, for complex data, Hermitian positive-definite matrices :math:`A_i`, given the Cholesky factorization of :math:`A_i`, :math:`i \in \{1...batch\_size\}`:
- | :math:`A_i = U_i^TU_i` for real data, :math:`A_i = U_i^HU_i` for complex data if ``uplo = mkl::uplo::upper``,
- | :math:`A_i = L_iL_i^T` for real data, :math:`A_i = L_iL_i^H` for complex data if ``uplo = mkl::uplo::lower``,
- | where :math:`L_i` is a lower triangular matrix and :math:`U_i` is upper triangular.
- | The systems are solved with multiple right-hand sides stored in the columns of the matrices :math:`B_i`.
- | Before calling this routine, matrices :math:`A_i` should be factorized by call to the Strided API of the :ref:`onemkl_lapack_potrf_batch_buffer` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void potrs_batch(sycl::queue &queue, mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, sycl::buffer<T> &a, std::int64_t lda, std::int64_t stride_a, sycl::buffer<T> &b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-uplo
- | Indicates how the input matrices have been factored:
- | If ``uplo = mkl::uplo::upper``, the upper triangle :math:`U_i` of :math:`A_i` is stored, where :math:`A_i = U_i^TU_i` for real data, :math:`A_i = U_i^HU_i` for complex data.
- | If ``uplo = mkl::uplo::lower``, the upper triangle :math:`L_i` of :math:`A_i` is stored, where :math:`A_i = L_iL_i^T` for real data, :math:`A_i = L_iL_i^H` for complex data.
-
-n
-  The order of matrices :math:`A_i` (:math:`0 \le n`).
-
-nrhs
-  The number of right-hand sides (:math:`0 \le \text{nrhs}`).
-
-a
-  Array containing batch of factorizations of the matrices :math:`A_i`, as returned by the Strided API of the :ref:`onemkl_lapack_potrf_batch_buffer` function.
-
-lda
-  Leading dimension of :math:`A_i`.
-
-stride_a
-  Stride between the beginnings of matrices inside the batch array ``a``.
-
-b
-  Array containing batch of matrices :math:`B_i` whose columns are the right-hand sides for the systems of equations.
-
-ldb
-  Leading dimension of :math:`B_i`.
-
-stride_b
-  Stride between the beginnings of matrices :math:`B_i` inside the batch array ``b``.
-
-batch_size
-  Number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by the Strided API of the :ref:`onemkl_lapack_potrs_batch_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-b
-  Solution matrices :math:`X_i`.
-
-.. _onemkl_lapack_potrs_batch_usm:
-
-potrs_batch (USM Version)
--------------------------
-
-.. container:: section
-
-  .. rubric:: Description
-
-The USM version of ``potrs_batch`` supports the group API and strided API. 
-
-**Group API**
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event potrs_batch(sycl::queue &queue, mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, T **a, std::int64_t *lda, T **b, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-uplo  
- | Array of ``group_count`` :math:`\text{uplo}_g` parameters.
- | Each of :math:`\text{uplo}_g` indicates whether the upper or lower triangular parts of the input matrices are provided:
- | If :math:`\text{uplo}_g` is ``mkl::uplo::upper``, input matrices from array ``a`` belonging to group :math:`g` store the upper triangular parts,
- | If :math:`\text{uplo}_g` is ``mkl::uplo::lower``, input matrices from array ``a`` belonging to group :math:`g` store the lower triangular parts.
-
-n
- | Array of ``group_count`` :math:`n_g` parameters.
- | Each :math:`n_g` specifies the order of the input matrices from array ``a`` belonging to group :math:`g`.
-
-nrhs
- | Array of ``group_count`` :math:`\text{nrhs}_g` parameters.
- | Each :math:`\text{nrhs}_g` specifies the number of right-hand sides supplied for group :math:`g` in corresponding part of array ``b``.
-
-a
-  Array of ``batch_size`` pointers to Cholesky factored matrices :math:`A_i` as returned by the Group API of the :ref:`onemkl_lapack_potrf_batch_usm` function.
-
-lda
- | Array of ``group_count`` :math:`\text{lda}_g` parameters.
- | Each :math:`\text{lda}_g` specifies the leading dimensions of the matrices from ``a`` belonging to group :math:`g`.
-
-b
-  Array of ``batch_size`` pointers to right-hand side matrices :math:`B_i`, each of size :math:`\text{ldb}_g \cdot \text{nrhs}_g`, where :math:`g` is an index of group corresponding to :math:`B_i`.
-
-ldb
- | Array of ``group_count`` :math:`\text{ldb}_g` parameters.
- | Each :math:`\text{ldb}_g` specifies the leading dimensions of the matrices from ``b`` belonging to group :math:`g`.
-
-group_count
-  Number of groups of parameters. Must be at least 0.
-
-group_sizes
-  Array of ``group_count`` integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by the Group API of the :ref:`onemkl_lapack_potrs_batch_scratchpad_size` function.
-
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-b
-  Solution matrices :math:`X_i`.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
- | The routine solves for :math:`X_i` the systems of linear equations :math:`A_iX_i = B_i` with a symmetric positive-definite or, for complex data, Hermitian positive-definite matrices :math:`A_i`, given the Cholesky factorization of :math:`A_i`, :math:`i \in \{1...batch\_size\}`:
- | :math:`A_i = U_i^TU_i` for real data, :math:`A_i = U_i^HU_i` for complex data if ``uplo = mkl::uplo::upper``,
- | :math:`A_i = L_iL_i^T` for real data, :math:`A_i = L_iL_i^H` for complex data if ``uplo = mkl::uplo::lower``,
- | where :math:`L_i` is a lower triangular matrix and :math:`U_i` is upper triangular.
- | The systems are solved with multiple right-hand sides stored in the columns of the matrices :math:`B_i`.
- | Before calling this routine, matrices :math:`A_i` should be factorized by call to the Strided API of the :ref:`onemkl_lapack_potrf_batch_usm` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event potrs_batch(sycl::queue &queue, mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, T *a, std::int64_t lda, std::int64_t stride_a, T *b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    };
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-uplo
- | Indicates how the input matrices have been factored:
- | If ``uplo = mkl::uplo::upper``, the upper triangle :math:`U_i` of :math:`A_i` is stored, where :math:`A_i = U_i^TU_i` for real data, :math:`A_i = U_i^HU_i` for complex data.
- | If ``uplo = mkl::uplo::lower``, the upper triangle :math:`L_i` of :math:`A_i` is stored, where :math:`A_i = L_iL_i^T` for real data, :math:`A_i = L_iL_i^H` for complex data.
-
-n
-  The order of matrices :math:`A_i` (:math:`0 \le n`).
-
-nrhs
-  The number of right-hand sides (:math:`0 \le nrhs`).
-
-a
-  Array containing batch of factorizations of the matrices :math:`A_i`, as returned by the Strided API of the :ref:`onemkl_lapack_potrf_batch_usm` function.
-
-lda
-  Leading dimension of :math:`A_i`.
-
-stride_a
-  Stride between the beginnings of matrices inside the batch array ``a``.
-
-b
-  Array containing batch of matrices :math:`B_i` whose columns are the right-hand sides for the systems of equations.
-
-ldb
-  Leading dimension of :math:`B_i`.
-
-stride_b
-  Stride between the beginnings of matrices :math:`B_i` inside the batch array ``b``.
-
-batch_size
-  Number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by the Strided API of the :ref:`onemkl_lapack_potrs_batch_scratchpad_size` function.
-
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-b
-  Solution matrices :math:`X_i`.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
-
diff --git a/docs/domains/lapack/potrs_batch_scratchpad_size.rst b/docs/domains/lapack/potrs_batch_scratchpad_size.rst
deleted file mode 100644
index 6136fd8b2..000000000
--- a/docs/domains/lapack/potrs_batch_scratchpad_size.rst
+++ /dev/null
@@ -1,136 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_potrs_batch_scratchpad_size:
-
-potrs_batch_scratchpad_size
-===========================
-
-Computes size of scratchpad memory required for the :ref:`onemkl_lapack_potrs_batch` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``potrs_batch_scratchpad_size`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``float`` 
-      * -  ``double`` 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-**Group API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_potrs_batch` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-uplo
- | Array of ``group_count`` :math:`\text{uplo}_g` parameters.
- | Each of :math:`\text{uplo}_g` indicates whether the upper or lower triangular parts of the input matrices are provided:
- | If :math:`\text{uplo}_g` is ``mkl::uplo::upper``, input matrices from array ``a`` belonging to group :math:`g` store the upper triangular parts,
- | If :math:`\text{uplo}_g` is ``mkl::uplo::lower``, input matrices from array ``a`` belonging to group :math:`g` store the lower triangular parts.
-
-n
- | Array of ``group_count`` :math:`n_g` parameters.
- | Each :math:`n_g` specifies the order of the input matrices belonging to group :math:`g`.
-
-nrhs
- | Array of ``group_count`` :math:`\text{nrhs}_g` parameters.
- | Each :math:`rhs_g` specifies the number of right-hand sides supplied for group :math:`g`.
-
-lda
- | Array of ``group_count`` :math:`\text{lda}_g` parameters.
- | Each :math:`\text{lda}_g` specifies the leading dimensions of the matrices belonging to group :math:`g`.
-
-ldb
- | Array of ``group_count`` :math:`\text{ldb}_g` parameters.
- | Each :math:`\text{ldb}_g` specifies the leading dimensions of the matrices belonging to group :math:`g`.
-
-group_count
-  Number of groups of parameters. Must be at least 0.
-
-group_sizes Array of group_count integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_potrs_batch` function.
-
-**Strided API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_potrs_batch` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size)
-    };
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-uplo
- | Indicates how the input matrices have been factored:
- | If ``uplo = mkl::uplo::upper``, the upper triangle :math:`U_i` of :math:`A_i` is stored, where :math:`A_i = U_i^TU_i` for real data, :math:`A_i = U_i^HU_i` for complex data.
- | If ``uplo = mkl::uplo::lower``, the upper triangle :math:`L_i` of :math:`A_i` is stored, where :math:`A_i = L_iL_i^T` for real data, :math:`A_i = L_iL_i^H` for complex data.
-
-n
-  Order of matrices :math:`A_i` (:math:`0 \le n`).
-
-nrhs  
-  Number of right-hand sides (:math:`0 \le \text{nrhs}`).
-
-lda
-  Leading dimension of :math:`A_i`.
-
-stride_a
-  Stride between the beginnings of matrices inside the batch array ``a``.
-
-ldb
-  Leading dimensions of :math:`B_i`.
-
-stride_b
-  Stride between the beginnings of matrices :math:`B_i` inside the batch array ``b``.
-
-batch_size
-  Number of problems in a batch.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_potrs_batch` function.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
-
diff --git a/docs/domains/lapack/potrs_scratchpad_size.rst b/docs/domains/lapack/potrs_scratchpad_size.rst
deleted file mode 100644
index 125d68944..000000000
--- a/docs/domains/lapack/potrs_scratchpad_size.rst
+++ /dev/null
@@ -1,77 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_potrs_scratchpad_size:
-
-potrs_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_potrs` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``potrs_scratchpad_size`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-    
-       * -  T 
-       * -  ``float`` 
-       * -  ``double`` 
-       * -  ``std::complex<float>`` 
-       * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_potrs` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-potrs_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_potrs` function will be performed.
-
-upper_lower
-   Indicates how the input matrix has been factored:
-
-   If ``upper_lower = oneapi::mkl::uplo::upper``, the upper triangle   :math:`U` of :math:`A` is stored, where :math:`A = U^{T}U`   for real data, :math:`A = U^{H}U` for complex data.
-
-   If ``upper_lower = oneapi::mkl::uplo::lower``, the lower triangle   :math:`L` of :math:`A` is stored, where :math:`A = LL^{T}`   for real data, :math:`A = LL^{H}` for complex   data.
-
-n
-   The order of matrix :math:`A` (:math:`0 \le n`).
-
-nrhs
-   The number of right-hand sides (:math:`0 \le nrhs`).
-
-lda
-   The leading dimension of ``a``.
-
-ldb
-   The leading dimension of ``b``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-         
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_potrs` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/syevd.rst b/docs/domains/lapack/syevd.rst
deleted file mode 100644
index 93df0ba27..000000000
--- a/docs/domains/lapack/syevd.rst
+++ /dev/null
@@ -1,186 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_syevd:
-
-syevd
-=====
-
-Computes all eigenvalues and, optionally, all eigenvectors of a real
-symmetric matrix using divide and conquer algorithm.
-
-.. container:: section
-
-  .. rubric:: Description
-      
-``syevd`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``float`` 
-       * -  ``double`` 
-
-The routine computes all the eigenvalues, and optionally all the
-eigenvectors, of a real symmetric matrix :math:`A`. In other words, it
-can compute the spectral factorization of :math:`A` as: :math:`A = Z\lambda Z^T`.
-
-Here :math:`\Lambda` is a diagonal matrix whose diagonal elements are the
-eigenvalues :math:`\lambda_i`, and :math:`Z` is the orthogonal matrix whose
-columns are the eigenvectors :math:`z_{i}`. Thus,
-
-:math:`A z_i = \lambda_i z_i` for :math:`i = 1, 2, ..., n`.
-
-If the eigenvectors are requested, then this routine uses a divide
-and conquer algorithm to compute eigenvalues and eigenvectors.
-However, if only eigenvalues are required, then it uses the
-Pal-Walker-Kahan variant of the QL or QR algorithm.
-
-syevd (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void syevd(sycl::queue &queue, jobz jobz, oneapi::mkl::uplo upper_lower, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &w, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-jobz
-   Must be ``job::novec`` or ``job::vec``.
-
-   If ``jobz = job::novec``, then only eigenvalues are computed.
-
-   If ``jobz = job::vec``, then eigenvalues and eigenvectors are
-   computed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = job::upper``, a stores the upper triangular
-   part of :math:`A`.
-
-   If ``upper_lower = job::lower``, a stores the lower triangular
-   part of :math:`A`.
-
-n
-   The order of the matrix :math:`A` (:math:`0 \le n`).
-
-a
-   The buffer ``a``, size (``lda,*``). The buffer ``a`` contains the matrix
-   :math:`A`. The second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``. Must be at least :math:`\max(1,n)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_syevd_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-a
-   If ``jobz = job::vec``, then on exit this buffer is overwritten by
-   the orthogonal matrix :math:`Z` which contains the eigenvectors of
-   :math:`A`.
-
-w
-   Buffer, size at least :math:`n`. Contains the eigenvalues
-   of the matrix :math:`A` in ascending order.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-syevd (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event syevd(sycl::queue &queue, jobz jobz, oneapi::mkl::uplo upper_lower, std::int64_t n, T *a, std::int64_t lda, T *w, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-jobz
-   Must be ``job::novec`` or ``job::vec``.
-
-   If ``jobz = job::novec``, then only eigenvalues are computed.
-
-   If ``jobz = job::vec``, then eigenvalues and eigenvectors are
-   computed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = job::upper``, a stores the upper triangular
-   part of :math:`A`.
-
-   If ``upper_lower = job::lower``, a stores the lower triangular
-   part of :math:`A`.
-
-n
-   The order of the matrix :math:`A` (:math:`0 \le n`).
-
-a
-   Pointer to array containing :math:`A`, size (``lda,*``).
-   The second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``. Must be at least :math:`\max(1,n)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_syevd_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   If ``jobz = job::vec``, then on exit this array is overwritten by
-   the orthogonal matrix :math:`Z` which contains the eigenvectors of
-   :math:`A`.
-
-w
-   Pointer to array of size at least :math:`n`. Contains the eigenvalues
-   of the matrix :math:`A` in ascending order.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-         
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/syevd_scratchpad_size.rst b/docs/domains/lapack/syevd_scratchpad_size.rst
deleted file mode 100644
index e9d08fc18..000000000
--- a/docs/domains/lapack/syevd_scratchpad_size.rst
+++ /dev/null
@@ -1,81 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_syevd_scratchpad_size:
-
-syevd_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_syevd` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``syevd_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_syevd` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-syevd_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-         
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_syevd` function will be performed.
-
-jobz
-   Must be ``job::novec`` or ``job::vec``.
-
-   If ``jobz = job::novec``, then only eigenvalues are computed.
-
-   If ``jobz = job::vec``, then eigenvalues and eigenvectors are
-   computed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = job::upper``, a stores the upper triangular
-   part of :math:`A`.
-
-   If ``upper_lower = job::lower``, a stores the lower triangular
-   part of :math:`A`.
-
-n
-   The order of the matrix :math:`A` (:math:`0 \le n`).
-
-lda
-   The leading dimension of ``a``. Currently ``lda`` is not referenced in
-   this function.
-
-.. container:: section
-
-  .. rubric:: Return Value
-         
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_syevd` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/sygvd.rst b/docs/domains/lapack/sygvd.rst
deleted file mode 100644
index 797fdf3a6..000000000
--- a/docs/domains/lapack/sygvd.rst
+++ /dev/null
@@ -1,249 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_sygvd:
-
-sygvd
-=====
-
-Computes all eigenvalues and, optionally, eigenvectors of a real
-generalized symmetric definite eigenproblem using a divide and
-conquer method.
-
-.. container:: section
-
-  .. rubric:: Description
-      
-``sygvd`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-
-The routine computes all the eigenvalues, and optionally, the
-eigenvectors of a real generalized symmetric-definite eigenproblem,
-of the form
-
-:math:`Ax = \lambda Bx`, :math:`ABx = \lambda x`, or :math:`BAx = \lambda x` .
-
-Here :math:`A` and :math:`B` are assumed to be symmetric and :math:`B` is also
-positive definite.
-
-It uses a divide and conquer algorithm.
-
-sygvd (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo upper_lower, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &b, std::int64_t ldb, sycl::buffer<T,1> &w, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-itype
-   Must be 1 or 2 or 3. Specifies the problem type to be solved:
-
-   if :math:`\text{itype} = 1`, the problem type is :math:`Ax =  \lambda Bx`;
-
-   if :math:`\text{itype} = 2`, the problem type is :math:`ABx = \lambda x`;
-
-   if :math:`\text{itype} = 3`, the problem type is :math:`BAx = \lambda x`.
-
-jobz
-   Must be ``job::novec`` or ``job::vec``.
-
-   If ``jobz = job::novec``, then only eigenvalues are computed.
-
-   If ``jobz = job::vec``, then eigenvalues and eigenvectors are
-   computed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = job::upper``, ``a`` and ``b`` store the upper
-   triangular part of :math:`A` and :math:`B`.
-
-   If ``upper_lower = job::lower``, ``a`` and ``b`` stores the lower
-   triangular part of :math:`A` and :math:`B`.
-
-n
-   The order of the matrices :math:`A` and :math:`B` :math:`(0 \le n)`.
-
-a
-   Buffer, size a\ ``(lda,*)`` contains the upper or lower triangle
-   of the symmetric matrix :math:`A`, as specified by ``upper_lower``. The
-   second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``; at least :math:`\max(1, n)`.
-
-b
-   Buffer, size ``b`` ``(ldb,*)`` contains the upper or lower triangle
-   of the symmetric matrix :math:`B`, as specified by ``upper_lower``. The
-   second dimension of ``b`` must be at least :math:`\max(1, n)`.
-
-ldb
-   The leading dimension of ``b``; at least :math:`\max(1, n)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_sygvd_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-a
-   On exit, if ``jobz = job::vec``, then if :math:`\text{info} = 0`, ``a``
-   contains the matrix :math:`Z` of eigenvectors. The eigenvectors are
-   normalized as follows:
-
-   if :math:`\text{itype} = 1` or :math:`2` , :math:`Z^{T}BZ = I`;
-
-   if :math:`\text{itype} = 3` , :math:`Z^{T}B^{-1}Z = I`;
-
-   If ``jobz = job::novec``, then on exit the upper triangle (if
-   ``upper_lower = uplo::upper``) or the lower triangle (if
-   ``upper_lower = uplo::lower``) of :math:`A`, including the diagonal,
-   is destroyed.
-
-b
-   On exit, if :math:`\text{info} \le n`, the part of ``b`` containing the matrix is
-   overwritten by the triangular factor :math:`U` or :math:`L` from the
-   Cholesky factorization :math:`B = U^{T}U` or
-   :math:`B = LL^{T}`.
-
-w
-   Buffer, size at least :math:`n`. If :math:`\text{info} = 0`, contains the
-   eigenvalues of the matrix :math:`A` in ascending order.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-sygvd (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo upper_lower, std::int64_t n, T *a, std::int64_t lda, T *b, std::int64_t ldb, T *w, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-itype
-   Must be 1 or 2 or 3. Specifies the problem type to be solved:
-
-   if :math:`\text{itype} = 1`, the problem type is :math:`Ax =  \lambda Bx`;
-
-   if :math:`\text{itype} = 2`, the problem type is :math:`ABx = \lambda x`;
-
-   if :math:`\text{itype} = 3`, the problem type is :math:`BAx = \lambda x`.
-
-jobz
-   Must be ``job::novec`` or ``job::vec``.
-
-   If ``jobz = job::novec``, then only eigenvalues are computed.
-
-   If ``jobz = job::vec``, then eigenvalues and eigenvectors are
-   computed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = job::upper``, ``a`` and ``b`` store the upper
-   triangular part of :math:`A` and :math:`B`.
-
-   If ``upper_lower = job::lower``, ``a`` and ``b`` stores the lower
-   triangular part of :math:`A` and :math:`B`.
-
-n
-   The order of the matrices :math:`A` and :math:`B` :math:`(0 \le n)`.
-
-a
-   Pointer to array of size a\ ``(lda,*)`` containing the upper or lower triangle
-   of the symmetric matrix :math:`A`, as specified by ``upper_lower``. The
-   second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``; at least :math:`\max(1, n)`.
-
-b
-   Pointer to array of size ``b`` ``(ldb,*)`` contains the upper or lower triangle
-   of the symmetric matrix :math:`B`, as specified by ``upper_lower``. The
-   second dimension of ``b`` must be at least :math:`\max(1, n)`.
-
-ldb
-   The leading dimension of ``b``; at least :math:`\max(1, n)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_sygvd_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   On exit, if ``jobz = job::vec``, then if :math:`\text{info} = 0`, :math:`a`
-   contains the matrix :math:`Z` of eigenvectors. The eigenvectors are
-   normalized as follows:
-
-   if :math:`\text{itype} = 1` or :math:`2`, :math:`Z^{T}BZ = I`;
-   
-   if :math:`\text{itype} = 3`, :math:`Z^{T}B^{-1}Z = I`;
-
-   If ``jobz = job::novec``, then on exit the upper triangle (if
-   ``upper_lower = uplo::upper``) or the lower triangle (if
-   ``upper_lower = uplo::lower``) of :math:`A`, including the diagonal,
-   is destroyed.
-
-b
-   On exit, if :math:`\text{info} \le n`, the part of ``b`` containing the matrix is
-   overwritten by the triangular factor :math:`U` or :math:`L` from the
-   Cholesky factorization :math:`B` = :math:`U^{T}U` or
-   :math:`B = LL^{T}`.
-
-w
-   Pointer to array of size at least ``n``. If :math:`\text{info} = 0`, contains the
-   eigenvalues of the matrix :math:`A` in ascending order.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/sygvd_scratchpad_size.rst b/docs/domains/lapack/sygvd_scratchpad_size.rst
deleted file mode 100644
index d14ef8f61..000000000
--- a/docs/domains/lapack/sygvd_scratchpad_size.rst
+++ /dev/null
@@ -1,92 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_sygvd_scratchpad_size:
-
-sygvd_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_sygvd` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-`sygvd_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_sygvd` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-sygvd_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t lda, std::int64_t ldb) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_sygvd` function will be performed.
-
-itype
-   Must be 1 or 2 or 3. Specifies the problem type to be solved:
-
-   if :math:`\text{itype} = 1`, the problem type is :math:`Ax = \lambda Bx`;
-
-   if :math:`\text{itype} = 2`, the problem type is :math:`ABx = \lambda x`;
-
-   if :math:`\text{itype} = 3`, the problem type is :math:`BAx = \lambda x`.
-
-jobz
-   Must be ``job::novec`` or ``job::vec``.
-
-   If ``jobz = job::novec``, then only eigenvalues are computed.
-
-   If ``jobz = job::vec``, then eigenvalues and eigenvectors are
-   computed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = job::upper``, ``a`` and ``b`` store the upper
-   triangular part of :math:`A` and :math:`B`.
-
-   If ``upper_lower = job::lower``, ``a`` and ``b`` stores the lower
-   triangular part of :math:`A` and :math:`B`.
-
-n
-   The order of the matrices :math:`A` and :math:`B` :math:`(0 \le n)`.
-
-lda
-   The leading dimension of ``a``.
-
-ldb
-   The leading dimension of ``b``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-         
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_sygvd` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/sytrd.rst b/docs/domains/lapack/sytrd.rst
deleted file mode 100644
index 2df61c2b6..000000000
--- a/docs/domains/lapack/sytrd.rst
+++ /dev/null
@@ -1,205 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_sytrd:
-
-sytrd
-=====
-
-Reduces a real symmetric matrix to tridiagonal form.
-
-.. container:: section
-
-  .. rubric:: Description
-      
-``sytrd`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-
-The routine reduces a real symmetric matrix :math:`A` to symmetric
-tridiagonal form :math:`T` by an orthogonal similarity transformation:
-:math:`A = QTQ^T`. The orthogonal matrix :math:`Q` is not formed explicitly
-but is represented as a product of :math:`n-1` elementary reflectors.
-Routines are provided for working with :math:`Q` in this representation .
-
-sytrd (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void sytrd(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &d, sycl::buffer<T,1> &e, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = uplo::upper``, ``a`` stores the upper
-   triangular part of :math:`A`.
-
-   If ``upper_lower = uplo::lower``, ``a`` stores the lower
-   triangular part of :math:`A`.
-
-n
-   The order of the matrices :math:`A` :math:`(0 \le n)`.
-
-a
-   The buffer ``a``, size ``(lda,*)``. Contains the upper or lower
-   triangle of the symmetric matrix :math:`A`, as specified by
-   ``upper_lower``.
-
-   The second dimension of ``a`` must be at least :math:`\max(1,n)`.
-
-lda
-   The leading dimension of ``a``; at least :math:`\max(1,n)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_sytrd_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   On exit,
-
-   if ``upper_lower = uplo::upper``, the diagonal and first
-   superdiagonal of :math:`A` are overwritten by the corresponding
-   elements of the tridiagonal matrix :math:`T`, and the elements above
-   the first superdiagonal, with the buffer ``tau``, represent the
-   orthogonal matrix :math:`Q` as a product of elementary reflectors;
-
-   if ``upper_lower = uplo::lower``, the diagonal and first
-   subdiagonal of :math:`A` are overwritten by the corresponding elements
-   of the tridiagonal matrix :math:`T`, and the elements below the first
-   subdiagonal, with the buffer ``tau``, represent the orthogonal matrix
-   :math:`Q` as a product of elementary reflectors.
-
-d
-   Buffer containing the diagonal elements of the matrix :math:`T`. The
-   dimension of ``d`` must be at least :math:`\max(1, n)`.
-
-e
-   Buffer containing the off diagonal elements of the matrix :math:`T`.
-   The dimension of ``e`` must be at least :math:`\max(1, n-1)`.
-
-tau
-   Buffer, size at least :math:`\max(1, n)`. Stores :math:`(n-1)` scalars that
-   define elementary reflectors in decomposition of the unitary
-   matrix :math:`Q` in a product of :math:`n-1` elementary reflectors.
-   :math:`\tau(n)` is used as workspace.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-sytrd (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, T *a, std::int64_t lda, T *d, T *e, T *tau, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = uplo::upper``, ``a`` stores the upper
-   triangular part of :math:`A`.
-
-   If ``upper_lower = uplo::lower``, ``a`` stores the lower
-   triangular part of :math:`A`.
-
-n
-   The order of the matrices :math:`A` :math:`(0 \le n)`.
-
-a
-   The pointer to matrix :math:`A`, size ``(lda,*)``. Contains the upper or lower
-   triangle of the symmetric matrix :math:`A`, as specified by
-   ``upper_lower``.
-   The second dimension of ``a`` must be at least :math:`\max(1,n)`.
-
-lda
-   The leading dimension of ``a``; at least :math:`\max(1,n)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_sytrd_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-   
-a
-   On exit,
-
-   if ``upper_lower = uplo::upper``, the diagonal and first
-   superdiagonal of :math:`A` are overwritten by the corresponding
-   elements of the tridiagonal matrix :math:`T`, and the elements above
-   the first superdiagonal, with the array ``tau``, represent the
-   orthogonal matrix :math:`Q` as a product of elementary reflectors;
-
-   if ``upper_lower = uplo::lower``, the diagonal and first
-   subdiagonal of :math:`A` are overwritten by the corresponding elements
-   of the tridiagonal matrix :math:`T`, and the elements below the first
-   subdiagonal, with the array ``tau``, represent the orthogonal matrix
-   :math:`Q` as a product of elementary reflectors.
-
-d
-   Pointer to diagonal elements of the matrix :math:`T`. The
-   dimension of ``d`` must be at least :math:`\max(1, n)`.
-
-e
-   Pointer to off diagonal elements of the matrix :math:`T`.
-   The dimension of ``e`` must be at least :math:`\max(1, n-1)`.
-
-tau
-   Pointer to array of size at least :math:`\max(1, n)`. Stores :math:`(n-1)` scalars that
-   define elementary reflectors in decomposition of the unitary
-   matrix :math:`Q` in a product of :math:`n-1` elementary reflectors.
-   :math:`\tau(n)` is used as workspace.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-         
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/sytrd_scratchpad_size.rst b/docs/domains/lapack/sytrd_scratchpad_size.rst
deleted file mode 100644
index b3db401f1..000000000
--- a/docs/domains/lapack/sytrd_scratchpad_size.rst
+++ /dev/null
@@ -1,72 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_sytrd_scratchpad_size:
-
-sytrd_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_sytrd` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``sytrd_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double``
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_sytrd` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-sytrd_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_sytrd` function will be performed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``.
-
-   If ``upper_lower = uplo::upper``, a stores the upper triangular
-   part of :math:`A`.
-
-   If ``upper_lower = uplo::lower``, a stores the lower triangular
-   part of :math:`A`.
-
-n
-   The order of the matrices :math:`A` :math:`(0 \le n)`.
-
-lda
-   The leading dimension of ``a``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-         
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_sytrd` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/sytrf.rst b/docs/domains/lapack/sytrf.rst
deleted file mode 100644
index 0baea1476..000000000
--- a/docs/domains/lapack/sytrf.rst
+++ /dev/null
@@ -1,166 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_sytrf:
-
-sytrf
-=====
-
-Computes the Bunch-Kaufman factorization of a symmetric matrix.
-
-.. container:: section
-
-  .. rubric:: Description
-      
-``sytrf`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-The routine computes the factorization of a real/complex symmetric
-matrix :math:`A` using the Bunch-Kaufman diagonal pivoting method. The
-form of the factorization is:
-
--  if ``upper_lower=uplo::upper``, :math:`A` = :math:`UDU^{T}`
-
--  if ``upper_lower=uplo::lower``, :math:`A` = :math:`LDL^{T}`
-
-where :math:`A` is the input matrix, :math:`U` and :math:`L` are products of
-permutation and triangular matrices with unit diagonal (upper
-triangular for :math:`U` and lower triangular for :math:`L`), and :math:`D` is a
-symmetric block-diagonal matrix with :math:`1 \times 1` and :math:`2 \times 2` diagonal
-blocks. :math:`U` and :math:`L` have :math:`2 \times 2` unit diagonal blocks
-corresponding to the :math:`2 \times 2` blocks of :math:`D`.
-
-sytrf (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void sytrf(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<int_64,1> &ipiv, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Indicates whether the upper or lower triangular part of    :math:`A` is stored and how :math:`A` is factored:
-
-      If   ``upper_lower=uplo::upper``, the buffer ``a`` stores the upper triangular   part of the matrix :math:`A`, and :math:`A` is factored as :math:`UDU^T`.
-
-      If ``upper_lower=uplo::lower``, the buffer ``a`` stores   the lower triangular part of the matrix :math:`A`, and :math:`A` is factored   as :math:`LDL^T`.
-
-n
-   The order of matrix :math:`A` (:math:`0 \le n`).
-
-a
-   The buffer ``a``, size :math:`\max(1,lda \cdot n)`. The buffer ``a``    contains either the upper or the lower triangular part of the matrix   :math:`A` (see ``upper_lower``). The second dimension of ``a`` must be at   least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_sytrf_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-a
-   The upper or lower triangular part of a is overwritten by    details of the block-diagonal matrix :math:`D` and the multipliers used   to obtain the factor :math:`U` (or :math:`L`).
-
-ipiv
-   Buffer, size at least :math:`\max(1, n)`. Contains details of    the interchanges and the block structure of :math:`D`. If   :math:`\text{ipiv}(i)=k>0`, then :math:`d_{ii}` is a :math:`1 \times 1` block, and the   :math:`i`-th row and column of :math:`A` was interchanged with the :math:`k`-th   row and column.
-
-      If ``upper_lower=oneapi::mkl::uplo::upper``   and :math:`\text{ipiv}(i)=\text{ipiv}(i-1)=-m<0`, then :math:`D` has a :math:`2 \times 2` block in   rows/columns :math:`i` and :math:`i`-1, and (:math:`i-1`)-th row and column of   :math:`A` was interchanged with the :math:`m`-th row and   column.
-
-      If ``upper_lower=oneapi::mkl::uplo::lower`` and   :math:`\text{ipiv}(i)=\text{ipiv}(i+1)=-m<0`, then :math:`D` has a :math:`2 \times 2` block in   rows/columns :math:`i` and :math:`i+1`, and (:math:`i+1`)-th row and column   of :math:`A` was interchanged with the :math:`m`-th row and column.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-sytrf (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, T *a, std::int64_t lda, int_64 *ipiv, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Indicates whether the upper or lower triangular part of    :math:`A` is stored and how :math:`A` is factored:
-
-      If   ``upper_lower=uplo::upper``, the array ``a`` stores the upper triangular   part of the matrix :math:`A`, and :math:`A` is factored as :math:`UDU^T`.
-
-      If ``upper_lower=uplo::lower``, the array ``a`` stores   the lower triangular part of the matrix :math:`A`, and :math:`A` is factored   as :math:`LDL^T`.
-
-n
-   The order of matrix :math:`A` (:math:`0 \le n`).
-
-a
-   The pointer to :math:`A`, size :math:`\max(1,\text{lda} \cdot n)`, containing either the upper or the lower triangular part of the matrix   :math:`A` (see ``upper_lower``). The second dimension of ``a`` must be at   least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_sytrf_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   The upper or lower triangular part of a is overwritten by    details of the block-diagonal matrix :math:`D` and the multipliers used   to obtain the factor :math:`U` (or :math:`L`).
-
-ipiv
-   Pointer to array of size at least :math:`\max(1, n)`. Contains details of    the interchanges and the block structure of :math:`D`. If   :math:`\text{ipiv}(i)=k>0`, then :math:`d_{ii}` is a :math:`1 \times 1` block, and the   :math:`i`-th row and column of :math:`A` was interchanged with the :math:`k`-th   row and column.
-
-      If ``upper_lower=oneapi::mkl::uplo::upper``   and :math:`\text{ipiv}(i)=\text{ipiv}(i-1)=-m<0`, then :math:`D` has a :math:`2 \times 2` block in   rows/columns :math:`i` and :math:`i-1`, and (:math:`i-1`)-th row and column of   :math:`A` was interchanged with the :math:`m`-th row and   column.
-      
-      If ``upper_lower=oneapi::mkl::uplo::lower`` and   :math:`\text{ipiv}(i)=\text{ipiv}(i+1)=-m<0`, then :math:`D` has a :math:`2 \times 2` block in   rows/columns :math:`i` and :math:`i+1`, and (:math:`i+1`)-th row and column   of :math:`A` was interchanged with the :math:`m`-th row and column.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-         
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/sytrf_scratchpad_size.rst b/docs/domains/lapack/sytrf_scratchpad_size.rst
deleted file mode 100644
index 5b56c6385..000000000
--- a/docs/domains/lapack/sytrf_scratchpad_size.rst
+++ /dev/null
@@ -1,77 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_sytrf_scratchpad_size:
-
-sytrf_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_sytrf` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``sytrf_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-  
-        * -  T 
-        * -  ``float`` 
-        * -  ``double`` 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_sytrf` function should be able to hold.
-Calls to this routine must specify the template parameter
-explicitly.
-
-sytrf_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-         
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_sytrf` function will be performed.
-
-upper_lower
-   Indicates whether the upper or lower triangular part of :math:`A` is
-   stored and how :math:`A` is factored:
-
-   If ``upper_lower=uplo::upper``, the buffer ``a`` stores the
-   upper triangular part of the matrix :math:`A`, and :math:`A` is
-   factored as :math:`UDU^T`.
-
-   If ``upper_lower=uplo::lower``, the buffer ``a`` stores the
-   lower triangular part of the matrix :math:`A`, and :math:`A` is
-   factored as :math:`LDL^T`
-
-n
-   The order of the matrix :math:`A` (:math:`0 \le n`).
-
-lda
-   The leading dimension of ``a``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_sytrf` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/trtrs.rst b/docs/domains/lapack/trtrs.rst
deleted file mode 100644
index 8f980465b..000000000
--- a/docs/domains/lapack/trtrs.rst
+++ /dev/null
@@ -1,197 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_trtrs:
-
-trtrs
-=====
-
-Solves a system of linear equations with a triangular coefficient
-matrix, with multiple right-hand sides.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``trtrs`` supports the following precisions.
-
-      .. list-table:: 
-         :header-rows: 1
-
-         * -  T 
-         * -  ``float`` 
-         * -  ``double`` 
-         * -  ``std::complex<float>`` 
-         * -  ``std::complex<double>`` 
-
-The routine solves for :math:`X` the following systems of linear
-equations with a triangular matrix :math:`A`, with multiple right-hand
-sides stored in :math:`B`:
-
-    .. list-table::
-       :header-rows: 1
- 
-       * -     :math:`AX = B`
-         -
-         -     if ``transa`` =\ ``transpose::nontrans``,
-       * -     \ :math:`A^TX = B`\
-         -
-         -     if ``transa`` =\ ``transpose::trans``,
-       * -     :math:`A^HX = B`
-         -
-         -     if ``transa`` =\ ``transpose::conjtrans`` (for complex    matrices only).
-
-trtrs (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void trtrs(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t nrhs, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &b, std::int64_t ldb, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Indicates whether :math:`A` is upper or lower    triangular:
-
-      If upper_lower = ``uplo::upper``, then   :math:`A` is upper triangular.
-
-      If upper_lower =   ``uplo::lower``, then :math:`A` is lower triangular.
-
-transa
-   If transa = ``transpose::nontrans``, then    :math:`AX = B` is solved for :math:`X`.
-
-   If   transa = ``transpose::trans``, then :math:`A^{T}X = B` is solved for :math:`X`.
-
-   If transa =   ``transpose::conjtrans``, then :math:`A^{H}X = B` is   solved for :math:`X`.
-
-unit_diag
-   If unit_diag = ``diag::nonunit``, then :math:`A` is not a    unit triangular matrix.
-
-   If unit_diag = ``diag::unit``,   then :math:`A` is unit triangular: diagonal elements of :math:`A` are assumed   to be 1 and not referenced in the array ``a``.
-
-n
-   The order of :math:`A`; the number of rows in :math:`B`;    :math:`n \ge 0`.
-
-nrhs
-   The number of right-hand sides; :math:`\text{nrhs} \ge 0`.
-
-a
-   Buffer containing the matrix :math:`A`.      The    second dimension of ``a`` must be at least :math:`\max(1,n)`.
-
-lda
-   The leading dimension of ``a``;    :math:`\text{lda} \ge \max(1, n)`.
-
-b
-   Buffer containing the matrix :math:`B` whose columns are the    right-hand sides for the systems of equations.      The   second dimension of ``b`` at least :math:`\max(1,\text{nrhs})`.
-
-ldb
-   The leading dimension of ``b``; :math:`\text{ldb} \ge \max(1, n)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_trtrs_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-b
-   Overwritten by the solution matrix :math:`X`.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-trtrs (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t nrhs, T *a, std::int64_t lda, T *b, std::int64_t ldb, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Indicates whether :math:`A` is upper or lower    triangular:
-
-      If upper_lower = ``uplo::upper``, then   :math:`A` is upper triangular.
-
-      If upper_lower =   ``uplo::lower``, then :math:`A` is lower triangular.
-
-transa
-   If transa = ``transpose::nontrans``, then    :math:`AX = B` is solved for :math:`X`.
-
-   If   transa = ``transpose::trans``, then :math:`A^{T}X = B` is solved for :math:`X`.
-
-   If transa =   ``transpose::conjtrans``, then :math:`A^{H}X = B` is   solved for :math:`X`.
-
-unit_diag
-   If unit_diag = ``diag::nonunit``, then :math:`A` is not a    unit triangular matrix.
-
-   If unit_diag = ``diag::unit``,   then :math:`A` is unit triangular: diagonal elements of :math:`A` are assumed   to be 1 and not referenced in the array ``a``.
-
-n
-   The order of :math:`A`; the number of rows in :math:`B`;    :math:`n \ge 0`.
-
-nrhs
-   The number of right-hand sides; :math:`\text{nrhs} \ge 0`.
-
-a
-   Array containing the matrix :math:`A`.      The    second dimension of ``a`` must be at least :math:`\max(1,n)`.
-
-lda
-   The leading dimension of ``a``;    :math:`\text{lda} \ge \max(1, n)`.
-
-b
-   Array containing the matrix :math:`B` whose columns are the    right-hand sides for the systems of equations.      The   second dimension of ``b`` at least :math:`\max(1,\text{nrhs})`.
-
-ldb
-   The leading dimension of ``b``; :math:`\text{ldb} \ge \max(1, n)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_trtrs_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-b
-   Overwritten by the solution matrix :math:`X`.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/trtrs_scratchpad_size.rst b/docs/domains/lapack/trtrs_scratchpad_size.rst
deleted file mode 100644
index 150800492..000000000
--- a/docs/domains/lapack/trtrs_scratchpad_size.rst
+++ /dev/null
@@ -1,94 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_trtrs_scratchpad_size:
-
-trtrs_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_trtrs` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``trtrs_scratchpad_size`` supports the following precisions.
-
-    .. list-table:: 
-       :header-rows: 1
-
-       * -  T 
-       * -  ``float`` 
-       * -  ``double`` 
-       * -  ``std::complex<float>`` 
-       * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_trtrs` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-trtrs_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-         
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_trtrs` function will be performed.
-
-upper_lower
-   Indicates whether :math:`A` is upper or lower    triangular:
-
-   If upper_lower = ``uplo::upper``, then   :math:`A` is upper triangular.
-
-   If upper_lower =   ``uplo::lower``, then :math:`A` is lower triangular.
-
-trans
-   Indicates the form of the equations:
-
-   If ``trans=oneapi::mkl::transpose::nontrans``, then :math:`AX = B` is solved
-   for :math:`X`.
-
-   If ``trans=oneapi::mkl::transpose::trans``, then :math:`A^TX = B` is solved
-   for :math:`X`.
-
-   If ``trans=oneapi::mkl::transpose::conjtrans``, then :math:`A^HX = B` is
-   solved for :math:`X`.
-
-diag
-   If diag = ``oneapi::mkl::diag::nonunit``, then :math:`A` is not a    unit triangular matrix.
-
-   If unit_diag = ``diag::unit``,   then :math:`A` is unit triangular: diagonal elements of :math:`A` are assumed   to be 1 and not referenced in the array ``a``.
-
-n
-   The order of :math:`A`; the number of rows in :math:`B`;    :math:`n \ge 0`.
-
-nrhs
-   The number of right-hand sides (:math:`0 \le \text{nrhs}`).
-
-lda
-   The leading dimension of ``a``; :math:`\text{lda} \ge \max(1, n)`.
-
-ldb
-   The leading dimension of ``b``; :math:`\text{ldb} \ge \max(1, n)`.
-
-.. container:: section
-
-  .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_trtrs` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/ungbr.rst b/docs/domains/lapack/ungbr.rst
deleted file mode 100644
index 671d169f5..000000000
--- a/docs/domains/lapack/ungbr.rst
+++ /dev/null
@@ -1,231 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_ungbr:
-
-ungbr
-=====
-
-Generates the complex unitary matrix :math:`Q` or :math:`P^{t}` determined by
-:ref:`onemkl_lapack_gebrd`.
-
-.. container:: section
-
-  .. rubric:: Description
-     
-``ungbr`` supports the following precisions.
-
-      .. list-table:: 
-         :header-rows: 1
-
-         * -  T 
-         * -  ``std::complex<float>`` 
-         * -  ``std::complex<double>`` 
-
-The routine generates the whole or part of the unitary matrices :math:`Q`
-and :math:`P^{H}` formed by the routines
-:ref:`onemkl_lapack_gebrd`.
-All valid combinations of arguments are described in *Input Parameters*; in
-most cases you need the following:
-
-To compute the whole :math:`m \times m` matrix :math:`Q`, use:
-
-::
-
-   oneapi::mkl::lapack::ungbr(queue, generate::q, m, m, n, a, ...)
-
-(note that the buffer ``a`` must have at least :math:`m` columns).
-
-To form the :math:`n` leading columns of :math:`Q` if :math:`m > n`, use:
-
-::
-
-   oneapi::mkl::lapack::ungbr(queue, generate::q, m, n, n, a, ...)
-
-To compute the whole :math:`n \times n` matrix :math:`P^{T}`, use:
-
-::
-
-   oneapi::mkl::lapack::ungbr(queue, generate::p, n, n, m, a, ...)
-
-(note that the array ``a`` must have at least :math:`n` rows).
-
-To form the :math:`m` leading rows of :math:`P^{T}` if :math:`m < n`, use:
-
-::
-
-   oneapi::mkl::lapack::ungbr(queue, generate::p, m, n, m, a, ...)
-
-ungbr (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void ungbr(sycl::queue &queue, oneapi::mkl::generate gen, std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-gen
-   Must be ``generate::q`` or ``generate::p``.
-
-   If ``gen = generate::q``, the routine generates the matrix :math:`Q`.
-
-   If ``gen = generate::p``, the routine generates the matrix
-   :math:`P^{T}`.
-
-m
-   The number of rows in the matrix :math:`Q` or :math:`P^{T}` to be
-   returned :math:`(0 \le m)`.
-
-   If ``gen = generate::q``, :math:`m \ge n \ge \min(m, k)`.
-
-   If ``gen = generate::p``, :math:`n \ge m \ge \min(n, k)`.
-
-n
-   The number of columns in the matrix :math:`Q` or :math:`P^{T}` to be
-   returned :math:`(0 \le n)`. See ``m`` for constraints.
-
-k
-   If ``gen = generate::q``, the number of columns in the original
-   :math:`m \times k` matrix returned by
-   :ref:`onemkl_lapack_gebrd`.
-
-   If ``gen = generate::p``, the number of rows in the original
-   :math:`k \times n` matrix returned by
-   :ref:`onemkl_lapack_gebrd`.
-
-a
-   The buffer ``a`` as returned by
-   :ref:`onemkl_lapack_gebrd`.
-
-lda
-   The leading dimension of ``a``.
-
-tau
-   For ``gen = generate::q``, the array ``tauq`` as returned by :ref:`onemkl_lapack_gebrd`.
-   For ``gen = generate::p``, the array ``taup`` as returned by :ref:`onemkl_lapack_gebrd`.
-
-   The dimension of ``tau`` must be at least :math:`\max(1, \min(m, k))` for
-   ``gen = generate::q``, or :math:`\max(1, \min(m, k))` for
-   ``gen = generate::p``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type :math:`T`.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_ungbr_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Overwritten by :math:`n` leading columns of the :math:`m \times m` unitary matrix
-   :math:`Q` or :math:`P^{T}`, (or the leading rows or columns thereof)
-   as specified by ``gen``, ``m``, and ``n``.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-ungbr (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate gen, std::int64_t m, std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-gen
-   Must be ``generate::q`` or ``generate::p``.
-
-   If ``gen = generate::q``, the routine generates the matrix :math:`Q`.
-
-   If ``gen = generate::p``, the routine generates the matrix
-   :math:`P^{T}`.
-
-m
-   The number of rows in the matrix :math:`Q` or :math:`P^{T}` to be
-   returned :math:`(0 \ge m)`.
-
-   If ``gen = generate::q``, :math:`m \ge n \ge \min(m, k)`.
-
-   If ``gen = generate::p``, :math:`n \ge m \ge \min(n, k)`.
-
-n
-   The number of columns in the matrix :math:`Q` or :math:`P^{T}` to be
-   returned :math:`(0 \le n)`. See ``m`` for constraints.
-
-k
-   If ``gen = generate::q``, the number of columns in the original
-   :math:`m \times k` matrix returned by
-   :ref:`onemkl_lapack_gebrd`.
-
-   If ``gen = generate::p``, the number of rows in the original
-   :math:`k \times n` matrix returned by
-   :ref:`onemkl_lapack_gebrd`.
-
-a
-   The pointer to ``a`` as returned by
-   :ref:`onemkl_lapack_gebrd`.
-
-lda
-   The leading dimension of ``a``.
-
-tau
-   For ``gen = generate::q``, the array ``tauq`` as returned by :ref:`onemkl_lapack_gebrd`.
-   For ``gen = generate::p``, the array ``taup`` as returned by :ref:`onemkl_lapack_gebrd`.
-
-   The dimension of ``tau`` must be at least :math:`\max(1, \min(m, k))` for
-   ``gen = generate::q``, or :math:`\max(1, \min(m, k))` for
-   ``gen = generate::p``.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type :math:`T`.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_ungbr_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-a
-   Overwritten by :math:`n` leading columns of the :math:`m \times m` unitary matrix
-   :math:`Q` or :math:`P^{T}`, (or the leading rows or columns thereof)
-   as specified by ``gen``, ``m``, and ``n``.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-         
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/ungbr_scratchpad_size.rst b/docs/domains/lapack/ungbr_scratchpad_size.rst
deleted file mode 100644
index 5a39e9cc1..000000000
--- a/docs/domains/lapack/ungbr_scratchpad_size.rst
+++ /dev/null
@@ -1,90 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_ungbr_scratchpad_size:
-
-ungbr_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_ungbr` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``ungbr_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-Computes the number of elements of type :math:`T` the scratchpad memory to be passed to :ref:`onemkl_lapack_ungbr` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-ungbr_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t ungbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate gen, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t &scratchpad_size) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-         
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_ungbr` function will be performed.
-
-gen
-   Must be ``generate::q`` or ``generate::p``.
-
-   If ``gen = generate::q``, the routine generates the matrix
-   :math:`Q`.
-
-   If ``gen = generate::p``, the routine generates the matrix
-   :math:`P^{T}`.
-
-m
-   The number of rows in the matrix :math:`Q` or :math:`P^{T}` to be
-   returned :math:`(0 \le m)`.
-
-   If ``gen = generate::q``, :math:`m \ge n \ge \min(m, k)`.
-
-   If ``gen = generate::p``, :math:`n \ge m \ge \min(n, k)`.
-
-n
-   The number of columns in the matrix :math:`Q` or :math:`P^{T}` to
-   be returned :math:`(0 \le n)`. See m for constraints.
-
-k
-   If ``gen = generate::q``, the number of columns in the original
-   :math:`m \times k` matrix reduced by
-   :ref:`onemkl_lapack_gebrd`.
-
-   If ``gen = generate::p``, the number of rows in the original
-   :math:`k \times n` matrix reduced by
-   :ref:`onemkl_lapack_gebrd`.
-
-lda
-   The leading dimension of ``a``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-         
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_ungbr` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/ungqr.rst b/docs/domains/lapack/ungqr.rst
deleted file mode 100644
index 044546adf..000000000
--- a/docs/domains/lapack/ungqr.rst
+++ /dev/null
@@ -1,181 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_ungqr:
-
-ungqr
-=====
-
-Generates the complex unitary matrix :math:`Q` of the QR factorization formed
-by :ref:`onemkl_lapack_geqrf`.
-
-.. container:: section
-
-  .. rubric:: Description
-      
-``ungqr`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-The routine generates the whole or part of :math:`m \times m` unitary
-matrix :math:`Q` of the QR factorization formed by the routines
-:ref:`onemkl_lapack_geqrf`.
-
-Usually :math:`Q` is determined from the QR factorization of an :math:`m \times p` matrix :math:`A` with :math:`m \ge p`. To compute the whole matrix
-:math:`Q`, use:
-
-::
-
-    oneapi::mkl::lapack::ungqr(queue, m, m, p, a, lda, tau, scratchpad, scratchpad_size)
-
-To compute the leading :math:`p` columns of :math:`Q` (which form an
-orthonormal basis in the space spanned by the columns of :math:`A`):
-
-::
-
-    oneapi::mkl::lapack::ungqr(queue, m, p, p, a, lda, tau, scratchpad, scratchpad_size)
-
-To compute the matrix :math:`Q^{k}` of the QR factorization of
-the leading :math:`k` columns of the matrix :math:`A`:
-
-::
-
-    oneapi::mkl::lapack::ungqr(queue, m, m, k, a, lda, tau, scratchpad, scratchpad_size)
-
-To compute the leading :math:`k` columns of :math:`Q^{k}` (which form
-an orthonormal basis in the space spanned by the leading :math:`k`
-columns of the matrix :math:`A`):
-
-::
-
-    oneapi::mkl::lapack::ungqr(queue, m, k, k, a, lda, tau, scratchpad, scratchpad_size)
-
-ungqr (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns in the matrix :math:`A` (:math:`0 \le n`).
-
-k
-   The number of elementary reflectors whose product defines the
-   matrix :math:`Q` (:math:`0 \le k \le n`).
-
-a
-   The buffer ``a`` as returned by
-   :ref:`onemkl_lapack_geqrf`.
-
-lda
-   The leading dimension of ``a`` (:math:`\text{lda} \le m`).
-
-tau
-   The buffer ``tau`` as returned by
-   :ref:`onemkl_lapack_geqrf`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_ungqr_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Overwritten by :math:`n` leading columns of the :math:`m \times m`
-   orthogonal matrix :math:`Q`.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-ungqr (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns in the matrix :math:`A` (:math:`0 \le n`).
-
-k
-   The number of elementary reflectors whose product defines the
-   matrix :math:`Q` (:math:`0 \le k \le n`).
-
-a
-   The pointer to ``a`` as returned by
-   :ref:`onemkl_lapack_geqrf`.
-
-lda
-   The leading dimension of ``a`` (:math:`\text{lda} \le m`).
-
-tau
-   The pointer to ``tau`` as returned by
-   :ref:`onemkl_lapack_geqrf`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_ungqr_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-a
-   Overwritten by :math:`n` leading columns of the :math:`m \times m`
-   orthogonal matrix :math:`Q`.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/ungqr_batch.rst b/docs/domains/lapack/ungqr_batch.rst
deleted file mode 100644
index 0d69e33b8..000000000
--- a/docs/domains/lapack/ungqr_batch.rst
+++ /dev/null
@@ -1,274 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_ungqr_batch:
-
-ungqr_batch
-===========
-
-Generates the complex unitary matrices :math:`Q_i` of the batch of QR factorizations formed by the :ref:`onemkl_lapack_geqrf_batch` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``ungqr_batch`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-.. _onemkl_lapack_ungqr_batch_buffer:
-
-ungqr_batch (Buffer Version)
-----------------------------
-
-.. container:: section
-
-  .. rubric:: Description
-
-The buffer version of ``ungqr_batch`` supports only the strided API. 
-   
-**Strided API**
-
- | The routine generates the wholes or parts of :math`m \times m` unitary matrices :math:`Q_i` of the batch of QR factorization formed by the Strided API of the :ref:`onemkl_lapack_geqrf_batch_buffer`.
- | Usually :math:`Q_i` is determined from the QR factorization of an :math:`m \times p` matrix :math:`A_i` with :math`m \ge p`.
- | To compute the whole matrices :math:`Q_i`, use:
- | ``ungqr_batch(queue, m, m, p, a, ...)``
- | To compute the leading :math:`p` columns of :math:`Q_i` (which form an orthonormal basis in the space spanned by the columns of :math:`A_i`):
- | ``ungqr_batch(queue, m, p, p, a, ...)``
- | To compute the matrices :math:`Q_i`^k` of the QR factorizations of leading :math:`k` columns of the matrices :math:`A_i`:
- | ``ungqr_batch(queue, m, m, k, a, ...)``
- | To compute the leading :math:`k` columns of :math:`Q_i^k` (which form an orthonormal basis in the space spanned by leading :math:`k` columns of the matrices :math:`A_i`):
- | ``ungqr_batch(queue, m, k, k, a, ...)``
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<T> &a, std::int64_t lda, std::int64_t stride_a, sycl::buffer<T> &tau, std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Number of rows in the matrices :math:`A_i` (:math:`0 \le m`).
-
-n
-  Number of columns in the matrices :math:`A_i` (:math:`0\le n`).
-
-k
-  Number of elementary reflectors whose product defines the matrices :math:`Q_i` (:math:`0 \le k \le n`).
-
-a
-  Array resulting after call to the Strided API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
-
-lda
-  Leading dimension of :math:`A_i` (:math:`\text{lda} \le m`).
-
-stride_a
-  Stride between the beginnings of matrices :math:`A_i` inside the batch array ``a``.
-
-tau
-  Array resulting after call to the Strided API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
-
-stride_tau
-  Stride between the beginnings of arrays :math:`tau_i` inside the array ``tau``.
-
-batch_size
-  Number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size 
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by strided version of the Strided API of the :ref:`onemkl_lapack_ungqr_batch_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-  Array data is overwritten by ``a`` batch of n leading columns of the :math:`m \times m` unitary matrices :math:`Q_i`.
-
-.. _onemkl_lapack_ungqr_batch_usm:
-
-ungqr_batch (USM Version)
--------------------------
-
-.. container:: section
-
-  .. rubric:: Description
-
-The USM version of ``ungqr_batch`` supports the group API and strided API. 
-
-**Group API**
-
- | The routine generates the wholes or parts of :math`m \times m` unitary matrices :math:`Q_i` of the batch of QR factorization formed by the Group API of the :ref:`onemkl_lapack_geqrf_batch_buffer`.
- | Usually :math:`Q_i` is determined from the QR factorization of an :math:`m \times p` matrix :math:`A_i` with :math`m \ge p`.
- | To compute the whole matrices :math:`Q_i`, use:
- | ``ungqr_batch(queue, m, m, p, a, ...)``
- | To compute the leading :math:`p` columns of :math:`Q_i` (which form an orthonormal basis in the space spanned by the columns of :math:`A_i`):
- | ``ungqr_batch(queue, m, p, p, a, ...)``
- | To compute the matrices :math:`Q_i`^k` of the QR factorizations of leading :math:`k` columns of the matrices :math:`A_i`:
- | ``ungqr_batch(queue, m, m, k, a, ...)``
- | To compute the leading :math:`k` columns of :math:`Q_i^k` (which form an orthonormal basis in the space spanned by leading :math:`k` columns of the matrices :math:`A_i`):
- | ``ungqr_batch(queue, m, k, k, a, ...)``
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, T **a, std::int64_t *lda, T **tau, std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Array of ``group_count`` :math:`m_g` parameters as previously supplied to the Group API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
-
-n
-  Array of ``group_count`` :math:`n_g` parameters as previously supplied to the Group API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
-
-k
- | Array of ``group_count`` :math:`k_g` parameters as previously supplied to the Group API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
- | The number of elementary reflectors whose product defines the matrices :math:`Q_i` (:math:`0 \le k_g \le n_g`).
-
-a
-  Array resulting after call to the Group API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
-
-lda
-  Array of leading dimensions of :math:`A_i` as previously supplied to the Group API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
-
-tau
-  Array resulting after call to the Group API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
-
-group_count
-  Number of groups of parameters. Must be at least 0.
-
-group_sizes
-  Array of ``group_count`` integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by Group API of the :ref:`onemkl_lapack_ungqr_batch_scratchpad_size` function.
-
-events
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-   
-a
-  Matrices pointed to by array ``a`` are overwritten by :math:`n_g` leading columns of the :math:`m_g \times m_g` orthogonal matrices :math:`Q_i`, where :math:`g` is an index of group of parameters corresponding to :math:`Q_i`.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Strided API**
-
- | The routine generates the wholes or parts of :math`m \times m` unitary matrices :math:`Q_i` of the batch of QR factorization formed by the Strided API of the :ref:`onemkl_lapack_geqrf_batch_usm`.
- | Usually :math:`Q_i` is determined from the QR factorization of an :math:`m \times p` matrix :math:`A_i` with :math`m \ge p`.
- | To compute the whole matrices :math:`Q_i`, use:
- | ``ungqr_batch(queue, m, m, p, a, ...)``
- | To compute the leading :math:`p` columns of :math:`Q_i` (which form an orthonormal basis in the space spanned by the columns of :math:`A_i`):
- | ``ungqr_batch(queue, m, p, p, a, ...)``
- | To compute the matrices :math:`Q_i`^k` of the QR factorizations of leading :math:`k` columns of the matrices :math:`A_i`:
- | ``ungqr_batch(queue, m, m, k, a, ...)``
- | To compute the leading :math:`k` columns of :math:`Q_i^k` (which form an orthonormal basis in the space spanned by leading :math:`k` columns of the matrices :math:`A_i`):
- | ``ungqr_batch(queue, m, k, k, a, ...)``
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, T *a, std::int64_t lda, std::int64_t stride_a, T *tau, std::int64_t stride_tau, std::int64_t batch_size, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    };
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Number of rows in the matrices :math:`A_i` (:math:`0 \le m`).
-
-n
-  Number of columns in the matrices :math:`A_i` (:math:`0\le n`).
-
-k
-  Number of elementary reflectors whose product defines the matrices :math:`Q_i` (:math:`0 \le k \le n`).
-
-a
-  Array resulting after call to the Strided API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
-
-lda
-  Leading dimension of :math:`A_i` (:math:`\text{lda} \le m`).
-
-stride_a
-  Stride between the beginnings of matrices :math:`A_i` inside the batch array ``a``.
-
-tau
-  Array resulting after call to the Strided API of the :ref:`onemkl_lapack_geqrf_batch_usm` function.
-
-stride_tau
-  Stride between the beginnings of arrays :math:`tau_i` inside the array ``tau``.
-
-batch_size
-  Number of problems in a batch.
-
-scratchpad
-  Scratchpad memory to be used by routine for storing intermediate results.
-
-scratchpad_size 
-  Size of scratchpad memory as a number of floating point elements of type ``T``. Size should not be less then the value returned by strided version of the Strided API of the :ref:`onemkl_lapack_ungqr_batch_scratchpad_size` function.
-
-events  
-  List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-  Array data is overwritten by ``a`` batch of n leading columns of the :math:`m \times m` unitary matrices :math:`Q_i`.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
-
diff --git a/docs/domains/lapack/ungqr_batch_scratchpad_size.rst b/docs/domains/lapack/ungqr_batch_scratchpad_size.rst
deleted file mode 100644
index 6a427d7b6..000000000
--- a/docs/domains/lapack/ungqr_batch_scratchpad_size.rst
+++ /dev/null
@@ -1,123 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_ungqr_batch_scratchpad_size:
-
-ungqr_batch_scratchpad_size
-===========================
-
-Computes size of scratchpad memory required for the :ref:`onemkl_lapack_ungqr_batch` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``ungqr_batch_scratchpad_size`` supports the following precisions.
-
-   .. list-table:: 
-      :header-rows: 1
-
-      * -  T 
-      * -  ``std::complex<float>`` 
-      * -  ``std::complex<double>`` 
-
-**Group API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_ungqr_batch` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Array of ``group_count`` :math:`m_g` parameters.
-
-n
-  Array of ``group_count`` :math:`n_g` parameters.
-
-k
- | Array of ``group_count`` :math:`k_g` parameters.
- | Number of elementary reflectors whose product defines the matrices :math:`Q_i` (:math:`0 \le k_g \le n_g`).
- 
-lda
-  Array of leading dimensions of :math:`A_i`.
-
-group_count
-  Number of groups of parameters. Must be at least 0.
-
-group_sizes
-  Array of ``group_count`` integers. Array element with index :math:`g` specifies the number of problems to solve for each of the groups of parameters :math:`g`. So the total number of problems to solve, ``batch_size``, is a sum of all parameter group sizes.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Group API of the :ref:`onemkl_lapack_ungqr_batch` function.
-
-**Strided API**
-
-Computes the number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_ungqr_batch` function.
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size)
-    };
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-
-queue
-  Device queue where calculations will be performed.
-
-m
-  Number of rows in the matrices :math:`A_i` (:math:`0 \le m`).
-
-n
-  Number of columns in the matrices :math:`A_i` (:math:`0 \le n`).
-
-k
-  Number of elementary reflectors whose product defines the matrices :math:`Q_i` (:math:`0 \le k \le n`).
-
-lda
-  Leading dimensions of :math:`A_i` (:math:`\text{lda} \le m`).
-
-stride_a
-  Stride between the beginnings of matrices :math:`A_i` inside the batch array ``a``.
-
-stride_tau
-  Stride between the beginnings of arrays :math:`\tau_i` inside the array ``tau``.
-
-batch_size
-  Number of problems in a batch.
-
-.. container:: section
-   
-  .. rubric:: Return Values
-
-Number of elements of type ``T`` the scratchpad memory should able to hold to be passed to the Strided API of the :ref:`onemkl_lapack_ungqr_batch` function.
-
-**Parent topic:** :ref:`onemkl_lapack-like-extensions-routines`
-
diff --git a/docs/domains/lapack/ungqr_scratchpad_size.rst b/docs/domains/lapack/ungqr_scratchpad_size.rst
deleted file mode 100644
index 7fed35d15..000000000
--- a/docs/domains/lapack/ungqr_scratchpad_size.rst
+++ /dev/null
@@ -1,70 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_ungqr_scratchpad_size:
-
-ungqr_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_ungqr` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``ungqr_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_ungqr` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-ungqr_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t ungqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-         
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_ungqr` function will be performed.
-
-m
-   The number of rows in the matrix :math:`A` (:math:`0 \le m`).
-
-n
-   The number of columns the matrix :math:`A` (:math:`0 \le n \le m`).
-
-k
-   The number of elementary reflectors whose product defines the
-   matrix :math:`Q` (:math:`0 \le k \le n`).
-
-lda
-   The leading dimension of ``a``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-         
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_ungqr` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines` 
-
-
diff --git a/docs/domains/lapack/ungtr.rst b/docs/domains/lapack/ungtr.rst
deleted file mode 100644
index 764a10f96..000000000
--- a/docs/domains/lapack/ungtr.rst
+++ /dev/null
@@ -1,153 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_ungtr:
-
-ungtr
-=====
-
-Generates the complex unitary matrix :math:`Q` determined by
-:ref:`onemkl_lapack_hetrd`.
-
-.. container:: section
-
-  .. rubric:: Description
-      
-``ungtr`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-The routine explicitly generates the :math:`n \times n` unitary matrix
-:math:`Q` formed by :ref:`onemkl_lapack_hetrd` when
-reducing a complex Hermitian matrix :math:`A` to tridiagonal form:
-:math:`A = QTQ^H`. Use this routine after a call to
-:ref:`onemkl_lapack_hetrd`.
-
-ungtr (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void ungtr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``. Uses the same
-   ``upper_lower`` as supplied to
-   :ref:`onemkl_lapack_hetrd`.
-
-n
-   The order of the matrix :math:`Q` :math:`(0 \le n)`.
-
-a
-   The buffer ``a`` as returned by
-   :ref:`onemkl_lapack_hetrd`. The
-   second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a`` :math:`(n \le \text{lda})`.
-
-tau
-   The buffer ``tau`` as returned by
-   :ref:`onemkl_lapack_hetrd`. The
-   dimension of ``tau`` must be at least :math:`\max(1, n-1)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_ungtr_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Overwritten by the unitary matrix :math:`Q`.
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-ungtr (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, T *a, std::int64_t lda, T *tau, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   The queue where the routine should be executed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``. Uses the same
-   ``upper_lower`` as supplied to
-   :ref:`onemkl_lapack_hetrd`.
-
-n
-   The order of the matrix :math:`Q` :math:`(0 \le n)`.
-
-a
-   The pointer to ``a`` as returned by
-   :ref:`onemkl_lapack_hetrd`. The
-   second dimension of ``a`` must be at least :math:`\max(1, n)`.
-
-lda
-   The leading dimension of ``a`` :math:`(n \le \text{lda})`.
-
-tau
-   The pointer to ``tau`` as returned by
-   :ref:`onemkl_lapack_hetrd`. The
-   dimension of ``tau`` must be at least :math:`\max(1, n-1)`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_ungtr_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-a
-   Overwritten by the unitary matrix :math:`Q`.
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
-
diff --git a/docs/domains/lapack/ungtr_scratchpad_size.rst b/docs/domains/lapack/ungtr_scratchpad_size.rst
deleted file mode 100644
index 6b91bbbaa..000000000
--- a/docs/domains/lapack/ungtr_scratchpad_size.rst
+++ /dev/null
@@ -1,67 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_ungtr_scratchpad_size:
-
-ungtr_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_ungtr` function.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``ungtr_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_ungtr` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-ungtr_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t ungtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t lda) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-         
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_ungtr` function will be performed.
-
-upper_lower
-   Must be ``uplo::upper`` or ``uplo::lower``. Uses the same
-   ``upper_lower`` as supplied to
-   :ref:`onemkl_lapack_hetrd`.
-
-n
-   The order of the matrix :math:`Q` :math:`(0 \le n)`.
-
-lda
-   The leading dimension of ``a`` :math:`(n \le lda)`.
-
-.. container:: section
-
-  .. rubric:: Return Value
-
-The number of elements of type T the scratchpad memory to be passed to :ref:`onemkl_lapack_ungtr` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
diff --git a/docs/domains/lapack/unmqr.rst b/docs/domains/lapack/unmqr.rst
deleted file mode 100644
index 0515be510..000000000
--- a/docs/domains/lapack/unmqr.rst
+++ /dev/null
@@ -1,207 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_unmqr:
-
-unmqr
-=====
-
-Multiplies a complex matrix by the unitary matrix :math:`Q` of the QR
-factorization formed by :ref:`onemkl_lapack_geqrf`.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``unmqr`` supports the following precisions.
-
-    .. list-table::
-       :header-rows: 1
-
-       * -  T
-       * -  ``std::complex<float>``
-       * -  ``std::complex<double>``
-
-The routine multiplies a rectangular complex :math:`m \times n` matrix :math:`C` by
-:math:`Q` or :math:`Q^H`, where :math:`Q` is the complex unitary matrix defined
-as a product of :math:`k` elementary reflectors :math:`H(i)` of order :math:`n`:
-:math:`Q = H(1)^HH(2)^H ... H(k)^H` as returned by the RQ factorization routine
-:ref:`onemkl_lapack_gerqf`.
-
-Depending on the parameters ``side`` and ``trans``, the routine can form one of
-the matrix products :math:`QC`, :math:`Q^HC`, :math:`CQ`, or :math:`CQ^H`
-(overwriting the result over :math:`C`).
-
-unmqr (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &c, std::int64_t ldc, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-    The queue where the routine should be executed.
-
-side
-    If ``side = oneapi::mkl::side::left``, :math:`Q` or :math:`Q^{H}` is applied
-    to :math:`C` from the left.
-
-    If ``side = oneapi::mkl::side::right``, :math:`Q` or :math:`Q^{H}` is
-    applied to :math:`C` from the right.
-
-trans
-    If ``trans = oneapi::mkl::transpose::nontrans``, the routine multiplies
-    :math:`C` by :math:`Q`.
-
-    If ``trans = oneapi::mkl::transpose::conjtrans``, the routine multiplies :math:`C`
-    by :math:`Q^{H}`.
-
-m
-    The number of rows in the matrix :math:`C` (:math:`0 \le m`).
-
-n
-    The number of columns in the matrix :math:`C` (:math:`0 \le n`).
-
-k
-    The number of elementary reflectors whose product defines the
-    matrix :math:`Q` 
-
-    If ``side = oneapi::mkl::side::left``, :math:`0 \le k \le m`
-
-    If ``side = oneapi::mkl::side::right``, :math:`0 \le k \le n`
-
-a
-    The buffer ``a`` as returned by :ref:`onemkl_lapack_geqrf`.
-    The second dimension of ``a`` must be at least :math:`\max(1,k)`.
-
-lda
-    The leading dimension of ``a``.
-
-tau
-    The buffer ``tau`` as returned by :ref:`onemkl_lapack_geqrf`.
-
-c
-    The buffer ``c`` contains the matrix :math:`C`. The second dimension of
-    ``c`` must be at least :math:`\max(1,n)`.
-
-ldc
-    The leading dimension of ``c``.
-
-scratchpad_size
-    Size of scratchpad memory as a number of floating point elements of type
-    ``T``. Size should not be less than the value returned by the
-    :ref:`onemkl_lapack_unmqr_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-c
-    Overwritten by the product :math:`QC`, :math:`Q^{H}C`, :math:`CQ`, or
-    :math:`CQ^H` (as specified by ``side`` and ``trans``).
-
-scratchpad
-    Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-unmqr (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *c, std::int64_t ldc, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-    The queue where the routine should be executed.
-
-side
-    If ``side = oneapi::mkl::side::left``, :math:`Q` or :math:`Q^{H}` is applied
-    to :math:`C` from the left.
-
-    If ``side = oneapi::mkl::side::right``, :math:`Q` or :math:`Q^{H}` is
-    applied to :math:`C` from the right.
-
-trans
-    If ``trans = oneapi::mkl::transpose::nontrans``, the routine multiplies
-    :math:`C` by :math:`Q`.
-
-    If ``trans = oneapi::mkl::transpose::conjtrans``, the routine multiplies :math:`C`
-    by :math:`Q^{H}`.
-
-m
-    The number of rows in the matrix :math:`C` (:math:`0 \le m`).
-
-n
-    The number of columns in the matrix :math:`C` (:math:`0 \le n`).
-
-k
-    The number of elementary reflectors whose product defines the
-    matrix :math:`Q`
-
-    If ``side = oneapi::mkl::side::left``, :math:`0 \le k \le m`
-
-    If ``side = oneapi::mkl::side::right``, :math:`0 \le k \le n`
-
-a
-    The pointer to ``a`` as returned by :ref:`onemkl_lapack_geqrf`.
-    The second dimension of ``a`` must be at least :math:`\max(1,k)`.
-
-lda
-    The leading dimension of ``a``.
-
-tau
-    The pointer to ``tau`` as returned by :ref:`onemkl_lapack_geqrf`.
-
-c
-    The pointer ``c`` points to the matrix :math:`C`. The second dimension of
-    ``c`` must be at least :math:`\max(1,n)`.
-
-ldc
-    The leading dimension of ``c``.
-
-scratchpad_size
-    Size of scratchpad memory as a number of floating point elements of type
-    ``T``. Size should not be less than the value returned by
-    :ref:`onemkl_lapack_unmqr_scratchpad_size` function.
-
-events
-    List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-c
-    Overwritten by the product :math:`QC`, :math:`Q^{H}C`, :math:`CQ`, or
-    :math:`CQ^{H}` (as specified by ``side`` and ``trans``).
-
-scratchpad
-    Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
diff --git a/docs/domains/lapack/unmqr_scratchpad_size.rst b/docs/domains/lapack/unmqr_scratchpad_size.rst
deleted file mode 100644
index a95127507..000000000
--- a/docs/domains/lapack/unmqr_scratchpad_size.rst
+++ /dev/null
@@ -1,87 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_unmqr_scratchpad_size:
-
-unmqr_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_unmqr` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``unmqr_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_unmqr` function should be able to hold.
-Calls to this routine must specify the template parameter
-explicitly.
-
-unmqr_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-         
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t unmqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc, std::int64_t &scratchpad_size) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-         
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_unmqr` function will be performed.
-
-side
-   If ``side=oneapi::mkl::side::left``, :math:`Q` or :math:`Q^{H}` is
-   applied to :math:`C` from the left.
-
-   If ``side=oneapi::mkl::side::right``, :math:`Q` or :math:`Q^{H}` is
-   applied to :math:`C` from the right.
-
-trans
-   If ``trans=oneapi::mkl::transpose::nontrans``, the routine multiplies
-   :math:`C` by :math:`Q`.
-
-   If ``trans=oneapi::mkl::transpose::conjtrans``, the routine multiplies
-   :math:`C` by :math:`Q^H`.
-
-m
-   The number of rows in the matrix :math:`C` (:math:`0 \le m`).
-
-n
-   The number of columns the matrix :math:`C` (:math:`0 \le n \le m`).
-
-k
-   The number of elementary reflectors whose product defines the
-   matrix :math:`Q` (:math:`0 \le k \le n`).
-
-lda
-   The leading dimension of ``a``.
-
-ldc
-   The leading dimension of ``c``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-         
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_unmqr` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/unmrq.rst b/docs/domains/lapack/unmrq.rst
deleted file mode 100644
index 2b3872261..000000000
--- a/docs/domains/lapack/unmrq.rst
+++ /dev/null
@@ -1,207 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_unmrq:
-
-unmrq
-=====
-
-Multiplies a complex matrix by the unitary matrix :math:`Q` of the RQ
-factorization formed by :ref:`onemkl_lapack_gerqf`.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``unmrq`` supports the following precisions.
-
-    .. list-table::
-       :header-rows: 1
-
-       * -  T
-       * -  ``std::complex<float>``
-       * -  ``std::complex<double>``
-
-The routine multiplies a rectangular complex :math:`m \times n` matrix :math:`C` by
-:math:`Q` or :math:`Q^H`, where :math:`Q` is the complex unitary matrix defined
-as a product of :math:`k` elementary reflectors :math:`H(i)` of order :math:`n`:
-:math:`Q = H(1)^HH(2)^H ... H(k)^H` as returned by the RQ factorization routine
-:ref:`onemkl_lapack_gerqf`.
-
-Depending on the parameters ``side`` and ``trans``, the routine can form one of
-the matrix products :math:`QC`, :math:`Q^HC`, :math:`CQ`, or :math:`CQ^H`
-(overwriting the result over :math:`C`).
-
-unmrq (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &c, std::int64_t ldc, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-    The queue where the routine should be executed.
-
-side
-    If ``side = oneapi::mkl::side::left``, :math:`Q` or :math:`Q^{H}` is applied
-    to :math:`C` from the left.
-
-    If ``side = oneapi::mkl::side::right``, :math:`Q` or :math:`Q^{H}` is
-    applied to :math:`C` from the right.
-
-trans
-    If ``trans = oneapi::mkl::transpose::nontrans``, the routine multiplies
-    :math:`C` by :math:`Q`.
-
-    If ``trans = oneapi::mkl::transpose::conjtrans``, the routine multiplies :math:`C`
-    by :math:`Q^{H}`.
-
-m
-    The number of rows in the matrix :math:`C` (:math:`0 \le m`).
-
-n
-    The number of columns in the matrix :math:`C` (:math:`0 \le n`).
-
-k
-    The number of elementary reflectors whose product defines the
-    matrix :math:`Q` 
-
-    If ``side = oneapi::mkl::side::left``, :math:`0 \le k \le m`
-
-    If ``side = oneapi::mkl::side::right``, :math:`0 \le k \le n`
-
-a
-    The buffer ``a`` as returned by :ref:`onemkl_lapack_gerqf`.
-    The second dimension of ``a`` must be at least :math:`\max(1,k)`.
-
-lda
-    The leading dimension of ``a``.
-
-tau
-    The buffer ``tau`` as returned by :ref:`onemkl_lapack_gerqf`.
-
-c
-    The buffer ``c`` contains the matrix :math:`C`. The second dimension of
-    ``c`` must be at least :math:`\max(1,n)`.
-
-ldc
-    The leading dimension of ``c``.
-
-scratchpad_size
-    Size of scratchpad memory as a number of floating point elements of type
-    ``T``. Size should not be less than the value returned by
-    :ref:`onemkl_lapack_unmrq_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-c
-    Overwritten by the product :math:`QC`, :math:`Q^{H}C`, :math:`CQ`, or
-    :math:`CQ^H` (as specified by ``side`` and ``trans``).
-
-scratchpad
-    Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-unmrq (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *c, std::int64_t ldc, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-    The queue where the routine should be executed.
-
-side
-    If ``side = oneapi::mkl::side::left``, :math:`Q` or :math:`Q^{H}` is applied
-    to :math:`C` from the left.
-
-    If ``side = oneapi::mkl::side::right``, :math:`Q` or :math:`Q^{H}` is
-    applied to :math:`C` from the right.
-
-trans
-    If ``trans = oneapi::mkl::transpose::nontrans``, the routine multiplies
-    :math:`C` by :math:`Q`.
-
-    If ``trans = oneapi::mkl::transpose::conjtrans``, the routine multiplies :math:`C`
-    by :math:`Q^{H}`.
-
-m
-    The number of rows in the matrix :math:`C` (:math:`0 \le m`).
-
-n
-    The number of columns in the matrix :math:`C` (:math:`0 \le n`).
-
-k
-    The number of elementary reflectors whose product defines the
-    matrix :math:`Q`
-
-    If ``side = oneapi::mkl::side::left``, :math:`0 \le k \le m`
-
-    If ``side = oneapi::mkl::side::right``, :math:`0 \le k \le n`
-
-a
-    The pointer to ``a`` as returned by :ref:`onemkl_lapack_gerqf`.
-    The second dimension of ``a`` must be at least :math:`\max(1,k)`.
-
-lda
-    The leading dimension of ``a``.
-
-tau
-    The pointer to ``tau`` as returned by :ref:`onemkl_lapack_gerqf`.
-
-c
-    The pointer ``c`` points to the matrix :math:`C`. The second dimension of
-    ``c`` must be at least :math:`\max(1,n)`.
-
-ldc
-    The leading dimension of ``c``.
-
-scratchpad_size
-    Size of scratchpad memory as a number of floating point elements of type
-    ``T``. Size should not be less than the value returned by
-    :ref:`onemkl_lapack_unmrq_scratchpad_size` function.
-
-events
-    List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-
-c
-    Overwritten by the product :math:`QC`, :math:`Q^{H}C`, :math:`CQ`, or
-    :math:`CQ^{H}` (as specified by ``side`` and ``trans``).
-
-scratchpad
-    Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
diff --git a/docs/domains/lapack/unmrq_scratchpad_size.rst b/docs/domains/lapack/unmrq_scratchpad_size.rst
deleted file mode 100644
index 8b771db75..000000000
--- a/docs/domains/lapack/unmrq_scratchpad_size.rst
+++ /dev/null
@@ -1,79 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_unmrq_scratchpad_size:
-
-unmrq_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_unmrq` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``unmrq_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-  
-        * -  T 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_unmrq` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-unmrq_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t unmrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-         
-queue
-   Device queue where calculations by the unmrq function will be performed.
-
-side
-   If ``side = oneapi::mkl::side::left``, :math:`Q` or :math:`Q^T` is applied to :math:`C` from the left. If ``side = oneapi::mkl::side::right``, :math:`Q` or :math:`Q^T` is applied to :math:`C` from the right.
-
-trans
-   If ``trans=oneapi::mkl::transpose::nontrans``, the routine multiplies :math:`C` by :math:`Q`.
-
-   If ``trans=oneapi::mkl::transpose::conjtrans``, the routine multiplies :math:`C` by :math:`Q^H`.
-
-m
-   The number of rows in the matrix :math:`C` (:math:`0 \le m`).
-
-n
-   The number of columns in the matrix :math:`C` (:math:`0 \le n \le m`).
-
-k
-   The number of elementary reflectors whose product defines the matrix :math:`Q` (:math:`0 \le k \le n`).
-
-lda
-   The leading dimension of ``a``.
-
-ldc
-   The leading dimension of ``c``.
-
-.. container:: section
-
-  .. rubric:: Return Value
-
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_unmrq` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-linear-equation-routines`
-
diff --git a/docs/domains/lapack/unmtr.rst b/docs/domains/lapack/unmtr.rst
deleted file mode 100644
index 0c156b4dc..000000000
--- a/docs/domains/lapack/unmtr.rst
+++ /dev/null
@@ -1,250 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_unmtr:
-
-unmtr
-=====
-
-Multiplies a complex matrix by the complex unitary matrix Q
-determined by
-:ref:`onemkl_lapack_hetrd`.
-
-.. container:: section
-
-  .. rubric:: Description
-
-``unmtr`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-The routine multiplies a complex matrix :math:`C` by :math:`Q` or
-:math:`Q^{H}`, where :math:`Q` is the unitary matrix :math:`Q` formed by
-:ref:`onemkl_lapack_hetrd`
-when reducing a complex Hermitian matrix :math:`A` to tridiagonal form:
-:math:`A = QTQ^H`. Use this routine after a call to
-:ref:`onemkl_lapack_hetrd`.
-
-Depending on the parameters ``side`` and ``trans``, the routine can
-form one of the matrix products :math:`QC`, :math:`Q^{H}C`,
-:math:`CQ`, or :math:`CQ^{H}` (overwriting the result on :math:`C`).
-
-unmtr (Buffer Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, sycl::buffer<T,1> &a, std::int64_t lda, sycl::buffer<T,1> &tau, sycl::buffer<T,1> &c, std::int64_t ldc, sycl::buffer<T,1> &scratchpad, std::int64_t scratchpad_size)
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-In the descriptions below, ``r`` denotes the order of :math:`Q`:
-
-.. container:: tablenoborder
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  :math:`r`\ =\ :math:`m` 
-          -  if ``side = side::left`` 
-        * -  :math:`r`\ =\ :math:`n` 
-          -  if ``side = side::right`` 
-
-queue
-   The queue where the routine should be executed.
-
-side
-   Must be either ``side::left`` or ``side::right``.
-
-   If ``side=side::left``, :math:`Q` or :math:`Q^{H}` is applied
-   to :math:`C` from the left.
-
-   If ``side=side::right``, :math:`Q` or :math:`Q^{H}` is applied
-   to :math:`C` from the right.
-
-upper_lower
-   Must be either ``uplo::upper`` or ``uplo::lower``. Uses the same
-   ``upper_lower`` as supplied to
-   :ref:`onemkl_lapack_hetrd`.
-
-trans
-   Must be either ``transpose::nontrans`` or
-   ``transpose::conjtrans``.
-
-   If ``trans=transpose::nontrans``, the routine multiplies :math:`C` by
-   :math:`Q`.
-
-   If ``trans=transpose::conjtrans``, the routine multiplies :math:`C` by
-   :math:`Q^{H}`.
-
-m
-   The number of rows in the matrix :math:`C` (:math:`m \ge 0`).
-
-n
-   The number of columns the matrix :math:`C` (:math:`n \ge 0`).
-
-k
-   The number of elementary reflectors whose product defines the
-   matrix :math:`Q` (:math:`0 \le k \le n`).
-
-a
-   The buffer ``a`` as returned by
-   :ref:`onemkl_lapack_hetrd`.
-
-lda
-   The leading dimension of ``a`` :math:`(\max(1,r) \le \text{lda})`.
-
-tau
-   The buffer ``tau`` as returned by
-   :ref:`onemkl_lapack_hetrd`. The
-   dimension of ``tau`` must be at least :math:`\max(1,r-1)`.
-
-c
-   The buffer ``c`` contains the matrix :math:`C`. The second dimension of ``c``
-   must be at least :math:`\max(1,n)`.
-
-ldc
-   The leading dimension of ``c`` :math:`(\max(1,n) \le \text{ldc})`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_unmtr_scratchpad_size` function.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-c
-   Overwritten by the product :math:`QC`, :math:`Q^{H}C`,
-   :math:`CQ`, or :math:`CQ^{H}` (as specified by ``side`` and
-   ``trans``).
-
-scratchpad
-   Buffer holding scratchpad memory to be used by routine for storing intermediate results.
-
-unmtr (USM Version)
-----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, T *a, std::int64_t lda, T *tau, T *c, std::int64_t ldc, T *scratchpad, std::int64_t scratchpad_size, const std::vector<sycl::event> &events = {})
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-      
-In the descriptions below, ``r`` denotes the order of :math:`Q`:
-
-.. container:: tablenoborder
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  :math:`r`\ =\ :math:`m` 
-          -  if ``side = side::left`` 
-        * -  :math:`r`\ =\ :math:`n` 
-          -  if ``side = side::right`` 
-
-queue
-   The queue where the routine should be executed.
-
-side
-   Must be either ``side::left`` or ``side::right``.
-
-   If ``side=side::left``, :math:`Q` or :math:`Q^{H}` is applied
-   to :math:`C` from the left.
-
-   If ``side=side::right``, :math:`Q` or :math:`Q^{H}` is applied
-   to :math:`C` from the right.
-
-upper_lower
-   Must be either ``uplo::upper`` or ``uplo::lower``. Uses the same
-   ``upper_lower`` as supplied to
-   :ref:`onemkl_lapack_hetrd`.
-
-trans
-   Must be either ``transpose::nontrans`` or
-   ``transpose::conjtrans``.
-
-   If ``trans=transpose::nontrans``, the routine multiplies :math:`C` by
-   :math:`Q`.
-
-   If ``trans=transpose::conjtrans``, the routine multiplies :math:`C` by
-   :math:`Q^{H}`.
-
-m
-   The number of rows in the matrix :math:`C` (:math:`m \ge 0`).
-
-n
-   The number of columns the matrix :math:`C` (:math:`n \ge 0`).
-
-k
-   The number of elementary reflectors whose product defines the
-   matrix :math:`Q` (:math:`0 \le k \le n`).
-
-a
-   The pointer to ``a`` as returned by
-   :ref:`onemkl_lapack_hetrd`.
-
-lda
-   The leading dimension of ``a`` :math:`(\max(1,r) \le \text{lda})`.
-
-tau
-   The pointer to ``tau`` as returned by
-   :ref:`onemkl_lapack_hetrd`. The
-   dimension of ``tau`` must be at least :math:`\max(1,r-1)`.
-
-c
-   The array ``c`` contains the matrix :math:`C`. The second dimension of ``c``
-   must be at least :math:`\max(1,n)`.
-
-ldc
-   The leading dimension of ``c`` :math:`(\max(1,n) \le \text{ldc})`.
-
-scratchpad_size
-   Size of scratchpad memory as a number of floating point elements of type ``T``.
-   Size should not be less than the value returned by :ref:`onemkl_lapack_unmtr_scratchpad_size` function.
-
-events
-   List of events to wait for before starting computation. Defaults to empty list.
-
-.. container:: section
-
-  .. rubric:: Output Parameters
-      
-c
-   Overwritten by the product :math:`QC`, :math:`Q^{H}C`,
-   :math:`CQ`, or :math:`CQ^{H}` (as specified by ``side`` and
-   trans).
-
-scratchpad
-   Pointer to scratchpad memory to be used by routine for storing intermediate results.
-
-.. container:: section
-
-  .. rubric:: Return Values
-
-Output event to wait on to ensure computation is complete.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
diff --git a/docs/domains/lapack/unmtr_scratchpad_size.rst b/docs/domains/lapack/unmtr_scratchpad_size.rst
deleted file mode 100644
index 8ca39d3f0..000000000
--- a/docs/domains/lapack/unmtr_scratchpad_size.rst
+++ /dev/null
@@ -1,96 +0,0 @@
-.. SPDX-FileCopyrightText: 2019-2020 Intel Corporation
-..
-.. SPDX-License-Identifier: CC-BY-4.0
-
-.. _onemkl_lapack_unmtr_scratchpad_size:
-
-unmtr_scratchpad_size
-=====================
-
-Computes size of scratchpad memory required for :ref:`onemkl_lapack_unmtr` function.
-
-.. container:: section
-
-  .. rubric:: Description
-         
-``unmtr_scratchpad_size`` supports the following precisions.
-
-     .. list-table:: 
-        :header-rows: 1
-
-        * -  T 
-        * -  ``std::complex<float>`` 
-        * -  ``std::complex<double>`` 
-
-Computes the number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_unmtr` function should be able to hold.
-Calls to this routine must specify the template parameter explicitly.
-
-unmtr_scratchpad_size
----------------------
-
-.. container:: section
-
-  .. rubric:: Syntax
-
-.. code-block:: cpp
-
-    namespace oneapi::mkl::lapack {
-      template <typename T>
-      std::int64_t unmtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc) 
-    }
-
-.. container:: section
-
-  .. rubric:: Input Parameters
-
-queue
-   Device queue where calculations by :ref:`onemkl_lapack_unmtr` function will be performed.
-
-side
-   Must be either ``side::left`` or ``side::right``.
-
-   If ``side=side::left``, :math:`Q` or :math:`Q^{H}` is
-   applied to :math:`C` from the left.
-
-   If ``side=side::right``, :math:`Q` or :math:`Q^{H}` is
-   applied to :math:`C` from the right.
-
-upper_lower
-   Must be either ``uplo::upper`` or ``uplo::lower``. Uses the
-   same ``upper_lower`` as supplied to
-   :ref:`onemkl_lapack_hetrd`.
-
-trans
-   Must be either ``transpose::nontrans`` or
-   ``transpose::conjtrans``.
-
-   If ``trans=transpose::nontrans``, the routine multiplies :math:`C`
-   by :math:`Q`.
-
-   If ``trans=transpose::conjtrans``, the routine multiplies :math:`C`
-   by :math:`Q^{H}`.
-
-m
-   The number of rows in the matrix :math:`C` (:math:`m \ge 0`).
-
-n
-   The number of columns the matrix :math:`C` (:math:`n \ge 0`).
-
-k
-   The number of elementary reflectors whose product defines the
-   matrix :math:`Q` (:math:`0 \le k \le n`).
-
-lda
-   The leading dimension of :math:`a` :math:`(\max(1,r) \le \text{lda})`.
-
-ldc
-   The leading dimension of :math:`c` :math:`(\max(1,n) \le \text{ldc})`.
-
-.. container:: section
-
-  .. rubric:: Return Value
-         
-The number of elements of type ``T`` the scratchpad memory to be passed to :ref:`onemkl_lapack_unmtr` function should be able to hold.
-
-**Parent topic:** :ref:`onemkl_lapack-singular-value-eigenvalue-routines`
-
diff --git a/docs/domains/matrix-storage.rst b/docs/domains/matrix-storage.rst
deleted file mode 100644
index 656262f55..000000000
--- a/docs/domains/matrix-storage.rst
+++ /dev/null
@@ -1,581 +0,0 @@
-.. _matrix-storage:
-
-Matrix Storage
-==============
-
-
-.. container::
-
-
-   The oneMKL BLAS and LAPACK routines for DPC++ use several matrix and
-   vector storage formats. These are the same formats used in
-   traditional Fortran BLAS/LAPACK.
-
-   .. container:: section
-
-      .. rubric:: General Matrix
-         :name: general-matrix
-         :class: sectiontitle
-
-      A general matrix ``A`` of ``m`` rows and ``n`` columns with
-      leading dimension ``lda`` is represented as a one dimensional
-      array ``a`` of size of at least ``lda`` \* ``n`` if column major
-      layout is used and at least ``lda`` \* ``m`` if row major layout
-      is used.  Before entry in any BLAS function using a general
-      matrix, the leading ``m`` by ``n`` part of the array ``a`` must
-      contain the matrix ``A``. For column (respectively row) major
-      layout, the elements of each column (respectively row) are
-      contiguous in memory while the elements of each row
-      (respectively column) are at distance ``lda`` from the element
-      in the same row (respectively column) and the previous column
-      (respectively row).
-
-      Visually, the matrix
-
-      .. math::
-            
-         A = \begin{bmatrix}
-             A_{11} & A_{12} & A_{13} & \ldots & A_{1n}\\
-             A_{21} & A_{22} & A_{23} & \ldots & A_{2n}\\
-             A_{31} & A_{32} & A_{33} & \ldots & A_{3n}\\
-             \vdots & \vdots & \vdots & \ddots & \vdots\\
-             A_{m1} & A_{m2} & A_{m3} & \ldots & A_{mn}
-             \end{bmatrix}
-
-      is stored in memory as an array
-
-      - For column major layout,
-
-      .. math::
-         
-         \scriptstyle a = 
-            [\underbrace{\underbrace{A_{11},A_{21},A_{31},...,A_{m1},*,...,*}_\text{lda},
-                         \underbrace{A_{12},A_{22},A_{32},...,A_{m2},*,...,*}_\text{lda},
-                         ...,
-                         \underbrace{A_{1n},A_{2n},A_{3n},...,A_{mn},*,...,*}_\text{lda}}
-                         _\text{lda x n}]
-      
-      - For row major layout,
-
-      .. math::
-         
-         \scriptstyle a = 
-            [\underbrace{\underbrace{A_{11},A_{12},A_{13},...,A_{1n},*,...,*}_\text{lda},
-                         \underbrace{A_{21},A_{22},A_{23},...,A_{2n},*,...,*}_\text{lda},
-                         ...,
-                         \underbrace{A_{m1},A_{m2},A_{m3},...,A_{mn},*,...,*}_\text{lda}}
-                         _\text{m x lda}]
-
-   .. container:: section
-
-      .. rubric:: Triangular Matrix
-         :name: triangular-matrix
-         :class: sectiontitle
-
-      A triangular matrix ``A`` of ``n`` rows and ``n`` columns with
-      leading dimension ``lda`` is represented as a one dimensional
-      array ``a``, of a size of at least ``lda`` \* ``n``. When column
-      (respectively row) major layout is used, the elements of each
-      column (respectively row) are contiguous in memory while the
-      elements of each row (respectively column) are at distance
-      ``lda`` from the element in the same row (respectively column)
-      and the previous column (respectively row).
-
-      Before entry in any BLAS function using a triangular matrix,
-
-      -  If ``upper_lower = uplo::upper``, the leading ``n`` by ``n``
-         upper triangular part of the array ``a`` must contain the upper
-         triangular part of the matrix ``A``. The strictly lower
-         triangular part of the array ``a`` is not referenced. In other
-         words, the matrix
-
-         .. math::
-
-            A = \begin{bmatrix}
-                A_{11} & A_{12} & A_{13} & \ldots & A_{1n}\\
-                *      & A_{22} & A_{23} & \ldots & A_{2n}\\
-                *      & *      & A_{33} & \ldots & A_{3n}\\
-                \vdots & \vdots & \vdots & \ddots & \vdots\\
-                *      & *      & *      & \ldots & A_{nn}
-                \end{bmatrix}
-
-         is stored in memory as the array
-
-         - For column major layout,
-
-         .. math::
-            
-            \scriptstyle a = 
-               [\underbrace{\underbrace{A_{11},*,...,*}_\text{lda},
-                            \underbrace{A_{12},A_{22},*,...,*}_\text{lda},
-                            ...,
-                            \underbrace{A_{1n},A_{2n},A_{3n},...,A_{nn},*,...,*}_\text{lda}}
-                            _\text{lda x n}]
-
-         - For row major layout,
-
-         .. math::
-            
-            \scriptstyle a = 
-               [\underbrace{\underbrace{A_{11},A_{12},A_{13},...,A_{1n},*,...,*}_\text{lda},
-                            \underbrace{*,A_{22},A_{23},...,A_{2n},*,...,*}_\text{lda},
-                            ...,
-                            \underbrace{*,...,*,A_{nn},*,...,*}_\text{lda}}
-                            _\text{lda x n}]
-
-      -  If ``upper_lower = uplo::lower``, the leading ``n`` by ``n``
-         lower triangular part of the array ``a`` must contain the lower
-         triangular part of the matrix ``A``. The strictly upper
-         triangular part of the array ``a`` is not referenced. That is,
-         the matrix
-
-         .. math::
-
-            A = \begin{bmatrix}
-                A_{11} & *      & *      & \ldots & *     \\
-                A_{21} & A_{22} & *      & \ldots & *     \\
-                A_{31} & A_{32} & A_{33} & \ldots & *     \\
-                \vdots & \vdots & \vdots & \ddots & \vdots\\
-                A_{n1} & A_{n2} & A_{n3} & \ldots & A_{nn}
-                \end{bmatrix}
-
-         is stored in memory as the array
-
-         - For column major layout,
-      
-         .. math::
-                  
-            \scriptstyle a = 
-               [\underbrace{\underbrace{A_{11},A_{21},A_{31},..,A_{n1},*,...,*}_\text{lda},
-                            \underbrace{*,A_{22},A_{32},...,A_{n2},*,...,*}_\text{lda},
-                            ...,
-                            \underbrace{*,...,*,A_{nn},*,...,*}_\text{lda}}
-                            _\text{lda x n}]
-
-         - For row major layout,
-
-         .. math::
-                  
-            \scriptstyle a = 
-               [\underbrace{\underbrace{A_{11},*,...,*}_\text{lda},
-                            \underbrace{A_{21},A_{22},*,...,*}_\text{lda},
-                            ...,
-                            \underbrace{A_{n1},A_{n2},A_{n3},...,A_{nn},*,...,*}_\text{lda}}
-                            _\text{lda x n}]
-
-   .. container:: section
-
-      .. rubric:: Band Matrix
-         :name: band-matrix
-         :class: sectiontitle
-
-      A general band matrix ``A`` of ``m`` rows and ``n`` columns with
-      ``kl`` sub-diagonals, ``ku`` super-diagonals, and leading
-      dimension ``lda`` is represented as a one dimensional array
-      ``a`` of a size of at least ``lda`` \* ``n`` (respectively
-      ``lda`` \* ``m``) if column (respectively row) major layout is
-      used.
-
-      Before entry in any BLAS function using a general band matrix,
-      the leading (``kl`` + ``ku`` + 1\ ``)`` by ``n`` (respectively
-      ``m``) part of the array ``a`` must contain the matrix
-      ``A``. This matrix must be supplied column-by-column
-      (respectively row-by-row), with the main diagonal of the matrix
-      in row ``ku`` (respectively ``kl``) of the array (0-based
-      indexing), the first super-diagonal starting at position 1
-      (respectively 0) in row (``ku`` - 1) (respectively column
-      (``kl`` + 1)), the first sub-diagonal starting at position 0
-      (respectively 1) in row (``ku`` + 1) (respectively column
-      (``kl`` - 1)), and so on. Elements in the array ``a`` that do
-      not correspond to elements in the band matrix (such as the top
-      left ``ku`` by ``ku`` triangle) are not referenced.
-
-      Visually, the matrix ``A``
-
-      .. math::
-
-         A = \left[\begin{smallmatrix}
-             A_{11}     & A_{12}     & A_{13}     & \ldots & A_{1,ku+1} & *          & \ldots     & \ldots     & \ldots & \ldots    & \ldots    & *         \\
-             A_{21}     & A_{22}     & A_{23}     & A_{24} & \ldots     & A_{2,ku+2} & *          & \ldots     & \ldots & \ldots    & \ldots    & *         \\
-             A_{31}     & A_{32}     & A_{33}     & A_{34} & A_{35}     & \ldots     & A_{3,ku+3} & *          & \ldots & \ldots    & \ldots    & *         \\
-             \vdots     & A_{42}     & A_{43}     & \ddots & \ddots     & \ddots     & \ddots     & \ddots     & *      & \ldots    & \ldots    & \vdots    \\
-             A_{kl+1,1} & \vdots     & A_{53}     & \ddots & \ddots     & \ddots     & \ddots     & \ddots     & \ddots & *         & \ldots    & \vdots    \\
-             *          & A_{kl+2,2} & \vdots     & \ddots & \ddots     & \ddots     & \ddots     & \ddots     & \ddots & \ddots    & \ddots    & \vdots    \\
-             \vdots     & *          & A_{kl+3,3} & \ddots & \ddots     & \ddots     & \ddots     & \ddots     & \ddots & \ddots    & \ddots    & *         \\
-             \vdots     & \vdots     & *          & \ddots & \ddots     & \ddots     & \ddots     & \ddots     & \ddots & \ddots    & \ddots    & A_{n-ku,n}\\
-             \vdots     & \vdots     & \vdots     & *      & \ddots     & \ddots     & \ddots     & \ddots     & \ddots & \ddots    & \ddots    & \vdots    \\
-             \vdots     & \vdots     & \vdots     & \vdots & *          & \ddots     & \ddots     & \ddots     & \ddots & \ddots    & \ddots    & A_{m-2,n} \\
-             \vdots     & \vdots     & \vdots     & \vdots & \vdots     & \ddots     & \ddots     & \ddots     & \ddots & \ddots    & \ddots    & A_{m-1,n} \\
-             *          & *          & *          & \ldots & \ldots     & \ldots     & *          & A_{m,m-kl} & \ldots & A_{m,n-2} & A_{m,n-1} & A_{m,n} 
-             \end{smallmatrix}\right]
-
-
-      is stored in memory as an array
-
-      - For column major layout,
-        
-      .. math::
-               
-         \scriptscriptstyle a = 
-            [\underbrace{
-             \underbrace{\underbrace{*,...,*}_\text{ku},A_{11}, A_{12},...,A_{min(kl+1,m),1},*,...,*}_\text{lda},
-             \underbrace{\underbrace{*,...,*}_\text{ku-1},A_{max(1,2-ku),2},...,A_{min(kl+2,m),2},*,...*}_\text{lda},
-             ...,
-             \underbrace{\underbrace{*,...,*}_\text{max(0,ku-n+1)},A_{max(1,n-ku),n},...,A_{min(kl+n,m),n},*,...*}_\text{lda}
-             }_\text{lda x n}]
-
-
-      - For row major layout,
-
-      .. math::
-               
-         \scriptscriptstyle a = 
-            [\underbrace{
-             \underbrace{\underbrace{*,...,*}_\text{kl},A_{11}, A_{12},...,A_{1,min(ku+1,n)},*,...,*}_\text{lda},
-             \underbrace{\underbrace{*,...,*}_\text{kl-1},A_{2,max(1,2-kl)},...,A_{2,min(ku+2,n)},*,...*}_\text{lda},
-             ...,
-             \underbrace{\underbrace{*,...,*}_\text{max(0,kl-m+1)},A_{m,max(1,m-kl)},...,A_{m,min(ku+m,n)},*,...*}_\text{lda}
-             }_\text{lda x m}]
-
-      The following program segment transfers a band matrix from
-      conventional full matrix storage (variable ``matrix``, with
-      leading dimension ``ldm``) to band storage (variable ``a``, with
-      leading dimension ``lda``):
-
-
-      - Using matrices stored with column major layout,
-        
-      ::
-
-         for (j = 0; j < n; j++) {
-             k = ku – j;
-             for (i = max(0, j – ku); i < min(m, j + kl + 1); i++) {
-                 a[(k + i) + j * lda] = matrix[i + j * ldm];
-             }
-         }
-
-      - Using matrices stored with row major layout,
-
-      ::
-
-         for (i = 0; i < m; i++) {
-             k = kl – i;
-             for (j = max(0, i – kl); j < min(n, i + ku + 1); j++) {
-                 a[(k + j) + i * lda] = matrix[j + i * ldm];
-             }
-         }
-        
-
-   .. container:: section
-
-      .. rubric:: Triangular Band Matrix
-         :name: triangular-band-matrix
-         :class: sectiontitle
-
-      A triangular band matrix ``A`` of ``n`` rows and ``n`` columns
-      with ``k`` sub/super-diagonals and leading dimension ``lda`` is
-      represented as a one dimensional array ``a`` of size at least
-      ``lda`` \* ``n``.
-
-      Before entry in any BLAS function using a triangular band matrix,
-
-
-      - If ``upper_lower = uplo::upper``, the leading (``k`` + 1) by ``n``
-        part of the array ``a`` must contain the upper
-        triangular band part of the matrix ``A``. When using column
-        major layout, this matrix must be supplied column-by-column
-        (respectively row-by-row) with the main diagonal of the
-        matrix in row (``k``) (respectively column 0) of the array,
-        the first super-diagonal starting at position 1
-        (respectively 0) in row (``k`` - 1) (respectively column 1),
-        and so on. Elements in the array ``a`` that do not correspond
-        to elements in the triangular band matrix (such as the top
-        left ``k`` by ``k`` triangle) are not referenced.
-
-        Visually, the matrix
-
-        .. math::
-
-           A = \left[\begin{smallmatrix}
-               A_{11}     & A_{12}     & A_{13}     & \ldots & A_{1,k+1} & *          & \ldots      & \ldots     & \ldots & \ldots    & \ldots    & *         \\
-               *          & A_{22}     & A_{23}     & A_{24} & \ldots     & A_{2,k+2} & *           & \ldots     & \ldots & \ldots    & \ldots    & *         \\
-               \vdots     & *          & A_{33}     & A_{34} & A_{35}     & \ldots     & A_{3,k+3}  & *          & \ldots & \ldots    & \ldots    & *         \\
-               \vdots     & \vdots     & *          & \ddots & \ddots     & \ddots     & \ddots     & \ddots     & *      & \ldots    & \ldots    & \vdots    \\
-               \vdots     & \vdots     & \vdots     & \ddots & \ddots     & \ddots     & \ddots     & \ddots     & \ddots & *         & \ldots    & \vdots    \\
-               \vdots     & \vdots     & \vdots     & \vdots & \ddots     & \ddots     & \ddots     & \ddots     & \ddots & \ddots    & \ddots    & \vdots    \\
-               \vdots     & \vdots     & \vdots     & \vdots & \vdots     & \ddots     & \ddots     & \ddots     & \ddots & \ddots    & \ddots    & *         \\
-               \vdots     & \vdots     & \vdots     & \vdots & \vdots     & \vdots     & \ddots     & \ddots     & \ddots & \ddots    & \ddots    & A_{n-k,n}\\
-               \vdots     & \vdots     & \vdots     & \vdots & \vdots     & \vdots     & \vdots     & \ddots     & \ddots & \ddots    & \ddots    & \vdots    \\
-               \vdots     & \vdots     & \vdots     & \vdots & \vdots     & \vdots     & \vdots     & \vdots     & \ddots & \ddots    & \ddots    & A_{n-2,n} \\
-               \vdots     & \vdots     & \vdots     & \vdots & \vdots     & \vdots     & \vdots     & \vdots     & \vdots & \ddots    & \ddots    & A_{n-1,n} \\
-               *          & *          & *          & \ldots & \ldots     & \ldots     & \ldots     & \ldots     & \ldots & \ldots    & *         & A_{n,n} 
-               \end{smallmatrix}\right]
-
-        is stored as an array
-
-      .. container:: fignone
-                            
-         - For column major layout,
-                
-            .. math::
-                     
-               \scriptstyle a = 
-                  [\underbrace{
-                   \underbrace{\underbrace{*,...,*}_\text{ku},A_{11},*,...,*}_\text{lda},
-                   \underbrace{\underbrace{*,...,*}_\text{ku-1},A_{max(1,2-k),2},...,A_{2,2},*,...*}_\text{lda},
-                   ...,
-                   \underbrace{\underbrace{*,...,*}_\text{max(0,k-n+1)},A_{max(1,n-k),n},...,A_{n,n},*,...*}_\text{lda}
-                   }_\text{lda x n}]
-
-
-         - For row major layout,
-            
-            .. math::
-                     
-               \scriptstyle a = 
-                  [\underbrace{
-                   \underbrace{A_{11},A_{21},...,A_{min(k+1,n),1},*,...,*}_\text{lda},
-                   \underbrace{A_{2,2},...,A_{min(k+2,n),2},*,...,*}_\text{lda},
-                   ...,
-                   \underbrace{A_{n,n},*,...*}_\text{lda}
-                   }_\text{lda x n}]
-
-         The following program segment transfers a band matrix from
-         conventional full matrix storage (variable ``matrix``, with
-         leading dimension ``ldm``) to band storage (variable ``a``,
-         with leading dimension ``lda``):
-
-         - Using matrices stored with column major layout,
-
-         ::
-
-            for (j = 0; j < n; j++) {
-                m = k – j;
-                for (i = max(0, j – k); i <= j; i++) {
-                    a[(m + i) + j * lda] = matrix[i + j * ldm];
-                }
-            }
-
-         - Using matrices stored with column major layout,
-
-         ::
-
-            for (i = 0; i < n; i++) {
-                m = –i;
-                for (j = i; j < min(n, i + k + 1); j++) {
-                    a[(m + j) + i * lda] = matrix[j + i * ldm];
-                }
-            }
-
-      - If ``upper_lower = uplo::lower``, the leading (``k`` + 1) by ``n``
-        part of the array ``a`` must contain the upper triangular
-        band part of the matrix ``A``. This matrix must be supplied
-        column-by-column with the main diagonal of the matrix in row 0
-        of the array, the first sub-diagonal starting at position 0 in
-        row 1, and so on. Elements in the array ``a`` that do not
-        correspond to elements in the triangular band matrix (such as
-        the bottom right ``k`` by ``k`` triangle) are not referenced.
-
-        That is, the matrix
-
-        .. math::
-
-           A = \left[\begin{smallmatrix}
-               A_{11}     & *          & \ldots     & \ldots & \ldots     & \ldots    & \ldots     & \ldots     & \ldots & \ldots    & \ldots    & *         \\
-               A_{21}     & A_{22}     & *          & \ldots & \ldots     & \ldots    & \ldots     & \ldots     & \ldots & \ldots    & \ldots    & *         \\
-               A_{31}     & A_{32}     & A_{33}     & *      & \ldots     & \ldots    & \ldots     & \ldots     & \ldots & \ldots    & \ldots    & *         \\
-               \vdots     & A_{42}     & A_{43}     & \ddots & \ddots     & \ldots    & \ldots     & \ldots     & \ldots & \ldots    & \ldots    & \vdots    \\
-               A_{k+1,1}  & \vdots     & A_{53}     & \ddots & \ddots     & \ddots    & \ldots     & \ldots     & \ldots & \ldots    & \ldots    & \vdots    \\
-               *          & A_{k+2,2}  & \vdots     & \ddots & \ddots     & \ddots    & \ddots     & \ldots     & \ldots & \ldots    & \ldots    & \vdots    \\
-               \vdots     & *          & A_{k+3,3}  & \ddots & \ddots     & \ddots    & \ddots     & \ddots     & \ldots & \ldots    & \ldots    & \vdots    \\
-               \vdots     & \vdots     & *          & \ddots & \ddots     & \ddots    & \ddots     & \ddots     & \ddots & \ldots    & \ldots    & \vdots    \\
-               \vdots     & \vdots     & \vdots     & *      & \ddots     & \ddots    & \ddots     & \ddots     & \ddots & \ddots    & \ldots    & \vdots    \\
-               \vdots     & \vdots     & \vdots     & \vdots & *          & \ddots    & \ddots     & \ddots     & \ddots & \ddots    & \ddots    & \vdots    \\
-               \vdots     & \vdots     & \vdots     & \vdots & \vdots     & \ddots    & \ddots     & \ddots     & \ddots & \ddots    & \ddots    & *         \\
-               *          & *          & *          & \ldots & \ldots     & \ldots    & *          & A_{n,n-k}  & \ldots & A_{n,n-2} & A_{n,n-1} & A_{n,n} 
-               \end{smallmatrix}\right]
-
-
-        is stored as the array
-
-
-      .. container:: fignone
-
-         - For column major layout,
-
-           .. math::
-                    
-              \scriptstyle a = 
-                 [\underbrace{
-                  \underbrace{A_{11},A_{21},...,A_{min(k+1,n),1},*,...,*}_\text{lda},
-                  \underbrace{A_{2,2},...,A_{min(k+2,n),2},*,...,*}_\text{lda},
-                  ...,
-                  \underbrace{A_{n,n},*,...*}_\text{lda}
-                  }_\text{lda x n}]
-
-         - For row major layout,
-        
-            .. math::
-                     
-               \scriptstyle a = 
-                  [\underbrace{
-                   \underbrace{\underbrace{*,...,*}_\text{k},A_{11},*,...,*}_\text{lda},
-                   \underbrace{\underbrace{*,...,*}_\text{k-1},A_{max(1,2-k),2},...,A_{2,2},*,...*}_\text{lda},
-                   ...,
-                   \underbrace{\underbrace{*,...,*}_\text{max(0,k-n+1)},A_{max(1,n-k),n},...,A_{n,n},*,...*}_\text{lda}
-                   }_\text{lda x n}]
-
-
-         The following program segment transfers a band matrix from
-         conventional full matrix storage (variable ``matrix``, with
-         leading dimension ``ldm``) to band storage (variable ``a``,
-         with leading dimension ``lda``):
-
-         - Using matrices stored with column major layout,
-           
-         ::
-
-            for (j = 0; j < n; j++) {
-                m = –j;
-                for (i = j; i < min(n, j + k + 1); i++) {
-                    a[(m + i) + j * lda] = matrix[i + j * ldm];
-                }
-            }
-
-         - Using matrices stored with row major layout,
-
-         ::
-
-            for (i = 0; i < n; i++) {
-                m = k – i;
-                for (j = max(0, i – k); j <= i; j++) {
-                    a[(m + j) + i * lda] = matrix[j + i * ldm];
-                }
-            }
-
-
-   .. container:: section
-
-      .. rubric:: Packed Triangular Matrix
-         :name: packed-triangular-matrix
-         :class: sectiontitle
-
-      A triangular matrix ``A`` of ``n`` rows and ``n`` columns is
-      represented in packed format as a one dimensional array ``a`` of
-      size at least (``n``\ \*(``n`` + 1))/2. All elements in the upper
-      or lower part of the matrix ``A`` are stored contiguously in the
-      array ``a``.
-
-      Before entry in any BLAS function using a triangular packed
-      matrix,
-
-      - If ``upper_lower = uplo::upper``, if column (respectively row)
-        major layout is used, the first (``n``\ \*(``n`` + 1))/2
-        elements in the array ``a`` must contain the upper triangular
-        part of the matrix ``A`` packed sequentially, column by column
-        (respectively row by row) so that ``a``\ [0] contains ``A``\
-        :sub:`11`, ``a``\ [1] and ``a``\ [2] contain ``A``\ :sub:`12`
-        and ``A``\ :sub:`22` (respectively ``A``\ :sub:`13`)
-        respectively, and so on. Hence, the matrix
-
-        .. math::
-              
-           A = \begin{bmatrix}
-               A_{11} & A_{12} & A_{13} & \ldots & A_{1n}\\
-               *      & A_{22} & A_{23} & \ldots & A_{2n}\\
-               *      & *      & A_{33} & \ldots & A_{3n}\\
-               \vdots & \vdots & \vdots & \ddots & \vdots\\
-               *      & *      & *      & \ldots & A_{nn}
-               \end{bmatrix}
-
-        is stored as the array
-
-        - For column major layout,
-
-          .. math::
-             
-             \scriptstyle a = [A_{11},A_{12},A_{22},A_{13},A_{23},A_{33},...,A_{(n-1),n},A_{nn}]
-
-        - For row major layout,
-
-          .. math::
-             
-             \scriptstyle a = [A_{11},A_{12},A_{13},...,A_{1n},
-                  A_{22},A_{23},...,A_{2n},...,
-                  A_{(n-1),(n-1)},A_{(n-1),n},A_{nn}]
-
-      - If ``upper_lower = uplo::lower``, if column (respectively row)
-        major layout is used, the first (``n``\ \*(``n`` + 1))/2
-        elements in the array ``a`` must contain the lower triangular
-        part of the matrix ``A`` packed sequentially, column by column
-        (row by row) so that ``a``\ [0] contains ``A``\ :sub:`11`,
-        ``a``\ [1] and ``a``\ [2] contain ``A``\ :sub:`21` and ``A``\
-        :sub:`31` (respectively ``A``\ :sub:`22`) respectively, and so
-        on. The matrix
-
-         .. math::
-               
-            A = \begin{bmatrix}
-                A_{11} & *      & *      & \ldots & *     \\
-                A_{21} & A_{22} & *      & \ldots & *     \\
-                A_{31} & A_{32} & A_{33} & \ldots & *     \\
-                \vdots & \vdots & \vdots & \ddots & \vdots\\
-                A_{n1} & A_{n2} & A_{n3} & \ldots & A_{nn}
-                \end{bmatrix}
-
-         is stored as the array
-
-         - For column major layout,
-
-          .. math::
-             
-             \scriptstyle a = [A_{11},A_{21},A_{31},...,A_{n1},
-                  A_{22},A_{32},...,A_{n2},...,
-                  A_{(n-1),(n-1)},A_{n,(n-1)},A_{nn}]
-
-         - For row major layout,
-
-          .. math::
-             
-             \scriptstyle a = [A_{11},A_{21},A_{22},A_{31},A_{32},A_{33},...,A_{n,(n-1)},A_{nn}]
-
-   .. container:: section
-
-      .. rubric:: Vector
-         :name: vector
-         :class: sectiontitle
-
-      A vector ``X`` of ``n`` elements with increment ``incx`` is
-      represented as a one dimensional array ``x`` of size at least (1 +
-      (``n`` - 1) \* abs(``incx``)).
-
-      Visually, the vector
-
-      .. math::
-            
-            X = (X_{1},X_{2}, X_{3},...,X_{n})
-
-      is stored in memory as an array
-
-
-      .. math::
-               
-         \scriptstyle x = [\underbrace{
-             \underbrace{X_{1},*,...,*}_\text{incx},
-             \underbrace{X_{2},*,...,*}_\text{incx},
-             ...,
-             \underbrace{X_{n-1},*,...,*}_\text{incx},X_{n}
-             }_\text{1 + (n-1) x incx}] \quad if \:incx \:> \:0 
-
-      .. math::
-               
-         \scriptstyle x = [\underbrace{
-             \underbrace{X_{n},*,...,*}_\text{|incx|},
-             \underbrace{X_{n-1},*,...,*}_\text{|incx|},
-             ...,
-             \underbrace{X_{2},*,...,*}_\text{|incx|},X_{1}
-             }_\text{1 + (1-n) x incx}] \quad if \:incx \:< \:0 
-
-
-
-
diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 100644
index 51e4216ee..000000000
--- a/docs/index.rst
+++ /dev/null
@@ -1,36 +0,0 @@
-..
-  Copyright 2020-2024 Intel Corporation
-
-.. _onemkl:
-
-*****************
-oneMKL Interfaces
-*****************
-
-Contents
-========
-
-.. toctree::
-   :caption: About
-   :maxdepth: 2
-
-   introduction.rst
-
-.. toctree::
-   :caption: Get Started
-   :maxdepth: 2
-
-   selecting_a_compiler.rst
-   building_the_project_with_dpcpp.rst
-   building_the_project_with_adaptivecpp.rst
-   building_and_running_tests.rst
-   using_onemkl_with_cmake.rst
-
-.. toctree::
-   :caption: Developer Reference
-   :maxdepth: 2
-   :includehidden:
-
-   onemkl-datatypes.rst
-   domains/dense_linear_algebra.rst
-   create_new_backend.rst
diff --git a/docs/introduction.rst b/docs/introduction.rst
deleted file mode 100644
index c3df7aa8a..000000000
--- a/docs/introduction.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-.. _introduction:
-
-Introduction
-============
-
-oneMKL Interfaces is an open-source implementation of oneMKL Data Parallel C++
-(DPC++) interfaces according to the `oneMKL specification <https://spec.oneapi.com/versions/latest/elements/oneMKL/source/index.html>`_
-that can work with multiple devices (backends) using device-specific
-libraries underneath.
diff --git a/docs/onemkl-datatypes.rst b/docs/onemkl-datatypes.rst
deleted file mode 100644
index 33a08e834..000000000
--- a/docs/onemkl-datatypes.rst
+++ /dev/null
@@ -1,140 +0,0 @@
-.. _onemkl_datatypes:
-
-oneMKL Defined Datatypes
-========================
-
-
-oneMKL BLAS and LAPACK for Data Parallel C++ (DPC++) introduces
-several new enumeration data types, which are type-safe versions of
-the traditional Fortran characters in BLAS and LAPACK. They are
-declared in ``types.hpp``, which is included automatically when
-you include ``mkl.hpp``. Like all oneMKL DPC++ functionality, they belong to the namespace ``oneapi::mkl``.
-
-
-Each enumeration value comes with two names: A single-character name
-(the traditional BLAS/LAPACK character) and a longer, descriptive
-name. The two names are exactly equivalent and may be used
-interchangeably.
-
-
-transpose
----------
-
-The ``transpose`` type specifies whether an input matrix should be
-transposed and/or conjugated. It can take the following values:
-
-
-.. list-table::
-   :header-rows: 1
-
-   * -  Short Name
-     -  Long Name
-     -  Description
-   * -  ``transpose::N``
-     -  ``transpose::nontrans``
-     -  Do not transpose or conjugate the matrix.
-   * -  ``transpose::T``
-     -  ``transpose::trans``
-     -  Transpose the matrix.
-   * -  ``transpose::C``
-     -  ``transpose::conjtrans``
-     -  Perform Hermitian transpose (transpose and conjugate). Only applicable to complex matrices.
-
-
-
-
-uplo
-----
-
-The ``uplo`` type specifies whether the lower or upper triangle of a riangular, symmetric, or Hermitian matrix should be accessed.
-
-It can take the following values:
-
-
-.. list-table::
-   :header-rows: 1
-
-   * -  Short Name
-     -  Long Name
-     -  Description
-   * -  ``uplo::U``
-     -  ``uplo::upper``
-     -  Access the upper triangle of the matrix.
-   * -  ``uplo::L``
-     -  ``uplo::lower``
-     -  Access the lower triangle of the matrix.
-
-
-
-
-In both cases, elements that are not in the selected triangle are not accessed or updated.
-
-
-diag
-----
-
-
-The ``diag`` type specifies the values on the diagonal of a triangular matrix. It can take the following values:
-
-
-.. list-table::
-   :header-rows: 1
-
-   * -  Short Name
-     -  Long Name
-     -  Description
-   * -  ``diag::N``
-     -  ``diag::nonunit``
-     -  The matrix is not unit triangular. The diagonal entries are stored with the matrix data.
-   * -  ``diag::U``
-     -  ``diag::unit``
-     -  The matrix is unit triangular (the diagonal entries are all 1s). The diagonal entries in the matrix data are not accessed.
-
-
-
-
-side
-----
-
-
-The ``side`` type specifies the order of matrix multiplication when one matrix has a special form (triangular, symmetric, or Hermitian):
-
-
-.. list-table::
-   :header-rows: 1
-
-   * -  Short Name
-     -  Long Name
-     -  Description
-   * -  ``side::L``
-     -  ``side::left``
-     -  The special form matrix is on the left in the multiplication.
-   * -  ``side::R``
-     -  ``side::right``
-     -  The special form matrix is on the right in the multiplication.
-
-
-offset
-------
-
-
-The ``offset`` type specifies whether the offset to apply to an output matrix is a fix offset, column offset or row offset. It can take the following values
-
-
-.. list-table::
-   :header-rows: 1
-
-   * -  Short Name
-     -  Long Name
-     -  Description
-   * -  ``offset::F``
-     -  ``offset::fix``
-     -  The offset to apply to the output matrix is fix, all the inputs in the ``C_offset`` matrix has the same value given by the first element in the ``co`` array.
-   * -  ``offset::C``
-     -  ``offset::column``
-     -  The offset to apply to the output matrix is a column offset, that is to say all the columns in the ``C_offset`` matrix are the same and given by the elements in the ``co`` array.
-   * -  ``offset::R``
-     -  ``offset::row``
-     -  The offset to apply to the output matrix is a row offset, that is to say all the rows in the ``C_offset`` matrix are the same and given by the elements in the ``co`` array.
-
-**Parent topic:** :ref:`onemkl`
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index 8365d7241..000000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-accessible-pygments==0.0.5
-alabaster==0.7.16
-Babel==2.15.0
-beautifulsoup4==4.12.3
-certifi==2024.7.4
-charset-normalizer==3.3.2
-docutils==0.21.2
-idna==3.7
-imagesize==1.4.1
-Jinja2==3.1.4
-MarkupSafe==2.1.5
-packaging==24.0
-pydata-sphinx-theme==0.15.2
-Pygments==2.18.0
-requests==2.32.1
-snowballstemmer==2.2.0
-soupsieve==2.5
-Sphinx==7.3.7
-sphinx-book-theme==1.1.2
-sphinxcontrib-applehelp==1.0.8
-sphinxcontrib-devhelp==1.0.6
-sphinxcontrib-htmlhelp==2.0.5
-sphinxcontrib-jsmath==1.0.1
-sphinxcontrib-qthelp==1.0.7
-sphinxcontrib-serializinghtml==1.1.10
-tomli==2.0.1
-typing_extensions==4.11.0
-urllib3==2.2.2
diff --git a/docs/selecting_a_compiler.rst b/docs/selecting_a_compiler.rst
deleted file mode 100644
index 8c09e60b4..000000000
--- a/docs/selecting_a_compiler.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-.. _selecting_a_compiler:
-
-Selecting a Compiler
-====================
-
-You must choose a compiler according to the required backend and the operating system of your
-application.
-
-* If your application requires Intel GPU, use
-  `Intel(R) oneAPI DPC++ Compiler <https://software.intel.com/en-us/oneapi/dpc-compiler>`_ ``icpx`` on Linux or ``icx`` on Windows.
-* If your Linux application requires NVIDIA GPU, build ``clang++`` from the latest source of
-  `oneAPI DPC++ Compiler <https://github.com/intel/llvm>`_ with `support for NVIDIA CUDA <https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md#build-dpc-toolchain-with-support-for-nvidia-cuda>`_ or use ``hipSYCL`` from the `hipSYCL repository <https://github.com/illuhad/hipSYCL>`_ (except for LAPACK domain).
-* If your Linux application requires AMD GPU, build ``clang++`` from the latest source of `oneAPI DPC++ Compiler <https://github.com/intel/llvm>`_ with `support for HIP AMD <https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md#build-dpc-toolchain-with-support-for-hip-amd>`_ or use ``hipSYCL``.
-* If no Intel GPU, NVIDIA GPU, or AMD GPU is required, on Linux you can use
-  `Intel(R) oneAPI DPC++ Compiler <https://software.intel.com/en-us/oneapi/dpc-compiler>`_
-  ``icpx``, `oneAPI DPC++ Compiler <https://github.com/intel/llvm/releases>`_ ``clang++``, or ``hipSYCL``,
-  and on Windows you can use either
-  `Intel(R) oneAPI DPC++ Compiler <https://software.intel.com/en-us/oneapi/dpc-compiler>`_
-  ``icx`` or `oneAPI DPC++ Compiler <https://github.com/intel/llvm/releases>`_ ``clang-cl``.
diff --git a/docs/using_onemkl_with_cmake.rst b/docs/using_onemkl_with_cmake.rst
deleted file mode 100644
index 5fb497362..000000000
--- a/docs/using_onemkl_with_cmake.rst
+++ /dev/null
@@ -1,102 +0,0 @@
-.. _using_onemkl_interface_library_with_cmake:
-
-Using the oneMKL Interfaces in your project with CMake
-=============================================================
-
-The CMake build tool can help you use oneMKL Interfaces in your own project.
-Instead of manually linking and including directories, you can use the CMake targets
-exported by the oneMKL Interfaces project. You can use oneMKL in one of two
-forms, with the target names depending on the approach taken: 
-
-* you can use a previously installed copy, either from a binary distribution or
-  built from source. This can be imported using CMake's ``find_package``
-  command. See the section `using_from_installed_binary`_.
-* or you can have CMake automatically download and build oneMKL as part of the
-  build process using CMake's FetchContent_ functionality.
-  See the section `using_with_fetchcontent`_.
-
-
-.. _using_from_installed_binary:
-
-Using an installed oneMKL Interfaces
-####################################
-
-If the oneMKL Interfaces have been previously installed, either by building from
-source or as a distributed binary, they can be consumed using CMake using
-``find_package(oneMKL REQUIRED)``. The compiler used for the target library or
-application should match that used to build oneMKL Interfaces.
-
-For example:
-
-.. code-block:: cmake
-
-    find_package(oneMKL REQUIRED)
-    target_link_libraries(myTarget PRIVATE MKL::onemkl)
-
-Different targets can be used depending on the requirements of oneMKL. 
-To link against the entire library, the ``MKL::onemkl`` target should be used.
-For specific domains, ``MKL::onemkl_<domain>`` should be used.
-And for specific backends, ``MKL::onemkl_<domain>_<backend>`` should be used.
-
-When using a binary, it may be useful to know the backends that were enabled
-during the build. To check for the existence of backends, CMake's ``if(TARGET
-<target>)`` construct can be used. For example, with the ``cufft`` backend:
-
-.. code-block:: cmake
-
-    if(TARGET MKL::onemkl_dft_cufft)
-        target_link_libraries(myTarget PRIVATE MKL::onemkl_dft_cufft)
-    else()
-        message(FATAL_ERROR "oneMKL Interfaces was not built with CuFFT backend")
-    endif()
-
-
-If oneMKL Interfaces has been installed to a non-standard location, the
-operating system may not find the backend libraries when they're lazily loaded
-at runtime. To make sure they're found you may need to set
-``LD_LIBRARY_PATH=<onemkl_install_dir>/lib:$LD_LIBRARY_PATH`` on Linux.
-
-.. _using_with_fetchcontent:
-
-Using CMake's FetchContent
-##########################
-
-
-The FetchContent_ functionality of CMake can be used to download, build and
-install oneMKL Interfaces as part of the build.
-
-For example:
-
-.. code-block:: cmake
-
-    include(FetchContent)
-    set(BUILD_FUNCTIONAL_TESTS False)
-    set(BUILD_EXAMPLES False)
-    set(ENABLE_<BACKEND_NAME>_BACKEND True)
-    FetchContent_Declare(
-            onemkl_interface_library
-            GIT_REPOSITORY https://github.com/oneapi-src/oneMKL.git
-            GIT_TAG develop
-    )
-    FetchContent_MakeAvailable(onemkl_interface_library)
-
-    target_link_libraries(myTarget PRIVATE onemkl)
-
-The build parameters should be appropriately set before
-``FetchContent_Declare``. See :ref:`building_the_project_with_dpcpp` or
-:ref:`building_the_project_with_adaptivecpp`.
-
-To link against the main library with run-time dispatching, use the target
-``onemkl``. To link against particular domains, use the target
-``onemkl_<domain>``. For example, ``onemkl_blas`` or ``onemkl_dft``. To link
-against particular backends (as required for static dispatch of oneAPI calls to
-a particular backend), use the target ``onemkl_<domain>_<backend>``. For
-example, ``onemkl_dft_cufft``.
-
-When using the run-time dispatch mechanism, it is likely that the operating
-system will not find the backend libraries when they're loaded at runtime. To
-make sure they're found you may need to set
-``LD_LIBRARY_PATH=<onemkl_install_dir>/lib:$LD_LIBRARY_PATH`` on Linux.
-
-
-.. _FetchContent: https://cmake.org/cmake/help/latest/module/FetchContent.html
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
deleted file mode 100644
index d175c5dfd..000000000
--- a/examples/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-#===============================================================================
-# Copyright 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# builds examples in specified domain
-
-foreach(domain ${TARGET_DOMAINS})
-  add_subdirectory(${domain})
-endforeach()
diff --git a/examples/README.md b/examples/README.md
deleted file mode 100644
index 0dad8772d..000000000
--- a/examples/README.md
+++ /dev/null
@@ -1,597 +0,0 @@
-# oneAPI Math Kernel Library (oneMKL) Interfaces Examples 
-oneAPI Math Kernel Library (oneMKL) Interfaces offers examples with the following routines: 
-- blas: level3/gemm_usm  
-- rng: uniform_usm  
-- lapack: getrs_usm
-- dft: complex_fwd_usm, real_fwd_usm
-- sparse_blas: sparse_gemv_usm
-
-Each routine has one run-time dispatching example and one compile-time dispatching example (which uses both mklcpu and cuda backends), located in `example/<$domain>/run_time_dispatching` and `example/<$domain>/compile_time_dispatching` subfolders, respectively.
-
-To build examples, use cmake build option `-DBUILD_EXAMPLES=true`.  
-Compile_time_dispatching will be built if `-DBUILD_EXAMPLES=true` and cuda backend is enabled, because the compile-time dispatching example runs on both mklcpu and cuda backends.
-Run_time_dispatching will be built if `-DBUILD_EXAMPLES=true` and `-DBUILD_SHARED_LIBS=true`.
-
-The example executable naming convention follows `example_<$domain>_<$routine>_<$backend>` for compile-time dispatching examples 
-  or `example_<$domain>_<$routine>` for run-time dispatching examples. 
-  E.g. `example_blas_gemm_usm_mklcpu_cublas `  `example_blas_gemm_usm`
-
-## Example outputs (blas, rng, lapack, dft, sparse_blas)
-  
-## blas
-
-Run-time dispatching examples with mklcpu backend
-```
-$ export ONEAPI_DEVICE_SELECTOR="opencl:cpu"
-$ ./bin/example_blas_gemm_usm
-
-########################################################################
-# General Matrix-Matrix Multiplication using Unified Shared Memory Example:
-#
-# C = alpha * A * B + beta * C
-#
-# where A, B and C are general dense matrices and alpha, beta are
-# floating point type precision scalars.
-#
-# Using apis:
-#   gemm
-#
-# Using single precision (float) data type
-#
-# Device will be selected during runtime.
-# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
-# available devices
-#
-########################################################################
-
-Running BLAS GEMM USM example on CPU device.
-Device name is: Intel(R) Core(TM) i7-6770HQ CPU @ 2.60GHz
-Running with single precision real data type:
-
-                GEMM parameters:
-                        transA = trans, transB = nontrans
-                        m = 45, n = 98, k = 67
-                        lda = 103, ldB = 105, ldC = 106
-                        alpha = 2, beta = 3
-
-                Outputting 2x2 block of A,B,C matrices:
-
-                        A = [ 0.340188, 0.260249, ...
-                            [ -0.105617, 0.0125354, ...
-                            [ ...
-
-
-                        B = [ -0.326421, -0.192968, ...
-                            [ 0.363891, 0.251295, ...
-                            [ ...
-
-
-                        C = [ 0.00698781, 0.525862, ...
-                            [ 0.585167, 1.59017, ...
-                            [ ...
-
-BLAS GEMM USM example ran OK.
-
-```
-Run-time dispatching examples with mklgpu backend
-```
-$ export ONEAPI_DEVICE_SELECTOR="level_zero:gpu"
-$ ./bin/example_blas_gemm_usm
-
-########################################################################
-# General Matrix-Matrix Multiplication using Unified Shared Memory Example:
-#
-# C = alpha * A * B + beta * C
-#
-# where A, B and C are general dense matrices and alpha, beta are
-# floating point type precision scalars.
-#
-# Using apis:
-#   gemm
-#
-# Using single precision (float) data type
-#
-# Device will be selected during runtime.
-# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
-# available devices
-#
-########################################################################
-
-Running BLAS GEMM USM example on GPU device.
-Device name is: Intel(R) Iris(R) Pro Graphics 580 [0x193b]
-Running with single precision real data type:
-
-                GEMM parameters:
-                        transA = trans, transB = nontrans
-                        m = 45, n = 98, k = 67
-                        lda = 103, ldB = 105, ldC = 106
-                        alpha = 2, beta = 3
-
-                Outputting 2x2 block of A,B,C matrices:
-
-                        A = [ 0.340188, 0.260249, ...
-                            [ -0.105617, 0.0125354, ...
-                            [ ...
-
-
-                        B = [ -0.326421, -0.192968, ...
-                            [ 0.363891, 0.251295, ...
-                            [ ...
-
-
-                        C = [ 0.00698781, 0.525862, ...
-                            [ 0.585167, 1.59017, ...
-                            [ ...
-
-BLAS GEMM USM example ran OK.
-```
-Compile-time dispatching example with both mklcpu and cublas backend
-
-(Note that the mklcpu and cublas result matrices have a small difference. This is expected due to precision limitation of `float`)
-```
-./bin/example_blas_gemm_usm_mklcpu_cublas
-
-########################################################################
-# General Matrix-Matrix Multiplication using Unified Shared Memory Example:
-#
-# C = alpha * A * B + beta * C
-#
-# where A, B and C are general dense matrices and alpha, beta are
-# floating point type precision scalars.
-#
-# Using apis:
-#   gemm
-#
-# Using single precision (float) data type
-#
-# Running on both Intel CPU and Nvidia GPU devices
-#
-########################################################################
-
-Running BLAS GEMM USM example
-Running with single precision real data type on:
-        CPU device: Intel(R) Core(TM) i9-7920X CPU @ 2.90GHz
-        GPU device: TITAN RTX
-
-                GEMM parameters:
-                        transA = trans, transB = nontrans
-                        m = 45, n = 98, k = 67
-                        lda = 103, ldB = 105, ldC = 106
-                        alpha = 2, beta = 3
-
-                Outputting 2x2 block of A,B,C matrices:
-
-                        A = [ 0.340188, 0.260249, ...
-                            [ -0.105617, 0.0125354, ...
-                            [ ...
-
-
-                        B = [ -0.326421, -0.192968, ...
-                            [ 0.363891, 0.251295, ...
-                            [ ...
-
-
-                        (CPU) C = [ 0.00698781, 0.525862, ...
-                            [ 0.585167, 1.59017, ...
-                            [ ...
-
-
-                        (GPU) C = [ 0.00698793, 0.525862, ...
-                            [ 0.585168, 1.59017, ...
-                            [ ...
-
-BLAS GEMM USM example ran OK on MKLCPU and CUBLAS
-
-```
- 
-## lapack 
-Run-time dispatching example with mklgpu backend:
-```
-$ export ONEAPI_DEVICE_SELECTOR="level_zero:gpu"
-$ ./bin/example_lapack_getrs_usm
-
-########################################################################
-# LU Factorization and Solve Example:
-#
-# Computes LU Factorization A = P * L * U
-# and uses it to solve for X in a system of linear equations:
-#   AX = B
-# where A is a general dense matrix and B is a matrix whose columns
-# are the right-hand sides for the systems of equations.
-#
-# Using apis:
-#   getrf and getrs
-#
-# Using single precision (float) data type
-#
-# Device will be selected during runtime.
-# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
-# available devices
-#
-########################################################################
-
-Running LAPACK getrs example on GPU device.
-Device name is: Intel(R) Iris(R) Pro Graphics 580 [0x193b]
-Running with single precision real data type:
-
-                GETRF and GETRS parameters:
-                        trans = nontrans
-                        m = 23, n = 23, nrhs = 23
-                        lda = 32, ldb = 32
-
-                Outputting 2x2 block of A and X matrices:
-
-                        A = [ 0.340188, 0.304177, ...
-                            [ -0.105617, -0.343321, ...
-                            [ ...
-
-
-                        X = [ -1.1748, 1.84793, ...
-                            [ 1.47856, 0.189481, ...
-                            [ ...
-
-LAPACK GETRS USM example ran OK
-```
-
-Compile-time dispatching example with both mklcpu and cusolver backend
-```
-$ ./bin/example_lapack_getrs_usm_mklcpu_cusolver
-
-########################################################################
-# LU Factorization and Solve Example:
-#
-# Computes LU Factorization A = P * L * U
-# and uses it to solve for X in a system of linear equations:
-#   AX = B
-# where A is a general dense matrix and B is a matrix whose columns
-# are the right-hand sides for the systems of equations.
-#
-# Using apis:
-#   getrf and getrs
-#
-# Using single precision (float) data type
-#
-# Running on both Intel CPU and NVIDIA GPU devices
-#
-########################################################################
-
-Running LAPACK GETRS USM example
-Running with single precision real data type on:
-        CPU device :Intel(R) Core(TM) i9-7920X CPU @ 2.90GHz
-        GPU device :TITAN RTX
-
-                GETRF and GETRS parameters:
-                        trans = nontrans
-                        m = 23, n = 23, nrhs = 23
-                        lda = 32, ldb = 32
-
-                Outputting 2x2 block of A,B,X matrices:
-
-                        A = [ 0.340188, 0.304177, ...
-                            [ -0.105617, -0.343321, ...
-                            [ ...
-
-
-                        (CPU) X = [ -1.1748, 1.84793, ...
-                            [ 1.47856, 0.189481, ...
-                            [ ...
-
-
-                        (GPU) X = [ -1.1748, 1.84793, ...
-                            [ 1.47856, 0.189481, ...
-                            [ ...
-
-LAPACK GETRS USM example ran OK on MKLCPU and CUSOLVER
-
-```
-
-## rng
-Run-time dispatching example with mklgpu backend:
-```
-$ export ONEAPI_DEVICE_SELECTOR="level_zero:gpu"
-$ ./bin/example_rng_uniform_usm
-
-########################################################################
-# Generate uniformly distributed random numbers with philox4x32x10
-# generator example:
-#
-# Using APIs:
-#   default_engine uniform
-#
-# Using single precision (float) data type
-#
-# Device will be selected during runtime.
-# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
-# available devices
-#
-########################################################################
-
-Running RNG uniform usm example on GPU device
-Device name is: Intel(R) Iris(R) Pro Graphics 580 [0x193b]
-Running with single precision real data type:
-                generation parameters:
-                        seed = 777, a = 0, b = 10
-                Output of generator:
-                        first 10 numbers of 1000:
-8.52971 1.76033 6.04753 3.68079 9.04039 2.61014 3.75788 3.94859 7.93444 8.60436
-Random number generator with uniform distribution ran OK
-
-```
-
-Compile-time dispatching example with both mklcpu and curand backend
-```
-$ ./bin/example_rng_uniform_usm_mklcpu_curand
-
-########################################################################
-# Generate uniformly distributed random numbers with philox4x32x10
-# generator example:
-#
-# Using APIs:
-#   default_engine uniform
-#
-# Using single precision (float) data type
-#
-# Running on both Intel CPU and Nvidia GPU devices
-#
-########################################################################
-
-Running RNG uniform usm example
-Running with single precision real data type:
-        CPU device: Intel(R) Core(TM) i9-7920X CPU @ 2.90GHz
-        GPU device: TITAN RTX
-                generation parameters:
-                        seed = 777, a = 0, b = 10
-                Output of generator on CPU device:
-                        first 10 numbers of 1000:
-8.52971 1.76033 6.04753 3.68079 9.04039 2.61014 3.75788 3.94859 7.93444 8.60436
-                Output of generator on GPU device:
-                        first 10 numbers of 1000:
-3.52971 6.76033 1.04753 8.68079 4.48229 0.501966 6.78265 8.99091 6.39516 9.67955
-Random number generator example with uniform distribution ran OK on MKLCPU and CURAND
-
-```
-
-## dft
-
-Compile-time dispatching example with MKLGPU backend
-
-```none
-$ ONEAPI_DEVICE_SELECTOR="level_zero:gpu" ./bin/example_dft_complex_fwd_buffer_mklgpu
-
-########################################################################
-# Complex out-of-place forward transform for Buffer API's example:
-#
-# Using APIs:
-#   Compile-time dispatch API
-#   Buffer forward complex out-of-place
-#
-# Using single precision (float) data type
-#
-# For Intel GPU with Intel MKLGPU backend.
-#
-# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
-# available devices
-########################################################################
-
-Running DFT Complex forward out-of-place buffer example
-Using compile-time dispatch API with MKLGPU.
-Running with single precision real data type on:
-	GPU device :Intel(R) UHD Graphics 750 [0x4c8a]
-DFT Complex USM example ran OK on MKLGPU
-```
-
-Runtime dispatching example with MKLGPU, cuFFT, rocFFT and portFFT backends:
-
-```none
-$ ONEAPI_DEVICE_SELECTOR="level_zero:gpu" ./bin/example_dft_real_fwd_usm
-
-########################################################################
-# DFT complex in-place forward transform with USM API example:
-#
-# Using APIs:
-#   USM forward complex in-place
-#   Run-time dispatch
-#
-# Using single precision (float) data type
-#
-# Device will be selected during runtime.
-# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
-# available devices
-#
-########################################################################
-
-Running DFT complex forward example on GPU device
-Device name is: Intel(R) UHD Graphics 750 [0x4c8a]
-Running with single precision real data type:
-DFT example run_time dispatch
-DFT example ran OK
-```
-
-```none
-$ ONEAPI_DEVICE_SELECTOR="level_zero:gpu" ./bin/example_dft_real_fwd_usm
-
-########################################################################
-# DFT complex in-place forward transform with USM API example:
-#
-# Using APIs:
-#   USM forward complex in-place
-#   Run-time dispatch
-#
-# Using single precision (float) data type
-#
-# Device will be selected during runtime.
-# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
-# available devices
-#
-########################################################################
-
-Running DFT complex forward example on GPU device
-Device name is: NVIDIA A100-PCIE-40GB
-Running with single precision real data type:
-DFT example run_time dispatch
-DFT example ran OK
-```
-
-```none
-$ ./bin/example_dft_real_fwd_usm
-
-########################################################################
-# DFT complex in-place forward transform with USM API example:
-#
-# Using APIs:
-#   USM forward complex in-place
-#   Run-time dispatch
-#
-# Using single precision (float) data type
-#
-# Device will be selected during runtime.
-# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
-# available devices
-#
-########################################################################
-
-Running DFT complex forward example on GPU device
-Device name is: AMD Radeon PRO W6800
-Running with single precision real data type:
-DFT example run_time dispatch
-DFT example ran OK
-```
-
-```none
-$ LD_LIBRARY_PATH=lib/:$LD_LIBRARY_PATH ./bin/example_dft_real_fwd_usm
-########################################################################
-# DFT complex in-place forward transform with USM API example:
-#
-# Using APIs:
-#   USM forward complex in-place
-#   Run-time dispatch
-#
-# Using single precision (float) data type
-#
-# Device will be selected during runtime.
-# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
-# available devices
-#
-########################################################################
-
-Running DFT complex forward example on GPU device
-Device name is: Intel(R) UHD Graphics 750
-Running with single precision real data type:
-DFT example run_time dispatch
-Unsupported Configuration:
-	oneMKL: dft/backends/portfft/commit: function is not implemented portFFT only supports complex to complex transforms
-```
-
-## sparse_blas
-
-Run-time dispatching examples with mklcpu backend
-```
-$ export ONEAPI_DEVICE_SELECTOR="opencl:cpu"
-$ ./bin/example_sparse_blas_gemv_usm
-
-########################################################################
-# Sparse Matrix-Vector Multiply Example: 
-# 
-# y = alpha * op(A) * x + beta * y
-# 
-# where A is a sparse matrix in CSR format, x and y are dense vectors
-# and alpha, beta are floating point type precision scalars.
-# 
-# Using apis:
-#   sparse::gemv
-# 
-# Using single precision (float) data type
-# 
-# Device will be selected during runtime.
-# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
-# available devices
-# 
-########################################################################
-
-Running Sparse BLAS GEMV USM example on CPU device.
-Device name is: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz
-Running with single precision real data type:
-
-		sparse::gemv parameters:
-			transA = nontrans
-			nrows = 64
-			alpha = 1, beta = 0
-
-		 sparse::gemv example passed
-	Finished
-Sparse BLAS GEMV USM example ran OK.
-```
-
-Run-time dispatching examples with mklgpu backend
-```
-$ export ONEAPI_DEVICE_SELECTOR="level_zero:gpu"
-$ ./bin/example_sparse_blas_gemv_usm
-
-########################################################################
-# Sparse Matrix-Vector Multiply Example: 
-# 
-# y = alpha * op(A) * x + beta * y
-# 
-# where A is a sparse matrix in CSR format, x and y are dense vectors
-# and alpha, beta are floating point type precision scalars.
-# 
-# Using apis:
-#   sparse::gemv
-# 
-# Using single precision (float) data type
-# 
-# Device will be selected during runtime.
-# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify
-# available devices
-# 
-########################################################################
-
-Running Sparse BLAS GEMV USM example on GPU device.
-Device name is: Intel(R) HD Graphics 530 [0x1912]
-Running with single precision real data type:
-
-		sparse::gemv parameters:
-			transA = nontrans
-			nrows = 64
-			alpha = 1, beta = 0
-
-		 sparse::gemv example passed
-	Finished
-Sparse BLAS GEMV USM example ran OK.
-```
-
-Compile-time dispatching example with mklcpu backend
-```
-$ export ONEAPI_DEVICE_SELECTOR="opencl:cpu"
-$ ./bin/example_sparse_blas_gemv_usm_mklcpu
-
-########################################################################
-# Sparse Matrix-Vector Multiply Example: 
-# 
-# y = alpha * op(A) * x + beta * y
-# 
-# where A is a sparse matrix in CSR format, x and y are dense vectors
-# and alpha, beta are floating point type precision scalars.
-# 
-# Using apis:
-#   sparse::gemv
-# 
-# Using single precision (float) data type
-# 
-# Running on Intel CPU device
-# 
-########################################################################
-
-Running Sparse BLAS GEMV USM example on CPU device.
-Device name is: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz
-Running with single precision real data type:
-
-		sparse::gemv parameters:
-			transA = nontrans
-			nrows = 64
-			alpha = 1, beta = 0
-
-		 sparse::gemv example passed
-	Finished
-Sparse BLAS GEMV USM example ran OK.
-```
diff --git a/examples/blas/CMakeLists.txt b/examples/blas/CMakeLists.txt
deleted file mode 100644
index 7d692a2cb..000000000
--- a/examples/blas/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-#===============================================================================
-# Copyright 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Note: compile-time example uses both MKLCPU and CUBLAS backends, therefore
-# cmake in the sub-directory will only build it if CUBLAS backend is enabled
-add_subdirectory(compile_time_dispatching)
-
-# run-time dispatching example compilation is only possible
-# with dynamic libraries
-if (BUILD_SHARED_LIBS)
-  add_subdirectory(run_time_dispatching)
-endif()
diff --git a/examples/blas/compile_time_dispatching/CMakeLists.txt b/examples/blas/compile_time_dispatching/CMakeLists.txt
deleted file mode 100644
index e3a7a4738..000000000
--- a/examples/blas/compile_time_dispatching/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-#===============================================================================
-# Copyright 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_subdirectory(level3)
diff --git a/examples/blas/compile_time_dispatching/level3/CMakeLists.txt b/examples/blas/compile_time_dispatching/level3/CMakeLists.txt
deleted file mode 100644
index 294e000de..000000000
--- a/examples/blas/compile_time_dispatching/level3/CMakeLists.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-#===============================================================================
-# Copyright 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-#Build object from all sources
-set(BLAS_CT_SOURCES "")
-if(ENABLE_MKLCPU_BACKEND AND ENABLE_CUBLAS_BACKEND)
-  list(APPEND BLAS_CT_SOURCES "gemm_usm_mklcpu_cublas")
-endif()
-
-foreach(blas_ct_source ${BLAS_CT_SOURCES})
-  add_executable(example_${domain}_${blas_ct_source} ${blas_ct_source}.cpp)
-  target_include_directories(example_${domain}_${blas_ct_source}
-      PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-  )
-
-  if(domain STREQUAL "blas" AND ENABLE_MKLCPU_BACKEND AND ENABLE_CUBLAS_BACKEND)
-    add_dependencies(example_${domain}_${blas_ct_source} onemkl_${domain}_mklcpu onemkl_${domain}_cublas)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_mklcpu onemkl_${domain}_cublas)
-  endif()
-
-  target_link_libraries(example_${domain}_${blas_ct_source} PUBLIC
-      ${ONEMKL_LIBRARIES_${domain}}
-      ONEMKL::SYCL::SYCL
-  )
-
-  # Register example as ctest
-  add_test(NAME ${domain}/EXAMPLE/CT/${blas_ct_source} COMMAND example_${domain}_${blas_ct_source})
-
-endforeach(blas_ct_source)
diff --git a/examples/blas/compile_time_dispatching/level3/gemm_usm_mklcpu_cublas.cpp b/examples/blas/compile_time_dispatching/level3/gemm_usm_mklcpu_cublas.cpp
deleted file mode 100644
index 358c0b768..000000000
--- a/examples/blas/compile_time_dispatching/level3/gemm_usm_mklcpu_cublas.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/*
-*
-*  Content:
-*       This example demonstrates use of DPCPP API oneapi::mkl::blas::gemm
-*       using unified shared memory to perform General Matrix-Matrix
-*       Multiplication on a INTEL CPU SYCL device and an NVIDIA GPU SYCL device
-*
-*       C = alpha * op(A) * op(B) + beta * C
-*
-*       where op() is defined by one of oneapi::mkl::transpose::{nontrans,trans,conjtrans}
-*
-*
-*       This example demonstrates only single precision (float) data type for
-*       gemm matrix data
-*
-*
-*******************************************************************************/
-
-// stl includes
-#include <algorithm>
-#include <cstdlib>
-#include <iostream>
-#include <vector>
-
-// oneMKL/SYCL includes
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl.hpp"
-
-// local includes
-#include "example_helper.hpp"
-
-//
-// Main example for Gemm consisting of
-// initialization of A, B and C matrices as well as
-// scalars alpha and beta.  Then the product
-//
-// C = alpha * op(A) * op(B) + beta * C
-//
-// is performed and finally the results are post processed.
-//
-void run_gemm_example(const sycl::device &cpu_dev, const sycl::device &gpu_dev) {
-    //
-    // Initialize data for Gemm
-    //
-    // C = alpha * op(A) * op(B)  + beta * C
-    //
-    oneapi::mkl::transpose transA = oneapi::mkl::transpose::trans;
-    oneapi::mkl::transpose transB = oneapi::mkl::transpose::nontrans;
-
-    // matrix data sizes
-    int m = 45;
-    int n = 98;
-    int k = 67;
-
-    // leading dimensions of data
-    int ldA = 103;
-    int ldB = 105;
-    int ldC = 106;
-    int sizea = (transA == oneapi::mkl::transpose::nontrans) ? ldA * k : ldA * m;
-    int sizeb = (transB == oneapi::mkl::transpose::nontrans) ? ldB * n : ldB * k;
-    int sizec = ldC * n;
-
-    // set scalar fp values
-    float alpha = set_fp_value(float(2.0), float(-0.5));
-    float beta = set_fp_value(float(3.0), float(-1.5));
-
-    // Catch asynchronous exceptions for CPU and GPU
-    auto cpu_exception_handler = [](sycl::exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const &e) {
-                std::cerr << "Caught asynchronous SYCL exception on CPU device during GEMM:"
-                          << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-            }
-        }
-        std::exit(2);
-    };
-    auto gpu_exception_handler = [](sycl::exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const &e) {
-                std::cerr << "Caught asynchronous SYCL exception on GPU device during GEMM:"
-                          << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-            }
-        }
-        std::exit(2);
-    };
-
-    //
-    // Data Preparation on host
-    //
-    std::vector<float> A(sizea);
-    std::vector<float> B(sizeb);
-    std::vector<float> C(sizec);
-    std::vector<float> result_cpu(sizec);
-    std::vector<float> result_gpu(sizec);
-    std::fill(A.begin(), A.end(), 0);
-    std::fill(B.begin(), B.end(), 0);
-    std::fill(C.begin(), C.end(), 0);
-    std::fill(result_cpu.begin(), result_cpu.end(), 0);
-    std::fill(result_gpu.begin(), result_gpu.end(), 0);
-
-    rand_matrix(A, transA, m, k, ldA);
-    rand_matrix(B, transB, k, n, ldB);
-    rand_matrix(C, oneapi::mkl::transpose::nontrans, m, n, ldC);
-
-    //
-    // Preparation on CPU
-    //
-    sycl::queue cpu_queue(cpu_dev, cpu_exception_handler);
-    sycl::event cpu_gemm_done;
-    sycl::context cpu_cxt = cpu_queue.get_context();
-
-    // allocate on CPU device and copy data from host to SYCL CPU device
-    float *cpu_A = sycl::malloc_device<float>(sizea * sizeof(float), cpu_queue);
-    float *cpu_B = sycl::malloc_device<float>(sizeb * sizeof(float), cpu_queue);
-    float *cpu_C = sycl::malloc_device<float>(sizec * sizeof(float), cpu_queue);
-    if (!cpu_A || !cpu_B || !cpu_C) {
-        throw std::runtime_error("Failed to allocate USM memory.");
-    }
-    cpu_queue.memcpy(cpu_A, A.data(), sizea * sizeof(float)).wait();
-    cpu_queue.memcpy(cpu_B, B.data(), sizeb * sizeof(float)).wait();
-    cpu_queue.memcpy(cpu_C, C.data(), sizec * sizeof(float)).wait();
-
-    //
-    // Preparation on GPU
-    //
-    sycl::queue gpu_queue(gpu_dev, gpu_exception_handler);
-    sycl::event gpu_gemm_done;
-    sycl::context gpu_cxt = gpu_queue.get_context();
-
-    // allocate on GPU device and copy data from host to SYCL GPU device
-    float *gpu_A = sycl::malloc_device<float>(sizea * sizeof(float), gpu_queue);
-    float *gpu_B = sycl::malloc_device<float>(sizeb * sizeof(float), gpu_queue);
-    float *gpu_C = sycl::malloc_device<float>(sizec * sizeof(float), gpu_queue);
-    if (!gpu_A || !gpu_B || !gpu_C) {
-        throw std::runtime_error("Failed to allocate USM memory.");
-    }
-    gpu_queue.memcpy(gpu_A, A.data(), sizea * sizeof(float)).wait();
-    gpu_queue.memcpy(gpu_B, B.data(), sizeb * sizeof(float)).wait();
-    gpu_queue.memcpy(gpu_C, C.data(), sizec * sizeof(float)).wait();
-
-    //
-    // Execute Gemm on CPU and GPU device
-    //
-    // add oneapi::mkl::blas::gemm to execution queue
-    cpu_gemm_done = oneapi::mkl::blas::column_major::gemm(
-        oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu>{ cpu_queue }, transA, transB, m,
-        n, k, alpha, cpu_A, ldA, cpu_B, ldB, beta, cpu_C, ldC);
-    gpu_gemm_done = oneapi::mkl::blas::column_major::gemm(
-        oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ gpu_queue }, transA, transB, m,
-        n, k, alpha, gpu_A, ldA, gpu_B, ldB, beta, gpu_C, ldC);
-
-    // Wait until calculations are done
-    cpu_gemm_done.wait_and_throw();
-    gpu_gemm_done.wait_and_throw();
-
-    //
-    // Post Processing
-    //
-    // copy data from CPU back to host
-    cpu_queue.memcpy(result_cpu.data(), cpu_C, sizec * sizeof(float)).wait_and_throw();
-
-    // copy data from GPU back to host
-    gpu_queue.memcpy(result_gpu.data(), gpu_C, sizec * sizeof(float)).wait_and_throw();
-
-    // print results
-    std::cout << "\n\t\tGEMM parameters:" << std::endl;
-    std::cout << "\t\t\ttransA = "
-              << (transA == oneapi::mkl::transpose::nontrans
-                      ? "nontrans"
-                      : (transA == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
-              << ", transB = "
-              << (transB == oneapi::mkl::transpose::nontrans
-                      ? "nontrans"
-                      : (transB == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
-              << std::endl;
-    std::cout << "\t\t\tm = " << m << ", n = " << n << ", k = " << k << std::endl;
-    std::cout << "\t\t\tlda = " << ldA << ", ldB = " << ldB << ", ldC = " << ldC << std::endl;
-    std::cout << "\t\t\talpha = " << alpha << ", beta = " << beta << std::endl;
-
-    std::cout << "\n\t\tOutputting 2x2 block of A,B,C matrices:" << std::endl;
-
-    // output the top 2x2 block of A matrix
-    print_2x2_matrix_values(A.data(), ldA, "A");
-
-    // output the top 2x2 block of B matrix
-    print_2x2_matrix_values(B.data(), ldB, "B");
-
-    // output the top 2x2 block of C matrix from CPU
-    print_2x2_matrix_values(result_cpu.data(), ldC, "(CPU) C");
-
-    // output the top 2x2 block of C matrix from GPU
-    print_2x2_matrix_values(result_gpu.data(), ldC, "(GPU) C");
-
-    sycl::free(gpu_C, gpu_queue);
-    sycl::free(gpu_B, gpu_queue);
-    sycl::free(gpu_A, gpu_queue);
-    sycl::free(cpu_C, cpu_queue);
-    sycl::free(cpu_B, cpu_queue);
-    sycl::free(cpu_A, cpu_queue);
-}
-
-//
-// Description of example setup, apis used and supported floating point type precisions
-//
-void print_example_banner() {
-    std::cout << "" << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << "# General Matrix-Matrix Multiplication using Unified Shared Memory Example: "
-              << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# C = alpha * A * B + beta * C" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# where A, B and C are general dense matrices and alpha, beta are" << std::endl;
-    std::cout << "# floating point type precision scalars." << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using apis:" << std::endl;
-    std::cout << "#   gemm" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using single precision (float) data type" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Running on both Intel CPU and Nvidia GPU devices" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << std::endl;
-}
-
-//
-// Main entry point for example.
-//
-int main(int argc, char **argv) {
-    print_example_banner();
-
-    try {
-        sycl::device cpu_dev((sycl::cpu_selector()));
-        sycl::device gpu_dev((sycl::gpu_selector()));
-
-        unsigned int vendor_id = gpu_dev.get_info<sycl::info::device::vendor_id>();
-        if (vendor_id != NVIDIA_ID) {
-            std::cerr << "FAILED: NVIDIA GPU device not found" << std::endl;
-            return 1;
-        }
-        std::cout << "Running BLAS GEMM USM example" << std::endl;
-        std::cout << "Running with single precision real data type on:" << std::endl;
-        std::cout << "\tCPU device: " << cpu_dev.get_info<sycl::info::device::name>() << std::endl;
-        std::cout << "\tGPU device: " << gpu_dev.get_info<sycl::info::device::name>() << std::endl;
-        run_gemm_example(cpu_dev, gpu_dev);
-        std::cout << "BLAS GEMM USM example ran OK on MKLCPU and CUBLAS" << std::endl;
-    }
-    catch (sycl::exception const &e) {
-        std::cerr << "Caught synchronous SYCL exception during GEMM:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
-        return 1;
-    }
-    catch (std::exception const &e) {
-        std::cerr << "Caught std::exception during GEMM:";
-        std::cerr << "\t" << e.what() << std::endl;
-        return 1;
-    }
-    return 0;
-}
diff --git a/examples/blas/run_time_dispatching/CMakeLists.txt b/examples/blas/run_time_dispatching/CMakeLists.txt
deleted file mode 100644
index e3a7a4738..000000000
--- a/examples/blas/run_time_dispatching/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-#===============================================================================
-# Copyright 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_subdirectory(level3)
diff --git a/examples/blas/run_time_dispatching/level3/CMakeLists.txt b/examples/blas/run_time_dispatching/level3/CMakeLists.txt
deleted file mode 100644
index d0d35fc0d..000000000
--- a/examples/blas/run_time_dispatching/level3/CMakeLists.txt
+++ /dev/null
@@ -1,87 +0,0 @@
-#===============================================================================
-# Copyright 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# NOTE: user needs to set env var ONEAPI_DEVICE_SELECTOR to use runtime example without specifying backend in CMake
-# $ENV{ONEAPI_DEVICE_SELECTOR}
-
-
-# Build object from all example sources
-set(BLAS_RT_SOURCES "gemm_usm")
-
-# Set up for the right backend for run-time dispatching examples
-# If users build more than one backend (i.e. mklcpu and mklgpu, or mklcpu and CUDA), they may need to
-# overwrite ONEAPI_DEVICE_SELECTOR in their environment to run on the desired backend
-set(DEVICE_FILTERS "")
-if(ENABLE_MKLCPU_BACKEND)
-  list(APPEND DEVICE_FILTERS "opencl:cpu")
-endif()
-if(ENABLE_MKLGPU_BACKEND)
-  list(APPEND DEVICE_FILTERS "level_zero:gpu")
-endif()
-if(ENABLE_CUBLAS_BACKEND)
-  list(APPEND DEVICE_FILTERS "cuda:gpu")
-endif()
-if(ENABLE_ROCBLAS_BACKEND)
-  list(APPEND DEVICE_FILTERS "hip:gpu")
-endif()
-if(ENABLE_PORTBLAS_BACKEND)
-  if(PORTBLAS_TUNING_TARGET)
-    if(PORTBLAS_TUNING_TARGET MATCHES "INTEL_CPU")
-      list(APPEND DEVICE_FILTERS "opencl:cpu")
-    elseif(PORTBLAS_TUNING_TARGET MATCHES "_GPU")
-      list(APPEND DEVICE_FILTERS "*:gpu")
-    endif()
-  else()
-    # portBLAS default sycl-target is spir64, testing runtime on both supported
-    # devices.
-    list(APPEND DEVICE_FILTERS "opencl:cpu;level_zero:gpu")
-  endif()
-endif()
-
-message(STATUS "ONEAPI_DEVICE_SELECTOR will be set to the following value(s): [${DEVICE_FILTERS}] for run-time dispatching examples")
-
-foreach(blas_rt_source ${BLAS_RT_SOURCES})
-  add_executable(example_${domain}_${blas_rt_source} ${blas_rt_source}.cpp)
-  target_include_directories(example_${domain}_${blas_rt_source}
-      PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-  )
-
-  add_dependencies(example_${domain}_${blas_rt_source} onemkl)
-
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET example_${domain}_${blas_rt_source} SOURCES ${BLAS_RT_SOURCES})
-  endif()
-
-  target_link_libraries(example_${domain}_${blas_rt_source} PUBLIC
-      onemkl
-      ONEMKL::SYCL::SYCL
-      ${CMAKE_DL_LIBS}
-  )
-
-  # Register example as ctest
-  foreach(device_filter ${DEVICE_FILTERS})
-    add_test(NAME ${domain}/EXAMPLE/RT/${blas_rt_source}/${device_filter} COMMAND example_${domain}_${blas_rt_source})
-    set_property(TEST ${domain}/EXAMPLE/RT/${blas_rt_source}/${device_filter} PROPERTY
-      ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}
-      ENVIRONMENT ONEAPI_DEVICE_SELECTOR=${device_filter})
-  endforeach(device_filter)
-
-endforeach(blas_rt_source)
diff --git a/examples/blas/run_time_dispatching/level3/gemm_usm.cpp b/examples/blas/run_time_dispatching/level3/gemm_usm.cpp
deleted file mode 100644
index cd59e7b7f..000000000
--- a/examples/blas/run_time_dispatching/level3/gemm_usm.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/*
-*
-*  Content:
-*       This example demonstrates use of DPCPP API oneapi::mkl::blas::gemm
-*       using unified shared memory to perform General Matrix-Matrix
-*       Multiplication on a SYCL device (HOST, CPU, GPU) that is selected
-*       during runtime.
-*
-*       C = alpha * op(A) * op(B) + beta * C
-*
-*       where op() is defined by one of oneapi::mkl::transpose::{nontrans,trans,conjtrans}
-*
-*
-*       This example demonstrates only single precision (float) data type for
-*       gemm matrix data
-*
-*
-*******************************************************************************/
-
-// stl includes
-#include <algorithm>
-#include <cstdlib>
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl.hpp"
-
-#include "example_helper.hpp"
-
-//
-// Main example for Gemm consisting of
-// initialization of A, B and C matrices as well as
-// scalars alpha and beta.  Then the product
-//
-// C = alpha * op(A) * op(B) + beta * C
-//
-// is performed and finally the results are post processed.
-//
-void run_gemm_example(const sycl::device& dev) {
-    //
-    // Initialize data for Gemm
-    //
-    // C = alpha * op(A) * op(B)  + beta * C
-    //
-
-    oneapi::mkl::transpose transA = oneapi::mkl::transpose::trans;
-    oneapi::mkl::transpose transB = oneapi::mkl::transpose::nontrans;
-
-    // matrix data sizes
-    int m = 45;
-    int n = 98;
-    int k = 67;
-
-    // leading dimensions of data
-    int ldA = 103;
-    int ldB = 105;
-    int ldC = 106;
-    int sizea = (transA == oneapi::mkl::transpose::nontrans) ? ldA * k : ldA * m;
-    int sizeb = (transB == oneapi::mkl::transpose::nontrans) ? ldB * n : ldB * k;
-    int sizec = ldC * n;
-
-    // set scalar fp values
-    float alpha = set_fp_value(float(2.0), float(-0.5));
-    float beta = set_fp_value(float(3.0), float(-1.5));
-
-    // Catch asynchronous exceptions
-    auto exception_handler = [](sycl::exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const& e) {
-                std::cerr << "Caught asynchronous SYCL exception during GEMM:" << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-            }
-        }
-        std::exit(2);
-    };
-
-    // create execution queue
-    sycl::queue main_queue(dev, exception_handler);
-    sycl::event gemm_done;
-    sycl::context cxt = main_queue.get_context();
-
-    // allocate matrix on host
-    std::vector<float> A(sizea);
-    std::vector<float> B(sizeb);
-    std::vector<float> C(sizec);
-    std::fill(A.begin(), A.end(), 0);
-    std::fill(B.begin(), B.end(), 0);
-    std::fill(C.begin(), C.end(), 0);
-
-    rand_matrix(A, transA, m, k, ldA);
-    rand_matrix(B, transB, k, n, ldB);
-    rand_matrix(C, oneapi::mkl::transpose::nontrans, m, n, ldC);
-
-    // allocate memory on device
-    auto dev_A = sycl::malloc_device<float>(sizea * sizeof(float), main_queue);
-    auto dev_B = sycl::malloc_device<float>(sizeb * sizeof(float), main_queue);
-    auto dev_C = sycl::malloc_device<float>(sizec * sizeof(float), main_queue);
-    if (!dev_A || !dev_B || !dev_C) {
-        throw std::runtime_error("Failed to allocate USM memory.");
-    }
-
-    // copy data from host to device
-    main_queue.memcpy(dev_A, A.data(), sizea * sizeof(float)).wait();
-    main_queue.memcpy(dev_B, B.data(), sizeb * sizeof(float)).wait();
-    main_queue.memcpy(dev_C, C.data(), sizec * sizeof(float)).wait();
-
-    //
-    // Execute Gemm
-    //
-    // add oneapi::mkl::blas::gemm to execution queue
-    gemm_done = oneapi::mkl::blas::column_major::gemm(main_queue, transA, transB, m, n, k, alpha,
-                                                      dev_A, ldA, dev_B, ldB, beta, dev_C, ldC);
-
-    // Wait until calculations are done
-    main_queue.wait_and_throw();
-
-    //
-    // Post Processing
-    //
-    // copy data from device back to host
-    main_queue.memcpy(C.data(), dev_C, sizec * sizeof(float)).wait_and_throw();
-
-    std::cout << "\n\t\tGEMM parameters:" << std::endl;
-    std::cout << "\t\t\ttransA = "
-              << (transA == oneapi::mkl::transpose::nontrans
-                      ? "nontrans"
-                      : (transA == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
-              << ", transB = "
-              << (transB == oneapi::mkl::transpose::nontrans
-                      ? "nontrans"
-                      : (transB == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
-              << std::endl;
-    std::cout << "\t\t\tm = " << m << ", n = " << n << ", k = " << k << std::endl;
-    std::cout << "\t\t\tlda = " << ldA << ", ldB = " << ldB << ", ldC = " << ldC << std::endl;
-    std::cout << "\t\t\talpha = " << alpha << ", beta = " << beta << std::endl;
-
-    std::cout << "\n\t\tOutputting 2x2 block of A,B,C matrices:" << std::endl;
-
-    // output the top 2x2 block of A matrix
-    print_2x2_matrix_values(A.data(), ldA, "A");
-
-    // output the top 2x2 block of B matrix
-    print_2x2_matrix_values(B.data(), ldB, "B");
-
-    // output the top 2x2 block of C matrix
-    print_2x2_matrix_values(C.data(), ldC, "C");
-
-    sycl::free(dev_C, main_queue);
-    sycl::free(dev_B, main_queue);
-    sycl::free(dev_A, main_queue);
-}
-
-//
-// Description of example setup, apis used and supported floating point type precisions
-//
-void print_example_banner() {
-    std::cout << "" << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << "# General Matrix-Matrix Multiplication using Unified Shared Memory Example: "
-              << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# C = alpha * A * B + beta * C" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# where A, B and C are general dense matrices and alpha, beta are" << std::endl;
-    std::cout << "# floating point type precision scalars." << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using apis:" << std::endl;
-    std::cout << "#   gemm" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using single precision (float) data type" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Device will be selected during runtime." << std::endl;
-    std::cout << "# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify"
-              << std::endl;
-    std::cout << "# available devices" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << std::endl;
-}
-
-//
-// Main entry point for example
-//
-int main(int argc, char** argv) {
-    print_example_banner();
-
-    try {
-        sycl::device dev = sycl::device();
-
-        if (dev.is_gpu()) {
-            std::cout << "Running BLAS GEMM USM example on GPU device." << std::endl;
-            std::cout << "Device name is: " << dev.get_info<sycl::info::device::name>()
-                      << std::endl;
-        }
-        else {
-            std::cout << "Running BLAS GEMM USM example on CPU device." << std::endl;
-            std::cout << "Device name is: " << dev.get_info<sycl::info::device::name>()
-                      << std::endl;
-        }
-        std::cout << "Running with single precision real data type:" << std::endl;
-
-        run_gemm_example(dev);
-        std::cout << "BLAS GEMM USM example ran OK." << std::endl;
-    }
-    catch (sycl::exception const& e) {
-        std::cerr << "Caught synchronous SYCL exception during GEMM:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
-        return 1;
-    }
-    catch (std::exception const& e) {
-        std::cerr << "Caught std::exception during GEMM:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        return 1;
-    }
-
-    return 0;
-}
diff --git a/examples/dft/CMakeLists.txt b/examples/dft/CMakeLists.txt
deleted file mode 100644
index 721512429..000000000
--- a/examples/dft/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_subdirectory(compile_time_dispatching)
-
-# runtime compilation is only possible with dynamic libraries
-if (BUILD_SHARED_LIBS)
-  add_subdirectory(run_time_dispatching)
-endif()
diff --git a/examples/dft/compile_time_dispatching/CMakeLists.txt b/examples/dft/compile_time_dispatching/CMakeLists.txt
deleted file mode 100644
index ed0ca2922..000000000
--- a/examples/dft/compile_time_dispatching/CMakeLists.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-#Build object from all sources
-set(DFT_CT_SOURCES "")
-if (ENABLE_MKLCPU_BACKEND AND ENABLE_CUFFT_BACKEND)
-  list(APPEND DFT_CT_SOURCES "complex_fwd_usm_mklcpu_cufft")
-endif()
-
-include(WarningsUtils)
-
-foreach(dft_ct_source ${DFT_CT_SOURCES})
-  set(EXAMPLE_NAME example_${domain}_${dft_ct_source})
-  add_executable(${EXAMPLE_NAME} ${dft_ct_source}.cpp)
-  target_include_directories(${EXAMPLE_NAME}
-      PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-  )
-
-  if(domain STREQUAL "dft" AND ENABLE_MKLCPU_BACKEND AND ENABLE_CUFFT_BACKEND)
-    add_dependencies(${EXAMPLE_NAME} onemkl_${domain}_mklcpu onemkl_${domain}_cufft)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_mklcpu onemkl_${domain}_cufft)
-  endif()
-
-  target_link_libraries(${EXAMPLE_NAME} PUBLIC
-    ${ONEMKL_LIBRARIES_${domain}}
-    onemkl_warnings
-  )
-
-  # Register example as ctest
-  add_test(NAME dft/EXAMPLE/CT/${dft_ct_source} COMMAND ${EXAMPLE_NAME})
-
-endforeach(dft_ct_source)
-
diff --git a/examples/dft/compile_time_dispatching/complex_fwd_usm_mklcpu_cufft.cpp b/examples/dft/compile_time_dispatching/complex_fwd_usm_mklcpu_cufft.cpp
deleted file mode 100644
index 59c810f3f..000000000
--- a/examples/dft/compile_time_dispatching/complex_fwd_usm_mklcpu_cufft.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*******************************************************************************
-* Copyright 2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// STL includes
-#include <iostream>
-
-// oneMKL/SYCL includes
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl.hpp"
-#include <complex>
-
-void run_example(const sycl::device& cpu_device, const sycl::device& gpu_device) {
-    constexpr std::size_t N = 10;
-
-    // Catch asynchronous exceptions for cpu
-    auto cpu_error_handler = [&](sycl::exception_list exceptions) {
-        for (auto const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const& e) {
-                // Handle not dft related exceptions that happened during asynchronous call
-                std::cerr << "Caught asynchronous SYCL exception on CPU device during execution:"
-                          << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-            }
-        }
-        std::exit(2);
-    };
-    // Catch asynchronous exceptions for gpu
-    auto gpu_error_handler = [&](sycl::exception_list exceptions) {
-        for (auto const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const& e) {
-                // Handle not dft related exceptions that happened during asynchronous call
-                std::cerr << "Caught asynchronous SYCL exception on GPU device during execution:"
-                          << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-            }
-        }
-        std::exit(2);
-    };
-
-    // Preparation CPU device and GPU device
-    sycl::queue cpu_queue(cpu_device, cpu_error_handler);
-    sycl::queue gpu_queue(gpu_device, gpu_error_handler);
-
-    // allocate on CPU device and GPU device
-    auto cpu_input_data = sycl::malloc_shared<std::complex<float>>(N, cpu_queue);
-    auto cpu_output_data = sycl::malloc_shared<std::complex<float>>(N, cpu_queue);
-
-    auto gpu_input_data = sycl::malloc_shared<std::complex<float>>(N, gpu_queue);
-    auto gpu_output_data = sycl::malloc_shared<std::complex<float>>(N, gpu_queue);
-
-    // Initialize input data
-    for (std::size_t i = 0; i < N; ++i) {
-        cpu_input_data[i] = { static_cast<float>(i), static_cast<float>(-i) };
-        gpu_input_data[i] = { static_cast<float>(i), static_cast<float>(-i) };
-    }
-
-    // enabling
-    // 1. create descriptors
-    oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE,
-                                 oneapi::mkl::dft::domain::COMPLEX>
-        desc(static_cast<std::int64_t>(N));
-
-    // 2. variadic set_value
-    desc.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                   oneapi::mkl::dft::config_value::NOT_INPLACE);
-    desc.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
-                   static_cast<std::int64_t>(1));
-
-    // 3a. commit_descriptor (compile_time MKLCPU)
-    desc.commit(oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu>{ cpu_queue });
-
-    // 4a. compute_forward / compute_backward (MKLCPU)
-    oneapi::mkl::dft::compute_forward<decltype(desc), std::complex<float>, std::complex<float>>(
-        desc, cpu_input_data, cpu_output_data);
-
-    // 3b. commit_descriptor (compile_time cuFFT)
-    desc.commit(oneapi::mkl::backend_selector<oneapi::mkl::backend::cufft>{ gpu_queue });
-
-    // 4b. compute_forward / compute_backward (cuFFT)
-    oneapi::mkl::dft::compute_forward<decltype(desc), std::complex<float>, std::complex<float>>(
-        desc, gpu_input_data, gpu_output_data);
-
-    cpu_queue.wait_and_throw();
-    gpu_queue.wait_and_throw();
-
-    sycl::free(cpu_input_data, cpu_queue);
-    sycl::free(gpu_input_data, gpu_queue);
-    sycl::free(cpu_output_data, cpu_queue);
-    sycl::free(gpu_output_data, gpu_queue);
-}
-
-//
-// Description of example setup, apis used and supported floating point type precisions
-//
-void print_example_banner() {
-    std::cout << "\n"
-                 "########################################################################\n"
-                 "# Complex out-of-place forward transform for USM API's example:\n"
-                 "#\n"
-                 "# Using APIs:\n"
-                 "#   Compile-time dispatch API\n"
-                 "#   USM forward complex out-of-place\n"
-                 "#\n"
-                 "# Using single precision (float) data type\n"
-                 "#\n"
-                 "# Running on both Intel CPU and NVIDIA GPU devices.\n"
-                 "#\n"
-                 "########################################################################\n"
-              << std::endl;
-}
-
-//
-// Main entry point for example.
-//
-int main(int /*argc*/, char** /*argv*/) {
-    print_example_banner();
-
-    try {
-        sycl::device cpu_device((sycl::cpu_selector_v));
-        sycl::device gpu_device((sycl::gpu_selector_v));
-
-        unsigned int vendor_id = gpu_device.get_info<sycl::info::device::vendor_id>();
-        if (vendor_id != NVIDIA_ID) {
-            std::cerr << "FAILED: NVIDIA GPU device not found" << std::endl;
-            return 1;
-        }
-
-        std::cout << "Running DFT Complex forward out-of-place usm example" << std::endl;
-        std::cout << "Using compile-time dispatch API with MKLCPU and cuFFT." << std::endl;
-        std::cout << "Running with single precision real data type on:" << std::endl;
-        std::cout << "\tCPU device: " << cpu_device.get_info<sycl::info::device::name>()
-                  << std::endl;
-        std::cout << "\tGPU device :" << gpu_device.get_info<sycl::info::device::name>()
-                  << std::endl;
-
-        run_example(cpu_device, gpu_device);
-        std::cout << "DFT Complex USM example ran OK on MKLCPU and CUFFT" << std::endl;
-    }
-    catch (sycl::exception const& e) {
-        // Handle not dft related exceptions that happened during synchronous call
-        std::cerr << "Caught synchronous SYCL exception:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
-        return 1;
-    }
-    catch (std::exception const& e) {
-        // Handle not SYCL related exceptions that happened during synchronous call
-        std::cerr << "Caught synchronous std::exception:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        return 1;
-    }
-
-    return 0;
-}
diff --git a/examples/dft/run_time_dispatching/CMakeLists.txt b/examples/dft/run_time_dispatching/CMakeLists.txt
deleted file mode 100644
index e221c7950..000000000
--- a/examples/dft/run_time_dispatching/CMakeLists.txt
+++ /dev/null
@@ -1,81 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# NOTE: user needs to set env var ONEAPI_DEVICE_SELECTOR to use runtime example (no need to specify backend when building with CMake)
-include(WarningsUtils)
-
-
-# Build object from all example sources
-set(DFT_RT_SOURCES "")
-# Set up for the right backend for run-time dispatching examples
-# If users build more than one backend (i.e. mklcpu and mklgpu, or mklcpu and CUDA), they may need to
-# overwrite ONEAPI_DEVICE_SELECTOR in their environment to run on the desired backend
-set(DEVICE_FILTERS "")
-if(ENABLE_MKLGPU_BACKEND OR ENABLE_MKLCPU_BACKEND OR ENABLE_CUFFT_BACKEND OR ENABLE_ROCFFT_BACKEND OR ENABLE_PORTFFT_BACKEND)
-  list(APPEND DFT_RT_SOURCES "real_fwd_usm")
-endif()
-
-if(ENABLE_MKLGPU_BACKEND)
-  list(APPEND DEVICE_FILTERS "level_zero:gpu")
-endif()
-if(ENABLE_MKLCPU_BACKEND)
-  list(APPEND DEVICE_FILTERS "opencl:cpu")
-endif()
-if(ENABLE_PORTFFT_BACKEND)
-  list(APPEND DEVICE_FILTERS "*:gpu")
-endif()
-if(ENABLE_CUFFT_BACKEND)
-  list(APPEND DEVICE_FILTERS "cuda:gpu")
-endif()
-if(ENABLE_ROCFFT_BACKEND)
-  list(APPEND DEVICE_FILTERS "hip:gpu")
-endif()
-
-message(STATUS "ONEAPI_DEVICE_SELECTOR will be set to the following value(s): [${DEVICE_FILTERS}] for run-time dispatching examples")
-
-foreach(dft_rt_sources ${DFT_RT_SOURCES})
-  add_executable(example_${domain}_${dft_rt_sources} ${dft_rt_sources}.cpp)
-  target_include_directories(example_${domain}_${dft_rt_sources}
-      PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-  )
-
-  add_dependencies(example_${domain}_${dft_rt_sources} onemkl)
-
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET example_${domain}_${dft_rt_sources} SOURCES ${DFT_RT_SOURCES})
-  endif()
-
-  target_link_libraries(example_${domain}_${dft_rt_sources}
-      PUBLIC onemkl
-      PUBLIC ONEMKL::SYCL::SYCL
-      PUBLIC ${CMAKE_DL_LIBS}
-      PRIVATE onemkl_warnings
-  )
-
-  # Register example as ctest
-  foreach(device_filter ${DEVICE_FILTERS})
-    add_test(NAME ${domain}/EXAMPLE/RT/${dft_rt_sources}/${device_filter} COMMAND example_${domain}_${dft_rt_sources})
-    set_property(TEST ${domain}/EXAMPLE/RT/${dft_rt_sources}/${device_filter} PROPERTY
-      ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}
-      ENVIRONMENT ONEAPI_DEVICE_SELECTOR=${device_filter})
-  endforeach(device_filter)
-
-endforeach()
diff --git a/examples/dft/run_time_dispatching/real_fwd_usm.cpp b/examples/dft/run_time_dispatching/real_fwd_usm.cpp
deleted file mode 100644
index c220b0ee7..000000000
--- a/examples/dft/run_time_dispatching/real_fwd_usm.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// stl includes
-#include <iostream>
-#include <cstdint>
-
-// oneMKL/SYCL includes
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-
-void run_example(const sycl::device& dev) {
-    constexpr std::size_t N = 16;
-
-    // Catch asynchronous exceptions
-    auto exception_handler = [](sycl::exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const& e) {
-                std::cerr << "Caught asynchronous SYCL exception:" << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-            }
-        }
-        std::exit(2);
-    };
-
-    std::cout << "DFT example run_time dispatch" << std::endl;
-
-    sycl::queue sycl_queue(dev, exception_handler);
-    auto x_usm = sycl::malloc_shared<float>(N * 2, sycl_queue);
-
-    // 1. create descriptors
-    oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE,
-                                 oneapi::mkl::dft::domain::REAL>
-        desc(static_cast<std::int64_t>(N));
-
-    // 2. variadic set_value
-    desc.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
-                   static_cast<std::int64_t>(1));
-    desc.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                   oneapi::mkl::dft::config_value::INPLACE);
-
-    // 3. commit_descriptor (runtime dispatch)
-    desc.commit(sycl_queue);
-
-    // 4. compute_forward / compute_backward (runtime dispatch)
-    auto compute_event = oneapi::mkl::dft::compute_forward(desc, x_usm);
-
-    // Do something with transformed data.
-    compute_event.wait();
-
-    // 5. Free USM allocation.
-    sycl::free(x_usm, sycl_queue);
-}
-
-//
-// Description of example setup, APIs used and supported floating point type precisions
-//
-void print_example_banner() {
-    std::cout << "########################################################################\n"
-                 "# DFT complex in-place forward transform with USM API example:\n"
-                 "#\n"
-                 "# Using APIs:\n"
-                 "#   USM forward complex in-place\n"
-                 "#   Run-time dispatch\n"
-                 "#\n"
-                 "# Using single precision (float) data type\n"
-                 "#\n"
-                 "# Device will be selected during runtime.\n"
-                 "# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify\n"
-                 "# available devices\n"
-                 "#\n"
-                 "########################################################################\n"
-              << std::endl;
-}
-
-//
-// Main entry point for example.
-//
-
-int main(int /*argc*/, char** /*argv*/) {
-    print_example_banner();
-
-    try {
-        sycl::device my_dev((sycl::default_selector_v));
-
-        if (my_dev.is_gpu()) {
-            std::cout << "Running DFT complex forward example on GPU device" << std::endl;
-            std::cout << "Device name is: " << my_dev.get_info<sycl::info::device::name>()
-                      << std::endl;
-        }
-        else {
-            std::cout << "Running DFT complex forward example on CPU device" << std::endl;
-            std::cout << "Device name is: " << my_dev.get_info<sycl::info::device::name>()
-                      << std::endl;
-        }
-        std::cout << "Running with single precision real data type:" << std::endl;
-
-        run_example(my_dev);
-        std::cout << "DFT example ran OK" << std::endl;
-    }
-    catch (oneapi::mkl::unimplemented const& e) {
-        std::cerr << "Unsupported Configuration:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        return 0;
-    }
-    catch (sycl::exception const& e) {
-        std::cerr << "Caught synchronous SYCL exception:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
-        return 1;
-    }
-    catch (std::exception const& e) {
-        std::cerr << "Caught std::exception:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        return 1;
-    }
-    return 0;
-}
diff --git a/examples/include/example_helper.hpp b/examples/include/example_helper.hpp
deleted file mode 100644
index 4a89e6fae..000000000
--- a/examples/include/example_helper.hpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef __EXAMPLE_HELPER_HPP__
-#define __EXAMPLE_HELPER_HPP__
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include <complex>
-#include <iostream>
-#include <limits>
-#include <type_traits>
-#include <vector>
-
-// Complex helpers.
-template <typename T>
-struct complex_info {
-    using real_type = T;
-    static const bool is_complex = false;
-};
-
-template <typename T>
-struct complex_info<std::complex<T>> {
-    using real_type = T;
-    static const bool is_complex = true;
-};
-
-template <class T>
-struct is_complex : std::false_type {};
-template <class T>
-struct is_complex<std::complex<T>> : std::true_type {};
-
-//
-// helpers for initializing templated scalar data type values.
-//
-template <typename fp>
-fp set_fp_value(fp arg1, fp /*arg2*/ = fp(0.0)) {
-    return arg1;
-}
-
-template <typename fp>
-std::complex<fp> set_fp_value(std::complex<fp> arg1,
-                              std::complex<fp> arg2 = std::complex<fp>(0.0)) {
-    return std::complex<fp>(arg1.real(), arg2.real());
-}
-
-//
-// print a 2x2 block of data from matrix M using the sycl accessor
-//
-// M = [ M_00, M_01 ...
-//     [ M_10, M_11 ...
-//     [ ...
-//
-template <typename T>
-void print_2x2_matrix_values(T M, int ldM, std::string M_name) {
-    std::cout << std::endl;
-    std::cout << "\t\t\t" << M_name << " = [ " << M[0 * ldM + 0] << ", " << M[1 * ldM + 0]
-              << ", ...\n";
-    std::cout << "\t\t\t    [ " << M[0 * ldM + 1] << ", " << M[1 * ldM + 1] << ", ...\n";
-    std::cout << "\t\t\t    [ "
-              << "...\n";
-    std::cout << std::endl;
-}
-
-template <typename fp>
-fp rand_scalar() {
-    return fp(std::rand()) / fp(RAND_MAX) - fp(0.5);
-}
-
-template <typename vec>
-void rand_matrix(vec &M, oneapi::mkl::transpose trans, int m, int n, int ld) {
-    using fp = typename vec::value_type;
-
-    if (trans == oneapi::mkl::transpose::nontrans) {
-        for (int j = 0; j < n; j++)
-            for (int i = 0; i < m; i++)
-                M.at(i + j * ld) = rand_scalar<fp>();
-    }
-    else {
-        for (int i = 0; i < m; i++)
-            for (int j = 0; j < n; j++)
-                M.at(j + i * ld) = rand_scalar<fp>();
-    }
-}
-
-template <typename fp, typename intType>
-intType generate_sparse_matrix(const intType nx, intType *ia, intType *ja, fp *a,
-                               const intType index = 0) {
-    intType nz = nx, ny = nx;
-    intType nnz = 0;
-    intType current_row;
-
-    ia[0] = index;
-
-    for (intType iz = 0; iz < nz; iz++) {
-        for (intType iy = 0; iy < ny; iy++) {
-            for (intType ix = 0; ix < nx; ix++) {
-                current_row = iz * nx * ny + iy * nx + ix;
-
-                for (intType sz = -1; sz <= 1; sz++) {
-                    if (iz + sz > -1 && iz + sz < nz) {
-                        for (intType sy = -1; sy <= 1; sy++) {
-                            if (iy + sy > -1 && iy + sy < ny) {
-                                for (intType sx = -1; sx <= 1; sx++) {
-                                    if (ix + sx > -1 && ix + sx < nx) {
-                                        intType current_column =
-                                            current_row + sz * nx * ny + sy * nx + sx;
-                                        ja[nnz] = current_column + index;
-                                        if (current_column == current_row) {
-                                            a[nnz++] = set_fp_value(fp(26.0));
-                                        }
-                                        else {
-                                            a[nnz++] = set_fp_value(fp(-1.0));
-                                        }
-                                    } // end
-                                    // x
-                                    // bounds
-                                    // test
-                                } // end sx loop
-                            } // end y bounds test
-                        } // end sy loop
-                    } // end z bounds test
-                } // end sz loop
-                ia[current_row + 1] = nnz + index;
-
-            } // end ix loop
-        } // end iy loop
-    } // end iz loop
-    return nnz;
-}
-
-template <typename fp, typename fp_real>
-bool check_errors(fp x, fp x_ref, fp_real bound) {
-    fp_real aerr = std::abs(x - x_ref);
-    fp_real rerr = aerr / (std::abs(x_ref) + std::numeric_limits<fp_real>::epsilon());
-    bool ok = (rerr <= bound) || (aerr <= bound);
-    if (!ok)
-        std::cout << "relative error = " << rerr << " absolute error = " << aerr
-                  << " limit = " << bound;
-    return ok;
-}
-
-template <typename fp, typename intType>
-bool check_result(fp res, fp ref, intType nFlops, intType index) {
-    bool check;
-    using fp_real = typename complex_info<fp>::real_type;
-    fp_real bound = std::numeric_limits<fp_real>::epsilon() * static_cast<fp_real>(nFlops);
-    check = check_errors<fp, fp_real>(res, ref, bound);
-    if (!check)
-        std::cout << " in index: " << index << std::endl;
-    return check;
-}
-
-template <typename T>
-void free_vec(std::vector<T *> &ptr_vec, sycl::queue queue) {
-    for (auto ptr : ptr_vec) {
-        sycl::free(ptr, queue);
-    }
-    ptr_vec.clear();
-}
-
-#endif //__EXAMPLE_HELPER_HPP__
diff --git a/examples/lapack/CMakeLists.txt b/examples/lapack/CMakeLists.txt
deleted file mode 100644
index 06bd70859..000000000
--- a/examples/lapack/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#===============================================================================
-# Copyright 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Note: compile-time example uses both MKLCPU and CUSOLVER backends, therefore
-# cmake in the sub-directory will only build it if CUSOLVER backend is enabled
-add_subdirectory(compile_time_dispatching)
-
-# runtime compilation is only possible with dynamic libraries
-if (BUILD_SHARED_LIBS)
-  add_subdirectory(run_time_dispatching)
-endif()
diff --git a/examples/lapack/compile_time_dispatching/CMakeLists.txt b/examples/lapack/compile_time_dispatching/CMakeLists.txt
deleted file mode 100644
index cc126674f..000000000
--- a/examples/lapack/compile_time_dispatching/CMakeLists.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-#===============================================================================
-# Copyright 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-#Build object from all sources
-set(LAPACK_CT_SOURCES "")
-if(ENABLE_MKLCPU_BACKEND AND ENABLE_CUSOLVER_BACKEND)
-  list(APPEND LAPACK_CT_SOURCES "getrs_usm_mklcpu_cusolver")
-endif()
-
-if(domain STREQUAL "lapack" AND ENABLE_MKLCPU_BACKEND)
-  find_library(OPENCL_LIBRARY NAMES OpenCL)
-  message(STATUS "Found OpenCL: ${OPENCL_LIBRARY}")
-endif()
-
-foreach(lapack_ct_source ${LAPACK_CT_SOURCES})
-  add_executable(example_${domain}_${lapack_ct_source} ${lapack_ct_source}.cpp)
-  target_include_directories(example_${domain}_${lapack_ct_source}
-      PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-  )
-  if(domain STREQUAL "lapack" AND ENABLE_MKLCPU_BACKEND AND ENABLE_CUSOLVER_BACKEND)
-    add_dependencies(example_${domain}_${lapack_ct_source} onemkl_${domain}_mklcpu onemkl_${domain}_cusolver)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_mklcpu onemkl_${domain}_cusolver)
-    target_link_libraries(example_${domain}_${lapack_ct_source} PUBLIC ${OPENCL_LIBRARY})
-  endif()
-  target_link_libraries(example_${domain}_${lapack_ct_source} PUBLIC
-      ${ONEMKL_LIBRARIES_${domain}}
-      ONEMKL::SYCL::SYCL
-  )
-  # Register example as ctest
- add_test(NAME ${domain}/EXAMPLE/CT/${lapack_ct_source} COMMAND example_${domain}_${lapack_ct_source})
-endforeach(lapack_ct_source)
diff --git a/examples/lapack/compile_time_dispatching/getrs_usm_mklcpu_cusolver.cpp b/examples/lapack/compile_time_dispatching/getrs_usm_mklcpu_cusolver.cpp
deleted file mode 100644
index 2d6017d08..000000000
--- a/examples/lapack/compile_time_dispatching/getrs_usm_mklcpu_cusolver.cpp
+++ /dev/null
@@ -1,333 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/*
-*
-*  Content:
-*       This example demonstrates use of oneapi::mkl::lapack::getrf and
-*       oneapi::mkl::lapack::getrs to perform LU factorization and compute
-*       the solution on both an Intel CPU device and NVIDIA GPU device.
-*
-*       This example demonstrates only single precision (float) data type
-*       for matrix data
-*
-*******************************************************************************/
-
-// STL includes
-#include <algorithm>
-#include <cstdlib>
-#include <iostream>
-#include <vector>
-
-// oneMKL/SYCL includes
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl.hpp"
-
-// local includes
-#include "example_helper.hpp"
-
-//
-// Main example for LU consisting of initialization of
-// a general dense A matrix.
-// Then the LU factorization
-// A = P * L * U
-// is performed followed by solving a system of linear
-// equations using the computed LU factorization, with
-// multiple right-hand sides.
-// Finally the results are post processed.
-//
-
-void run_getrs_example(const sycl::device& cpu_device, const sycl::device& gpu_device) {
-    // Matrix sizes and leading dimensions
-    std::int64_t m = 23;
-    std::int64_t n = 23;
-    std::int64_t nrhs = 23;
-    std::int64_t lda = 32;
-    std::int64_t ldb = 32;
-    std::int64_t A_size = n * lda;
-    std::int64_t B_size = nrhs * ldb;
-    std::int64_t ipiv_size = n;
-    oneapi::mkl::transpose trans = oneapi::mkl::transpose::nontrans;
-
-    // Catch asynchronous exceptions for CPU and GPU
-    auto cpu_error_handler = [&](sycl::exception_list exceptions) {
-        for (auto const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (oneapi::mkl::lapack::exception const& e) {
-                // Handle LAPACK related exceptions that happened during asynchronous call
-                std::cerr
-                    << "Caught asynchronous LAPACK exception on CPU device during GETRF or GETRS:"
-                    << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-                std::cerr << "\tinfo: " << e.info() << std::endl;
-            }
-            catch (sycl::exception const& e) {
-                // Handle not LAPACK related exceptions that happened during asynchronous call
-                std::cerr
-                    << "Caught asynchronous SYCL exception on CPU device during GETRF or GETRS:"
-                    << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-            }
-        }
-        std::exit(2);
-    };
-    auto gpu_error_handler = [&](sycl::exception_list exceptions) {
-        for (auto const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (oneapi::mkl::lapack::exception const& e) {
-                // Handle LAPACK related exceptions that happened during asynchronous call
-                std::cerr
-                    << "Caught asynchronous LAPACK exception on GPU device during GETRF or GETRS:"
-                    << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-                std::cerr << "\tinfo: " << e.info() << std::endl;
-            }
-            catch (sycl::exception const& e) {
-                // Handle not LAPACK related exceptions that happened during asynchronous call
-                std::cerr
-                    << "Caught asynchronous SYCL exception on GPU device during GETRF or GETRS:"
-                    << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-            }
-        }
-        std::exit(2);
-    };
-
-    //
-    // Data preparation on host
-    //
-    std::vector<float> A(A_size);
-    std::vector<float> B(B_size);
-    std::vector<float> result_cpu(B_size);
-    std::vector<float> result_gpu(B_size);
-    std::fill(A.begin(), A.end(), 0);
-    std::fill(B.begin(), B.end(), 0);
-    std::fill(result_cpu.begin(), result_cpu.end(), 0);
-    std::fill(result_gpu.begin(), result_gpu.end(), 0);
-
-    rand_matrix(A, trans, m, n, lda);
-    rand_matrix(B, trans, n, nrhs, ldb);
-
-    //
-    // Preparation on CPU
-    //
-    sycl::queue cpu_queue(cpu_device, cpu_error_handler);
-    sycl::context cpu_context = cpu_queue.get_context();
-    sycl::event cpu_getrf_done;
-    sycl::event cpu_getrs_done;
-
-    float* cpu_A = sycl::malloc_device<float>(A_size * sizeof(float), cpu_queue);
-    float* cpu_B = sycl::malloc_device<float>(B_size * sizeof(float), cpu_queue);
-    std::int64_t* cpu_ipiv =
-        sycl::malloc_device<std::int64_t>(ipiv_size * sizeof(std::int64_t), cpu_queue);
-
-    std::int64_t cpu_getrf_scratchpad_size = oneapi::mkl::lapack::getrf_scratchpad_size<float>(
-        oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu>{ cpu_queue }, m, n, lda);
-    std::int64_t cpu_getrs_scratchpad_size = oneapi::mkl::lapack::getrs_scratchpad_size<float>(
-        oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu>{ cpu_queue }, trans, n, nrhs,
-        lda, ldb);
-    float* cpu_getrf_scratchpad = sycl::malloc_device<float>(
-        cpu_getrf_scratchpad_size * sizeof(float), cpu_device, cpu_context);
-    float* cpu_getrs_scratchpad = sycl::malloc_device<float>(
-        cpu_getrs_scratchpad_size * sizeof(float), cpu_device, cpu_context);
-    if (!cpu_A || !cpu_B || !cpu_ipiv || !cpu_getrf_scratchpad || !cpu_getrs_scratchpad) {
-        throw std::runtime_error("Failed to allocate USM memory.");
-    }
-
-    // copy data from host to CPU device
-    cpu_queue.memcpy(cpu_A, A.data(), A_size * sizeof(float)).wait();
-    cpu_queue.memcpy(cpu_B, B.data(), B_size * sizeof(float)).wait();
-
-    //
-    // Preparation on GPU
-    //
-    sycl::queue gpu_queue(gpu_device, gpu_error_handler);
-    sycl::context gpu_context = gpu_queue.get_context();
-    sycl::event gpu_getrf_done;
-    sycl::event gpu_getrs_done;
-
-    float* gpu_A = sycl::malloc_device<float>(A_size * sizeof(float), gpu_queue);
-    float* gpu_B = sycl::malloc_device<float>(B_size * sizeof(float), gpu_queue);
-    std::int64_t* gpu_ipiv =
-        sycl::malloc_device<std::int64_t>(ipiv_size * sizeof(std::int64_t), gpu_queue);
-
-    std::int64_t gpu_getrf_scratchpad_size = oneapi::mkl::lapack::getrf_scratchpad_size<float>(
-        oneapi::mkl::backend_selector<oneapi::mkl::backend::cusolver>{ gpu_queue }, m, n, lda);
-    std::int64_t gpu_getrs_scratchpad_size = oneapi::mkl::lapack::getrs_scratchpad_size<float>(
-        oneapi::mkl::backend_selector<oneapi::mkl::backend::cusolver>{ gpu_queue }, trans, n, nrhs,
-        lda, ldb);
-    float* gpu_getrf_scratchpad = sycl::malloc_device<float>(
-        gpu_getrf_scratchpad_size * sizeof(float), gpu_device, gpu_context);
-    float* gpu_getrs_scratchpad = sycl::malloc_device<float>(
-        gpu_getrs_scratchpad_size * sizeof(float), gpu_device, gpu_context);
-    if (!gpu_A || !gpu_B || !gpu_ipiv || !gpu_getrf_scratchpad) {
-        throw std::runtime_error("Failed to allocate USM memory.");
-    }
-
-    // copy data from host to CPU device
-    gpu_queue.memcpy(gpu_A, A.data(), A_size * sizeof(float)).wait();
-    gpu_queue.memcpy(gpu_B, B.data(), B_size * sizeof(float)).wait();
-
-    //
-    // Execute on CPU and GPU devices
-    //
-
-    cpu_getrf_done = oneapi::mkl::lapack::getrf(
-        oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu>{ cpu_queue }, m, n, cpu_A, lda,
-        cpu_ipiv, cpu_getrf_scratchpad, cpu_getrf_scratchpad_size);
-    cpu_getrs_done = oneapi::mkl::lapack::getrs(
-        oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu>{ cpu_queue }, trans, n, nrhs,
-        cpu_A, lda, cpu_ipiv, cpu_B, ldb, cpu_getrs_scratchpad, cpu_getrs_scratchpad_size,
-        { cpu_getrf_done });
-    gpu_getrf_done = oneapi::mkl::lapack::getrf(
-        oneapi::mkl::backend_selector<oneapi::mkl::backend::cusolver>{ gpu_queue }, m, n, gpu_A,
-        lda, gpu_ipiv, gpu_getrf_scratchpad, gpu_getrf_scratchpad_size);
-    gpu_getrs_done = oneapi::mkl::lapack::getrs(
-        oneapi::mkl::backend_selector<oneapi::mkl::backend::cusolver>{ gpu_queue }, trans, n, nrhs,
-        gpu_A, lda, gpu_ipiv, gpu_B, ldb, gpu_getrs_scratchpad, gpu_getrs_scratchpad_size,
-        { gpu_getrf_done });
-
-    // Wait until calculations are done
-    cpu_queue.wait_and_throw();
-    gpu_queue.wait_and_throw();
-
-    //
-    // Post Processing
-    //
-    // copy data from CPU device back to host
-    cpu_queue.memcpy(result_cpu.data(), cpu_B, B_size * sizeof(float)).wait_and_throw();
-
-    // copy data from GPU device back to host
-    gpu_queue.memcpy(result_gpu.data(), gpu_B, B_size * sizeof(float)).wait_and_throw();
-
-    // Print results
-    std::cout << "\n\t\tGETRF and GETRS parameters:" << std::endl;
-    std::cout << "\t\t\ttrans = "
-              << (trans == oneapi::mkl::transpose::nontrans
-                      ? "nontrans"
-                      : (trans == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
-              << std::endl;
-    std::cout << "\t\t\tm = " << m << ", n = " << n << ", nrhs = " << nrhs << std::endl;
-    std::cout << "\t\t\tlda = " << lda << ", ldb = " << ldb << std::endl;
-
-    std::cout << "\n\t\tOutputting 2x2 block of A and X matrices:" << std::endl;
-    // output the top 2x2 block of A matrix
-    print_2x2_matrix_values(A.data(), lda, "A");
-
-    // output the top 2x2 block of X matrix from CPU
-    print_2x2_matrix_values(result_cpu.data(), ldb, "(CPU) X");
-
-    // output the top 2x2 block of X matrix from GPU
-    print_2x2_matrix_values(result_gpu.data(), ldb, "(GPU) X");
-
-    sycl::free(gpu_getrs_scratchpad, gpu_queue);
-    sycl::free(gpu_getrf_scratchpad, gpu_queue);
-    sycl::free(gpu_ipiv, gpu_queue);
-    sycl::free(gpu_B, gpu_queue);
-    sycl::free(gpu_A, gpu_queue);
-
-    sycl::free(cpu_getrs_scratchpad, cpu_queue);
-    sycl::free(cpu_getrf_scratchpad, cpu_queue);
-    sycl::free(cpu_ipiv, cpu_queue);
-    sycl::free(cpu_B, cpu_queue);
-    sycl::free(cpu_A, cpu_queue);
-}
-
-//
-// Description of example setup, apis used and supported floating point type precisions
-//
-
-void print_example_banner() {
-    std::cout << "" << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << "# LU Factorization and Solve Example: " << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Computes LU Factorization A = P * L * U" << std::endl;
-    std::cout << "# and uses it to solve for X in a system of linear equations:" << std::endl;
-    std::cout << "#   AX = B" << std::endl;
-    std::cout << "# where A is a general dense matrix and B is a matrix whose columns" << std::endl;
-    std::cout << "# are the right-hand sides for the systems of equations." << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using apis:" << std::endl;
-    std::cout << "#   getrf and getrs" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using single precision (float) data type" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Running on both Intel CPU and NVIDIA GPU devices" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << std::endl;
-}
-
-//
-// Main entry point for example.
-//
-int main(int argc, char** argv) {
-    print_example_banner();
-
-    try {
-        sycl::device cpu_dev((sycl::cpu_selector()));
-        sycl::device gpu_dev((sycl::gpu_selector()));
-
-        unsigned int vendor_id = gpu_dev.get_info<sycl::info::device::vendor_id>();
-        if (vendor_id != NVIDIA_ID) {
-            std::cerr << "FAILED: NVIDIA GPU device not found." << std::endl;
-            return 1;
-        }
-        std::cout << "Running LAPACK GETRS USM example" << std::endl;
-        std::cout << "Running with single precision real data type on:" << std::endl;
-        std::cout << "\tCPU device :" << cpu_dev.get_info<sycl::info::device::name>() << std::endl;
-        std::cout << "\tGPU device :" << gpu_dev.get_info<sycl::info::device::name>() << std::endl;
-
-        run_getrs_example(cpu_dev, gpu_dev);
-        std::cout << "LAPACK GETRS USM example ran OK on MKLCPU and CUSOLVER" << std::endl;
-    }
-    catch (oneapi::mkl::lapack::exception const& e) {
-        // Handle LAPACK related exceptions that happened during synchronous call
-        std::cerr << "Caught synchronous LAPACK exception:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        std::cerr << "\tinfo: " << e.info() << std::endl;
-        return 1;
-    }
-    catch (sycl::exception const& e) {
-        // Handle not LAPACK related exceptions that happened during synchronous call
-        std::cerr << "Caught synchronous SYCL exception:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
-        return 1;
-    }
-    catch (std::exception const& e) {
-        // Handle not SYCL related exceptions that happened during synchronous call
-        std::cerr << "Caught synchronous std::exception:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        return 1;
-    }
-
-    return 0;
-}
diff --git a/examples/lapack/run_time_dispatching/CMakeLists.txt b/examples/lapack/run_time_dispatching/CMakeLists.txt
deleted file mode 100644
index 5fcf6a311..000000000
--- a/examples/lapack/run_time_dispatching/CMakeLists.txt
+++ /dev/null
@@ -1,72 +0,0 @@
-#===============================================================================
-# Copyright 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# NOTE: user needs to set env var ONEAPI_DEVICE_SELECTOR to use runtime example without specifying backend in CMake
-
-# Build object from all example sources
-set(LAPACK_RT_SOURCES "getrs_usm")
-
-# Set up for the right backend for run-time dispatching examples
-# If users build more than one backend (i.e. mklcpu and mklgpu, or mklcpu and CUDA), they may need to
-# overwrite ONEAPI_DEVICE_SELECTOR in their environment to run on the desired backend
-set(DEVICE_FILTERS "")
-if(ENABLE_MKLCPU_BACKEND)
-  list(APPEND DEVICE_FILTERS "opencl:cpu")
-endif()
-if(ENABLE_MKLGPU_BACKEND)
-  list(APPEND DEVICE_FILTERS "level_zero:gpu")
-endif()
-if(ENABLE_CUSOLVER_BACKEND)
-  list(APPEND DEVICE_FILTERS "cuda:gpu")
-endif()
-if(ENABLE_ROCSOLVER_BACKEND)
-  list(APPEND DEVICE_FILTERS "hip:gpu")
-endif()
-
-message(STATUS "ONEAPI_DEVICE_SELECTOR will be set to the following value(s): [${DEVICE_FILTERS}] for run-time dispatching examples")
-
-foreach(lapack_rt_source ${LAPACK_RT_SOURCES})
-  add_executable(example_${domain}_${lapack_rt_source} ${lapack_rt_source}.cpp)
-  target_include_directories(example_${domain}_${lapack_rt_source}
-      PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-  )
-
-  add_dependencies(example_${domain}_${lapack_rt_source} onemkl)
-
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET example_${domain}_${lapack_rt_source} SOURCES ${LAPACK_RT_SOURCES})
-  endif()
-
-  target_link_libraries(example_${domain}_${lapack_rt_source} PUBLIC
-      onemkl
-      ONEMKL::SYCL::SYCL
-      ${CMAKE_DL_LIBS}
-  )
-
-  foreach(device_filter ${DEVICE_FILTERS})
-    # Register example as ctest
-    add_test(NAME ${domain}/EXAMPLE/RT/${lapack_rt_source}/${device_filter} COMMAND example_${domain}_${lapack_rt_source})
-    set_property(TEST ${domain}/EXAMPLE/RT/${lapack_rt_source}/${device_filter} PROPERTY
-      ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}
-      ENVIRONMENT ONEAPI_DEVICE_SELECTOR=${device_filter})
-  endforeach(device_filter)
-
-endforeach(lapack_rt_source)
diff --git a/examples/lapack/run_time_dispatching/getrs_usm.cpp b/examples/lapack/run_time_dispatching/getrs_usm.cpp
deleted file mode 100644
index 4cf851a7e..000000000
--- a/examples/lapack/run_time_dispatching/getrs_usm.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/*
-*
-*  Content:
-*       This example demonstrates use of oneapi::mkl::lapack::getrf and
-*       oneapi::mkl::lapack::getrs to perform LU factorization and compute
-*       the solution on a SYCL device (HOST, CPU, GPU) that is selected
-*       during runtime.
-*
-*       This example demonstrates only single precision (float) data type
-*       for matrix data
-*
-*******************************************************************************/
-
-// STL includes
-#include <algorithm>
-#include <cstdlib>
-#include <iostream>
-#include <vector>
-
-// oneMKL/SYCL includes
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl.hpp"
-
-// local includes
-#include "example_helper.hpp"
-
-//
-// Main example for LU consisting of initialization of
-// a general dense A matrix.
-// Then the LU factorization
-// A = P * L * U
-// is performed followed by solving a system of linear
-// equations using the computed LU factorization, with
-// multiple right-hand sides.
-// Finally the results are post processed.
-//
-
-void run_getrs_example(const sycl::device& device) {
-    // Matrix sizes and leading dimensions
-    std::int64_t m = 23;
-    std::int64_t n = 23;
-    std::int64_t nrhs = 23;
-    std::int64_t lda = 32;
-    std::int64_t ldb = 32;
-    std::int64_t A_size = n * lda;
-    std::int64_t B_size = nrhs * ldb;
-    std::int64_t ipiv_size = n;
-    oneapi::mkl::transpose trans = oneapi::mkl::transpose::nontrans;
-
-    // Asynchronous error handler
-    auto error_handler = [&](sycl::exception_list exceptions) {
-        for (auto const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (oneapi::mkl::lapack::exception const& e) {
-                // Handle LAPACK related exceptions that happened during asynchronous call
-                std::cerr << "Caught asynchronous LAPACK exception during GETRF or GETRS:"
-                          << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-                std::cerr << "\tinfo: " << e.info() << std::endl;
-            }
-            catch (sycl::exception const& e) {
-                // Handle not LAPACK related exceptions that happened during asynchronous call
-                std::cerr << "Caught asynchronous SYCL exception during GETRF or GETRS:"
-                          << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-            }
-        }
-        std::exit(2);
-    };
-
-    // Data preparation on host
-    std::vector<float> A(A_size);
-    std::vector<float> B(B_size);
-    std::fill(A.begin(), A.end(), 0);
-    std::fill(B.begin(), B.end(), 0);
-
-    rand_matrix(A, trans, m, n, lda);
-    rand_matrix(B, trans, n, nrhs, ldb);
-
-    // Data preparation on selected device
-    sycl::queue queue(device, error_handler);
-    sycl::context context = queue.get_context();
-    sycl::event getrf_done;
-    sycl::event getrs_done;
-
-    float* dev_A = sycl::malloc_device<float>(A_size * sizeof(float), queue);
-    float* dev_B = sycl::malloc_device<float>(B_size * sizeof(float), queue);
-    std::int64_t* dev_ipiv =
-        sycl::malloc_device<std::int64_t>(ipiv_size * sizeof(std::int64_t), queue);
-
-    std::int64_t getrf_scratchpad_size =
-        oneapi::mkl::lapack::getrf_scratchpad_size<float>(queue, m, n, lda);
-    std::int64_t getrs_scratchpad_size =
-        oneapi::mkl::lapack::getrs_scratchpad_size<float>(queue, trans, n, nrhs, lda, ldb);
-    float* getrf_scratchpad =
-        sycl::malloc_shared<float>(getrf_scratchpad_size * sizeof(float), device, context);
-    float* getrs_scratchpad =
-        sycl::malloc_shared<float>(getrs_scratchpad_size * sizeof(float), device, context);
-    if (!dev_A || !dev_B || !dev_ipiv) {
-        throw std::runtime_error("Failed to allocate USM memory.");
-    }
-    // Skip checking getrf scratchpad memory allocation on rocsolver because with rocsolver
-    // backend getrf does not use scrachpad memory
-    if (device.is_cpu() || device.get_info<sycl::info::device::vendor_id>() != AMD_ID) {
-        if (!getrf_scratchpad) {
-            throw std::runtime_error("Failed to allocate USM memory.");
-        }
-    }
-    // Skip checking getrs scratchpad memory allocation on cusolver/rocsolver because with
-    // cusolver/rocsolver backend getrs does not use scrachpad memory
-    if (device.is_cpu() || (device.get_info<sycl::info::device::vendor_id>() != NVIDIA_ID &&
-                            device.get_info<sycl::info::device::vendor_id>() != AMD_ID)) {
-        if (!getrs_scratchpad) {
-            throw std::runtime_error("Failed to allocate USM memory.");
-        }
-    }
-
-    // copy data from host to device
-    queue.memcpy(dev_A, A.data(), A_size * sizeof(float)).wait();
-    queue.memcpy(dev_B, B.data(), B_size * sizeof(float)).wait();
-
-    // Execute on device
-    getrf_done = oneapi::mkl::lapack::getrf(queue, m, n, dev_A, lda, dev_ipiv, getrf_scratchpad,
-                                            getrf_scratchpad_size);
-    getrs_done =
-        oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, dev_A, lda, dev_ipiv, dev_B, ldb,
-                                   getrs_scratchpad, getrs_scratchpad_size, { getrf_done });
-
-    // Wait until calculations are done
-    queue.wait_and_throw();
-
-    // Copy data from device back to host
-    queue.memcpy(B.data(), dev_B, B_size * sizeof(float)).wait_and_throw();
-
-    // Print results
-    std::cout << "\n\t\tGETRF and GETRS parameters:" << std::endl;
-    std::cout << "\t\t\ttrans = "
-              << (trans == oneapi::mkl::transpose::nontrans
-                      ? "nontrans"
-                      : (trans == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
-              << std::endl;
-    std::cout << "\t\t\tm = " << m << ", n = " << n << ", nrhs = " << nrhs << std::endl;
-    std::cout << "\t\t\tlda = " << lda << ", ldb = " << ldb << std::endl;
-
-    std::cout << "\n\t\tOutputting 2x2 block of A and X matrices:" << std::endl;
-    // output the top 2x2 block of A matrix
-    print_2x2_matrix_values(A.data(), lda, "A");
-
-    // output the top 2x2 block of X matrix
-    print_2x2_matrix_values(B.data(), ldb, "X");
-
-    sycl::free(getrs_scratchpad, queue);
-    sycl::free(getrf_scratchpad, queue);
-    sycl::free(dev_ipiv, queue);
-    sycl::free(dev_B, queue);
-    sycl::free(dev_A, queue);
-}
-
-//
-// Description of example setup, apis used and supported floating point type precisions
-//
-
-void print_example_banner() {
-    std::cout << "" << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << "# LU Factorization and Solve Example: " << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Computes LU Factorization A = P * L * U" << std::endl;
-    std::cout << "# and uses it to solve for X in a system of linear equations:" << std::endl;
-    std::cout << "#   AX = B" << std::endl;
-    std::cout << "# where A is a general dense matrix and B is a matrix whose columns" << std::endl;
-    std::cout << "# are the right-hand sides for the systems of equations." << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using apis:" << std::endl;
-    std::cout << "#   getrf and getrs" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using single precision (float) data type" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Device will be selected during runtime." << std::endl;
-    std::cout << "# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify"
-              << std::endl;
-    std::cout << "# available devices" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << std::endl;
-}
-
-//
-// Main entry point for example.
-//
-int main(int argc, char** argv) {
-    print_example_banner();
-
-    try {
-        sycl::device dev = sycl::device();
-        if (dev.is_gpu()) {
-            std::cout << "Running LAPACK getrs example on GPU device." << std::endl;
-            std::cout << "Device name is: " << dev.get_info<sycl::info::device::name>()
-                      << std::endl;
-        }
-        else {
-            std::cout << "Running LAPACK getrs example on CPU device." << std::endl;
-            std::cout << "Device name is: " << dev.get_info<sycl::info::device::name>()
-                      << std::endl;
-        }
-
-        std::cout << "Running with single precision real data type:" << std::endl;
-        run_getrs_example(dev);
-        std::cout << "LAPACK GETRS USM example ran OK" << std::endl;
-    }
-    catch (oneapi::mkl::lapack::exception const& e) {
-        // Handle LAPACK related exceptions that happened during synchronous call
-        std::cerr << "Caught synchronous LAPACK exception:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        std::cerr << "\tinfo: " << e.info() << std::endl;
-        return 1;
-    }
-    catch (sycl::exception const& e) {
-        // Handle not LAPACK related exceptions that happened during synchronous call
-        std::cerr << "Caught synchronous SYCL exception:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
-        return 1;
-    }
-    catch (std::exception const& e) {
-        // Handle not SYCL related exceptions that happened during synchronous call
-        std::cerr << "Caught synchronous std::exception:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        return 1;
-    }
-
-    return 0;
-}
diff --git a/examples/rng/CMakeLists.txt b/examples/rng/CMakeLists.txt
deleted file mode 100644
index b2890bf19..000000000
--- a/examples/rng/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-#===============================================================================
-# Copyright 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Note: compile-time example uses both MKLCPU and CURAND backends, therefore
-# cmake in the sub-directory will only build it if CURAND backend is enabled
-add_subdirectory(compile_time_dispatching)
-add_subdirectory(device)
-
-# runtime compilation is only possible with dynamic libraries
-if (BUILD_SHARED_LIBS)
-  add_subdirectory(run_time_dispatching)
-endif()
diff --git a/examples/rng/compile_time_dispatching/CMakeLists.txt b/examples/rng/compile_time_dispatching/CMakeLists.txt
deleted file mode 100644
index 4f57db38c..000000000
--- a/examples/rng/compile_time_dispatching/CMakeLists.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-#===============================================================================
-# Copyright 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-#Build object from all sources
-set(RNG_CT_SOURCES "")
-if(ENABLE_MKLCPU_BACKEND AND ENABLE_CURAND_BACKEND)
-  list(APPEND RNG_CT_SOURCES "uniform_usm_mklcpu_curand")
-endif()
-
-foreach(rng_ct_source ${RNG_CT_SOURCES})
-  add_executable(example_${domain}_${rng_ct_source} ${rng_ct_source}.cpp)
-  target_include_directories(example_${domain}_${rng_ct_source}
-      PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-  )
-
-  if(domain STREQUAL "rng" AND ENABLE_MKLCPU_BACKEND AND ENABLE_CURAND_BACKEND)
-    add_dependencies(example_${domain}_${rng_ct_source}
-        onemkl_${domain}_mklcpu
-        onemkl_${domain}_curand)
-    list(APPEND ONEMKL_LIBRARIES_${domain}
-        onemkl_${domain}_mklcpu
-        onemkl_${domain}_curand)
-  endif()
-
-  target_link_libraries(example_${domain}_${rng_ct_source} PUBLIC
-      ${ONEMKL_LIBRARIES_${domain}}
-      ONEMKL::SYCL::SYCL
-  )
-
-  # Register example as ctest
-  add_test(NAME ${domain}/EXAMPLE/CT/${rng_ct_source} COMMAND example_${domain}_${rng_ct_source})
-endforeach(rng_ct_source)
diff --git a/examples/rng/compile_time_dispatching/uniform_usm_mklcpu_curand.cpp b/examples/rng/compile_time_dispatching/uniform_usm_mklcpu_curand.cpp
deleted file mode 100644
index cdfd6c765..000000000
--- a/examples/rng/compile_time_dispatching/uniform_usm_mklcpu_curand.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/*
-*
-*  Content:
-*       This example demonstrates use of DPC++ API oneapi::mkl::rng::uniform distribution
-*       with oneapi::mkl::rng::philox4x32x10 random number generator to produce
-*       random numbers on a INTEL CPU SYCL device and an NVIDIA GPU SYCL device
-*       with Unified Shared Memory(USM) API.
-*
-*       This example demonstrates only single precision (float) data type
-*       for random numbers
-*
-*******************************************************************************/
-
-// stl includes
-#include <algorithm>
-#include <cstdlib>
-#include <iostream>
-#include <vector>
-
-// oneMKL/SYCL includes
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl.hpp"
-
-// local includes
-#include "example_helper.hpp"
-
-//
-// Main example for Uniform random number generation consisting of
-// initialization of random number engine philox4x32x10 object, distribution
-// object. Then random number generation performed and
-// the output is post-processed and validated.
-//
-void run_uniform_example(const sycl::device& cpu_dev, const sycl::device& gpu_dev) {
-    //
-    // Initialization
-    //
-    // example parameters defines
-    constexpr std::uint64_t seed = 777;
-    constexpr std::size_t n = 1000;
-    constexpr std::size_t n_print = 10;
-    constexpr std::size_t alignment = 64;
-
-    // Catch asynchronous exceptions for CPU and GPU
-    auto cpu_exception_handler = [](sycl::exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const& e) {
-                std::cerr << "Caught asynchronous SYCL exception on CPU device during generation:"
-                          << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-            }
-        }
-        std::exit(2);
-    };
-    auto gpu_exception_handler = [](sycl::exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const& e) {
-                std::cerr << "Caught asynchronous SYCL exception on GPU device during generation:"
-                          << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-            }
-        }
-        std::exit(2);
-    };
-
-    // set scalar Type values
-    float a(0.0);
-    float b(10.0);
-
-    // preparation on CPU device and GPU device
-    sycl::queue cpu_queue(cpu_dev, cpu_exception_handler);
-    sycl::queue gpu_queue(gpu_dev, gpu_exception_handler);
-    oneapi::mkl::rng::default_engine cpu_engine(
-        oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu>{ cpu_queue }, seed);
-    oneapi::mkl::rng::default_engine gpu_engine(
-        oneapi::mkl::backend_selector<oneapi::mkl::backend::curand>{ gpu_queue }, seed);
-
-    oneapi::mkl::rng::uniform<float> distribution(a, b);
-
-    //
-    // Data preparation on host: prepare array for random numbers
-    //
-    std::vector<float> r_cpu(n);
-    std::vector<float> r_gpu(n);
-    std::fill(r_cpu.begin(), r_cpu.end(), 0);
-    std::fill(r_gpu.begin(), r_gpu.end(), 0);
-
-    //
-    // Data preparation on CPU device and GPU device
-    //
-    float* dev_cpu = sycl::malloc_device<float>(n * sizeof(float), cpu_queue);
-    float* dev_gpu = sycl::malloc_device<float>(n * sizeof(float), gpu_queue);
-    if (!dev_cpu || !dev_gpu) {
-        throw std::runtime_error("Failed to allocate USM memory.");
-    }
-
-    //
-    // Perform generation on CPU device and GPU device
-    //
-    sycl::event event_out_cpu;
-    sycl::event event_out_gpu;
-    event_out_cpu = oneapi::mkl::rng::generate(distribution, cpu_engine, n, dev_cpu);
-    event_out_gpu = oneapi::mkl::rng::generate(distribution, gpu_engine, n, dev_gpu);
-    event_out_cpu.wait_and_throw();
-    event_out_gpu.wait_and_throw();
-
-    //
-    // Post Processing
-    //
-
-    // copy data from CPU device and GPU device back to host
-    cpu_queue.memcpy(r_cpu.data(), dev_cpu, n * sizeof(float)).wait_and_throw();
-    gpu_queue.memcpy(r_gpu.data(), dev_gpu, n * sizeof(float)).wait_and_throw();
-
-    std::cout << "\t\tgeneration parameters:" << std::endl;
-    std::cout << "\t\t\tseed = " << seed << ", a = " << a << ", b = " << b << std::endl;
-
-    std::cout << "\t\tOutput of generator on CPU device:" << std::endl;
-    std::cout << "\t\t\tfirst " << n_print << " numbers of " << n << ": " << std::endl;
-    for (int i = 0; i < n_print; i++) {
-        std::cout << r_cpu.at(i) << " ";
-    }
-    std::cout << std::endl;
-
-    std::cout << "\t\tOutput of generator on GPU device:" << std::endl;
-    std::cout << "\t\t\tfirst " << n_print << " numbers of " << n << ": " << std::endl;
-    for (int i = 0; i < n_print; i++) {
-        std::cout << r_gpu.at(i) << " ";
-    }
-    std::cout << std::endl;
-
-    sycl::free(dev_gpu, gpu_queue);
-    sycl::free(dev_cpu, cpu_queue);
-}
-
-//
-// Description of example setup, APIs used and supported floating point type precisions
-//
-void print_example_banner() {
-    std::cout << "" << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout
-        << "# Generate uniformly distributed random numbers with philox4x32x10\n# generator example: "
-        << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using APIs:" << std::endl;
-    std::cout << "#   default_engine uniform" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using single precision (float) data type" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Running on both Intel CPU and Nvidia GPU devices" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << std::endl;
-}
-
-//
-// Main entry point for example.
-//
-
-int main(int argc, char** argv) {
-    print_example_banner();
-    try {
-        sycl::device cpu_dev((sycl::cpu_selector()));
-        sycl::device gpu_dev((sycl::gpu_selector()));
-
-        unsigned int vendor_id = gpu_dev.get_info<sycl::info::device::vendor_id>();
-        if (vendor_id != NVIDIA_ID) {
-            std::cerr << "FAILED: NVIDIA GPU device not found" << std::endl;
-            return 1;
-        }
-        std::cout << "Running RNG uniform usm example" << std::endl;
-        std::cout << "Running with single precision real data type:" << std::endl;
-        std::cout << "\tCPU device: " << cpu_dev.get_info<sycl::info::device::name>() << std::endl;
-        std::cout << "\tGPU device: " << gpu_dev.get_info<sycl::info::device::name>() << std::endl;
-
-        run_uniform_example(cpu_dev, gpu_dev);
-        std::cout
-            << "Random number generator example with uniform distribution ran OK on MKLCPU and CURAND"
-            << std::endl;
-    }
-    catch (sycl::exception const& e) {
-        std::cerr << "Caught synchronous SYCL exception during generation:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
-        return 1;
-    }
-    catch (std::exception const& e) {
-        std::cerr << "Caught std::exception during generation:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        return 1;
-    }
-
-    return 0;
-}
diff --git a/examples/rng/device/CMakeLists.txt b/examples/rng/device/CMakeLists.txt
deleted file mode 100644
index 1b6ecf2dd..000000000
--- a/examples/rng/device/CMakeLists.txt
+++ /dev/null
@@ -1,74 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# NOTE: user needs to set env var ONEAPI_DEVICE_SELECTOR to use runtime example (no need to specify backend when building with CMake)
-
-# Build object from all example sources
-set(RNG_DEVICE_SOURCES "uniform")
-
-# Set up for the right backend for run-time dispatching examples
-# If users build more than one backend (i.e. mklcpu and mklgpu, or mklcpu and CUDA), they may need to
-# overwrite ONEAPI_DEVICE_SELECTOR in their environment to run on the desired backend
-set(DEVICE_FILTERS "")
-if(ENABLE_MKLCPU_BACKEND)
-  list(APPEND DEVICE_FILTERS "opencl:cpu")
-endif()
-# RNG only supports mklcpu backend on Windows
-if(ENABLE_MKLGPU_BACKEND)
-  list(APPEND DEVICE_FILTERS "level_zero:gpu")
-endif()
-if(ENABLE_CURAND_BACKEND)
-  list(APPEND DEVICE_FILTERS "cuda:gpu")
-endif()
-if(ENABLE_ROCRAND_BACKEND)
-  list(APPEND DEVICE_FILTERS "hip:gpu")
-endif()
-
-message(STATUS "ONEAPI_DEVICE_SELECTOR will be set to the following value(s): [${DEVICE_FILTERS}] for run-time dispatching examples")
-
-foreach(rng_device_source ${RNG_DEVICE_SOURCES})
-  add_executable(example_${domain}_${rng_device_source} ${rng_device_source}.cpp)
-  target_include_directories(example_${domain}_${rng_device_source}
-      PUBLIC ${PROJECT_SOURCE_DIR}/examples/rng/device/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-  )
-
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET example_${domain}_${rng_device_source} SOURCES ${RNG_DEVICE_SOURCES})
-  endif()
-
-  target_link_libraries(example_${domain}_${rng_device_source} PUBLIC
-      ONEMKL::SYCL::SYCL
-  )
-
-  if(NOT ${ONEMKL_SYCL_IMPLEMENTATION} STREQUAL "hipsycl")
-    target_link_options(example_${domain}_${rng_device_source} PUBLIC -fsycl -fsycl-device-code-split=per_kernel)
-  endif()
-
-  # Register example as ctest
-  foreach(device_filter ${DEVICE_FILTERS})
-    add_test(NAME ${domain}/EXAMPLE/DEVICE/${rng_device_source}/${device_filter} COMMAND example_${domain}_${rng_device_source})
-    set_property(TEST ${domain}/EXAMPLE/DEVICE/${rng_device_source}/${device_filter} PROPERTY
-      ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}
-      ENVIRONMENT ONEAPI_DEVICE_SELECTOR=${device_filter})
-  endforeach(device_filter)
-
-endforeach()
diff --git a/examples/rng/device/include/rng_example_helper.hpp b/examples/rng/device/include/rng_example_helper.hpp
deleted file mode 100644
index 0bcf114b4..000000000
--- a/examples/rng/device/include/rng_example_helper.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _RNG_EXAMPLE_HELPER_HPP__
-#define _RNG_EXAMPLE_HELPER_HPP__
-
-template <typename T, typename = void>
-struct has_member_code_meta : std::false_type {};
-
-template <typename T>
-struct has_member_code_meta<T, std::void_t<decltype(std::declval<T>().get_multi_ptr())>>
-        : std::true_type {};
-
-template <typename T, typename std::enable_if<has_member_code_meta<T>::value>::type* = nullptr>
-auto get_multi_ptr(T acc) {
-// Workaround for AdaptiveCPP, as they do not yet support the get_multi_ptr function
-#ifndef __HIPSYCL__
-    return acc.get_multi_ptr();
-#else
-    return acc.get_pointer();
-#endif
-};
-
-template <typename T, typename std::enable_if<!has_member_code_meta<T>::value>::type* = nullptr>
-auto get_multi_ptr(T acc) {
-// Workaround for AdaptiveCPP, as they do not yet support the get_multi_ptr function
-#ifndef __HIPSYCL__
-    return acc.template get_multi_ptr<sycl::access::decorated::yes>();
-#else
-    return acc.get_pointer();
-#endif
-};
-
-#endif // _RNG_EXAMPLE_HELPER_HPP__
diff --git a/examples/rng/device/uniform.cpp b/examples/rng/device/uniform.cpp
deleted file mode 100644
index a1c097bba..000000000
--- a/examples/rng/device/uniform.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/*
-*
-*  Content:
-*       This example demonstrates usage of oneapi::mkl::rng::device::mcg59
-*       random number generator to produce random
-*       numbers using unifrom distribution on a SYCL device (CPU, GPU).
-*
-*******************************************************************************/
-
-// stl includes
-#include <iostream>
-#include <vector>
-
-// oneMKL/SYCL includes
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/rng/device.hpp"
-
-#include "rng_example_helper.hpp"
-
-bool isDoubleSupported(sycl::device my_dev) {
-    return my_dev.get_info<sycl::info::device::double_fp_config>().size() != 0;
-}
-
-// example parameters
-constexpr std::uint64_t seed = 777;
-constexpr std::size_t n = 1024;
-constexpr int n_print = 10;
-
-//
-// example show usage of rng device functionality, which can be called from both
-// host and device sides with scalar and vector generation
-//
-template <typename Type, std::int32_t VecSize>
-int run_example(sycl::queue& queue) {
-    if (VecSize == 1) {
-        std::cout << "\tRunning scalar example" << std::endl;
-    }
-    else {
-        std::cout << "\tRunning vector example with " << VecSize << " vector size" << std::endl;
-    }
-    // prepare array for random numbers
-    std::vector<Type> r_dev(n);
-
-    // submit a kernel to generate on device
-    {
-        sycl::buffer<Type> r_buf(r_dev.data(), r_dev.size());
-
-        try {
-            queue.submit([&](sycl::handler& cgh) {
-                sycl::accessor r_acc(r_buf, cgh, sycl::write_only);
-                cgh.parallel_for(sycl::range<1>(n / VecSize), [=](sycl::item<1> item) {
-                    size_t item_id = item.get_id(0);
-                    oneapi::mkl::rng::device::mcg59<VecSize> engine(seed, item_id * VecSize);
-                    oneapi::mkl::rng::device::uniform<Type> distr;
-
-                    auto res = oneapi::mkl::rng::device::generate(distr, engine);
-                    if constexpr (VecSize == 1) {
-                        r_acc[item_id] = res;
-                    }
-                    else {
-                        res.store(item_id, get_multi_ptr(r_acc));
-                    }
-                });
-            });
-            queue.wait_and_throw();
-        }
-        catch (sycl::exception const& e) {
-            std::cout << "\t\tSYCL exception\n" << e.what() << std::endl;
-            return 1;
-        }
-
-        std::cout << "\t\tOutput of generator:" << std::endl;
-
-        auto r_acc = sycl::host_accessor(r_buf, sycl::read_only);
-        std::cout << "first " << n_print << " numbers of " << n << ": " << std::endl;
-        for (int i = 0; i < n_print; i++) {
-            std::cout << r_acc[i] << " ";
-        }
-        std::cout << std::endl;
-    } // buffer life-time ends
-
-    // compare results with host-side generation
-    oneapi::mkl::rng::device::mcg59<1> engine(seed);
-    oneapi::mkl::rng::device::uniform<Type> distr;
-
-    int err = 0;
-    Type res_host;
-    for (int i = 0; i < n; i++) {
-        res_host = oneapi::mkl::rng::device::generate(distr, engine);
-        if (res_host != r_dev[i]) {
-            std::cout << "error in " << i << " element " << res_host << " " << r_dev[i]
-                      << std::endl;
-            err++;
-        }
-    }
-    return err;
-}
-
-//
-// description of example setup, APIs used
-//
-void print_example_banner() {
-    std::cout << "" << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << "# Generate uniformly distributed random numbers example: " << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using APIs:" << std::endl;
-    std::cout << "# mcg59 uniform" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << std::endl;
-}
-
-int main() {
-    // Catch asynchronous exceptions
-    auto exception_handler = [](sycl::exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const& e) {
-                std::cerr << "Caught asynchronous SYCL exception during generation:" << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-            }
-        }
-        std::exit(2);
-    };
-
-    print_example_banner();
-
-    try {
-        sycl::device my_dev = sycl::device();
-
-        if (my_dev.is_gpu()) {
-            std::cout << "Running RNG uniform usm example on GPU device" << std::endl;
-            std::cout << "Device name is: " << my_dev.get_info<sycl::info::device::name>()
-                      << std::endl;
-        }
-        else {
-            std::cout << "Running RNG uniform usm example on CPU device" << std::endl;
-            std::cout << "Device name is: " << my_dev.get_info<sycl::info::device::name>()
-                      << std::endl;
-        }
-
-        sycl::queue queue(my_dev, exception_handler);
-
-        std::cout << "\n\tRunning with single precision real data type:" << std::endl;
-        if (run_example<float, 1>(queue) || run_example<float, 4>(queue)) {
-            std::cout << "FAILED" << std::endl;
-            return 1;
-        }
-        if (isDoubleSupported(my_dev)) {
-            std::cout << "\n\tRunning with double precision real data type:" << std::endl;
-            if (run_example<double, 1>(queue) || run_example<double, 4>(queue)) {
-                std::cout << "FAILED" << std::endl;
-                return 1;
-            }
-        }
-        else {
-            std::cout << "Double precision is not supported for this device" << std::endl;
-        }
-        std::cout << "\n\tRunning with integer data type:" << std::endl;
-        if (run_example<std::int32_t, 1>(queue) || run_example<std::int32_t, 4>(queue)) {
-            std::cout << "FAILED" << std::endl;
-            return 1;
-        }
-        std::cout << "\n\tRunning with unsigned integer data type:" << std::endl;
-        if (run_example<std::uint32_t, 1>(queue) || run_example<std::uint32_t, 4>(queue)) {
-            std::cout << "FAILED" << std::endl;
-            return 1;
-        }
-
-        std::cout << "Random number generator with uniform distribution ran OK" << std::endl;
-    }
-    catch (sycl::exception const& e) {
-        std::cerr << "Caught synchronous SYCL exception:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
-        return 1;
-    }
-    catch (std::exception const& e) {
-        std::cerr << "Caught std::exception during generation:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        return 1;
-    }
-    return 0;
-}
diff --git a/examples/rng/run_time_dispatching/CMakeLists.txt b/examples/rng/run_time_dispatching/CMakeLists.txt
deleted file mode 100644
index d3bcc0f19..000000000
--- a/examples/rng/run_time_dispatching/CMakeLists.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-#===============================================================================
-# Copyright 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# NOTE: user needs to set env var ONEAPI_DEVICE_SELECTOR to use runtime example (no need to specify backend when building with CMake)
-
-# Build object from all example sources
-set(RNG_RT_SOURCES "uniform_usm")
-
-# Set up for the right backend for run-time dispatching examples
-# If users build more than one backend (i.e. mklcpu and mklgpu, or mklcpu and CUDA), they may need to
-# overwrite ONEAPI_DEVICE_SELECTOR in their environment to run on the desired backend
-set(DEVICE_FILTERS "")
-if(ENABLE_MKLCPU_BACKEND)
-  list(APPEND DEVICE_FILTERS "opencl:cpu")
-endif()
-# RNG only supports mklcpu backend on Windows
-if(UNIX AND ENABLE_MKLGPU_BACKEND)
-  list(APPEND DEVICE_FILTERS "level_zero:gpu")
-endif()
-if(UNIX AND ENABLE_CURAND_BACKEND)
-  list(APPEND DEVICE_FILTERS "cuda:gpu")
-endif()
-if(UNIX AND ENABLE_ROCRAND_BACKEND)
-  list(APPEND DEVICE_FILTERS "hip:gpu")
-endif()
-
-message(STATUS "ONEAPI_DEVICE_SELECTOR will be set to the following value(s): [${DEVICE_FILTERS}] for run-time dispatching examples")
-
-foreach(rng_rt_source ${RNG_RT_SOURCES})
-  add_executable(example_${domain}_${rng_rt_source} ${rng_rt_source}.cpp)
-  target_include_directories(example_${domain}_${rng_rt_source}
-      PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-  )
-
-  add_dependencies(example_${domain}_${rng_rt_source} onemkl)
-
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET example_${domain}_${rng_rt_source} SOURCES ${RNG_RT_SOURCES})
-  endif()
-
-  target_link_libraries(example_${domain}_${rng_rt_source} PUBLIC
-      onemkl
-      ONEMKL::SYCL::SYCL
-      ${CMAKE_DL_LIBS}
-  )
-
-  # Register example as ctest
-  foreach(device_filter ${DEVICE_FILTERS})
-    add_test(NAME ${domain}/EXAMPLE/RT/${rng_rt_source}/${device_filter} COMMAND example_${domain}_${rng_rt_source})
-    set_property(TEST ${domain}/EXAMPLE/RT/${rng_rt_source}/${device_filter} PROPERTY
-      ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}
-      ENVIRONMENT ONEAPI_DEVICE_SELECTOR=${device_filter})
-  endforeach(device_filter)
-
-endforeach()
diff --git a/examples/rng/run_time_dispatching/uniform_usm.cpp b/examples/rng/run_time_dispatching/uniform_usm.cpp
deleted file mode 100644
index 8ac7363c8..000000000
--- a/examples/rng/run_time_dispatching/uniform_usm.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/*
-*
-*  Content:
-*       This example demonstrates use of DPC++ API oneapi::mkl::rng::uniform distribution
-*       with oneapi::mkl::rng::philox4x32x10 random number generator to produce
-*       random numbers on a SYCL device (HOST, CPU, GPU) that is selected
-*       during runtime with Unified Shared Memory(USM) API.
-*
-*       This example demonstrates only single precision (float) data type
-*       for random numbers
-*
-*******************************************************************************/
-
-// stl includes
-#include <algorithm>
-#include <cstdlib>
-#include <iostream>
-#include <vector>
-
-// oneMKL/SYCL includes
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl.hpp"
-
-// local includes
-#include "example_helper.hpp"
-
-//
-// Main example for Uniform random number generation consisting of
-// initialization of random number engine philox4x32x10 object, distribution
-// object. Then random number generation performed and
-// the output is post-processed and validated.
-//
-void run_uniform_example(const sycl::device& dev) {
-    //
-    // Initialization
-    //
-    // example parameters defines
-    constexpr std::uint64_t seed = 777;
-    constexpr std::size_t n = 1000;
-    constexpr std::size_t n_print = 10;
-    constexpr std::size_t alignment = 64;
-
-    // Catch asynchronous exceptions
-    auto exception_handler = [](sycl::exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const& e) {
-                std::cerr << "Caught asynchronous SYCL exception during generation:" << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-            }
-        }
-        std::exit(2);
-    };
-
-    sycl::queue queue(dev, exception_handler);
-
-    // set scalar Type values
-    float a(0.0);
-    float b(10.0);
-
-    oneapi::mkl::rng::default_engine engine(queue, seed);
-    oneapi::mkl::rng::uniform<float> distribution(a, b);
-
-    //
-    // Data preparation on host: prepare array for random numbers
-    //
-    std::vector<float> r(n);
-
-    // Data preparation on selected device
-    float* dev_r = sycl::malloc_device<float>(n * sizeof(float), queue);
-    if (!dev_r) {
-        throw std::runtime_error("Failed to allocate USM memory.");
-    }
-
-    //
-    // Perform generation on device
-    //
-    sycl::event event_out;
-    event_out = oneapi::mkl::rng::generate(distribution, engine, n, dev_r);
-    event_out.wait_and_throw();
-
-    //
-    // Post Processing
-    //
-
-    // copy data from device back to host
-    queue.memcpy(r.data(), dev_r, n * sizeof(float)).wait_and_throw();
-
-    std::cout << "\t\tgeneration parameters:" << std::endl;
-    std::cout << "\t\t\tseed = " << seed << ", a = " << a << ", b = " << b << std::endl;
-
-    std::cout << "\t\tOutput of generator:" << std::endl;
-    std::cout << "\t\t\tfirst " << n_print << " numbers of " << n << ": " << std::endl;
-    for (int i = 0; i < n_print; i++) {
-        std::cout << r.at(i) << " ";
-    }
-    std::cout << std::endl;
-
-    sycl::free(dev_r, queue);
-}
-
-//
-// Description of example setup, APIs used and supported floating point type precisions
-//
-void print_example_banner() {
-    std::cout << "" << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout
-        << "# Generate uniformly distributed random numbers with philox4x32x10\n# generator example: "
-        << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using APIs:" << std::endl;
-    std::cout << "#   default_engine uniform" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using single precision (float) data type" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Device will be selected during runtime." << std::endl;
-    std::cout << "# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify"
-              << std::endl;
-    std::cout << "# available devices" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << std::endl;
-}
-
-//
-// Main entry point for example.
-//
-
-int main(int argc, char** argv) {
-    print_example_banner();
-
-    try {
-        sycl::device my_dev = sycl::device();
-
-        if (my_dev.is_gpu()) {
-            std::cout << "Running RNG uniform usm example on GPU device" << std::endl;
-            std::cout << "Device name is: " << my_dev.get_info<sycl::info::device::name>()
-                      << std::endl;
-        }
-        else {
-            std::cout << "Running RNG uniform usm example on CPU device" << std::endl;
-            std::cout << "Device name is: " << my_dev.get_info<sycl::info::device::name>()
-                      << std::endl;
-        }
-        std::cout << "Running with single precision real data type:" << std::endl;
-
-        run_uniform_example(my_dev);
-        std::cout << "Random number generator with uniform distribution ran OK" << std::endl;
-    }
-    catch (sycl::exception const& e) {
-        std::cerr << "Caught synchronous SYCL exception:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
-        return 1;
-    }
-    catch (std::exception const& e) {
-        std::cerr << "Caught std::exception during generation:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        return 1;
-    }
-    return 0;
-}
diff --git a/examples/sparse_blas/CMakeLists.txt b/examples/sparse_blas/CMakeLists.txt
deleted file mode 100644
index 721512429..000000000
--- a/examples/sparse_blas/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_subdirectory(compile_time_dispatching)
-
-# runtime compilation is only possible with dynamic libraries
-if (BUILD_SHARED_LIBS)
-  add_subdirectory(run_time_dispatching)
-endif()
diff --git a/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt b/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
deleted file mode 100644
index cb95333b4..000000000
--- a/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-#Build object from all sources
-set(SPARSE_BLAS_BACKENDS "")
-
-if(ENABLE_MKLCPU_BACKEND)
-  list(APPEND SPARSE_BLAS_BACKENDS "mklcpu")
-endif()
-
-include(WarningsUtils)
-
-foreach(backend ${SPARSE_BLAS_BACKENDS})
-  set(EXAMPLE_NAME example_sparse_blas_gemv_usm_${backend})
-  add_executable(${EXAMPLE_NAME} sparse_blas_gemv_usm_${backend}.cpp)
-  target_include_directories(${EXAMPLE_NAME}
-      PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-  )
-
-  add_dependencies(${EXAMPLE_NAME} onemkl_sparse_blas_${backend})
-  target_link_libraries(${EXAMPLE_NAME} PRIVATE ONEMKL::SYCL::SYCL onemkl_sparse_blas_${backend})
-
-  # Register example as ctest
-  add_test(NAME sparse_blas/EXAMPLE/CT/sparse_blas_gemv_usm_${backend} COMMAND ${EXAMPLE_NAME})
-endforeach(backend)
-
diff --git a/examples/sparse_blas/compile_time_dispatching/sparse_blas_gemv_usm_mklcpu.cpp b/examples/sparse_blas/compile_time_dispatching/sparse_blas_gemv_usm_mklcpu.cpp
deleted file mode 100644
index edb6d7e1f..000000000
--- a/examples/sparse_blas/compile_time_dispatching/sparse_blas_gemv_usm_mklcpu.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/*
-*
-*  Content:
-*       This example demonstrates use of DPCPP API oneapi::mkl::sparse::gemv
-*       using unified shared memory to perform general sparse matrix-vector
-*       multiplication on a INTEL CPU SYCL device.
-*
-*       y = alpha * op(A) * x + beta * y
-*
-*       where op() is defined by one of
-*
-*           oneapi::mkl::transpose::{nontrans,trans,conjtrans}
-*
-*
-*       This example demonstrates only single precision (float) data type for
-*       gemv matrix data
-*
-*
-*******************************************************************************/
-
-// stl includes
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl.hpp"
-
-#include "example_helper.hpp"
-
-//
-// Main example for Sparse Matrix-Vector Multiply consisting of
-// initialization of A matrix, x and y vectors as well as
-// scalars alpha and beta.  Then the product
-//
-// y = alpha * op(A) * x + beta * y
-//
-// is performed and finally the results are post processed.
-//
-template <typename fp, typename intType>
-int run_sparse_matrix_vector_multiply_example(const sycl::device &cpu_dev) {
-    // Matrix data size
-    intType size = 4;
-    intType nrows = size * size * size;
-
-    // Set scalar fp values
-    fp alpha = set_fp_value(fp(1.0));
-    fp beta = set_fp_value(fp(0.0));
-
-    // Catch asynchronous exceptions
-    auto exception_handler = [](sycl::exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const &e) {
-                std::cout << "Caught asynchronous SYCL "
-                             "exception during sparse::gemv:\n"
-                          << e.what() << std::endl;
-            }
-        }
-    };
-
-    // create execution queue and buffers of matrix data
-    sycl::queue cpu_queue(cpu_dev, exception_handler);
-    oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu> cpu_selector{ cpu_queue };
-
-    intType *ia, *ja;
-    fp *a, *x, *y, *z;
-    std::size_t sizea = static_cast<std::size_t>(27 * nrows);
-    std::size_t sizeja = static_cast<std::size_t>(27 * nrows);
-    std::size_t sizeia = static_cast<std::size_t>(nrows + 1);
-    std::size_t sizevec = static_cast<std::size_t>(nrows);
-
-    ia = (intType *)sycl::malloc_shared(sizeia * sizeof(intType), cpu_queue);
-    ja = (intType *)sycl::malloc_shared(sizeja * sizeof(intType), cpu_queue);
-    a = (fp *)sycl::malloc_shared(sizea * sizeof(fp), cpu_queue);
-    x = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue);
-    y = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue);
-    z = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue);
-
-    if (!ia || !ja || !a || !x || !y || !z) {
-        throw std::runtime_error("Failed to allocate USM memory");
-    }
-
-    intType nnz = generate_sparse_matrix<fp, intType>(size, ia, ja, a);
-
-    // Init vectors x and y
-    for (int i = 0; i < nrows; i++) {
-        x[i] = set_fp_value(fp(1.0));
-        y[i] = set_fp_value(fp(0.0));
-        z[i] = set_fp_value(fp(0.0));
-    }
-
-    std::vector<intType *> int_ptr_vec;
-    int_ptr_vec.push_back(ia);
-    int_ptr_vec.push_back(ja);
-    std::vector<fp *> fp_ptr_vec;
-    fp_ptr_vec.push_back(a);
-    fp_ptr_vec.push_back(x);
-    fp_ptr_vec.push_back(y);
-    fp_ptr_vec.push_back(z);
-
-    //
-    // Execute Matrix Multiply
-    //
-
-    oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans;
-    std::cout << "\n\t\tsparse::gemv parameters:\n";
-    std::cout << "\t\t\ttransA = "
-              << (transA == oneapi::mkl::transpose::nontrans
-                      ? "nontrans"
-                      : (transA == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
-              << std::endl;
-    std::cout << "\t\t\tnrows = " << nrows << std::endl;
-    std::cout << "\t\t\talpha = " << alpha << ", beta = " << beta << std::endl;
-
-    // create and initialize handle for a Sparse Matrix in CSR format
-    oneapi::mkl::sparse::matrix_handle_t handle = nullptr;
-
-    oneapi::mkl::sparse::init_matrix_handle(cpu_selector, &handle);
-
-    auto ev_set = oneapi::mkl::sparse::set_csr_data(cpu_selector, handle, nrows, nrows, nnz,
-                                                    oneapi::mkl::index_base::zero, ia, ja, a);
-
-    auto ev_opt = oneapi::mkl::sparse::optimize_gemv(cpu_selector, transA, handle, { ev_set });
-
-    auto ev_gemv =
-        oneapi::mkl::sparse::gemv(cpu_selector, transA, alpha, handle, x, beta, y, { ev_opt });
-
-    auto ev_release =
-        oneapi::mkl::sparse::release_matrix_handle(cpu_selector, &handle, { ev_gemv });
-
-    ev_release.wait_and_throw();
-
-    //
-    // Post Processing
-    //
-
-    fp *res = y;
-    const bool isConj = (transA == oneapi::mkl::transpose::conjtrans);
-    for (intType row = 0; row < nrows; row++) {
-        z[row] *= beta;
-    }
-    for (intType row = 0; row < nrows; row++) {
-        fp tmp = alpha * x[row];
-        for (intType i = ia[row]; i < ia[row + 1]; i++) {
-            if constexpr (is_complex<fp>()) {
-                z[ja[i]] += tmp * (isConj ? std::conj(a[i]) : a[i]);
-            }
-            else {
-                z[ja[i]] += tmp * a[i];
-            }
-        }
-    }
-
-    bool good = true;
-    for (intType row = 0; row < nrows; row++) {
-        good &= check_result(res[row], z[row], nrows, row);
-    }
-
-    std::cout << "\n\t\t sparse::gemv example " << (good ? "passed" : "failed") << "\n\tFinished"
-              << std::endl;
-
-    free_vec(fp_ptr_vec, cpu_queue);
-    free_vec(int_ptr_vec, cpu_queue);
-
-    if (!good)
-        return 1;
-
-    return 0;
-}
-
-//
-// Description of example setup, apis used and supported floating point type
-// precisions
-//
-void print_example_banner() {
-    std::cout << "" << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << "# Sparse Matrix-Vector Multiply Example: " << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# y = alpha * op(A) * x + beta * y" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# where A is a sparse matrix in CSR format, x and y are "
-                 "dense vectors"
-              << std::endl;
-    std::cout << "# and alpha, beta are floating point type precision scalars." << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using apis:" << std::endl;
-    std::cout << "#   sparse::gemv" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using single precision (float) data type" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Running on Intel CPU device" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << std::endl;
-}
-
-//
-// Main entry point for example
-//
-int main(int /*argc*/, char ** /*argv*/) {
-    print_example_banner();
-
-    try {
-        // TODO: Add cuSPARSE compile-time dispatcher in this example once it is supported.
-        sycl::device cpu_dev(sycl::cpu_selector_v);
-
-        std::cout << "Running Sparse BLAS GEMV USM example on CPU device." << std::endl;
-        std::cout << "Device name is: " << cpu_dev.get_info<sycl::info::device::name>()
-                  << std::endl;
-        std::cout << "Running with single precision real data type:" << std::endl;
-
-        run_sparse_matrix_vector_multiply_example<float, std::int32_t>(cpu_dev);
-        std::cout << "Sparse BLAS GEMV USM example ran OK." << std::endl;
-    }
-    catch (sycl::exception const &e) {
-        std::cerr << "Caught synchronous SYCL exception during Sparse GEMV:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
-        return 1;
-    }
-    catch (std::exception const &e) {
-        std::cerr << "Caught std::exception during Sparse GEMV:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        return 1;
-    }
-
-    return 0;
-}
diff --git a/examples/sparse_blas/run_time_dispatching/CMakeLists.txt b/examples/sparse_blas/run_time_dispatching/CMakeLists.txt
deleted file mode 100644
index 6f144c898..000000000
--- a/examples/sparse_blas/run_time_dispatching/CMakeLists.txt
+++ /dev/null
@@ -1,68 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# NOTE: user needs to set env var ONEAPI_DEVICE_SELECTOR to use runtime example (no need to specify backend when building with CMake)
-
-include(WarningsUtils)
-
-# Build object from all example sources
-set(SPARSE_BLAS_RT_SOURCES "sparse_blas_gemv_usm")
-# Set up for the right backend for run-time dispatching examples
-# If users build more than one backend (i.e. mklcpu and mklgpu, or mklcpu and CUDA), they may need to
-# overwrite ONEAPI_DEVICE_SELECTOR in their environment to run on the desired backend
-set(DEVICE_FILTERS "")
-if(ENABLE_MKLCPU_BACKEND)
-  list(APPEND DEVICE_FILTERS "opencl:cpu")
-endif()
-if(ENABLE_MKLGPU_BACKEND)
-  list(APPEND DEVICE_FILTERS "level_zero:gpu")
-endif()
-
-message(STATUS "ONEAPI_DEVICE_SELECTOR will be set to the following value(s): [${DEVICE_FILTERS}] for run-time dispatching examples")
-
-foreach(sparse_blas_rt_sources ${SPARSE_BLAS_RT_SOURCES})
-  add_executable(example_${sparse_blas_rt_sources} ${sparse_blas_rt_sources}.cpp)
-  target_include_directories(example_${sparse_blas_rt_sources}
-      PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-  )
-
-  add_dependencies(example_${sparse_blas_rt_sources} onemkl)
-
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET example_${sparse_blas_rt_sources} SOURCES ${SPARSE_BLAS_RT_SOURCES})
-  endif()
-
-  target_link_libraries(example_${sparse_blas_rt_sources}
-      PUBLIC onemkl
-      PUBLIC ONEMKL::SYCL::SYCL
-      PUBLIC ${CMAKE_DL_LIBS}
-      PRIVATE onemkl_warnings
-  )
-
-  # Register example as ctest
-  foreach(device_filter ${DEVICE_FILTERS})
-    add_test(NAME ${domain}/EXAMPLE/RT/${sparse_blas_rt_sources}/${device_filter} COMMAND example_${sparse_blas_rt_sources})
-    set_property(TEST ${domain}/EXAMPLE/RT/${sparse_blas_rt_sources}/${device_filter} PROPERTY
-      ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}
-      ENVIRONMENT ONEAPI_DEVICE_SELECTOR=${device_filter})
-  endforeach(device_filter)
-
-endforeach()
diff --git a/examples/sparse_blas/run_time_dispatching/sparse_blas_gemv_usm.cpp b/examples/sparse_blas/run_time_dispatching/sparse_blas_gemv_usm.cpp
deleted file mode 100644
index b5812fabf..000000000
--- a/examples/sparse_blas/run_time_dispatching/sparse_blas_gemv_usm.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/*
-*
-*  Content:
-*       This example demonstrates use of DPCPP API oneapi::mkl::sparse::gemv
-*       using unified shared memory to perform general sparse matrix-vector
-*       multiplication on a SYCL device (HOST, CPU, GPU) that is selected
-*       during runtime.
-*
-*       y = alpha * op(A) * x + beta * y
-*
-*       where op() is defined by one of
-*
-*           oneapi::mkl::transpose::{nontrans,trans,conjtrans}
-*
-*
-*       This example demonstrates only single precision (float) data type for
-*       gemv matrix data
-*
-*
-*******************************************************************************/
-
-// stl includes
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl.hpp"
-
-#include "example_helper.hpp"
-
-//
-// Main example for Sparse Matrix-Vector Multiply consisting of
-// initialization of A matrix, x and y vectors as well as
-// scalars alpha and beta.  Then the product
-//
-// y = alpha * op(A) * x + beta * y
-//
-// is performed and finally the results are post processed.
-//
-template <typename fp, typename intType>
-int run_sparse_matrix_vector_multiply_example(const sycl::device &dev) {
-    // Matrix data size
-    intType size = 4;
-    intType nrows = size * size * size;
-
-    // Set scalar fp values
-    fp alpha = set_fp_value(fp(1.0));
-    fp beta = set_fp_value(fp(0.0));
-
-    // Catch asynchronous exceptions
-    auto exception_handler = [](sycl::exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const &e) {
-                std::cout << "Caught asynchronous SYCL "
-                             "exception during sparse::gemv:\n"
-                          << e.what() << std::endl;
-            }
-        }
-    };
-
-    // create execution queue and buffers of matrix data
-    sycl::queue main_queue(dev, exception_handler);
-
-    intType *ia, *ja;
-    fp *a, *x, *y, *z;
-    std::size_t sizea = static_cast<std::size_t>(27 * nrows);
-    std::size_t sizeja = static_cast<std::size_t>(27 * nrows);
-    std::size_t sizeia = static_cast<std::size_t>(nrows + 1);
-    std::size_t sizevec = static_cast<std::size_t>(nrows);
-
-    ia = (intType *)sycl::malloc_shared(sizeia * sizeof(intType), main_queue);
-    ja = (intType *)sycl::malloc_shared(sizeja * sizeof(intType), main_queue);
-    a = (fp *)sycl::malloc_shared(sizea * sizeof(fp), main_queue);
-    x = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), main_queue);
-    y = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), main_queue);
-    z = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), main_queue);
-
-    if (!ia || !ja || !a || !x || !y || !z) {
-        throw std::runtime_error("Failed to allocate USM memory");
-    }
-
-    intType nnz = generate_sparse_matrix<fp, intType>(size, ia, ja, a);
-
-    // Init vectors x and y
-    for (int i = 0; i < nrows; i++) {
-        x[i] = set_fp_value(fp(1.0));
-        y[i] = set_fp_value(fp(0.0));
-        z[i] = set_fp_value(fp(0.0));
-    }
-
-    std::vector<intType *> int_ptr_vec;
-    int_ptr_vec.push_back(ia);
-    int_ptr_vec.push_back(ja);
-    std::vector<fp *> fp_ptr_vec;
-    fp_ptr_vec.push_back(a);
-    fp_ptr_vec.push_back(x);
-    fp_ptr_vec.push_back(y);
-    fp_ptr_vec.push_back(z);
-
-    //
-    // Execute Matrix Multiply
-    //
-
-    oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans;
-    std::cout << "\n\t\tsparse::gemv parameters:\n";
-    std::cout << "\t\t\ttransA = "
-              << (transA == oneapi::mkl::transpose::nontrans
-                      ? "nontrans"
-                      : (transA == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
-              << std::endl;
-    std::cout << "\t\t\tnrows = " << nrows << std::endl;
-    std::cout << "\t\t\talpha = " << alpha << ", beta = " << beta << std::endl;
-
-    // create and initialize handle for a Sparse Matrix in CSR format
-    oneapi::mkl::sparse::matrix_handle_t handle = nullptr;
-
-    oneapi::mkl::sparse::init_matrix_handle(main_queue, &handle);
-
-    auto ev_set = oneapi::mkl::sparse::set_csr_data(main_queue, handle, nrows, nrows, nnz,
-                                                    oneapi::mkl::index_base::zero, ia, ja, a);
-
-    auto ev_opt = oneapi::mkl::sparse::optimize_gemv(main_queue, transA, handle, { ev_set });
-
-    auto ev_gemv =
-        oneapi::mkl::sparse::gemv(main_queue, transA, alpha, handle, x, beta, y, { ev_opt });
-
-    auto ev_release = oneapi::mkl::sparse::release_matrix_handle(main_queue, &handle, { ev_gemv });
-
-    ev_release.wait_and_throw();
-
-    //
-    // Post Processing
-    //
-
-    fp *res = y;
-    const bool isConj = (transA == oneapi::mkl::transpose::conjtrans);
-    for (intType row = 0; row < nrows; row++) {
-        z[row] *= beta;
-    }
-    for (intType row = 0; row < nrows; row++) {
-        fp tmp = alpha * x[row];
-        for (intType i = ia[row]; i < ia[row + 1]; i++) {
-            if constexpr (is_complex<fp>()) {
-                z[ja[i]] += tmp * (isConj ? std::conj(a[i]) : a[i]);
-            }
-            else {
-                z[ja[i]] += tmp * a[i];
-            }
-        }
-    }
-
-    bool good = true;
-    for (intType row = 0; row < nrows; row++) {
-        good &= check_result(res[row], z[row], nrows, row);
-    }
-
-    std::cout << "\n\t\t sparse::gemv example " << (good ? "passed" : "failed") << "\n\tFinished"
-              << std::endl;
-
-    free_vec(fp_ptr_vec, main_queue);
-    free_vec(int_ptr_vec, main_queue);
-
-    if (!good)
-        return 1;
-
-    return 0;
-}
-
-//
-// Description of example setup, apis used and supported floating point type
-// precisions
-//
-void print_example_banner() {
-    std::cout << "" << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << "# Sparse Matrix-Vector Multiply Example: " << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# y = alpha * op(A) * x + beta * y" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# where A is a sparse matrix in CSR format, x and y are "
-                 "dense vectors"
-              << std::endl;
-    std::cout << "# and alpha, beta are floating point type precision scalars." << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using apis:" << std::endl;
-    std::cout << "#   sparse::gemv" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Using single precision (float) data type" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "# Device will be selected during runtime." << std::endl;
-    std::cout << "# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify"
-              << std::endl;
-    std::cout << "# available devices" << std::endl;
-    std::cout << "# " << std::endl;
-    std::cout << "########################################################################"
-              << std::endl;
-    std::cout << std::endl;
-}
-
-//
-// Main entry point for example
-//
-int main(int /*argc*/, char ** /*argv*/) {
-    print_example_banner();
-
-    try {
-        sycl::device dev = sycl::device();
-
-        if (dev.is_gpu()) {
-            std::cout << "Running Sparse BLAS GEMV USM example on GPU device." << std::endl;
-            std::cout << "Device name is: " << dev.get_info<sycl::info::device::name>()
-                      << std::endl;
-        }
-        else {
-            std::cout << "Running Sparse BLAS GEMV USM example on CPU device." << std::endl;
-            std::cout << "Device name is: " << dev.get_info<sycl::info::device::name>()
-                      << std::endl;
-        }
-        std::cout << "Running with single precision real data type:" << std::endl;
-
-        run_sparse_matrix_vector_multiply_example<float, std::int32_t>(dev);
-        std::cout << "Sparse BLAS GEMV USM example ran OK." << std::endl;
-    }
-    catch (sycl::exception const &e) {
-        std::cerr << "Caught synchronous SYCL exception during Sparse GEMV:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
-        return 1;
-    }
-    catch (std::exception const &e) {
-        std::cerr << "Caught std::exception during Sparse GEMV:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        return 1;
-    }
-
-    return 0;
-}
diff --git a/include/oneapi/mkl.hpp b/include/oneapi/mkl.hpp
deleted file mode 100644
index f3e9b8618..000000000
--- a/include/oneapi/mkl.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_HPP_
-#define _ONEMKL_HPP_
-
-#include "oneapi/mkl/types.hpp"
-
-#include "oneapi/mkl/blas.hpp"
-#include "oneapi/mkl/dft.hpp"
-#include "oneapi/mkl/lapack.hpp"
-#include "oneapi/mkl/rng.hpp"
-#include "oneapi/mkl/sparse_blas.hpp"
-
-#endif //_ONEMKL_HPP_
diff --git a/include/oneapi/mkl/bfloat16.hpp b/include/oneapi/mkl/bfloat16.hpp
deleted file mode 100644
index afa155b1a..000000000
--- a/include/oneapi/mkl/bfloat16.hpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _BFLOAT16_HPP__
-#define _BFLOAT16_HPP__
-
-#include <cmath>
-#include <cstdint>
-#include <type_traits>
-
-namespace oneapi {
-namespace mkl {
-
-namespace bfloat16_impl {
-
-template <typename T>
-struct is_float_double {
-    static constexpr bool value = false;
-};
-template <>
-struct is_float_double<float> {
-    static constexpr bool value = true;
-};
-template <>
-struct is_float_double<double> {
-    static constexpr bool value = true;
-};
-
-union float_raw {
-    float f;
-    std::uint32_t i;
-};
-
-static inline std::uint32_t float_to_raw(float f) {
-    float_raw r;
-    r.f = f;
-    return r.i;
-}
-
-static inline float raw_to_float(std::uint32_t i) {
-    float_raw r;
-    r.i = i;
-    return r.f;
-}
-
-} /* namespace bfloat16_impl */
-
-struct bfloat16 {
-    std::uint16_t raw;
-
-    bfloat16(int raw_, bool) : raw(raw_) {}
-
-    bfloat16() = default;
-    inline bfloat16(float f);
-    bfloat16(double d) : bfloat16(float(d)) {}
-    template <typename T>
-    bfloat16(T i, typename std::enable_if<std::is_integral<T>::value>::type *_ = nullptr)
-            : bfloat16(float(i)) {}
-
-    inline operator float() const;
-
-    bfloat16 operator+() const {
-        return *this;
-    }
-    bfloat16 operator-() const {
-        bfloat16 h = *this;
-        h.raw ^= 0x8000;
-        return h;
-    }
-
-    bfloat16 operator++() {
-        return (*this = *this + 1);
-    }
-    bfloat16 operator++(int) {
-        bfloat16 h = *this;
-        ++*this;
-        return h;
-    }
-    bfloat16 operator--() {
-        return (*this = *this - 1);
-    }
-    bfloat16 operator--(int) {
-        bfloat16 h = *this;
-        --*this;
-        return h;
-    }
-
-    friend float operator+(const bfloat16 &h1, const bfloat16 &h2) {
-        return float(h1) + float(h2);
-    }
-    friend float operator-(const bfloat16 &h1, const bfloat16 &h2) {
-        return float(h1) - float(h2);
-    }
-    friend float operator*(const bfloat16 &h1, const bfloat16 &h2) {
-        return float(h1) * float(h2);
-    }
-    friend float operator/(const bfloat16 &h1, const bfloat16 &h2) {
-        return float(h1) / float(h2);
-    }
-
-    template <typename T>
-    friend typename std::enable_if<std::is_integral<T>::value, float>::type operator+(
-        const bfloat16 &h, const T &o) {
-        return float(h) + float(o);
-    }
-    template <typename T>
-    friend typename std::enable_if<std::is_integral<T>::value, float>::type operator-(
-        const bfloat16 &h, const T &o) {
-        return float(h) - float(o);
-    }
-    template <typename T>
-    friend typename std::enable_if<std::is_integral<T>::value, float>::type operator*(
-        const bfloat16 &h, const T &o) {
-        return float(h) * float(o);
-    }
-    template <typename T>
-    friend typename std::enable_if<std::is_integral<T>::value, float>::type operator/(
-        const bfloat16 &h, const T &o) {
-        return float(h) / float(o);
-    }
-    template <typename T>
-    friend typename std::enable_if<std::is_integral<T>::value, float>::type operator+(
-        const T &o, const bfloat16 &h) {
-        return float(o) + float(h);
-    }
-    template <typename T>
-    friend typename std::enable_if<std::is_integral<T>::value, float>::type operator-(
-        const T &o, const bfloat16 &h) {
-        return float(o) - float(h);
-    }
-    template <typename T>
-    friend typename std::enable_if<std::is_integral<T>::value, float>::type operator*(
-        const T &o, const bfloat16 &h) {
-        return float(o) * float(h);
-    }
-    template <typename T>
-    friend typename std::enable_if<std::is_integral<T>::value, float>::type operator/(
-        const T &o, const bfloat16 &h) {
-        return float(o) / float(h);
-    }
-
-    template <typename T>
-    friend typename std::enable_if<bfloat16_impl::is_float_double<T>::value, T>::type operator+(
-        const bfloat16 &h, const T &o) {
-        return float(h) + o;
-    }
-    template <typename T>
-    friend typename std::enable_if<bfloat16_impl::is_float_double<T>::value, T>::type operator-(
-        const bfloat16 &h, const T &o) {
-        return float(h) - o;
-    }
-    template <typename T>
-    friend typename std::enable_if<bfloat16_impl::is_float_double<T>::value, T>::type operator*(
-        const bfloat16 &h, const T &o) {
-        return float(h) * o;
-    }
-    template <typename T>
-    friend typename std::enable_if<bfloat16_impl::is_float_double<T>::value, T>::type operator/(
-        const bfloat16 &h, const T &o) {
-        return float(h) / o;
-    }
-    template <typename T>
-    friend typename std::enable_if<bfloat16_impl::is_float_double<T>::value, T>::type operator+(
-        const T &o, const bfloat16 &h) {
-        return o + float(h);
-    }
-    template <typename T>
-    friend typename std::enable_if<bfloat16_impl::is_float_double<T>::value, T>::type operator-(
-        const T &o, const bfloat16 &h) {
-        return o - float(h);
-    }
-    template <typename T>
-    friend typename std::enable_if<bfloat16_impl::is_float_double<T>::value, T>::type operator*(
-        const T &o, const bfloat16 &h) {
-        return o * float(h);
-    }
-    template <typename T>
-    friend typename std::enable_if<bfloat16_impl::is_float_double<T>::value, T>::type operator/(
-        const T &o, const bfloat16 &h) {
-        return o / float(h);
-    }
-
-    template <typename T>
-    bfloat16 operator+=(const T &o) {
-        return *this = bfloat16(*this + o);
-    }
-    template <typename T>
-    bfloat16 operator-=(const T &o) {
-        return *this = bfloat16(*this - o);
-    }
-    template <typename T>
-    bfloat16 operator*=(const T &o) {
-        return *this = bfloat16(*this * o);
-    }
-    template <typename T>
-    bfloat16 operator/=(const T &o) {
-        return *this = bfloat16(*this / o);
-    }
-};
-
-bfloat16::bfloat16(float f) {
-    raw = bfloat16_impl::float_to_raw(f) >> 16; // RTZ
-}
-
-inline bfloat16::operator float() const {
-    return bfloat16_impl::raw_to_float(raw << 16);
-}
-
-} /* namespace mkl */
-} // namespace oneapi
-
-#endif /* _BFLOAT16_HPP__ */
diff --git a/include/oneapi/mkl/blas.hpp b/include/oneapi/mkl/blas.hpp
deleted file mode 100644
index 05458d9aa..000000000
--- a/include/oneapi/mkl/blas.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_BLAS_HPP_
-#define _ONEMKL_BLAS_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl/types.hpp"
-
-#include "oneapi/mkl/detail/get_device_id.hpp"
-
-#include "oneapi/mkl/blas/detail/blas_loader.hpp"
-#ifdef ENABLE_CUBLAS_BACKEND
-#include "oneapi/mkl/blas/detail/cublas/blas_ct.hpp"
-#endif
-#ifdef ENABLE_ROCBLAS_BACKEND
-#include "oneapi/mkl/blas/detail/rocblas/blas_ct.hpp"
-#endif
-#ifdef ENABLE_MKLCPU_BACKEND
-#include "oneapi/mkl/blas/detail/mklcpu/blas_ct.hpp"
-#endif
-#ifdef ENABLE_MKLGPU_BACKEND
-#include "oneapi/mkl/blas/detail/mklgpu/blas_ct.hpp"
-#endif
-#ifdef ENABLE_NETLIB_BACKEND
-#include "oneapi/mkl/blas/detail/netlib/blas_ct.hpp"
-#endif
-#ifdef ENABLE_PORTBLAS_BACKEND
-#include "oneapi/mkl/blas/detail/portblas/blas_ct.hpp"
-#endif
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace column_major {
-
-#include "blas.hxx"
-
-} //namespace column_major
-namespace row_major {
-
-#include "blas.hxx"
-
-} //namespace row_major
-} //namespace blas
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_ONEMKL_BLAS_LOADER_HPP_
diff --git a/include/oneapi/mkl/blas.hxx b/include/oneapi/mkl/blas.hxx
deleted file mode 100644
index 374585912..000000000
--- a/include/oneapi/mkl/blas.hxx
+++ /dev/null
@@ -1,4406 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-static inline void asum(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &result) {
-    detail::asum(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void asum(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &result) {
-    detail::asum(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, sycl::buffer<float, 1> &result) {
-    detail::asum(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, sycl::buffer<double, 1> &result) {
-    detail::asum(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void axpy(sycl::queue &queue, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &y, std::int64_t incy) {
-    detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy);
-}
-
-static inline void axpy(sycl::queue &queue, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &y, std::int64_t incy) {
-    detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy);
-}
-
-static inline void axpy(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy);
-}
-
-static inline void axpy(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy);
-}
-
-static inline void axpy_batch(sycl::queue &queue, std::int64_t n, float alpha,
-                              sycl::buffer<float, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<float, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy, stridey,
-                       batch_size);
-}
-
-static inline void axpy_batch(sycl::queue &queue, std::int64_t n, double alpha,
-                              sycl::buffer<double, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<double, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy, stridey,
-                       batch_size);
-}
-
-static inline void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy, stridey,
-                       batch_size);
-}
-
-static inline void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy, stridey,
-                       batch_size);
-}
-
-static inline void axpby(sycl::queue &queue, std::int64_t n, float alpha,
-                         sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                         sycl::buffer<float, 1> &y, std::int64_t incy) {
-    detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy);
-}
-
-static inline void axpby(sycl::queue &queue, std::int64_t n, double alpha,
-                         sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                         sycl::buffer<double, 1> &y, std::int64_t incy) {
-    detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy);
-}
-
-static inline void axpby(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                         std::int64_t incy) {
-    detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy);
-}
-
-static inline void axpby(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                         std::int64_t incy) {
-    detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy);
-}
-
-static inline void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    detail::copy(get_device_id(queue), queue, n, x, incx, y, incy);
-}
-
-static inline void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    detail::copy(get_device_id(queue), queue, n, x, incx, y, incy);
-}
-
-static inline void copy(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    detail::copy(get_device_id(queue), queue, n, x, incx, y, incy);
-}
-
-static inline void copy(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    detail::copy(get_device_id(queue), queue, n, x, incx, y, incy);
-}
-
-static inline void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                              std::int64_t incx, std::int64_t stridex,
-                              sycl::buffer<float, 1> &y, std::int64_t incy,
-                              std::int64_t stridey, std::int64_t batch_size) {
-    detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy, stridey,
-                       batch_size);
-}
-
-static inline void copy_batch(sycl::queue &queue, std::int64_t n,
-                              sycl::buffer<double, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<double, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy, stridey,
-                       batch_size);
-}
-
-static inline void copy_batch(sycl::queue &queue, std::int64_t n,
-                              sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy, stridey,
-                       batch_size);
-}
-
-static inline void copy_batch(sycl::queue &queue, std::int64_t n,
-                              sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy, stridey,
-                       batch_size);
-}
-
-static inline void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                       std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                       sycl::buffer<float, 1> &result) {
-    detail::dot(get_device_id(queue), queue, n, x, incx, y, incy, result);
-}
-
-static inline void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                       std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                       sycl::buffer<double, 1> &result) {
-    detail::dot(get_device_id(queue), queue, n, x, incx, y, incy, result);
-}
-
-static inline void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                       std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                       sycl::buffer<double, 1> &result) {
-    detail::dot(get_device_id(queue), queue, n, x, incx, y, incy, result);
-}
-
-static inline void dotc(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<float>, 1> &result) {
-    detail::dotc(get_device_id(queue), queue, n, x, incx, y, incy, result);
-}
-
-static inline void dotc(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<double>, 1> &result) {
-    detail::dotc(get_device_id(queue), queue, n, x, incx, y, incy, result);
-}
-
-static inline void dotu(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<float>, 1> &result) {
-    detail::dotu(get_device_id(queue), queue, n, x, incx, y, incy, result);
-}
-
-static inline void dotu(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<double>, 1> &result) {
-    detail::dotu(get_device_id(queue), queue, n, x, incx, y, incy, result);
-}
-
-static inline void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, float alpha,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        sycl::buffer<float, 1> &y, std::int64_t incy) {
-    detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
-                 incy);
-}
-
-static inline void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy) {
-    detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
-                 incy);
-}
-
-static inline void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy) {
-    detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
-                 incy);
-}
-
-static inline void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy) {
-    detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
-                 incy);
-}
-
-static inline void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                        float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta,
-                 c, ldc);
-}
-
-static inline void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                        sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta,
-                 c, ldc);
-}
-
-static inline void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc) {
-    detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta,
-                 c, ldc);
-}
-
-static inline void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc) {
-    detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta,
-                 c, ldc);
-}
-
-static inline void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, sycl::half alpha,
-                        sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                        sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, sycl::half beta,
-                        sycl::buffer<sycl::half, 1> &c, std::int64_t ldc) {
-    detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta,
-                 c, ldc);
-}
-
-static inline void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, float alpha,
-                        sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                        sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, float beta,
-                        sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta,
-                 c, ldc);
-}
-
-static inline void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                        std::int64_t n, std::int64_t k, float alpha,
-                        sycl::buffer<bfloat16, 1> &a, std::int64_t lda,
-                        sycl::buffer<bfloat16, 1> &b, std::int64_t ldb, float beta,
-                        sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta,
-                 c, ldc);
-}
-
-static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                              sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<float, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              sycl::buffer<float, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size) {
-    detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda,
-                       stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                              sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<double, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, double beta,
-                              sycl::buffer<double, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size) {
-    detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda,
-                       stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k,
-                              std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                              sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size) {
-    detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda,
-                       stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k,
-                              std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                              sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size) {
-    detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda,
-                       stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                              sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<sycl::half, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, sycl::half beta,
-                              sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size) {
-    detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda,
-                       stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                              sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<sycl::half, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                              std::int64_t batch_size) {
-    detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda,
-                       stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                              sycl::buffer<std::int8_t, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::int8_t, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                              std::int64_t batch_size) {
-    detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda,
-                       stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                              std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                              sycl::buffer<std::int8_t, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::int8_t, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size) {
-    detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda,
-                       stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-static inline void gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                             offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-                             float alpha, sycl::buffer<int8_t, 1> &a, std::int64_t lda,
-                             int8_t ao, sycl::buffer<uint8_t, 1> &b, std::int64_t ldb,
-                             uint8_t bo, float beta, sycl::buffer<int32_t, 1> &c,
-                             std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k, alpha, a, lda,
-                      ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-static inline void gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                             offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-                             float alpha, sycl::buffer<int8_t, 1> &a, std::int64_t lda,
-                             int8_t ao, sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo,
-                             float beta, sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-                             sycl::buffer<int32_t, 1> &co) {
-    detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k, alpha, a, lda,
-                      ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-static inline void gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                             offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-                             float alpha, sycl::buffer<uint8_t, 1> &a, std::int64_t lda,
-                             uint8_t ao, sycl::buffer<int8_t, 1> &b, std::int64_t ldb,
-                             int8_t bo, float beta, sycl::buffer<int32_t, 1> &c,
-                             std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k, alpha, a, lda,
-                      ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-static inline void gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                             offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-                             float alpha, sycl::buffer<uint8_t, 1> &a, std::int64_t lda,
-                             uint8_t ao, sycl::buffer<uint8_t, 1> &b, std::int64_t ldb,
-                             uint8_t bo, float beta, sycl::buffer<int32_t, 1> &c,
-                             std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k, alpha, a, lda,
-                      ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-static inline void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k, float alpha,
-                         sycl::buffer<float, 1> &a, std::int64_t lda,
-                         sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                         sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
-                  ldb, beta, c, ldc);
-}
-
-static inline void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k, double alpha,
-                         sycl::buffer<double, 1> &a, std::int64_t lda,
-                         sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                         sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
-                  ldb, beta, c, ldc);
-}
-
-static inline void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k,
-                         std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                         std::int64_t ldb, std::complex<float> beta,
-                         sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
-                  ldb, beta, c, ldc);
-}
-
-static inline void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa,
-                         transpose transb, std::int64_t n, std::int64_t k,
-                         std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                         std::int64_t ldb, std::complex<double> beta,
-                         sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha, a, lda, b,
-                  ldb, beta, c, ldc);
-}
-
-static inline void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        sycl::buffer<float, 1> &y, std::int64_t incy) {
-    detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-static inline void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy) {
-    detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-static inline void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-static inline void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-static inline void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                              std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                              std::int64_t lda, std::int64_t stridea, sycl::buffer<float, 1> &x,
-                              std::int64_t incx, std::int64_t stridex, float beta,
-                              sycl::buffer<float, 1> &y, std::int64_t incy,
-                              std::int64_t stridey, std::int64_t batch_size) {
-    detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x, incx,
-                       stridex, beta, y, incy, stridey, batch_size);
-}
-
-static inline void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                              std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                              std::int64_t lda, std::int64_t stridea,
-                              sycl::buffer<double, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, double beta, sycl::buffer<double, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x, incx,
-                       stridex, beta, y, incy, stridey, batch_size);
-}
-
-static inline void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                              std::int64_t n, std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                              std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-                              sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                              std::int64_t stridey, std::int64_t batch_size) {
-    detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x, incx,
-                       stridex, beta, y, incy, stridey, batch_size);
-}
-
-static inline void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                              std::int64_t n, std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                              std::int64_t incx, std::int64_t stridex, std::complex<double> beta,
-                              sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                              std::int64_t stridey, std::int64_t batch_size) {
-    detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x, incx,
-                       stridex, beta, y, incy, stridey, batch_size);
-}
-
-static inline void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m,
-                              std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<float, 1> &x,
-                              std::int64_t incx, std::int64_t stridex,
-                              sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                              std::int64_t batch_size) {
-    detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea, x, incx,
-                       stridex, c, ldc, stridec, batch_size);
-}
-
-static inline void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m,
-                              std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<double, 1> &x,
-                              std::int64_t incx, std::int64_t stridex,
-                              sycl::buffer<double, 1> &c, std::int64_t ldc,
-                              std::int64_t stridec, std::int64_t batch_size) {
-    detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea, x, incx,
-                       stridex, c, ldc, stridec, batch_size);
-}
-
-static inline void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m,
-                              std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-                              std::int64_t lda, std::int64_t stridea,
-                              sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &c,
-                              std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) {
-    detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea, x, incx,
-                       stridex, c, ldc, stridec, batch_size);
-}
-
-static inline void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m,
-                              std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-                              std::int64_t lda, std::int64_t stridea,
-                              sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &c,
-                              std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) {
-    detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea, x, incx,
-                       stridex, c, ldc, stridec, batch_size);
-}
-
-static inline void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
-                       sycl::buffer<float, 1> &x, std::int64_t incx,
-                       sycl::buffer<float, 1> &y, std::int64_t incy,
-                       sycl::buffer<float, 1> &a, std::int64_t lda) {
-    detail::ger(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-static inline void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
-                       sycl::buffer<double, 1> &x, std::int64_t incx,
-                       sycl::buffer<double, 1> &y, std::int64_t incy,
-                       sycl::buffer<double, 1> &a, std::int64_t lda) {
-    detail::ger(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-static inline void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda) {
-    detail::gerc(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-static inline void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda) {
-    detail::gerc(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-static inline void geru(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda) {
-    detail::geru(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-static inline void geru(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda) {
-    detail::geru(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-static inline void hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    detail::hbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
-                 incy);
-}
-
-static inline void hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    detail::hbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
-                 incy);
-}
-
-static inline void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc) {
-    detail::hemm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
-                 beta, c, ldc);
-}
-
-static inline void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc) {
-    detail::hemm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
-                 beta, c, ldc);
-}
-
-static inline void hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    detail::hemv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx, beta, y,
-                 incy);
-}
-
-static inline void hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    detail::hemv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx, beta, y,
-                 incy);
-}
-
-static inline void her(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    detail::her(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda);
-}
-
-static inline void her(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    detail::her(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda);
-}
-
-static inline void her2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda) {
-    detail::her2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
-}
-
-static inline void her2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda) {
-    detail::her2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
-}
-
-static inline void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<float> alpha,
-                         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-                         sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    detail::her2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,
-                  beta, c, ldc);
-}
-
-static inline void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<double> alpha,
-                         sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                         double beta, sycl::buffer<std::complex<double>, 1> &c,
-                         std::int64_t ldc) {
-    detail::her2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,
-                  beta, c, ldc);
-}
-
-static inline void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, float alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, float beta, sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc) {
-    detail::herk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c,
-                 ldc);
-}
-
-static inline void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, double alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, double beta, sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc) {
-    detail::herk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c,
-                 ldc);
-}
-
-static inline void hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy) {
-    detail::hpmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
-}
-
-static inline void hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy) {
-    detail::hpmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
-}
-
-static inline void hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       sycl::buffer<std::complex<float>, 1> &a) {
-    detail::hpr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a);
-}
-
-static inline void hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       sycl::buffer<std::complex<double>, 1> &a) {
-    detail::hpr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a);
-}
-
-static inline void hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<float>, 1> &a) {
-    detail::hpr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a);
-}
-
-static inline void hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<double>, 1> &a) {
-    detail::hpr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a);
-}
-
-static inline void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                         std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    detail::iamax(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                         std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    detail::iamax(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void iamax(sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result) {
-    detail::iamax(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void iamax(sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result) {
-    detail::iamax(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                         std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    detail::iamin(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                         std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    detail::iamin(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void iamin(sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result) {
-    detail::iamin(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void iamin(sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result) {
-    detail::iamin(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void nrm2(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &result) {
-    detail::nrm2(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void nrm2(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &result) {
-    detail::nrm2(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, sycl::buffer<float, 1> &result) {
-    detail::nrm2(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, sycl::buffer<double, 1> &result) {
-    detail::nrm2(get_device_id(queue), queue, n, x, incx, result);
-}
-
-static inline void rot(sycl::queue &queue, std::int64_t n,
-                       sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c,
-                       float s) {
-    detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s);
-}
-
-static inline void rot(sycl::queue &queue, std::int64_t n,
-                       sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c,
-                       double s) {
-    detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s);
-}
-
-static inline void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                       std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy, float c,
-                       float s) {
-    detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s);
-}
-
-static inline void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                       std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                       double c, double s) {
-    detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s);
-}
-
-static inline void rotg(sycl::queue &queue, sycl::buffer<float, 1> &a,
-                        sycl::buffer<float, 1> &b, sycl::buffer<float, 1> &c,
-                        sycl::buffer<float, 1> &s) {
-    detail::rotg(get_device_id(queue), queue, a, b, c, s);
-}
-
-static inline void rotg(sycl::queue &queue, sycl::buffer<double, 1> &a,
-                        sycl::buffer<double, 1> &b, sycl::buffer<double, 1> &c,
-                        sycl::buffer<double, 1> &s) {
-    detail::rotg(get_device_id(queue), queue, a, b, c, s);
-}
-
-static inline void rotg(sycl::queue &queue, sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-                        sycl::buffer<std::complex<float>, 1> &s) {
-    detail::rotg(get_device_id(queue), queue, a, b, c, s);
-}
-
-static inline void rotg(sycl::queue &queue, sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &b,
-                        sycl::buffer<double, 1> &c,
-                        sycl::buffer<std::complex<double>, 1> &s) {
-    detail::rotg(get_device_id(queue), queue, a, b, c, s);
-}
-
-static inline void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                        sycl::buffer<float, 1> &param) {
-    detail::rotm(get_device_id(queue), queue, n, x, incx, y, incy, param);
-}
-
-static inline void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                        sycl::buffer<double, 1> &param) {
-    detail::rotm(get_device_id(queue), queue, n, x, incx, y, incy, param);
-}
-
-static inline void rotmg(sycl::queue &queue, sycl::buffer<float, 1> &d1,
-                         sycl::buffer<float, 1> &d2, sycl::buffer<float, 1> &x1, float y1,
-                         sycl::buffer<float, 1> &param) {
-    detail::rotmg(get_device_id(queue), queue, d1, d2, x1, y1, param);
-}
-
-static inline void rotmg(sycl::queue &queue, sycl::buffer<double, 1> &d1,
-                         sycl::buffer<double, 1> &d2, sycl::buffer<double, 1> &x1,
-                         double y1, sycl::buffer<double, 1> &param) {
-    detail::rotmg(get_device_id(queue), queue, d1, d2, x1, y1, param);
-}
-
-static inline void sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        sycl::buffer<float, 1> &y, std::int64_t incy) {
-    detail::sbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
-                 incy);
-}
-
-static inline void sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                        double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy) {
-    detail::sbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
-                 incy);
-}
-
-static inline void scal(sycl::queue &queue, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &x, std::int64_t incx) {
-    detail::scal(get_device_id(queue), queue, n, alpha, x, incx);
-}
-
-static inline void scal(sycl::queue &queue, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &x, std::int64_t incx) {
-    detail::scal(get_device_id(queue), queue, n, alpha, x, incx);
-}
-
-static inline void scal(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    detail::scal(get_device_id(queue), queue, n, alpha, x, incx);
-}
-
-static inline void scal(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    detail::scal(get_device_id(queue), queue, n, alpha, x, incx);
-}
-
-static inline void scal(sycl::queue &queue, std::int64_t n, float alpha,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    detail::scal(get_device_id(queue), queue, n, alpha, x, incx);
-}
-
-static inline void scal(sycl::queue &queue, std::int64_t n, double alpha,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    detail::scal(get_device_id(queue), queue, n, alpha, x, incx);
-}
-
-static inline void sdsdot(sycl::queue &queue, std::int64_t n, float sb,
-                          sycl::buffer<float, 1> &x, std::int64_t incx,
-                          sycl::buffer<float, 1> &y, std::int64_t incy,
-                          sycl::buffer<float, 1> &result) {
-    detail::sdsdot(get_device_id(queue), queue, n, sb, x, incx, y, incy, result);
-}
-
-static inline void spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-                        std::int64_t incy) {
-    detail::spmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
-}
-
-static inline void spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, double beta, sycl::buffer<double, 1> &y,
-                        std::int64_t incy) {
-    detail::spmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
-}
-
-static inline void spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       sycl::buffer<float, 1> &x, std::int64_t incx,
-                       sycl::buffer<float, 1> &a) {
-    detail::spr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a);
-}
-
-static inline void spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       sycl::buffer<double, 1> &x, std::int64_t incx,
-                       sycl::buffer<double, 1> &a) {
-    detail::spr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a);
-}
-
-static inline void spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &y, std::int64_t incy,
-                        sycl::buffer<float, 1> &a) {
-    detail::spr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a);
-}
-
-static inline void spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &y, std::int64_t incy,
-                        sycl::buffer<double, 1> &a) {
-    detail::spr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a);
-}
-
-static inline void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    detail::swap(get_device_id(queue), queue, n, x, incx, y, incy);
-}
-
-static inline void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    detail::swap(get_device_id(queue), queue, n, x, incx, y, incy);
-}
-
-static inline void swap(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    detail::swap(get_device_id(queue), queue, n, x, incx, y, incy);
-}
-
-static inline void swap(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    detail::swap(get_device_id(queue), queue, n, x, incx, y, incy);
-}
-
-static inline void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                        float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
-                 beta, c, ldc);
-}
-
-static inline void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                        std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                        double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
-                 beta, c, ldc);
-}
-
-static inline void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc) {
-    detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
-                 beta, c, ldc);
-}
-
-static inline void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc) {
-    detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
-                 beta, c, ldc);
-}
-
-static inline void symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        sycl::buffer<float, 1> &y, std::int64_t incy) {
-    detail::symv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx, beta, y,
-                 incy);
-}
-
-static inline void symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy) {
-    detail::symv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx, beta, y,
-                 incy);
-}
-
-static inline void syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                       sycl::buffer<float, 1> &x, std::int64_t incx,
-                       sycl::buffer<float, 1> &a, std::int64_t lda) {
-    detail::syr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda);
-}
-
-static inline void syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                       sycl::buffer<double, 1> &x, std::int64_t incx,
-                       sycl::buffer<double, 1> &a, std::int64_t lda) {
-    detail::syr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda);
-}
-
-static inline void syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &y, std::int64_t incy,
-                        sycl::buffer<float, 1> &a, std::int64_t lda) {
-    detail::syr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
-}
-
-static inline void syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &y, std::int64_t incy,
-                        sycl::buffer<double, 1> &a, std::int64_t lda) {
-    detail::syr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
-}
-
-static inline void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                         std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                         float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,
-                  beta, c, ldc);
-}
-
-static inline void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                         std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                         double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,
-                  beta, c, ldc);
-}
-
-static inline void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<float> alpha,
-                         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                         std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                         std::int64_t ldc) {
-    detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,
-                  beta, c, ldc);
-}
-
-static inline void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                         std::int64_t k, std::complex<double> alpha,
-                         sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                         std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                         std::int64_t ldc) {
-    detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,
-                  beta, c, ldc);
-}
-
-static inline void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, float beta, sycl::buffer<float, 1> &c,
-                        std::int64_t ldc) {
-    detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c,
-                 ldc);
-}
-
-static inline void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                        std::int64_t lda, double beta, sycl::buffer<double, 1> &c,
-                        std::int64_t ldc) {
-    detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c,
-                 ldc);
-}
-
-static inline void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc) {
-    detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c,
-                 ldc);
-}
-
-static inline void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                        std::int64_t k, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc) {
-    detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c,
-                 ldc);
-}
-
-static inline void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans,
-                              std::int64_t n, std::int64_t k, float alpha,
-                              sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, float beta, sycl::buffer<float, 1> &c,
-                              std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                       stride_a, beta, c, ldc, stride_c, batch_size);
-}
-
-static inline void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans,
-                              std::int64_t n, std::int64_t k, double alpha,
-                              sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, double beta, sycl::buffer<double, 1> &c,
-                              std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                       stride_a, beta, c, ldc, stride_c, batch_size);
-}
-
-static inline void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans,
-                              std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, std::complex<float> beta,
-                              sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size) {
-    detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                       stride_a, beta, c, ldc, stride_c, batch_size);
-}
-
-static inline void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans,
-                              std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, std::complex<double> beta,
-                              sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size) {
-    detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                       stride_a, beta, c, ldc, stride_c, batch_size);
-}
-
-static inline void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-}
-
-static inline void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-                        std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-}
-
-static inline void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx) {
-    detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-}
-
-static inline void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-}
-
-static inline void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-}
-
-static inline void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-                        std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-}
-
-static inline void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx) {
-    detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-}
-
-static inline void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-}
-
-static inline void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<float, 1> &a,
-                        sycl::buffer<float, 1> &x, std::int64_t incx) {
-    detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx);
-}
-
-static inline void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<double, 1> &a,
-                        sycl::buffer<double, 1> &x, std::int64_t incx) {
-    detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx);
-}
-
-static inline void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx);
-}
-
-static inline void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx);
-}
-
-static inline void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<float, 1> &a,
-                        sycl::buffer<float, 1> &x, std::int64_t incx) {
-    detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx);
-}
-
-static inline void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<double, 1> &a,
-                        sycl::buffer<double, 1> &x, std::int64_t incx) {
-    detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx);
-}
-
-static inline void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx);
-}
-
-static inline void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx);
-}
-
-static inline void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &b, std::int64_t ldb) {
-    detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n,
-                 alpha, a, lda, b, ldb);
-}
-
-static inline void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &b, std::int64_t ldb) {
-    detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n,
-                 alpha, a, lda, b, ldb);
-}
-
-static inline void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n,
-                 alpha, a, lda, b, ldb);
-}
-
-static inline void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n,
-                 alpha, a, lda, b, ldb);
-}
-
-static inline void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx) {
-    detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-}
-
-static inline void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx) {
-    detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-}
-
-static inline void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx) {
-    detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-}
-
-static inline void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx) {
-    detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-}
-
-static inline void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &b, std::int64_t ldb) {
-    detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n,
-                 alpha, a, lda, b, ldb);
-}
-
-static inline void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &b, std::int64_t ldb) {
-    detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n,
-                 alpha, a, lda, b, ldb);
-}
-
-static inline void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n,
-                 alpha, a, lda, b, ldb);
-}
-
-static inline void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                        diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n,
-                 alpha, a, lda, b, ldb);
-}
-
-static inline void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<float, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n,
-                       alpha, a, lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-static inline void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<double, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n,
-                       alpha, a, lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-static inline void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n,
-                       alpha, a, lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-static inline void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                              std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n,
-                       alpha, a, lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-static inline void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx) {
-    detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-}
-
-static inline void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx) {
-    detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-}
-
-static inline void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx) {
-    detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-}
-
-static inline void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                        std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx) {
-    detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-}
-
-static inline void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                  std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                                  std::int64_t lda, std::int64_t stride_a,
-                                  sycl::buffer<float, 1> &b, std::int64_t ldb,
-                                  std::int64_t stride_b, std::int64_t batch_size) {
-    detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stride_a, b,
-                           ldb, stride_b, batch_size);
-}
-
-static inline void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                  std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                                  std::int64_t lda, std::int64_t stride_a,
-                                  sycl::buffer<double, 1> &b, std::int64_t ldb,
-                                  std::int64_t stride_b, std::int64_t batch_size) {
-    detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stride_a, b,
-                           ldb, stride_b, batch_size);
-}
-
-static inline void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                  std::int64_t n, std::complex<float> alpha,
-                                  sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                  std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                                  std::int64_t ldb, std::int64_t stride_b,
-                                  std::int64_t batch_size) {
-    detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stride_a, b,
-                           ldb, stride_b, batch_size);
-}
-
-static inline void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                  std::int64_t n, std::complex<double> alpha,
-                                  sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                  std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                                  std::int64_t ldb, std::int64_t stride_b,
-                                  std::int64_t batch_size) {
-    detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stride_a, b,
-                           ldb, stride_b, batch_size);
-}
-
-static inline void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                  std::int64_t n, float alpha, sycl::buffer<float, 1> &ab,
-                                  std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                                  std::int64_t batch_size) {
-    detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, stride,
-                           batch_size);
-}
-
-static inline void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                  std::int64_t n, double alpha, sycl::buffer<double, 1> &ab,
-                                  std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                                  std::int64_t batch_size) {
-    detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, stride,
-                           batch_size);
-}
-
-static inline void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                  std::int64_t n, std::complex<float> alpha,
-                                  sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda,
-                                  std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, stride,
-                           batch_size);
-}
-
-static inline void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                  std::int64_t n, std::complex<double> alpha,
-                                  sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda,
-                                  std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, stride,
-                           batch_size);
-}
-
-static inline void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb,
-                                 std::int64_t m, std::int64_t n, float alpha,
-                                 sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                                 float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                                 std::int64_t stride_b, sycl::buffer<float, 1> &c, std::int64_t ldc,
-                                 std::int64_t stride_c, std::int64_t batch_size) {
-    detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda,
-                          stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size);
-}
-
-static inline void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb,
-                                 std::int64_t m, std::int64_t n, double alpha,
-                                 sycl::buffer<double, 1> &a, std::int64_t lda,
-                                 std::int64_t stride_a, double beta, sycl::buffer<double, 1> &b,
-                                 std::int64_t ldb, std::int64_t stride_b,
-                                 sycl::buffer<double, 1> &c, std::int64_t ldc,
-                                 std::int64_t stride_c, std::int64_t batch_size) {
-    detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda,
-                          stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size);
-}
-
-static inline void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb,
-                                 std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                 sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                 std::int64_t stride_a, std::complex<float> beta,
-                                 sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                 std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c,
-                                 std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda,
-                          stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size);
-}
-
-static inline void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb,
-                                 std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 std::int64_t stride_a, std::complex<double> beta,
-                                 sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                 std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c,
-                                 std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda,
-                          stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size);
-}
-
-static inline void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                            float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                            sycl::buffer<float, 1> &b, std::int64_t ldb) {
-    detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-static inline void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                            double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                            sycl::buffer<double, 1> &b, std::int64_t ldb) {
-    detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-static inline void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                            std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                            std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                            std::int64_t ldb) {
-    detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-static inline void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                            std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                            std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                            std::int64_t ldb) {
-    detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-static inline void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                             float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                             std::int64_t stridea, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                             std::int64_t strideb) {
-    detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, b, ldb,
-                      strideb);
-}
-
-static inline void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                             double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                             std::int64_t stridea, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                             std::int64_t strideb) {
-    detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, b, ldb,
-                      strideb);
-}
-
-static inline void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                             std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                             std::int64_t lda, std::int64_t stridea,
-                             sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                             std::int64_t strideb) {
-    detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, b, ldb,
-                      strideb);
-}
-
-static inline void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                             std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                             std::int64_t lda, std::int64_t stridea,
-                             sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                             std::int64_t strideb) {
-    detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, b, ldb,
-                      strideb);
-}
-
-static inline void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                            float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-                            std::int64_t ldb) {
-    detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-static inline void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                            double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-                            std::int64_t ldb) {
-    detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-static inline void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                            std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-                            std::int64_t lda, std::int64_t ldb) {
-    detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-static inline void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                            std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-                            std::int64_t lda, std::int64_t ldb) {
-    detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-static inline void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                           std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                           float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                           sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb,
-                    c, ldc);
-}
-
-static inline void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                           std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                           std::int64_t lda, double beta, sycl::buffer<double, 1> &b,
-                           std::int64_t ldb, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb,
-                    c, ldc);
-}
-
-static inline void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                           std::int64_t n, std::complex<float> alpha,
-                           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                           std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b,
-                           std::int64_t ldb, sycl::buffer<std::complex<float>, 1> &c,
-                           std::int64_t ldc) {
-    detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb,
-                    c, ldc);
-}
-
-static inline void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                           std::int64_t n, std::complex<double> alpha,
-                           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &b,
-                           std::int64_t ldb, sycl::buffer<std::complex<double>, 1> &c,
-                           std::int64_t ldc) {
-    detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb,
-                    c, ldc);
-}
-
-// USM APIs
-
-static inline sycl::event asum(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx, float *result,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::asum(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event asum(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx, double *result,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::asum(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event asum(sycl::queue &queue, std::int64_t n, const float *x,
-                                   std::int64_t incx, float *result,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::asum(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event asum(sycl::queue &queue, std::int64_t n, const double *x,
-                                   std::int64_t incx, double *result,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::asum(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event axpy(sycl::queue &queue, std::int64_t n, float alpha,
-                                   const float *x, std::int64_t incx, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event axpy(sycl::queue &queue, std::int64_t n, double alpha,
-                                   const double *x, std::int64_t incx, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event axpy(sycl::queue &queue, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *x,
-                                   std::int64_t incx, std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event axpy(sycl::queue &queue, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *x,
-                                   std::int64_t incx, std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, double *alpha,
-                                         const double **x, std::int64_t *incx, double **y,
-                                         std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, y, incy,
-                                   group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, float *alpha,
-                                         const float **x, std::int64_t *incx, float **y,
-                                         std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, y, incy,
-                                   group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n,
-                                         std::complex<double> *alpha,
-                                         const std::complex<double> **x, std::int64_t *incx,
-                                         std::complex<double> **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, y, incy,
-                                   group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n,
-                                         std::complex<float> *alpha, const std::complex<float> **x,
-                                         std::int64_t *incx, std::complex<float> **y,
-                                         std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, y, incy,
-                                   group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, float alpha,
-                                         const float *x, std::int64_t incx, std::int64_t stridex,
-                                         float *y, std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy,
-                                   stridey, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, double alpha,
-                                         const double *x, std::int64_t incx, std::int64_t stridex,
-                                         double *y, std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy,
-                                   stridey, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t n,
-                                         std::complex<float> alpha, const std::complex<float> *x,
-                                         std::int64_t incx, std::int64_t stridex,
-                                         std::complex<float> *y, std::int64_t incy,
-                                         std::int64_t stridey, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy,
-                                   stridey, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t n,
-                                         std::complex<double> alpha, const std::complex<double> *x,
-                                         std::int64_t incx, std::int64_t stridex,
-                                         std::complex<double> *y, std::int64_t incy,
-                                         std::int64_t stridey, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy,
-                                   stridey, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event axpby(sycl::queue &queue, std::int64_t n, float alpha,
-                                    const float *x, std::int64_t incx, const float beta, float *y,
-                                    std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event axpby(sycl::queue &queue, std::int64_t n, double alpha,
-                                    const double *x, std::int64_t incx, const double beta,
-                                    double *y, std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event axpby(sycl::queue &queue, std::int64_t n,
-                                    std::complex<float> alpha, const std::complex<float> *x,
-                                    std::int64_t incx, const std::complex<float> beta,
-                                    std::complex<float> *y, std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event axpby(sycl::queue &queue, std::int64_t n,
-                                    std::complex<double> alpha, const std::complex<double> *x,
-                                    std::int64_t incx, const std::complex<double> beta,
-                                    std::complex<double> *y, std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event copy(sycl::queue &queue, std::int64_t n, const float *x,
-                                   std::int64_t incx, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::copy(get_device_id(queue), queue, n, x, incx, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event copy(sycl::queue &queue, std::int64_t n, const double *x,
-                                   std::int64_t incx, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::copy(get_device_id(queue), queue, n, x, incx, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event copy(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::copy(get_device_id(queue), queue, n, x, incx, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event copy(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::copy(get_device_id(queue), queue, n, x, incx, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const float **x,
-                                         std::int64_t *incx, float **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, y, incy, group_count,
-                                   group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const double **x,
-                                         std::int64_t *incx, double **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, y, incy, group_count,
-                                   group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t *n,
-                                         const std::complex<float> **x, std::int64_t *incx,
-                                         std::complex<float> **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, y, incy, group_count,
-                                   group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t *n,
-                                         const std::complex<double> **x, std::int64_t *incx,
-                                         std::complex<double> **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, y, incy, group_count,
-                                   group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const float *x,
-                                         std::int64_t incx, std::int64_t stridex, float *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy,
-                                   stridey, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const double *x,
-                                         std::int64_t incx, std::int64_t stridex, double *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy,
-                                   stridey, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t n,
-                                         const std::complex<float> *x, std::int64_t incx,
-                                         std::int64_t stridex, std::complex<float> *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy,
-                                   stridey, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t n,
-                                         const std::complex<double> *x, std::int64_t incx,
-                                         std::int64_t stridex, std::complex<double> *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy,
-                                   stridey, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x,
-                                  std::int64_t incx, const float *y, std::int64_t incy,
-                                  float *result,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::dot(get_device_id(queue), queue, n, x, incx, y, incy, result, dependencies);
-    return done;
-}
-
-static inline sycl::event dot(sycl::queue &queue, std::int64_t n, const double *x,
-                                  std::int64_t incx, const double *y, std::int64_t incy,
-                                  double *result,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::dot(get_device_id(queue), queue, n, x, incx, y, incy, result, dependencies);
-    return done;
-}
-
-static inline sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x,
-                                  std::int64_t incx, const float *y, std::int64_t incy,
-                                  double *result,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::dot(get_device_id(queue), queue, n, x, incx, y, incy, result, dependencies);
-    return done;
-}
-
-static inline sycl::event dotc(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *result,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::dotc(get_device_id(queue), queue, n, x, incx, y, incy, result, dependencies);
-    return done;
-}
-
-static inline sycl::event dotc(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *result,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::dotc(get_device_id(queue), queue, n, x, incx, y, incy, result, dependencies);
-    return done;
-}
-
-static inline sycl::event dotu(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *result,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::dotu(get_device_id(queue), queue, n, x, incx, y, incy, result, dependencies);
-    return done;
-}
-
-static inline sycl::event dotu(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *result,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::dotu(get_device_id(queue), queue, n, x, incx, y, incy, result, dependencies);
-    return done;
-}
-
-static inline sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha,
-                                   const float *a, std::int64_t lda, const float *x,
-                                   std::int64_t incx, float beta, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x,
-                             incx, beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
-                                   const double *a, std::int64_t lda, const double *x,
-                                   std::int64_t incx, double beta, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x,
-                             incx, beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, std::int64_t kl, std::int64_t ku,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, const std::complex<float> *x,
-                                   std::int64_t incx, std::complex<float> beta,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x,
-                             incx, beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, std::int64_t kl, std::int64_t ku,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, const std::complex<double> *x,
-                                   std::int64_t incx, std::complex<double> beta,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x,
-                             incx, beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                                   const float *a, std::int64_t lda, const float *b,
-                                   std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                             ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                                   const double *a, std::int64_t lda, const double *b,
-                                   std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                             ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb,
-                                   std::int64_t m, std::int64_t n, std::int64_t k,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                             ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb,
-                                   std::int64_t m, std::int64_t n, std::int64_t k,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, const std::complex<double> *b,
-                                   std::int64_t ldb, std::complex<double> beta,
-                                   std::complex<double> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                             ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                                   const sycl::half *a, std::int64_t lda, const sycl::half *b,
-                                   std::int64_t ldb, sycl::half beta, sycl::half *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                             ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                                   const sycl::half *a, std::int64_t lda, const sycl::half *b,
-                                   std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                             ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                                   const bfloat16 *a, std::int64_t lda, const bfloat16 *b,
-                                   std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                             ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(sycl::queue &queue, transpose *transa,
-                                         transpose *transb, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *k, float *alpha, const float **a,
-                                         std::int64_t *lda, const float **b, std::int64_t *ldb,
-                                         float *beta, float **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                           ldb, beta, c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(sycl::queue &queue, transpose *transa,
-                                         transpose *transb, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *k, double *alpha, const double **a,
-                                         std::int64_t *lda, const double **b, std::int64_t *ldb,
-                                         double *beta, double **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                           ldb, beta, c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(
-    sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
-    std::int64_t *k, std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
-    const std::complex<float> **b, std::int64_t *ldb, std::complex<float> *beta,
-    std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-    const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                           ldb, beta, c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(
-    sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n,
-    std::int64_t *k, std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
-    const std::complex<double> **b, std::int64_t *ldb, std::complex<double> *beta,
-    std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-    const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                           ldb, beta, c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(sycl::queue &queue, transpose *transa,
-                                         transpose *transb, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *k, sycl::half *alpha, const sycl::half **a,
-                                         std::int64_t *lda, const sycl::half **b, std::int64_t *ldb,
-                                         sycl::half *beta, sycl::half **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                           ldb, beta, c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb,
-                                     std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                     float *alpha, const sycl::half **a, std::int64_t *lda,
-                                     const sycl::half **b, std::int64_t *ldb, float *beta,
-                                     float **c, std::int64_t *ldc, std::int64_t group_count,
-                                     std::int64_t *group_size,
-                                     const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                           ldb, beta, c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb,
-                                     std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                     float *alpha, const std::int8_t **a, std::int64_t *lda,
-                                     const std::int8_t **b, std::int64_t *ldb, float *beta,
-                                     float **c, std::int64_t *ldc, std::int64_t group_count,
-                                     std::int64_t *group_size,
-                                     const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                           ldb, beta, c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb,
-                                     std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                     float *alpha, const std::int8_t **a, std::int64_t *lda,
-                                     const std::int8_t **b, std::int64_t *ldb, float *beta,
-                                     std::int32_t **c, std::int64_t *ldc, std::int64_t group_count,
-                                     std::int64_t *group_size,
-                                     const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b,
-                           ldb, beta, c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                                         std::int64_t m, std::int64_t n, std::int64_t k,
-                                         float alpha, const float *a, std::int64_t lda,
-                                         std::int64_t stride_a, const float *b, std::int64_t ldb,
-                                         std::int64_t stride_b, float beta, float *c,
-                                         std::int64_t ldc, std::int64_t stride_c,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a,
-                                   lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
-                                   batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                                         std::int64_t m, std::int64_t n, std::int64_t k,
-                                         double alpha, const double *a, std::int64_t lda,
-                                         std::int64_t stride_a, const double *b, std::int64_t ldb,
-                                         std::int64_t stride_b, double beta, double *c,
-                                         std::int64_t ldc, std::int64_t stride_c,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a,
-                                   lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
-                                   batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(
-    sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-    std::int64_t k, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-    std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-    std::complex<float> beta, std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a,
-                                   lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
-                                   batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(
-    sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-    std::int64_t k, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-    std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a,
-                                   lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
-                                   batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                                         std::int64_t m, std::int64_t n, std::int64_t k,
-                                         sycl::half alpha, const sycl::half *a, std::int64_t lda,
-                                         std::int64_t stride_a, const sycl::half *b,
-                                         std::int64_t ldb, std::int64_t stride_b, sycl::half beta,
-                                         sycl::half *c, std::int64_t ldc, std::int64_t stride_c,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a,
-                                   lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
-                                   batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                                     const sycl::half *a, std::int64_t lda, std::int64_t stride_a,
-                                     const sycl::half *b, std::int64_t ldb, std::int64_t stride_b,
-                                     float beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-                                     std::int64_t batch_size,
-                                     const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a,
-                                   lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
-                                   batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                                     const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                                     const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b,
-                                     float beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-                                     std::int64_t batch_size,
-                                     const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a,
-                                   lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
-                                   batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                                     const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                                     const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b,
-                                     float beta, std::int32_t *c, std::int64_t ldc,
-                                     std::int64_t stride_c, std::int64_t batch_size,
-                                     const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a,
-                                   lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
-                                   batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa,
-                                    transpose transb, std::int64_t n, std::int64_t k, float alpha,
-                                    const float *a, std::int64_t lda, const float *b,
-                                    std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha,
-                              a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa,
-                                    transpose transb, std::int64_t n, std::int64_t k, double alpha,
-                                    const double *a, std::int64_t lda, const double *b,
-                                    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha,
-                              a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa,
-                                    transpose transb, std::int64_t n, std::int64_t k,
-                                    std::complex<float> alpha, const std::complex<float> *a,
-                                    std::int64_t lda, const std::complex<float> *b,
-                                    std::int64_t ldb, std::complex<float> beta,
-                                    std::complex<float> *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha,
-                              a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa,
-                                    transpose transb, std::int64_t n, std::int64_t k,
-                                    std::complex<double> alpha, const std::complex<double> *a,
-                                    std::int64_t lda, const std::complex<double> *b,
-                                    std::int64_t ldb, std::complex<double> beta,
-                                    std::complex<double> *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha,
-                              a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                                        offset offsetc, std::int64_t m, std::int64_t n,
-                                        std::int64_t k, float alpha, const std::int8_t *a,
-                                        std::int64_t lda, std::int8_t ao, const std::uint8_t *b,
-                                        std::int64_t ldb, std::uint8_t bo, float beta,
-                                        std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                                        const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k,
-                                  alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                                        offset offsetc, std::int64_t m, std::int64_t n,
-                                        std::int64_t k, float alpha, const std::int8_t *a,
-                                        std::int64_t lda, std::int8_t ao, const std::int8_t *b,
-                                        std::int64_t ldb, std::int8_t bo, float beta,
-                                        std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                                        const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k,
-                                  alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                                        offset offsetc, std::int64_t m, std::int64_t n,
-                                        std::int64_t k, float alpha, const std::uint8_t *a,
-                                        std::int64_t lda, std::uint8_t ao, const std::int8_t *b,
-                                        std::int64_t ldb, std::int8_t bo, float beta,
-                                        std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                                        const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k,
-                                  alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies);
-    return done;
-}
-
-static inline sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                                        offset offsetc, std::int64_t m, std::int64_t n,
-                                        std::int64_t k, float alpha, const std::uint8_t *a,
-                                        std::int64_t lda, std::uint8_t ao, const std::uint8_t *b,
-                                        std::int64_t ldb, std::uint8_t bo, float beta,
-                                        std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                                        const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k,
-                                  alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies);
-    return done;
-}
-
-static inline sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                                   const float *x, std::int64_t incx, float beta, float *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta,
-                             y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                                   const double *x, std::int64_t incx, double beta, double *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta,
-                             y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> beta, std::complex<float> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta,
-                             y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> beta, std::complex<double> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta,
-                             y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                         std::int64_t n, float alpha, const float *a,
-                                         std::int64_t lda, std::int64_t stridea, const float *x,
-                                         std::int64_t incx, std::int64_t stridex, float beta,
-                                         float *y, std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x,
-                           incx, stridex, beta, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                         std::int64_t n, double alpha, const double *a,
-                                         std::int64_t lda, std::int64_t stridea, const double *x,
-                                         std::int64_t incx, std::int64_t stridex, double beta,
-                                         double *y, std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x,
-                           incx, stridex, beta, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemv_batch(
-    sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-    const std::complex<float> *x, std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-    std::complex<float> *y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-    const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x,
-                           incx, stridex, beta, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemv_batch(
-    sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-    std::int64_t stridea, const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-    std::complex<double> beta, std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x,
-                           incx, stridex, beta, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m,
-                                         std::int64_t *n, float *alpha, const float **a,
-                                         std::int64_t *lda, const float **x, std::int64_t *incx,
-                                         float *beta, float **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx,
-                                   beta, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m,
-                                         std::int64_t *n, double *alpha, const double **a,
-                                         std::int64_t *lda, const double **x, std::int64_t *incx,
-                                         double *beta, double **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx,
-                                   beta, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m,
-                                         std::int64_t *n, std::complex<float> *alpha,
-                                         const std::complex<float> **a, std::int64_t *lda,
-                                         const std::complex<float> **x, std::int64_t *incx,
-                                         std::complex<float> *beta, std::complex<float> **y,
-                                         std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx,
-                                   beta, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m,
-                                         std::int64_t *n, std::complex<double> *alpha,
-                                         const std::complex<double> **a, std::int64_t *lda,
-                                         const std::complex<double> **x, std::int64_t *incx,
-                                         std::complex<double> *beta, std::complex<double> **y,
-                                         std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx,
-                                   beta, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m,
-                                         std::int64_t n, const float *a, std::int64_t lda,
-                                         std::int64_t stridea, const float *x, std::int64_t incx,
-                                         std::int64_t stridex, float *c, std::int64_t ldc,
-                                         std::int64_t stridec, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea,
-                                   x, incx, stridex, c, ldc, stridec, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m,
-                                         std::int64_t n, const double *a, std::int64_t lda,
-                                         std::int64_t stridea, const double *x, std::int64_t incx,
-                                         std::int64_t stridex, double *c, std::int64_t ldc,
-                                         std::int64_t stridec, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea,
-                                   x, incx, stridex, c, ldc, stridec, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m,
-                                         std::int64_t n, const std::complex<float> *a,
-                                         std::int64_t lda, std::int64_t stridea,
-                                         const std::complex<float> *x, std::int64_t incx,
-                                         std::int64_t stridex, std::complex<float> *c,
-                                         std::int64_t ldc, std::int64_t stridec,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea,
-                                   x, incx, stridex, c, ldc, stridec, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m,
-                                         std::int64_t n, const std::complex<double> *a,
-                                         std::int64_t lda, std::int64_t stridea,
-                                         const std::complex<double> *x, std::int64_t incx,
-                                         std::int64_t stridex, std::complex<double> *c,
-                                         std::int64_t ldc, std::int64_t stridec,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea,
-                                   x, incx, stridex, c, ldc, stridec, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m,
-                                         std::int64_t *n, const float **a, std::int64_t *lda,
-                                         const float **x, std::int64_t *incx, float **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, x, incx,
-                                   c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m,
-                                         std::int64_t *n, const double **a, std::int64_t *lda,
-                                         const double **x, std::int64_t *incx, double **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, x, incx,
-                                   c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m,
-                                         std::int64_t *n, const std::complex<float> **a,
-                                         std::int64_t *lda, const std::complex<float> **x,
-                                         std::int64_t *incx, std::complex<float> **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, x, incx,
-                                   c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m,
-                                         std::int64_t *n, const std::complex<double> **a,
-                                         std::int64_t *lda, const std::complex<double> **x,
-                                         std::int64_t *incx, std::complex<double> **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, x, incx,
-                                   c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                  float alpha, const float *x, std::int64_t incx, const float *y,
-                                  std::int64_t incy, float *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::ger(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda,
-                            dependencies);
-    return done;
-}
-
-static inline sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                  double alpha, const double *x, std::int64_t incx, const double *y,
-                                  std::int64_t incy, double *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::ger(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda,
-                            dependencies);
-    return done;
-}
-
-static inline sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *x,
-                                   std::int64_t incx, const std::complex<float> *y,
-                                   std::int64_t incy, std::complex<float> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gerc(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda,
-                             dependencies);
-    return done;
-}
-
-static inline sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *x,
-                                   std::int64_t incx, const std::complex<double> *y,
-                                   std::int64_t incy, std::complex<double> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::gerc(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda,
-                             dependencies);
-    return done;
-}
-
-static inline sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *x,
-                                   std::int64_t incx, const std::complex<float> *y,
-                                   std::int64_t incy, std::complex<float> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::geru(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda,
-                             dependencies);
-    return done;
-}
-
-static inline sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *x,
-                                   std::int64_t incx, const std::complex<double> *y,
-                                   std::int64_t incy, std::complex<double> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::geru(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda,
-                             dependencies);
-    return done;
-}
-
-static inline sycl::event hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   std::int64_t k, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> beta, std::complex<float> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::hbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx,
-                             beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   std::int64_t k, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> beta, std::complex<double> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::hbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx,
-                             beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower,
-                                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::hemm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a,
-                             lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower,
-                                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *b, std::int64_t ldb,
-                                   std::complex<double> beta, std::complex<double> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::hemm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a,
-                             lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, const std::complex<float> *x,
-                                   std::int64_t incx, std::complex<float> beta,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::hemv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx,
-                             beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, const std::complex<double> *x,
-                                   std::int64_t incx, std::complex<double> beta,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::hemv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx,
-                             beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event her(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                  float alpha, const std::complex<float> *x, std::int64_t incx,
-                                  std::complex<float> *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::her(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda,
-                            dependencies);
-    return done;
-}
-
-static inline sycl::event her(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                  double alpha, const std::complex<double> *x, std::int64_t incx,
-                                  std::complex<double> *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::her(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda,
-                            dependencies);
-    return done;
-}
-
-static inline sycl::event her2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *x,
-                                   std::int64_t incx, const std::complex<float> *y,
-                                   std::int64_t incy, std::complex<float> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::her2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
-                             a, lda, dependencies);
-    return done;
-}
-
-static inline sycl::event her2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *x,
-                                   std::int64_t incx, const std::complex<double> *y,
-                                   std::int64_t incy, std::complex<double> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::her2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
-                             a, lda, dependencies);
-    return done;
-}
-
-static inline sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                    std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                                    const std::complex<float> *a, std::int64_t lda,
-                                    const std::complex<float> *b, std::int64_t ldb, float beta,
-                                    std::complex<float> *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::her2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                              b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                    std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                                    const std::complex<double> *a, std::int64_t lda,
-                                    const std::complex<double> *b, std::int64_t ldb, double beta,
-                                    std::complex<double> *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::her2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                              b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   std::int64_t n, std::int64_t k, float alpha,
-                                   const std::complex<float> *a, std::int64_t lda, float beta,
-                                   std::complex<float> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::herk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                             beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   std::int64_t n, std::int64_t k, double alpha,
-                                   const std::complex<double> *a, std::int64_t lda, double beta,
-                                   std::complex<double> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::herk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                             beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> beta, std::complex<float> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::hpmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta,
-                             y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> beta, std::complex<double> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::hpmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta,
-                             y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                  float alpha, const std::complex<float> *x, std::int64_t incx,
-                                  std::complex<float> *a,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::hpr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies);
-    return done;
-}
-
-static inline sycl::event hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                  double alpha, const std::complex<double> *x, std::int64_t incx,
-                                  std::complex<double> *a,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::hpr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies);
-    return done;
-}
-
-static inline sycl::event hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *x,
-                                   std::int64_t incx, const std::complex<float> *y,
-                                   std::int64_t incy, std::complex<float> *a,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::hpr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
-                             a, dependencies);
-    return done;
-}
-
-static inline sycl::event hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *x,
-                                   std::int64_t incx, const std::complex<double> *y,
-                                   std::int64_t incy, std::complex<double> *a,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::hpr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
-                             a, dependencies);
-    return done;
-}
-
-static inline sycl::event iamax(sycl::queue &queue, std::int64_t n, const float *x,
-                                    std::int64_t incx, std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::iamax(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event iamax(sycl::queue &queue, std::int64_t n, const double *x,
-                                    std::int64_t incx, std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::iamax(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event iamax(sycl::queue &queue, std::int64_t n,
-                                    const std::complex<float> *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::iamax(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event iamax(sycl::queue &queue, std::int64_t n,
-                                    const std::complex<double> *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::iamax(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event iamin(sycl::queue &queue, std::int64_t n, const float *x,
-                                    std::int64_t incx, std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::iamin(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event iamin(sycl::queue &queue, std::int64_t n, const double *x,
-                                    std::int64_t incx, std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::iamin(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event iamin(sycl::queue &queue, std::int64_t n,
-                                    const std::complex<float> *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::iamin(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event iamin(sycl::queue &queue, std::int64_t n,
-                                    const std::complex<double> *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::iamin(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event nrm2(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx, float *result,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::nrm2(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event nrm2(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx, double *result,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::nrm2(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event nrm2(sycl::queue &queue, std::int64_t n, const float *x,
-                                   std::int64_t incx, float *result,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::nrm2(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event nrm2(sycl::queue &queue, std::int64_t n, const double *x,
-                                   std::int64_t incx, double *result,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::nrm2(get_device_id(queue), queue, n, x, incx, result, dependencies);
-    return done;
-}
-
-static inline sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex<float> *x,
-                                  std::int64_t incx, std::complex<float> *y, std::int64_t incy,
-                                  float c, float s,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s, dependencies);
-    return done;
-}
-
-static inline sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex<double> *x,
-                                  std::int64_t incx, std::complex<double> *y, std::int64_t incy,
-                                  double c, double s,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s, dependencies);
-    return done;
-}
-
-static inline sycl::event rot(sycl::queue &queue, std::int64_t n, float *x,
-                                  std::int64_t incx, float *y, std::int64_t incy, float c, float s,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s, dependencies);
-    return done;
-}
-
-static inline sycl::event rot(sycl::queue &queue, std::int64_t n, double *x,
-                                  std::int64_t incx, double *y, std::int64_t incy, double c,
-                                  double s, const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s, dependencies);
-    return done;
-}
-
-static inline sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::rotg(get_device_id(queue), queue, a, b, c, s, dependencies);
-    return done;
-}
-
-static inline sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c,
-                                   double *s,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::rotg(get_device_id(queue), queue, a, b, c, s, dependencies);
-    return done;
-}
-
-static inline sycl::event rotg(sycl::queue &queue, std::complex<float> *a,
-                                   std::complex<float> *b, float *c, std::complex<float> *s,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::rotg(get_device_id(queue), queue, a, b, c, s, dependencies);
-    return done;
-}
-
-static inline sycl::event rotg(sycl::queue &queue, std::complex<double> *a,
-                                   std::complex<double> *b, double *c, std::complex<double> *s,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::rotg(get_device_id(queue), queue, a, b, c, s, dependencies);
-    return done;
-}
-
-static inline sycl::event rotm(sycl::queue &queue, std::int64_t n, float *x,
-                                   std::int64_t incx, float *y, std::int64_t incy, float *param,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::rotm(get_device_id(queue), queue, n, x, incx, y, incy, param, dependencies);
-    return done;
-}
-
-static inline sycl::event rotm(sycl::queue &queue, std::int64_t n, double *x,
-                                   std::int64_t incx, double *y, std::int64_t incy, double *param,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::rotm(get_device_id(queue), queue, n, x, incx, y, incy, param, dependencies);
-    return done;
-}
-
-static inline sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1,
-                                    float y1, float *param,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::rotmg(get_device_id(queue), queue, d1, d2, x1, y1, param, dependencies);
-    return done;
-}
-
-static inline sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1,
-                                    double y1, double *param,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::rotmg(get_device_id(queue), queue, d1, d2, x1, y1, param, dependencies);
-    return done;
-}
-
-static inline sycl::event sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                                   const float *x, std::int64_t incx, float beta, float *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::sbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx,
-                             beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                                   const double *x, std::int64_t incx, double beta, double *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::sbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx,
-                             beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, float *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::scal(get_device_id(queue), queue, n, alpha, x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, double *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::scal(get_device_id(queue), queue, n, alpha, x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event scal(sycl::queue &queue, std::int64_t n,
-                                   std::complex<float> alpha, std::complex<float> *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::scal(get_device_id(queue), queue, n, alpha, x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event scal(sycl::queue &queue, std::int64_t n,
-                                   std::complex<double> alpha, std::complex<double> *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::scal(get_device_id(queue), queue, n, alpha, x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::scal(get_device_id(queue), queue, n, alpha, x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::scal(get_device_id(queue), queue, n, alpha, x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event sdsdot(sycl::queue &queue, std::int64_t n, float sb,
-                                     const float *x, std::int64_t incx, const float *y,
-                                     std::int64_t incy, float *result,
-                                     const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::sdsdot(get_device_id(queue), queue, n, sb, x, incx, y, incy, result, dependencies);
-    return done;
-}
-
-static inline sycl::event spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   float alpha, const float *a, const float *x, std::int64_t incx,
-                                   float beta, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::spmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta,
-                             y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   double alpha, const double *a, const double *x,
-                                   std::int64_t incx, double beta, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::spmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta,
-                             y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event spr(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                  float alpha, const float *x, std::int64_t incx, float *a,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::spr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies);
-    return done;
-}
-
-static inline sycl::event spr(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                  double alpha, const double *x, std::int64_t incx, double *a,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::spr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies);
-    return done;
-}
-
-static inline sycl::event spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   float alpha, const float *x, std::int64_t incx, const float *y,
-                                   std::int64_t incy, float *a,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::spr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
-                             a, dependencies);
-    return done;
-}
-
-static inline sycl::event spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   double alpha, const double *x, std::int64_t incx,
-                                   const double *y, std::int64_t incy, double *a,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::spr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
-                             a, dependencies);
-    return done;
-}
-
-static inline sycl::event swap(sycl::queue &queue, std::int64_t n, float *x,
-                                   std::int64_t incx, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::swap(get_device_id(queue), queue, n, x, incx, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event swap(sycl::queue &queue, std::int64_t n, double *x,
-                                   std::int64_t incx, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::swap(get_device_id(queue), queue, n, x, incx, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex<float> *x,
-                                   std::int64_t incx, std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::swap(get_device_id(queue), queue, n, x, incx, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex<double> *x,
-                                   std::int64_t incx, std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::swap(get_device_id(queue), queue, n, x, incx, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower,
-                                   std::int64_t m, std::int64_t n, float alpha, const float *a,
-                                   std::int64_t lda, const float *b, std::int64_t ldb, float beta,
-                                   float *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a,
-                             lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower,
-                                   std::int64_t m, std::int64_t n, double alpha, const double *a,
-                                   std::int64_t lda, const double *b, std::int64_t ldb, double beta,
-                                   double *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a,
-                             lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower,
-                                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a,
-                             lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower,
-                                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *b, std::int64_t ldb,
-                                   std::complex<double> beta, std::complex<double> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a,
-                             lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event symv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   float alpha, const float *a, std::int64_t lda, const float *x,
-                                   std::int64_t incx, float beta, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::symv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx,
-                             beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event symv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   double alpha, const double *a, std::int64_t lda, const double *x,
-                                   std::int64_t incx, double beta, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::symv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx,
-                             beta, y, incy, dependencies);
-    return done;
-}
-
-static inline sycl::event syr(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                  float alpha, const float *x, std::int64_t incx, float *a,
-                                  std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda,
-                            dependencies);
-    return done;
-}
-
-static inline sycl::event syr(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                  double alpha, const double *x, std::int64_t incx, double *a,
-                                  std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda,
-                            dependencies);
-    return done;
-}
-
-static inline sycl::event syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   float alpha, const float *x, std::int64_t incx, const float *y,
-                                   std::int64_t incy, float *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
-                             a, lda, dependencies);
-    return done;
-}
-
-static inline sycl::event syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                                   double alpha, const double *x, std::int64_t incx,
-                                   const double *y, std::int64_t incy, double *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy,
-                             a, lda, dependencies);
-    return done;
-}
-
-static inline sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                    std::int64_t n, std::int64_t k, float alpha, const float *a,
-                                    std::int64_t lda, const float *b, std::int64_t ldb, float beta,
-                                    float *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                              b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                    std::int64_t n, std::int64_t k, double alpha, const double *a,
-                                    std::int64_t lda, const double *b, std::int64_t ldb,
-                                    double beta, double *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                              b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                    std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                                    const std::complex<float> *a, std::int64_t lda,
-                                    const std::complex<float> *b, std::int64_t ldb,
-                                    std::complex<float> beta, std::complex<float> *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                              b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                    std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                                    const std::complex<double> *a, std::int64_t lda,
-                                    const std::complex<double> *b, std::int64_t ldb,
-                                    std::complex<double> beta, std::complex<double> *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                              b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   std::int64_t n, std::int64_t k, float alpha, const float *a,
-                                   std::int64_t lda, float beta, float *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                             beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   std::int64_t n, std::int64_t k, double alpha, const double *a,
-                                   std::int64_t lda, double beta, double *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                             beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                             beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> beta, std::complex<double> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda,
-                             beta, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower,
-                                         transpose *trans, std::int64_t *n, std::int64_t *k,
-                                         float *alpha, const float **a, std::int64_t *lda,
-                                         float *beta, float **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a,
-                                   lda, beta, c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower,
-                                         transpose *trans, std::int64_t *n, std::int64_t *k,
-                                         double *alpha, const double **a, std::int64_t *lda,
-                                         double *beta, double **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a,
-                                   lda, beta, c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower,
-                                         transpose *trans, std::int64_t *n, std::int64_t *k,
-                                         std::complex<float> *alpha, const std::complex<float> **a,
-                                         std::int64_t *lda, std::complex<float> *beta,
-                                         std::complex<float> **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a,
-                                   lda, beta, c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower,
-                                         transpose *trans, std::int64_t *n, std::int64_t *k,
-                                         std::complex<double> *alpha,
-                                         const std::complex<double> **a, std::int64_t *lda,
-                                         std::complex<double> *beta, std::complex<double> **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a,
-                                   lda, beta, c, ldc, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                         std::int64_t n, std::int64_t k, float alpha,
-                                         const float *a, std::int64_t lda, std::int64_t stride_a,
-                                         float beta, float *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a,
-                                   lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                         std::int64_t n, std::int64_t k, double alpha,
-                                         const double *a, std::int64_t lda, std::int64_t stride_a,
-                                         double beta, double *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a,
-                                   lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                         std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                                         const std::complex<float> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<float> beta,
-                                         std::complex<float> *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a,
-                                   lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                         std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                                         const std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<double> beta,
-                                         std::complex<double> *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a,
-                                   lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
-                                   std::int64_t lda, float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a,
-                             lda, x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
-                                   std::int64_t lda, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a,
-                             lda, x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, std::int64_t k,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a,
-                             lda, x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, std::int64_t k,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a,
-                             lda, x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
-                                   std::int64_t lda, float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a,
-                             lda, x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
-                                   std::int64_t lda, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a,
-                             lda, x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, std::int64_t k,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a,
-                             lda, x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, std::int64_t k,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a,
-                             lda, x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const float *a, float *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x,
-                             incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const double *a, double *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x,
-                             incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x,
-                             incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x,
-                             incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const float *a, float *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x,
-                             incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const double *a, double *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x,
-                             incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x,
-                             incx, dependencies);
-    return done;
-}
-
-static inline sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x,
-                             incx, dependencies);
-    return done;
-}
-
-static inline sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                                   float alpha, const float *a, std::int64_t lda, float *b,
-                                   std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag,
-                             m, n, alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-static inline sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                                   double alpha, const double *a, std::int64_t lda, double *b,
-                                   std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag,
-                             m, n, alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-static inline sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag,
-                             m, n, alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-static inline sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag,
-                             m, n, alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-static inline sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const float *a, std::int64_t lda,
-                                   float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
-                             x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const double *a,
-                                   std::int64_t lda, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
-                             x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
-                             x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
-                             x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                                   float alpha, const float *a, std::int64_t lda, float *b,
-                                   std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag,
-                             m, n, alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-static inline sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                                   double alpha, const double *a, std::int64_t lda, double *b,
-                                   std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag,
-                             m, n, alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-static inline sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag,
-                             m, n, alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-static inline sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag,
-                             m, n, alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-static inline sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                                         transpose trans, diag unit_diag, std::int64_t m,
-                                         std::int64_t n, float alpha, const float *a,
-                                         std::int64_t lda, std::int64_t stride_a, float *b,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans,
-                                   unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b,
-                                   batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                                         transpose trans, diag unit_diag, std::int64_t m,
-                                         std::int64_t n, double alpha, const double *a,
-                                         std::int64_t lda, std::int64_t stride_a, double *b,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans,
-                                   unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b,
-                                   batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                                         transpose trans, diag unit_diag, std::int64_t m,
-                                         std::int64_t n, std::complex<float> alpha,
-                                         const std::complex<float> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<float> *b,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans,
-                                   unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b,
-                                   batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                                         transpose trans, diag unit_diag, std::int64_t m,
-                                         std::int64_t n, std::complex<double> alpha,
-                                         const std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<double> *b,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans,
-                                   unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b,
-                                   batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event trsm_batch(sycl::queue &queue, side *left_right,
-                                         uplo *upper_lower, transpose *trans, diag *unit_diag,
-                                         std::int64_t *m, std::int64_t *n, float *alpha,
-                                         const float **a, std::int64_t *lda, float **b,
-                                         std::int64_t *ldb, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag,
-                           m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event trsm_batch(sycl::queue &queue, side *left_right,
-                                         uplo *upper_lower, transpose *trans, diag *unit_diag,
-                                         std::int64_t *m, std::int64_t *n, double *alpha,
-                                         const double **a, std::int64_t *lda, double **b,
-                                         std::int64_t *ldb, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag,
-                           m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event trsm_batch(
-    sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans, diag *unit_diag,
-    std::int64_t *m, std::int64_t *n, std::complex<float> *alpha, const std::complex<float> **a,
-    std::int64_t *lda, std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_size, const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag,
-                           m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event trsm_batch(
-    sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans, diag *unit_diag,
-    std::int64_t *m, std::int64_t *n, std::complex<double> *alpha, const std::complex<double> **a,
-    std::int64_t *lda, std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_size, const std::vector<sycl::event> &dependencies = {}) {
-    auto done =
-        detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag,
-                           m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-static inline sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const float *a, std::int64_t lda,
-                                   float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
-                             x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const double *a,
-                                   std::int64_t lda, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
-                             x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
-                             x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda,
-                             x, incx, dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                         std::int64_t n, float alpha, const float *a,
-                                         std::int64_t lda, std::int64_t stride_a, float *b,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda,
-                                       stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                         std::int64_t n, double alpha, const double *a,
-                                         std::int64_t lda, std::int64_t stride_a, double *b,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda,
-                                       stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                         std::int64_t n, std::complex<float> alpha,
-                                         const std::complex<float> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<float> *b,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda,
-                                       stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                         std::int64_t n, std::complex<double> alpha,
-                                         const std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<double> *b,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda,
-                                       stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                         std::int64_t n, float alpha, float *ab, std::int64_t lda,
-                                         std::int64_t ldb, std::int64_t stride,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda,
-                                       ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                         std::int64_t n, double alpha, double *ab, std::int64_t lda,
-                                         std::int64_t ldb, std::int64_t stride,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda,
-                                       ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                         std::int64_t n, std::complex<float> alpha,
-                                         std::complex<float> *ab, std::int64_t lda,
-                                         std::int64_t ldb, std::int64_t stride,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda,
-                                       ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m,
-                                         std::int64_t n, std::complex<double> alpha,
-                                         std::complex<double> *ab, std::int64_t lda,
-                                         std::int64_t ldb, std::int64_t stride,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda,
-                                       ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb,
-                                        std::int64_t m, std::int64_t n, float alpha, const float *a,
-                                        std::int64_t lda, std::int64_t stride_a, float beta,
-                                        const float *b, std::int64_t ldb, std::int64_t stride_b,
-                                        float *c, std::int64_t ldc, std::int64_t stride_c,
-                                        std::int64_t batch_size,
-                                        const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a,
-                                      lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c,
-                                      batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb,
-                                        std::int64_t m, std::int64_t n, double alpha,
-                                        const double *a, std::int64_t lda, std::int64_t stride_a,
-                                        double beta, const double *b, std::int64_t ldb,
-                                        std::int64_t stride_b, double *c, std::int64_t ldc,
-                                        std::int64_t stride_c, std::int64_t batch_size,
-                                        const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a,
-                                      lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c,
-                                      batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event omatadd_batch(
-    sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-    std::int64_t stride_a, std::complex<float> beta, const std::complex<float> *b, std::int64_t ldb,
-    std::int64_t stride_b, std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a,
-                                      lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c,
-                                      batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb,
-                                        std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                        const std::complex<double> *a, std::int64_t lda,
-                                        std::int64_t stride_a, std::complex<double> beta,
-                                        const std::complex<double> *b, std::int64_t ldb,
-                                        std::int64_t stride_b, std::complex<double> *c,
-                                        std::int64_t ldc, std::int64_t stride_c,
-                                        std::int64_t batch_size,
-                                        const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a,
-                                      lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c,
-                                      batch_size, dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                                   float *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb,
-                                 dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                                   double *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb,
-                                 dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb,
-                                 dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb,
-                                 dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m,
-                                    std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                                    std::int64_t stridea, float *b, std::int64_t ldb,
-                                    std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea,
-                                  b, ldb, strideb, dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m,
-                                    std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                                    std::int64_t stridea, double *b, std::int64_t ldb,
-                                    std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea,
-                                  b, ldb, strideb, dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m,
-                                    std::int64_t n, std::complex<float> alpha,
-                                    const std::complex<float> *a, std::int64_t lda,
-                                    std::int64_t stridea, std::complex<float> *b, std::int64_t ldb,
-                                    std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea,
-                                  b, ldb, strideb, dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m,
-                                    std::int64_t n, std::complex<double> alpha,
-                                    const std::complex<double> *a, std::int64_t lda,
-                                    std::int64_t stridea, std::complex<double> *b, std::int64_t ldb,
-                                    std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea,
-                                  b, ldb, strideb, dependencies);
-    return done;
-}
-
-static inline sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, float alpha, float *ab, std::int64_t lda,
-                                   std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb,
-                                 dependencies);
-    return done;
-}
-
-static inline sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, double alpha, double *ab, std::int64_t lda,
-                                   std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb,
-                                 dependencies);
-    return done;
-}
-
-static inline sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb,
-                                 dependencies);
-    return done;
-}
-
-static inline sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb,
-                                 dependencies);
-    return done;
-}
-
-static inline sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb,
-                                  std::int64_t m, std::int64_t n, float alpha, const float *a,
-                                  std::int64_t lda, float beta, const float *b, std::int64_t ldb,
-                                  float *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda,
-                                beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb,
-                                  std::int64_t m, std::int64_t n, double alpha, const double *a,
-                                  std::int64_t lda, double beta, const double *b, std::int64_t ldb,
-                                  double *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda,
-                                beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb,
-                                  std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                  const std::complex<float> *a, std::int64_t lda,
-                                  std::complex<float> beta, const std::complex<float> *b,
-                                  std::int64_t ldb, std::complex<float> *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda,
-                                beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb,
-                                  std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                  const std::complex<double> *a, std::int64_t lda,
-                                  std::complex<double> beta, const std::complex<double> *b,
-                                  std::int64_t ldb, std::complex<double> *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {}) {
-    auto done = detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda,
-                                beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, float* alpha, const float** a,
-                                         std::int64_t* lda, float** b, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {}) {
-    auto done = detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b,
-                                       ldb, group_count, groupsize, dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, double* alpha, const double** a,
-                                         std::int64_t* lda, double** b, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {}) {
-    auto done = detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b,
-                                       ldb, group_count, groupsize, dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, std::complex<float>* alpha,
-                                         const std::complex<float>** a, std::int64_t* lda,
-                                         std::complex<float>** b, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {}) {
-    auto done = detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b,
-                                       ldb, group_count, groupsize, dependencies);
-    return done;
-}
-
-static inline sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, std::complex<double>* alpha,
-                                         const std::complex<double>** a, std::int64_t* lda,
-                                         std::complex<double>** b, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {}) {
-    auto done = detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b,
-                                       ldb, group_count, groupsize, dependencies);
-    return done;
-}
-
-static inline sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, float* alpha, float** ab,
-                                         std::int64_t* lda, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {}) {
-    auto done = detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda,
-                                       ldb, group_count, groupsize, dependencies);
-    return done;
-}
-
-static inline sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, double* alpha, double** ab,
-                                         std::int64_t* lda, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {}) {
-    auto done = detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda,
-                                       ldb, group_count, groupsize, dependencies);
-    return done;
-}
-
-static inline sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, std::complex<float>* alpha,
-                                         std::complex<float>** ab, std::int64_t* lda,
-                                         std::int64_t* ldb, std::int64_t group_count,
-                                         std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {}) {
-    auto done = detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda,
-                                       ldb, group_count, groupsize, dependencies);
-    return done;
-}
-
-static inline sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, std::complex<double>* alpha,
-                                         std::complex<double>** ab, std::int64_t* lda,
-                                         std::int64_t* ldb, std::int64_t group_count,
-                                         std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {}) {
-    auto done = detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda,
-                                       ldb, group_count, groupsize, dependencies);
-    return done;
-}
diff --git a/include/oneapi/mkl/blas/detail/blas_ct_backends.hpp b/include/oneapi/mkl/blas/detail/blas_ct_backends.hpp
deleted file mode 100644
index eb894b5b9..000000000
--- a/include/oneapi/mkl/blas/detail/blas_ct_backends.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _BLAS_CT_BACKENDS_HPP__
-#define _BLAS_CT_BACKENDS_HPP__
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/detail/backend_selector.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace column_major {
-
-#define BACKEND mklcpu
-#include "blas_ct_backends.hxx"
-#undef BACKEND
-#define BACKEND mklgpu
-#include "blas_ct_backends.hxx"
-#undef BACKEND
-#define BACKEND cublas
-#include "blas_ct_backends.hxx"
-#undef BACKEND
-#define BACKEND rocblas
-#include "blas_ct_backends.hxx"
-#undef BACKEND
-#define BACKEND netlib
-#include "blas_ct_backends.hxx"
-#undef BACKEND
-#define BACKEND portblas
-#include "blas_ct_backends.hxx"
-#undef BACKEND
-
-} //namespace column_major
-namespace row_major {
-
-#define BACKEND mklcpu
-#include "blas_ct_backends.hxx"
-#undef BACKEND
-#define BACKEND mklgpu
-#include "blas_ct_backends.hxx"
-#undef BACKEND
-#define BACKEND cublas
-#include "blas_ct_backends.hxx"
-#undef BACKEND
-#define BACKEND rocblas
-#include "blas_ct_backends.hxx"
-#undef BACKEND
-#define BACKEND netlib
-#include "blas_ct_backends.hxx"
-#undef BACKEND
-#define BACKEND portblas
-#include "blas_ct_backends.hxx"
-#undef BACKEND
-
-} //namespace row_major
-} //namespace blas
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_BLAS_CT_BACKENDS_HPP__
diff --git a/include/oneapi/mkl/blas/detail/blas_ct_backends.hxx b/include/oneapi/mkl/blas/detail/blas_ct_backends.hxx
deleted file mode 100644
index afebb93c3..000000000
--- a/include/oneapi/mkl/blas/detail/blas_ct_backends.hxx
+++ /dev/null
@@ -1,2966 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-static inline void syr2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                        sycl::buffer<float, 1> &a, std::int64_t lda);
-
-static inline void syr2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                        sycl::buffer<double, 1> &a, std::int64_t lda);
-
-static inline void scal(backend_selector<backend::BACKEND> selector, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &x, std::int64_t incx);
-
-static inline void scal(backend_selector<backend::BACKEND> selector, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &x, std::int64_t incx);
-
-static inline void scal(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
-
-static inline void scal(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx);
-
-static inline void scal(backend_selector<backend::BACKEND> selector, std::int64_t n, float alpha,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-static inline void scal(backend_selector<backend::BACKEND> selector, std::int64_t n, double alpha,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-static inline void trmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx);
-
-static inline void trmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx);
-
-static inline void trmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-static inline void trmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-static inline void tpmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-                        std::int64_t incx);
-
-static inline void tpmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-                        std::int64_t incx);
-
-static inline void tpmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-static inline void tpmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-static inline void spr(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                       std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                       std::int64_t incx, sycl::buffer<float, 1> &a);
-
-static inline void spr(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                       std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                       std::int64_t incx, sycl::buffer<double, 1> &a);
-
-static inline void hpmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
-
-static inline void hpmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
-
-static inline void syrk(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                        sycl::buffer<float, 1> &a, std::int64_t lda, float beta,
-                        sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-static inline void syrk(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda, double beta,
-                        sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-static inline void syrk(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
-
-static inline void syrk(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
-
-static inline void syrk_batch(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                              transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                              sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, float beta, sycl::buffer<float, 1> &c,
-                              std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-static inline void syrk_batch(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                              transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                              sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, double beta, sycl::buffer<double, 1> &c,
-                              std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-static inline void syrk_batch(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                              transpose trans, std::int64_t n, std::int64_t k,
-                              std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, std::complex<float> beta,
-                              sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-static inline void syrk_batch(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                              transpose trans, std::int64_t n, std::int64_t k,
-                              std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, std::complex<double> beta,
-                              sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-static inline void her2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-
-static inline void her2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-
-static inline void hbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
-
-static inline void hbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
-
-static inline void rot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                       sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c,
-                       float s);
-
-static inline void rot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                       sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c,
-                       double s);
-
-static inline void rot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                       sycl::buffer<float, 1> &x, std::int64_t incx,
-                       sycl::buffer<float, 1> &y, std::int64_t incy, float c, float s);
-
-static inline void rot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                       sycl::buffer<double, 1> &x, std::int64_t incx,
-                       sycl::buffer<double, 1> &y, std::int64_t incy, double c, double s);
-
-static inline void axpy(backend_selector<backend::BACKEND> selector, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &y, std::int64_t incy);
-
-static inline void axpy(backend_selector<backend::BACKEND> selector, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-
-static inline void axpy(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
-
-static inline void axpy(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
-
-static inline void axpy_batch(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                              float alpha, sycl::buffer<float, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<float, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-static inline void axpy_batch(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                              double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<double, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-static inline void axpy_batch(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                              std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-static inline void axpy_batch(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                              std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-static inline void axpby(backend_selector<backend::BACKEND> selector, std::int64_t n, float alpha,
-                         sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                         sycl::buffer<float, 1> &y, std::int64_t incy);
-
-static inline void axpby(backend_selector<backend::BACKEND> selector, std::int64_t n, double alpha,
-                         sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                         sycl::buffer<double, 1> &y, std::int64_t incy);
-
-static inline void axpby(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                         std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                         std::int64_t incx, std::complex<float> beta,
-                         sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-static inline void axpby(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                         std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                         std::int64_t incx, std::complex<double> beta,
-                         sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-static inline void gerc(backend_selector<backend::BACKEND> selector, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda);
-
-static inline void gerc(backend_selector<backend::BACKEND> selector, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda);
-
-static inline void syr2k(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                         transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                         sycl::buffer<float, 1> &a, std::int64_t lda,
-                         sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                         sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-static inline void syr2k(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                         transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                         sycl::buffer<double, 1> &a, std::int64_t lda,
-                         sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                         sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-static inline void syr2k(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                         transpose trans, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                         std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                         std::int64_t ldc);
-
-static inline void syr2k(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                         transpose trans, std::int64_t n, std::int64_t k,
-                         std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                         std::int64_t ldb, std::complex<double> beta,
-                         sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-static inline void gemv(backend_selector<backend::BACKEND> selector, transpose trans,
-                        std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-                        float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-static inline void gemv(backend_selector<backend::BACKEND> selector, transpose trans,
-                        std::int64_t m, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-
-static inline void gemv(backend_selector<backend::BACKEND> selector, transpose trans,
-                        std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
-
-static inline void gemv(backend_selector<backend::BACKEND> selector, transpose trans,
-                        std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
-
-static inline void gemv_batch(backend_selector<backend::BACKEND> selector, transpose trans,
-                              std::int64_t m, std::int64_t n, float alpha,
-                              sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stridea,
-                              sycl::buffer<float, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, float beta, sycl::buffer<float, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-static inline void gemv_batch(backend_selector<backend::BACKEND> selector, transpose trans,
-                              std::int64_t m, std::int64_t n, double alpha,
-                              sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<double, 1> &x,
-                              std::int64_t incx, std::int64_t stridex, double beta,
-                              sycl::buffer<double, 1> &y, std::int64_t incy,
-                              std::int64_t stridey, std::int64_t batch_size);
-
-static inline void gemv_batch(backend_selector<backend::BACKEND> selector, transpose trans,
-                              std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                              std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-                              sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                              std::int64_t stridey, std::int64_t batch_size);
-
-static inline void gemv_batch(backend_selector<backend::BACKEND> selector, transpose trans,
-                              std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                              std::int64_t incx, std::int64_t stridex, std::complex<double> beta,
-                              sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                              std::int64_t stridey, std::int64_t batch_size);
-
-static inline void dgmm_batch(backend_selector<backend::BACKEND> selector, side left_right,
-                              std::int64_t m, std::int64_t n, sycl::buffer<float, 1> &a,
-                              std::int64_t lda, std::int64_t stridea, sycl::buffer<float, 1> &x,
-                              std::int64_t incx, std::int64_t stridex,
-                              sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                              std::int64_t batch_size);
-
-static inline void dgmm_batch(backend_selector<backend::BACKEND> selector, side left_right,
-                              std::int64_t m, std::int64_t n, sycl::buffer<double, 1> &a,
-                              std::int64_t lda, std::int64_t stridea,
-                              sycl::buffer<double, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<double, 1> &c,
-                              std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size);
-
-static inline void dgmm_batch(backend_selector<backend::BACKEND> selector, side left_right,
-                              std::int64_t m, std::int64_t n,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                              std::int64_t incx, std::int64_t stridex,
-                              sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                              std::int64_t stridec, std::int64_t batch_size);
-
-static inline void dgmm_batch(backend_selector<backend::BACKEND> selector, side left_right,
-                              std::int64_t m, std::int64_t n,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                              std::int64_t incx, std::int64_t stridex,
-                              sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                              std::int64_t stridec, std::int64_t batch_size);
-
-static inline void her(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                       std::int64_t n, float alpha, sycl::buffer<std::complex<float>, 1> &x,
-                       std::int64_t incx, sycl::buffer<std::complex<float>, 1> &a,
-                       std::int64_t lda);
-
-static inline void her(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                       std::int64_t n, double alpha, sycl::buffer<std::complex<double>, 1> &x,
-                       std::int64_t incx, sycl::buffer<std::complex<double>, 1> &a,
-                       std::int64_t lda);
-
-static inline void hpr(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                       std::int64_t n, float alpha, sycl::buffer<std::complex<float>, 1> &x,
-                       std::int64_t incx, sycl::buffer<std::complex<float>, 1> &a);
-
-static inline void hpr(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                       std::int64_t n, double alpha, sycl::buffer<std::complex<double>, 1> &x,
-                       std::int64_t incx, sycl::buffer<std::complex<double>, 1> &a);
-
-static inline void iamin(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                         sycl::buffer<float, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-
-static inline void iamin(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                         sycl::buffer<double, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-
-static inline void iamin(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-
-static inline void iamin(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-
-static inline void gemm_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<float, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              sycl::buffer<float, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-static inline void gemm_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<double, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, double beta,
-                              sycl::buffer<double, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-static inline void gemm_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                              sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-static inline void gemm_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                              sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-static inline void gemm_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              sycl::half alpha, sycl::buffer<sycl::half, 1> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<sycl::half, 1> &b, std::int64_t ldb,
-                              std::int64_t stride_b, sycl::half beta,
-                              sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-static inline void gemm_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              float alpha, sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<sycl::half, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                              std::int64_t batch_size);
-
-static inline void gemm_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              float alpha, sycl::buffer<std::int8_t, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::int8_t, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                              std::int64_t batch_size);
-
-static inline void gemm_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              float alpha, sycl::buffer<std::int8_t, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::int8_t, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-static inline void spmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                        sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        sycl::buffer<float, 1> &y, std::int64_t incy);
-
-static inline void spmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-
-static inline void gemm_bias(backend_selector<backend::BACKEND> selector, transpose transa,
-                             transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                             std::int64_t k, float alpha, sycl::buffer<int8_t, 1> &a,
-                             std::int64_t lda, int8_t ao, sycl::buffer<uint8_t, 1> &b,
-                             std::int64_t ldb, uint8_t bo, float beta,
-                             sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-                             sycl::buffer<int32_t, 1> &co);
-
-static inline void gemm_bias(backend_selector<backend::BACKEND> selector, transpose transa,
-                             transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                             std::int64_t k, float alpha, sycl::buffer<int8_t, 1> &a,
-                             std::int64_t lda, int8_t ao, sycl::buffer<int8_t, 1> &b,
-                             std::int64_t ldb, int8_t bo, float beta,
-                             sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-                             sycl::buffer<int32_t, 1> &co);
-
-static inline void gemm_bias(backend_selector<backend::BACKEND> selector, transpose transa,
-                             transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                             std::int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a,
-                             std::int64_t lda, uint8_t ao, sycl::buffer<int8_t, 1> &b,
-                             std::int64_t ldb, int8_t bo, float beta,
-                             sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-                             sycl::buffer<int32_t, 1> &co);
-
-static inline void gemm_bias(backend_selector<backend::BACKEND> selector, transpose transa,
-                             transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                             std::int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a,
-                             std::int64_t lda, uint8_t ao, sycl::buffer<uint8_t, 1> &b,
-                             std::int64_t ldb, uint8_t bo, float beta,
-                             sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-                             sycl::buffer<int32_t, 1> &co);
-
-static inline void swap(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &y, std::int64_t incy);
-
-static inline void swap(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-
-static inline void swap(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-static inline void swap(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-static inline void geru(backend_selector<backend::BACKEND> selector, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda);
-
-static inline void geru(backend_selector<backend::BACKEND> selector, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda);
-
-static inline void nrm2(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &result);
-
-static inline void nrm2(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &result);
-
-static inline void nrm2(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &result);
-
-static inline void nrm2(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &result);
-
-static inline void gemm(backend_selector<backend::BACKEND> selector, transpose transa,
-                        transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                        float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                        sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-static inline void gemm(backend_selector<backend::BACKEND> selector, transpose transa,
-                        transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                        double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                        sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-static inline void gemm(backend_selector<backend::BACKEND> selector, transpose transa,
-                        transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                        std::int64_t ldb, std::complex<float> beta,
-                        sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-static inline void gemm(backend_selector<backend::BACKEND> selector, transpose transa,
-                        transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                        std::int64_t ldb, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-static inline void gemm(backend_selector<backend::BACKEND> selector, transpose transa,
-                        transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                        sycl::half alpha, sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                        sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, sycl::half beta,
-                        sycl::buffer<sycl::half, 1> &c, std::int64_t ldc);
-
-static inline void gemm(backend_selector<backend::BACKEND> selector, transpose transa,
-                        transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                        float alpha, sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                        sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, float beta,
-                        sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-static inline void gemm(backend_selector<backend::BACKEND> selector, transpose transa,
-                        transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                        float alpha, sycl::buffer<bfloat16, 1> &a, std::int64_t lda,
-                        sycl::buffer<bfloat16, 1> &b, std::int64_t ldb, float beta,
-                        sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-static inline void herk(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, float beta,
-                        sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-static inline void herk(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, double beta,
-                        sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-static inline void ger(backend_selector<backend::BACKEND> selector, std::int64_t m, std::int64_t n,
-                       float alpha, sycl::buffer<float, 1> &x, std::int64_t incx,
-                       sycl::buffer<float, 1> &y, std::int64_t incy,
-                       sycl::buffer<float, 1> &a, std::int64_t lda);
-
-static inline void ger(backend_selector<backend::BACKEND> selector, std::int64_t m, std::int64_t n,
-                       double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-                       sycl::buffer<double, 1> &y, std::int64_t incy,
-                       sycl::buffer<double, 1> &a, std::int64_t lda);
-
-static inline void trsm(backend_selector<backend::BACKEND> selector, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb);
-
-static inline void trsm(backend_selector<backend::BACKEND> selector, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                        std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb);
-
-static inline void trsm(backend_selector<backend::BACKEND> selector, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-
-static inline void trsm(backend_selector<backend::BACKEND> selector, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-
-static inline void dotu(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<float>, 1> &result);
-
-static inline void dotu(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<double>, 1> &result);
-
-static inline void hemm(backend_selector<backend::BACKEND> selector, side left_right,
-                        uplo upper_lower, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
-
-static inline void hemm(backend_selector<backend::BACKEND> selector, side left_right,
-                        uplo upper_lower, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                        std::int64_t ldb, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-static inline void hpr2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<float>, 1> &a);
-
-static inline void hpr2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<double>, 1> &a);
-
-static inline void gbmv(backend_selector<backend::BACKEND> selector, transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                        float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        sycl::buffer<float, 1> &y, std::int64_t incy);
-
-static inline void gbmv(backend_selector<backend::BACKEND> selector, transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                        double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-
-static inline void gbmv(backend_selector<backend::BACKEND> selector, transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-static inline void gbmv(backend_selector<backend::BACKEND> selector, transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-static inline void tbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx);
-
-static inline void tbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx);
-
-static inline void tbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-static inline void tbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-static inline void symm(backend_selector<backend::BACKEND> selector, side left_right,
-                        uplo upper_lower, std::int64_t m, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                        sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-static inline void symm(backend_selector<backend::BACKEND> selector, side left_right,
-                        uplo upper_lower, std::int64_t m, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                        sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-static inline void symm(backend_selector<backend::BACKEND> selector, side left_right,
-                        uplo upper_lower, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
-
-static inline void symm(backend_selector<backend::BACKEND> selector, side left_right,
-                        uplo upper_lower, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                        std::int64_t ldb, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-static inline void dotc(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<float>, 1> &result);
-
-static inline void dotc(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<double>, 1> &result);
-
-static inline void syr(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                       std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                       std::int64_t incx, sycl::buffer<float, 1> &a, std::int64_t lda);
-
-static inline void syr(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                       std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                       std::int64_t incx, sycl::buffer<double, 1> &a, std::int64_t lda);
-
-static inline void trmm(backend_selector<backend::BACKEND> selector, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb);
-
-static inline void trmm(backend_selector<backend::BACKEND> selector, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                        std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb);
-
-static inline void trmm(backend_selector<backend::BACKEND> selector, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-
-static inline void trmm(backend_selector<backend::BACKEND> selector, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-
-static inline void rotmg(backend_selector<backend::BACKEND> selector,
-                         sycl::buffer<float, 1> &d1, sycl::buffer<float, 1> &d2,
-                         sycl::buffer<float, 1> &x1, float y1,
-                         sycl::buffer<float, 1> &param);
-
-static inline void rotmg(backend_selector<backend::BACKEND> selector,
-                         sycl::buffer<double, 1> &d1, sycl::buffer<double, 1> &d2,
-                         sycl::buffer<double, 1> &x1, double y1,
-                         sycl::buffer<double, 1> &param);
-
-static inline void tpsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-                        std::int64_t incx);
-
-static inline void tpsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-                        std::int64_t incx);
-
-static inline void tpsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-static inline void tpsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-static inline void trsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx);
-
-static inline void trsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx);
-
-static inline void trsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-static inline void trsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-static inline void copy(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &y, std::int64_t incy);
-
-static inline void copy(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-
-static inline void copy(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-static inline void copy(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-static inline void copy_batch(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                              sycl::buffer<float, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<float, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-static inline void copy_batch(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                              sycl::buffer<double, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<double, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-static inline void copy_batch(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                              sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-static inline void copy_batch(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                              sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-static inline void hemv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
-
-static inline void hemv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
-
-static inline void gemmt(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                         transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                         float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                         sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                         sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-static inline void gemmt(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                         transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                         double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                         sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                         sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-static inline void gemmt(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                         transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                         std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                         std::int64_t ldb, std::complex<float> beta,
-                         sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-static inline void gemmt(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                         transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                         std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                         std::int64_t ldb, std::complex<double> beta,
-                         sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-static inline void sbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-                        float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-static inline void sbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, std::int64_t k, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-
-static inline void asum(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &result);
-
-static inline void asum(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &result);
-
-static inline void asum(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &result);
-
-static inline void asum(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &result);
-
-static inline void tbsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx);
-
-static inline void tbsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx);
-
-static inline void tbsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-static inline void tbsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-static inline void spr2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                        sycl::buffer<float, 1> &a);
-
-static inline void spr2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                        sycl::buffer<double, 1> &a);
-
-static inline void iamax(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                         sycl::buffer<float, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-
-static inline void iamax(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                         sycl::buffer<double, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-
-static inline void iamax(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-
-static inline void iamax(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-
-static inline void trsm_batch(backend_selector<backend::BACKEND> selector, side left_right,
-                              uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                              std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<float, 1> &b, std::int64_t ldb,
-                              std::int64_t stride_b, std::int64_t batch_size);
-
-static inline void trsm_batch(backend_selector<backend::BACKEND> selector, side left_right,
-                              uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                              std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<double, 1> &b, std::int64_t ldb,
-                              std::int64_t stride_b, std::int64_t batch_size);
-
-static inline void trsm_batch(backend_selector<backend::BACKEND> selector, side left_right,
-                              uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                              std::int64_t n, std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-static inline void trsm_batch(backend_selector<backend::BACKEND> selector, side left_right,
-                              uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                              std::int64_t n, std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-static inline void rotm(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &y, std::int64_t incy,
-                        sycl::buffer<float, 1> &param);
-
-static inline void rotm(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &y, std::int64_t incy,
-                        sycl::buffer<double, 1> &param);
-
-static inline void rotg(backend_selector<backend::BACKEND> selector, sycl::buffer<float, 1> &a,
-                        sycl::buffer<float, 1> &b, sycl::buffer<float, 1> &c,
-                        sycl::buffer<float, 1> &s);
-
-static inline void rotg(backend_selector<backend::BACKEND> selector, sycl::buffer<double, 1> &a,
-                        sycl::buffer<double, 1> &b, sycl::buffer<double, 1> &c,
-                        sycl::buffer<double, 1> &s);
-
-static inline void rotg(backend_selector<backend::BACKEND> selector,
-                        sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-                        sycl::buffer<std::complex<float>, 1> &s);
-
-static inline void rotg(backend_selector<backend::BACKEND> selector,
-                        sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &b,
-                        sycl::buffer<double, 1> &c,
-                        sycl::buffer<std::complex<double>, 1> &s);
-
-static inline void sdsdot(backend_selector<backend::BACKEND> selector, std::int64_t n, float sb,
-                          sycl::buffer<float, 1> &x, std::int64_t incx,
-                          sycl::buffer<float, 1> &y, std::int64_t incy,
-                          sycl::buffer<float, 1> &result);
-
-static inline void her2k(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                         transpose trans, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-                         sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-static inline void her2k(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                         transpose trans, std::int64_t n, std::int64_t k,
-                         std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                         std::int64_t ldb, double beta,
-                         sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-static inline void dot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                       sycl::buffer<float, 1> &x, std::int64_t incx,
-                       sycl::buffer<float, 1> &y, std::int64_t incy,
-                       sycl::buffer<float, 1> &result);
-
-static inline void dot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                       sycl::buffer<double, 1> &x, std::int64_t incx,
-                       sycl::buffer<double, 1> &y, std::int64_t incy,
-                       sycl::buffer<double, 1> &result);
-
-static inline void dot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                       sycl::buffer<float, 1> &x, std::int64_t incx,
-                       sycl::buffer<float, 1> &y, std::int64_t incy,
-                       sycl::buffer<double, 1> &result);
-
-static inline void symv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-                        float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-static inline void symv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                        std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                        std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx,
-                        double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-static inline void omatcopy_batch(backend_selector<backend::BACKEND> selector, transpose trans,
-                                  std::int64_t m, std::int64_t n, float alpha,
-                                  sycl::buffer<float, 1> &a, std::int64_t lda,
-                                  std::int64_t stride_a, sycl::buffer<float, 1> &b,
-                                  std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-static inline void omatcopy_batch(backend_selector<backend::BACKEND> selector, transpose trans,
-                                  std::int64_t m, std::int64_t n, double alpha,
-                                  sycl::buffer<double, 1> &a, std::int64_t lda,
-                                  std::int64_t stride_a, sycl::buffer<double, 1> &b,
-                                  std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-static inline void omatcopy_batch(backend_selector<backend::BACKEND> selector, transpose trans,
-                                  std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                  sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                  std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                                  std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-static inline void omatcopy_batch(backend_selector<backend::BACKEND> selector, transpose trans,
-                                  std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                  sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                  std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                                  std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-static inline void imatcopy_batch(backend_selector<backend::BACKEND> selector, transpose trans,
-                                  std::int64_t m, std::int64_t n, float alpha,
-                                  sycl::buffer<float, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                                  std::int64_t stride, std::int64_t batch_size);
-
-static inline void imatcopy_batch(backend_selector<backend::BACKEND> selector, transpose trans,
-                                  std::int64_t m, std::int64_t n, double alpha,
-                                  sycl::buffer<double, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                                  std::int64_t stride, std::int64_t batch_size);
-
-static inline void imatcopy_batch(backend_selector<backend::BACKEND> selector, transpose trans,
-                                  std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                  sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda,
-                                  std::int64_t ldb, std::int64_t stride, std::int64_t batch_size);
-
-static inline void imatcopy_batch(backend_selector<backend::BACKEND> selector, transpose trans,
-                                  std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                  sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda,
-                                  std::int64_t ldb, std::int64_t stride, std::int64_t batch_size);
-
-static inline void omatadd_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                                 transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                                 sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                                 float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                                 std::int64_t stride_b, sycl::buffer<float, 1> &c, std::int64_t ldc,
-                                 std::int64_t stride_c, std::int64_t batch_size);
-
-static inline void omatadd_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                                 transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                                 sycl::buffer<double, 1> &a, std::int64_t lda,
-                                 std::int64_t stride_a, double beta, sycl::buffer<double, 1> &b,
-                                 std::int64_t ldb, std::int64_t stride_b,
-                                 sycl::buffer<double, 1> &c, std::int64_t ldc,
-                                 std::int64_t stride_c, std::int64_t batch_size);
-
-static inline void omatadd_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                                 transpose transb, std::int64_t m, std::int64_t n,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda, std::int64_t stride_a, std::complex<float> beta,
-                                 sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                 std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c,
-                                 std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-static inline void omatadd_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                                 transpose transb, std::int64_t m, std::int64_t n,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 std::int64_t stride_a, std::complex<double> beta,
-                                 sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                 std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c,
-                                 std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-static inline void omatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                            std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                            std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb);
-
-static inline void omatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                            std::int64_t m, std::int64_t n, double alpha,
-                            sycl::buffer<double, 1> &a, std::int64_t lda,
-                            sycl::buffer<double, 1> &b, std::int64_t ldb);
-
-static inline void omatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                            std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                            sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                            sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-
-static inline void omatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                            std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                            sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                            sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-
-static inline void omatcopy2(backend_selector<backend::BACKEND> selector, transpose trans,
-                             std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                             std::int64_t lda, std::int64_t stridea, sycl::buffer<float, 1> &b,
-                             std::int64_t ldb, std::int64_t strideb);
-
-static inline void omatcopy2(backend_selector<backend::BACKEND> selector, transpose trans,
-                             std::int64_t m, std::int64_t n, double alpha,
-                             sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stridea,
-                             sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t strideb);
-
-static inline void omatcopy2(backend_selector<backend::BACKEND> selector, transpose trans,
-                             std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                             sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                             std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &b,
-                             std::int64_t ldb, std::int64_t strideb);
-
-static inline void omatcopy2(backend_selector<backend::BACKEND> selector, transpose trans,
-                             std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                             sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                             std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b,
-                             std::int64_t ldb, std::int64_t strideb);
-
-static inline void imatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                            std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &ab,
-                            std::int64_t lda, std::int64_t ldb);
-
-static inline void imatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                            std::int64_t m, std::int64_t n, double alpha,
-                            sycl::buffer<double, 1> &ab, std::int64_t lda, std::int64_t ldb);
-
-static inline void imatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                            std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                            sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda,
-                            std::int64_t ldb);
-
-static inline void imatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                            std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                            sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda,
-                            std::int64_t ldb);
-
-static inline void omatadd(backend_selector<backend::BACKEND> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                           sycl::buffer<float, 1> &a, std::int64_t lda, float beta,
-                           sycl::buffer<float, 1> &b, std::int64_t ldb, sycl::buffer<float, 1> &c,
-                           std::int64_t ldc);
-
-static inline void omatadd(backend_selector<backend::BACKEND> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                           sycl::buffer<double, 1> &a, std::int64_t lda, double beta,
-                           sycl::buffer<double, 1> &b, std::int64_t ldb, sycl::buffer<double, 1> &c,
-                           std::int64_t ldc);
-
-static inline void omatadd(backend_selector<backend::BACKEND> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n,
-                           std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                           std::int64_t lda, std::complex<float> beta,
-                           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-static inline void omatadd(backend_selector<backend::BACKEND> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n,
-                           std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                           std::int64_t lda, std::complex<double> beta,
-                           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-// USM APIs
-
-static inline sycl::event syr2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, float alpha, const float *x, std::int64_t incx,
-                                   const float *y, std::int64_t incy, float *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syr2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, double alpha, const double *x, std::int64_t incx,
-                                   const double *y, std::int64_t incy, double *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event scal(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   float alpha, float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event scal(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   double alpha, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event scal(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   std::complex<float> alpha, std::complex<float> *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event scal(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   std::complex<double> alpha, std::complex<double> *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event scal(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   float alpha, std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event scal(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   double alpha, std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, const float *a,
-                                   std::int64_t lda, float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, const double *a,
-                                   std::int64_t lda, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tpmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, const float *a,
-                                   float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tpmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, const double *a,
-                                   double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tpmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n,
-                                   const std::complex<float> *a, std::complex<float> *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tpmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n,
-                                   const std::complex<double> *a, std::complex<double> *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event spr(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                  std::int64_t n, float alpha, const float *x, std::int64_t incx,
-                                  float *a, const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event spr(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                  std::int64_t n, double alpha, const double *x, std::int64_t incx,
-                                  double *a, const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event hpmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, const std::complex<float> *x,
-                                   std::int64_t incx, std::complex<float> beta,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event hpmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, const std::complex<double> *x,
-                                   std::int64_t incx, std::complex<double> beta,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syrk(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                                   const float *a, std::int64_t lda, float beta, float *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syrk(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                                   const double *a, std::int64_t lda, double beta, double *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syrk(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, std::int64_t n, std::int64_t k,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> beta,
-                                   std::complex<float> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syrk(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, std::int64_t n, std::int64_t k,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> beta,
-                                   std::complex<double> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syrk_batch(backend_selector<backend::BACKEND> selector,
-                                         uplo *upper_lower, transpose *trans, std::int64_t *n,
-                                         std::int64_t *k, float *alpha, const float **a,
-                                         std::int64_t *lda, float *beta, float **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syrk_batch(backend_selector<backend::BACKEND> selector,
-                                         uplo *upper_lower, transpose *trans, std::int64_t *n,
-                                         std::int64_t *k, double *alpha, const double **a,
-                                         std::int64_t *lda, double *beta, double **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syrk_batch(backend_selector<backend::BACKEND> selector,
-                                         uplo *upper_lower, transpose *trans, std::int64_t *n,
-                                         std::int64_t *k, std::complex<float> *alpha,
-                                         const std::complex<float> **a, std::int64_t *lda,
-                                         std::complex<float> *beta, std::complex<float> **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syrk_batch(backend_selector<backend::BACKEND> selector,
-                                         uplo *upper_lower, transpose *trans, std::int64_t *n,
-                                         std::int64_t *k, std::complex<double> *alpha,
-                                         const std::complex<double> **a, std::int64_t *lda,
-                                         std::complex<double> *beta, std::complex<double> **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syrk_batch(backend_selector<backend::BACKEND> selector,
-                                         uplo upper_lower, transpose trans, std::int64_t n,
-                                         std::int64_t k, float alpha, const float *a,
-                                         std::int64_t lda, std::int64_t stride_a, float beta,
-                                         float *c, std::int64_t ldc, std::int64_t stride_c,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syrk_batch(backend_selector<backend::BACKEND> selector,
-                                         uplo upper_lower, transpose trans, std::int64_t n,
-                                         std::int64_t k, double alpha, const double *a,
-                                         std::int64_t lda, std::int64_t stride_a, double beta,
-                                         double *c, std::int64_t ldc, std::int64_t stride_c,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syrk_batch(backend_selector<backend::BACKEND> selector,
-                                         uplo upper_lower, transpose trans, std::int64_t n,
-                                         std::int64_t k, std::complex<float> alpha,
-                                         const std::complex<float> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<float> beta,
-                                         std::complex<float> *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syrk_batch(backend_selector<backend::BACKEND> selector,
-                                         uplo upper_lower, transpose trans, std::int64_t n,
-                                         std::int64_t k, std::complex<double> alpha,
-                                         const std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<double> beta,
-                                         std::complex<double> *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event her2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event her2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event hbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> beta, std::complex<float> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event hbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> beta, std::complex<double> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event rot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                  std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                                  std::int64_t incy, float c, float s,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event rot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                  std::complex<double> *x, std::int64_t incx,
-                                  std::complex<double> *y, std::int64_t incy, double c, double s,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event rot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                  float *x, std::int64_t incx, float *y, std::int64_t incy, float c,
-                                  float s, const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event rot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                  double *x, std::int64_t incx, double *y, std::int64_t incy,
-                                  double c, double s,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpy(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   float alpha, const float *x, std::int64_t incx, float *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpy(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   double alpha, const double *x, std::int64_t incx, double *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpy(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *x,
-                                   std::int64_t incx, std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpy(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *x,
-                                   std::int64_t incx, std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t *n, float *alpha, const float **x,
-                                         std::int64_t *incx, float **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t *n, double *alpha, const double **x,
-                                         std::int64_t *incx, double **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t *n, std::complex<float> *alpha,
-                                         const std::complex<float> **x, std::int64_t *incx,
-                                         std::complex<float> **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t *n, std::complex<double> *alpha,
-                                         const std::complex<double> **x, std::int64_t *incx,
-                                         std::complex<double> **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t n, float alpha, const float *x,
-                                         std::int64_t incx, std::int64_t stridex, float *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t n, double alpha, const double *x,
-                                         std::int64_t incx, std::int64_t stridex, double *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t n, std::complex<float> alpha,
-                                         const std::complex<float> *x, std::int64_t incx,
-                                         std::int64_t stridex, std::complex<float> *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t n, std::complex<double> alpha,
-                                         const std::complex<double> *x, std::int64_t incx,
-                                         std::int64_t stridex, std::complex<double> *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpby(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                    float alpha, const float *x, std::int64_t incx,
-                                    const float beta, float *y, std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpby(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                    double alpha, const double *x, std::int64_t incx,
-                                    const double beta, double *y, std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpby(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                    std::complex<float> alpha, const std::complex<float> *x,
-                                    std::int64_t incx, const std::complex<float> beta,
-                                    std::complex<float> *y, std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event axpby(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                    std::complex<double> alpha, const std::complex<double> *x,
-                                    std::int64_t incx, const std::complex<double> beta,
-                                    std::complex<double> *y, std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gerc(backend_selector<backend::BACKEND> selector, std::int64_t m,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gerc(backend_selector<backend::BACKEND> selector, std::int64_t m,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syr2k(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                    transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                                    const float *a, std::int64_t lda, const float *b,
-                                    std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syr2k(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                    transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                                    const double *a, std::int64_t lda, const double *b,
-                                    std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syr2k(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                    transpose trans, std::int64_t n, std::int64_t k,
-                                    std::complex<float> alpha, const std::complex<float> *a,
-                                    std::int64_t lda, const std::complex<float> *b,
-                                    std::int64_t ldb, std::complex<float> beta,
-                                    std::complex<float> *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syr2k(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                    transpose trans, std::int64_t n, std::int64_t k,
-                                    std::complex<double> alpha, const std::complex<double> *a,
-                                    std::int64_t lda, const std::complex<double> *b,
-                                    std::int64_t ldb, std::complex<double> beta,
-                                    std::complex<double> *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemv(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, float alpha, const float *a,
-                                   std::int64_t lda, const float *x, std::int64_t incx, float beta,
-                                   float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemv(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, double alpha, const double *a,
-                                   std::int64_t lda, const double *x, std::int64_t incx,
-                                   double beta, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemv(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> beta, std::complex<float> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemv(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> beta, std::complex<double> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemv_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         float alpha, const float *a, std::int64_t lda,
-                                         std::int64_t stridea, const float *x, std::int64_t incx,
-                                         std::int64_t stridex, float beta, float *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemv_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         double alpha, const double *a, std::int64_t lda,
-                                         std::int64_t stridea, const double *x, std::int64_t incx,
-                                         std::int64_t stridex, double beta, double *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemv_batch(
-    backend_selector<backend::BACKEND> selector, transpose trans, std::int64_t m, std::int64_t n,
-    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-    const std::complex<float> *x, std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-    std::complex<float> *y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemv_batch(
-    backend_selector<backend::BACKEND> selector, transpose trans, std::int64_t m, std::int64_t n,
-    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-    std::int64_t stridea, const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-    std::complex<double> beta, std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemv_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose *trans, std::int64_t *m, std::int64_t *n,
-                                         float *alpha, const float **a, std::int64_t *lda,
-                                         const float **x, std::int64_t *incx, float *beta,
-                                         float **y, std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemv_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose *trans, std::int64_t *m, std::int64_t *n,
-                                         double *alpha, const double **a, std::int64_t *lda,
-                                         const double **x, std::int64_t *incx, double *beta,
-                                         double **y, std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemv_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose *trans, std::int64_t *m, std::int64_t *n,
-                                         std::complex<float> *alpha, const std::complex<float> **a,
-                                         std::int64_t *lda, const std::complex<float> **x,
-                                         std::int64_t *incx, std::complex<float> *beta,
-                                         std::complex<float> **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemv_batch(
-    backend_selector<backend::BACKEND> selector, transpose *trans, std::int64_t *m, std::int64_t *n,
-    std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
-    const std::complex<double> **x, std::int64_t *incx, std::complex<double> *beta,
-    std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
-    std::int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dgmm_batch(backend_selector<backend::BACKEND> selector,
-                                         side left_right, std::int64_t m, std::int64_t n,
-                                         const float *a, std::int64_t lda, std::int64_t stridea,
-                                         const float *x, std::int64_t incx, std::int64_t stridex,
-                                         float *c, std::int64_t ldc, std::int64_t stridec,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dgmm_batch(backend_selector<backend::BACKEND> selector,
-                                         side left_right, std::int64_t m, std::int64_t n,
-                                         const double *a, std::int64_t lda, std::int64_t stridea,
-                                         const double *x, std::int64_t incx, std::int64_t stridex,
-                                         double *c, std::int64_t ldc, std::int64_t stridec,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dgmm_batch(backend_selector<backend::BACKEND> selector,
-                                         side left_right, std::int64_t m, std::int64_t n,
-                                         const std::complex<float> *a, std::int64_t lda,
-                                         std::int64_t stridea, const std::complex<float> *x,
-                                         std::int64_t incx, std::int64_t stridex,
-                                         std::complex<float> *c, std::int64_t ldc,
-                                         std::int64_t stridec, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dgmm_batch(backend_selector<backend::BACKEND> selector,
-                                         side left_right, std::int64_t m, std::int64_t n,
-                                         const std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stridea, const std::complex<double> *x,
-                                         std::int64_t incx, std::int64_t stridex,
-                                         std::complex<double> *c, std::int64_t ldc,
-                                         std::int64_t stridec, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dgmm_batch(backend_selector<backend::BACKEND> selector,
-                                         side *left_right, std::int64_t *m, std::int64_t *n,
-                                         const float **a, std::int64_t *lda, const float **x,
-                                         std::int64_t *incx, float **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dgmm_batch(backend_selector<backend::BACKEND> selector,
-                                         side *left_right, std::int64_t *m, std::int64_t *n,
-                                         const double **a, std::int64_t *lda, const double **x,
-                                         std::int64_t *incx, double **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dgmm_batch(backend_selector<backend::BACKEND> selector,
-                                         side *left_right, std::int64_t *m, std::int64_t *n,
-                                         const std::complex<float> **a, std::int64_t *lda,
-                                         const std::complex<float> **x, std::int64_t *incx,
-                                         std::complex<float> **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dgmm_batch(backend_selector<backend::BACKEND> selector,
-                                         side *left_right, std::int64_t *m, std::int64_t *n,
-                                         const std::complex<double> **a, std::int64_t *lda,
-                                         const std::complex<double> **x, std::int64_t *incx,
-                                         std::complex<double> **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event her(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                  std::int64_t n, float alpha, const std::complex<float> *x,
-                                  std::int64_t incx, std::complex<float> *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event her(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                  std::int64_t n, double alpha, const std::complex<double> *x,
-                                  std::int64_t incx, std::complex<double> *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event hpr(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                  std::int64_t n, float alpha, const std::complex<float> *x,
-                                  std::int64_t incx, std::complex<float> *a,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event hpr(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                  std::int64_t n, double alpha, const std::complex<double> *x,
-                                  std::int64_t incx, std::complex<double> *a,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event iamin(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                    const float *x, std::int64_t incx, std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event iamin(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                    const double *x, std::int64_t incx, std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event iamin(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                    const std::complex<float> *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event iamin(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                    const std::complex<double> *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose *transa, transpose *transb, std::int64_t *m,
-                                         std::int64_t *n, std::int64_t *k, float *alpha,
-                                         const float **a, std::int64_t *lda, const float **b,
-                                         std::int64_t *ldb, float *beta, float **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose *transa, transpose *transb, std::int64_t *m,
-                                         std::int64_t *n, std::int64_t *k, double *alpha,
-                                         const double **a, std::int64_t *lda, const double **b,
-                                         std::int64_t *ldb, double *beta, double **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose *transa, transpose *transb, std::int64_t *m,
-                                         std::int64_t *n, std::int64_t *k,
-                                         std::complex<float> *alpha, const std::complex<float> **a,
-                                         std::int64_t *lda, const std::complex<float> **b,
-                                         std::int64_t *ldb, std::complex<float> *beta,
-                                         std::complex<float> **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(
-    backend_selector<backend::BACKEND> selector, transpose *transa, transpose *transb,
-    std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex<double> *alpha,
-    const std::complex<double> **a, std::int64_t *lda, const std::complex<double> **b,
-    std::int64_t *ldb, std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-    std::int64_t group_count, std::int64_t *group_size,
-    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose *transa, transpose *transb, std::int64_t *m,
-                                         std::int64_t *n, std::int64_t *k, sycl::half *alpha,
-                                         const sycl::half **a, std::int64_t *lda,
-                                         const sycl::half **b, std::int64_t *ldb, sycl::half *beta,
-                                         sycl::half **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(backend_selector<backend::BACKEND> selector, transpose *transa,
-                                     transpose *transb, std::int64_t *m, std::int64_t *n,
-                                     std::int64_t *k, float *alpha, const sycl::half **a,
-                                     std::int64_t *lda, const sycl::half **b, std::int64_t *ldb,
-                                     float *beta, float **c, std::int64_t *ldc,
-                                     std::int64_t group_count, std::int64_t *group_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(backend_selector<backend::BACKEND> selector, transpose *transa,
-                                     transpose *transb, std::int64_t *m, std::int64_t *n,
-                                     std::int64_t *k, float *alpha, const std::int8_t **a,
-                                     std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb,
-                                     float *beta, float **c, std::int64_t *ldc,
-                                     std::int64_t group_count, std::int64_t *group_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(backend_selector<backend::BACKEND> selector, transpose *transa,
-                                     transpose *transb, std::int64_t *m, std::int64_t *n,
-                                     std::int64_t *k, float *alpha, const std::int8_t **a,
-                                     std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb,
-                                     float *beta, std::int32_t **c, std::int64_t *ldc,
-                                     std::int64_t group_count, std::int64_t *group_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose transa, transpose transb, std::int64_t m,
-                                         std::int64_t n, std::int64_t k, float alpha,
-                                         const float *a, std::int64_t lda, std::int64_t stride_a,
-                                         const float *b, std::int64_t ldb, std::int64_t stride_b,
-                                         float beta, float *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose transa, transpose transb, std::int64_t m,
-                                         std::int64_t n, std::int64_t k, double alpha,
-                                         const double *a, std::int64_t lda, std::int64_t stride_a,
-                                         const double *b, std::int64_t ldb, std::int64_t stride_b,
-                                         double beta, double *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(
-    backend_selector<backend::BACKEND> selector, transpose transa, transpose transb, std::int64_t m,
-    std::int64_t n, std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-    std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb,
-    std::int64_t stride_b, std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
-    std::int64_t stride_c, std::int64_t batch_size,
-    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(
-    backend_selector<backend::BACKEND> selector, transpose transa, transpose transb, std::int64_t m,
-    std::int64_t n, std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-    std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb,
-    std::int64_t stride_b, std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-    std::int64_t stride_c, std::int64_t batch_size,
-    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(
-    backend_selector<backend::BACKEND> selector, transpose transa, transpose transb, std::int64_t m,
-    std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half *a, std::int64_t lda,
-    std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, std::int64_t stride_b,
-    sycl::half beta, sycl::half *c, std::int64_t ldc, std::int64_t stride_c,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                                     transpose transb, std::int64_t m, std::int64_t n,
-                                     std::int64_t k, float alpha, const sycl::half *a,
-                                     std::int64_t lda, std::int64_t stride_a, const sycl::half *b,
-                                     std::int64_t ldb, std::int64_t stride_b, float beta, float *c,
-                                     std::int64_t ldc, std::int64_t stride_c,
-                                     std::int64_t batch_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                                     transpose transb, std::int64_t m, std::int64_t n,
-                                     std::int64_t k, float alpha, const std::int8_t *a,
-                                     std::int64_t lda, std::int64_t stride_a, const std::int8_t *b,
-                                     std::int64_t ldb, std::int64_t stride_b, float beta, float *c,
-                                     std::int64_t ldc, std::int64_t stride_c,
-                                     std::int64_t batch_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_batch(backend_selector<backend::BACKEND> selector, transpose transa,
-                                     transpose transb, std::int64_t m, std::int64_t n,
-                                     std::int64_t k, float alpha, const std::int8_t *a,
-                                     std::int64_t lda, std::int64_t stride_a, const std::int8_t *b,
-                                     std::int64_t ldb, std::int64_t stride_b, float beta,
-                                     std::int32_t *c, std::int64_t ldc, std::int64_t stride_c,
-                                     std::int64_t batch_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event spmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, float alpha, const float *a, const float *x,
-                                   std::int64_t incx, float beta, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event spmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, double alpha, const double *a, const double *x,
-                                   std::int64_t incx, double beta, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event swap(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   float *x, std::int64_t incx, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event swap(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   double *x, std::int64_t incx, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event swap(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event swap(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event geru(backend_selector<backend::BACKEND> selector, std::int64_t m,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event geru(backend_selector<backend::BACKEND> selector, std::int64_t m,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event nrm2(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx, float *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event nrm2(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx, double *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event nrm2(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const float *x, std::int64_t incx, float *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event nrm2(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const double *x, std::int64_t incx, double *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm(backend_selector<backend::BACKEND> selector, transpose transa,
-                                   transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                                   float alpha, const float *a, std::int64_t lda, const float *b,
-                                   std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm(backend_selector<backend::BACKEND> selector, transpose transa,
-                                   transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                                   double alpha, const double *a, std::int64_t lda, const double *b,
-                                   std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm(backend_selector<backend::BACKEND> selector, transpose transa,
-                                   transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm(backend_selector<backend::BACKEND> selector, transpose transa,
-                                   transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, const std::complex<double> *b,
-                                   std::int64_t ldb, std::complex<double> beta,
-                                   std::complex<double> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm(backend_selector<backend::BACKEND> selector, transpose transa,
-                                   transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                                   sycl::half alpha, const sycl::half *a, std::int64_t lda,
-                                   const sycl::half *b, std::int64_t ldb, sycl::half beta,
-                                   sycl::half *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm(backend_selector<backend::BACKEND> selector, transpose transa,
-                                   transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                                   float alpha, const sycl::half *a, std::int64_t lda,
-                                   const sycl::half *b, std::int64_t ldb, float beta, float *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm(backend_selector<backend::BACKEND> selector, transpose transa,
-                                   transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                                   float alpha, const bfloat16 *a, std::int64_t lda,
-                                   const bfloat16 *b, std::int64_t ldb, float beta, float *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event herk(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                                   const std::complex<float> *a, std::int64_t lda, float beta,
-                                   std::complex<float> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event herk(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                                   const std::complex<double> *a, std::int64_t lda, double beta,
-                                   std::complex<double> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event ger(backend_selector<backend::BACKEND> selector, std::int64_t m,
-                                  std::int64_t n, float alpha, const float *x, std::int64_t incx,
-                                  const float *y, std::int64_t incy, float *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event ger(backend_selector<backend::BACKEND> selector, std::int64_t m,
-                                  std::int64_t n, double alpha, const double *x, std::int64_t incx,
-                                  const double *y, std::int64_t incy, double *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsm(backend_selector<backend::BACKEND> selector, side left_right,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t m, std::int64_t n, float alpha, const float *a,
-                                   std::int64_t lda, float *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsm(backend_selector<backend::BACKEND> selector, side left_right,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t m, std::int64_t n, double alpha, const double *a,
-                                   std::int64_t lda, double *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsm(backend_selector<backend::BACKEND> selector, side left_right,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsm(backend_selector<backend::BACKEND> selector, side left_right,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsm_batch(backend_selector<backend::BACKEND> selector,
-                                         side left_right, uplo upper_lower, transpose trans,
-                                         diag unit_diag, int64_t m, int64_t n, float alpha,
-                                         const float *a, int64_t lda, int64_t stride_a, float *b,
-                                         int64_t ldb, int64_t stride_b, int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsm_batch(backend_selector<backend::BACKEND> selector,
-                                         side left_right, uplo upper_lower, transpose trans,
-                                         diag unit_diag, int64_t m, int64_t n, double alpha,
-                                         const double *a, int64_t lda, int64_t stride_a, double *b,
-                                         int64_t ldb, int64_t stride_b, int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsm_batch(backend_selector<backend::BACKEND> selector,
-                                         side left_right, uplo upper_lower, transpose trans,
-                                         diag unit_diag, int64_t m, int64_t n,
-                                         std::complex<float> alpha, const std::complex<float> *a,
-                                         int64_t lda, int64_t stride_a, std::complex<float> *b,
-                                         int64_t ldb, int64_t stride_b, int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsm_batch(backend_selector<backend::BACKEND> selector,
-                                         side left_right, uplo upper_lower, transpose trans,
-                                         diag unit_diag, int64_t m, int64_t n,
-                                         std::complex<double> alpha, const std::complex<double> *a,
-                                         int64_t lda, int64_t stride_a, std::complex<double> *b,
-                                         int64_t ldb, int64_t stride_b, int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsm_batch(backend_selector<backend::BACKEND> selector,
-                                         side *left_right, uplo *upper_lower, transpose *trans,
-                                         diag *unit_diag, int64_t *m, int64_t *n, float *alpha,
-                                         const float **a, int64_t *lda, float **b, int64_t *ldb,
-                                         int64_t group_count, int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsm_batch(backend_selector<backend::BACKEND> selector,
-                                         side *left_right, uplo *upper_lower, transpose *trans,
-                                         diag *unit_diag, int64_t *m, int64_t *n, double *alpha,
-                                         const double **a, int64_t *lda, double **b, int64_t *ldb,
-                                         int64_t group_count, int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsm_batch(backend_selector<backend::BACKEND> selector,
-                                         side *left_right, uplo *upper_lower, transpose *trans,
-                                         diag *unit_diag, int64_t *m, int64_t *n,
-                                         std::complex<float> *alpha, const std::complex<float> **a,
-                                         int64_t *lda, std::complex<float> **b, int64_t *ldb,
-                                         int64_t group_count, int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsm_batch(backend_selector<backend::BACKEND> selector,
-                                         side *left_right, uplo *upper_lower, transpose *trans,
-                                         diag *unit_diag, int64_t *m, int64_t *n,
-                                         std::complex<double> *alpha,
-                                         const std::complex<double> **a, int64_t *lda,
-                                         std::complex<double> **b, int64_t *ldb,
-                                         int64_t group_count, int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dotu(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dotu(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event hemm(backend_selector<backend::BACKEND> selector, side left_right,
-                                   uplo upper_lower, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event hemm(backend_selector<backend::BACKEND> selector, side left_right,
-                                   uplo upper_lower, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, const std::complex<double> *b,
-                                   std::int64_t ldb, std::complex<double> beta,
-                                   std::complex<double> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event hpr2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *a,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event hpr2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *a,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gbmv(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                                   float alpha, const float *a, std::int64_t lda, const float *x,
-                                   std::int64_t incx, float beta, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gbmv(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                                   double alpha, const double *a, std::int64_t lda, const double *x,
-                                   std::int64_t incx, double beta, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gbmv(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, const std::complex<float> *x,
-                                   std::int64_t incx, std::complex<float> beta,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gbmv(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, const std::complex<double> *x,
-                                   std::int64_t incx, std::complex<double> beta,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                                   const float *a, std::int64_t lda, float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                                   const double *a, std::int64_t lda, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event symm(backend_selector<backend::BACKEND> selector, side left_right,
-                                   uplo upper_lower, std::int64_t m, std::int64_t n, float alpha,
-                                   const float *a, std::int64_t lda, const float *b,
-                                   std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event symm(backend_selector<backend::BACKEND> selector, side left_right,
-                                   uplo upper_lower, std::int64_t m, std::int64_t n, double alpha,
-                                   const double *a, std::int64_t lda, const double *b,
-                                   std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event symm(backend_selector<backend::BACKEND> selector, side left_right,
-                                   uplo upper_lower, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event symm(backend_selector<backend::BACKEND> selector, side left_right,
-                                   uplo upper_lower, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, const std::complex<double> *b,
-                                   std::int64_t ldb, std::complex<double> beta,
-                                   std::complex<double> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dotc(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dotc(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syr(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                  std::int64_t n, float alpha, const float *x, std::int64_t incx,
-                                  float *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event syr(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                  std::int64_t n, double alpha, const double *x, std::int64_t incx,
-                                  double *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trmm(backend_selector<backend::BACKEND> selector, side left_right,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t m, std::int64_t n, float alpha, const float *a,
-                                   std::int64_t lda, float *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trmm(backend_selector<backend::BACKEND> selector, side left_right,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t m, std::int64_t n, double alpha, const double *a,
-                                   std::int64_t lda, double *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trmm(backend_selector<backend::BACKEND> selector, side left_right,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trmm(backend_selector<backend::BACKEND> selector, side left_right,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event rotmg(backend_selector<backend::BACKEND> selector, float *d1,
-                                    float *d2, float *x1, float y1, float *param,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event rotmg(backend_selector<backend::BACKEND> selector, double *d1,
-                                    double *d2, double *x1, double y1, double *param,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tpsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, const float *a,
-                                   float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tpsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, const double *a,
-                                   double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tpsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n,
-                                   const std::complex<float> *a, std::complex<float> *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tpsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n,
-                                   const std::complex<double> *a, std::complex<double> *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, const float *a,
-                                   std::int64_t lda, float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, const double *a,
-                                   std::int64_t lda, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event trsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event copy(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const float *x, std::int64_t incx, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event copy(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const double *x, std::int64_t incx, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event copy(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event copy(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event copy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t *n, const float **x, std::int64_t *incx,
-                                         float **y, std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event copy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t *n, const double **x, std::int64_t *incx,
-                                         double **y, std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event copy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t *n, const std::complex<float> **x,
-                                         std::int64_t *incx, std::complex<float> **y,
-                                         std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event copy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t *n, const std::complex<double> **x,
-                                         std::int64_t *incx, std::complex<double> **y,
-                                         std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event copy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t n, const float *x, std::int64_t incx,
-                                         std::int64_t stridex, float *y, std::int64_t incy,
-                                         std::int64_t stridey, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event copy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t n, const double *x, std::int64_t incx,
-                                         std::int64_t stridex, double *y, std::int64_t incy,
-                                         std::int64_t stridey, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event copy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t n, const std::complex<float> *x,
-                                         std::int64_t incx, std::int64_t stridex,
-                                         std::complex<float> *y, std::int64_t incy,
-                                         std::int64_t stridey, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event copy_batch(backend_selector<backend::BACKEND> selector,
-                                         std::int64_t n, const std::complex<double> *x,
-                                         std::int64_t incx, std::int64_t stridex,
-                                         std::complex<double> *y, std::int64_t incy,
-                                         std::int64_t stridey, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event hemv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> beta, std::complex<float> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event hemv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> beta, std::complex<double> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemmt(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                    transpose transa, transpose transb, std::int64_t n,
-                                    std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                                    const float *b, std::int64_t ldb, float beta, float *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemmt(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                    transpose transa, transpose transb, std::int64_t n,
-                                    std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                                    const double *b, std::int64_t ldb, double beta, double *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemmt(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                    transpose transa, transpose transb, std::int64_t n,
-                                    std::int64_t k, std::complex<float> alpha,
-                                    const std::complex<float> *a, std::int64_t lda,
-                                    const std::complex<float> *b, std::int64_t ldb,
-                                    std::complex<float> beta, std::complex<float> *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemmt(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                    transpose transa, transpose transb, std::int64_t n,
-                                    std::int64_t k, std::complex<double> alpha,
-                                    const std::complex<double> *a, std::int64_t lda,
-                                    const std::complex<double> *b, std::int64_t ldb,
-                                    std::complex<double> beta, std::complex<double> *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_bias(
-    backend_selector<backend::BACKEND> selector, transpose transa, transpose transb, offset offsetc,
-    int64_t m, int64_t n, int64_t k, float alpha, const std::int8_t *a, int64_t lda, std::int8_t ao,
-    const std::uint8_t *b, int64_t ldb, std::uint8_t bo, float beta, std::int32_t *c, int64_t ldc,
-    const std::int32_t *co, const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_bias(
-    backend_selector<backend::BACKEND> selector, transpose transa, transpose transb, offset offsetc,
-    int64_t m, int64_t n, int64_t k, float alpha, const std::int8_t *a, int64_t lda, std::int8_t ao,
-    const std::int8_t *b, int64_t ldb, std::int8_t bo, float beta, std::int32_t *c, int64_t ldc,
-    const std::int32_t *co, const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_bias(
-    backend_selector<backend::BACKEND> selector, transpose transa, transpose transb, offset offsetc,
-    int64_t m, int64_t n, int64_t k, float alpha, const std::uint8_t *a, int64_t lda,
-    std::uint8_t ao, const std::int8_t *b, int64_t ldb, std::int8_t bo, float beta, std::int32_t *c,
-    int64_t ldc, const std::int32_t *co, const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event gemm_bias(backend_selector<backend::BACKEND> selector,
-                                        transpose transa, transpose transb, offset offsetc,
-                                        int64_t m, int64_t n, int64_t k, float alpha,
-                                        const std::uint8_t *a, int64_t lda, std::uint8_t ao,
-                                        const std::uint8_t *b, int64_t ldb, std::uint8_t bo,
-                                        float beta, std::int32_t *c, int64_t ldc,
-                                        const std::int32_t *co,
-                                        const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event sbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, std::int64_t k, float alpha, const float *a,
-                                   std::int64_t lda, const float *x, std::int64_t incx, float beta,
-                                   float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event sbmv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, std::int64_t k, double alpha, const double *a,
-                                   std::int64_t lda, const double *x, std::int64_t incx,
-                                   double beta, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event asum(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx, float *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event asum(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx, double *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event asum(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const float *x, std::int64_t incx, float *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event asum(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   const double *x, std::int64_t incx, double *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tbsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                                   const float *a, std::int64_t lda, float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tbsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                                   const double *a, std::int64_t lda, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tbsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event tbsv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event spr2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, float alpha, const float *x, std::int64_t incx,
-                                   const float *y, std::int64_t incy, float *a,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event spr2(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, double alpha, const double *x, std::int64_t incx,
-                                   const double *y, std::int64_t incy, double *a,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event iamax(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                    const float *x, std::int64_t incx, std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event iamax(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                    const double *x, std::int64_t incx, std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event iamax(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                    const std::complex<float> *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event iamax(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                    const std::complex<double> *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event rotm(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   float *x, std::int64_t incx, float *y, std::int64_t incy,
-                                   float *param,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event rotm(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                   double *x, std::int64_t incx, double *y, std::int64_t incy,
-                                   double *param,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event rotg(backend_selector<backend::BACKEND> selector, float *a, float *b,
-                                   float *c, float *s,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event rotg(backend_selector<backend::BACKEND> selector, double *a,
-                                   double *b, double *c, double *s,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event rotg(backend_selector<backend::BACKEND> selector,
-                                   std::complex<float> *a, std::complex<float> *b, float *c,
-                                   std::complex<float> *s,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event rotg(backend_selector<backend::BACKEND> selector,
-                                   std::complex<double> *a, std::complex<double> *b, double *c,
-                                   std::complex<double> *s,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event sdsdot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                     float sb, const float *x, std::int64_t incx, const float *y,
-                                     std::int64_t incy, float *result,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event her2k(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                    transpose trans, std::int64_t n, std::int64_t k,
-                                    std::complex<float> alpha, const std::complex<float> *a,
-                                    std::int64_t lda, const std::complex<float> *b,
-                                    std::int64_t ldb, float beta, std::complex<float> *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event her2k(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                    transpose trans, std::int64_t n, std::int64_t k,
-                                    std::complex<double> alpha, const std::complex<double> *a,
-                                    std::int64_t lda, const std::complex<double> *b,
-                                    std::int64_t ldb, double beta, std::complex<double> *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                  const float *x, std::int64_t incx, const float *y,
-                                  std::int64_t incy, float *result,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                  const double *x, std::int64_t incx, const double *y,
-                                  std::int64_t incy, double *result,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event dot(backend_selector<backend::BACKEND> selector, std::int64_t n,
-                                  const float *x, std::int64_t incx, const float *y,
-                                  std::int64_t incy, double *result,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event symv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                                   const float *x, std::int64_t incx, float beta, float *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event symv(backend_selector<backend::BACKEND> selector, uplo upper_lower,
-                                   std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                                   const double *x, std::int64_t incx, double beta, double *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         float alpha, const float *a, std::int64_t lda,
-                                         std::int64_t stride_a, float *b, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         double alpha, const double *a, std::int64_t lda,
-                                         std::int64_t stride_a, double *b, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         std::complex<float> alpha, const std::complex<float> *a,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::complex<float> *b, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         std::complex<double> alpha, const std::complex<double> *a,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::complex<double> *b, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event imatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         float alpha, float *ab, std::int64_t lda, std::int64_t ldb,
-                                         std::int64_t stride, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event imatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         double alpha, double *ab, std::int64_t lda,
-                                         std::int64_t ldb, std::int64_t stride,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event imatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         std::complex<float> alpha, std::complex<float> *ab,
-                                         std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event imatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         std::complex<double> alpha, std::complex<double> *ab,
-                                         std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatadd_batch(backend_selector<backend::BACKEND> selector,
-                                        transpose transa, transpose transb, std::int64_t m,
-                                        std::int64_t n, float alpha, const float *a,
-                                        std::int64_t lda, std::int64_t stride_a, float beta,
-                                        const float *b, std::int64_t ldb, std::int64_t stride_b,
-                                        float *c, std::int64_t ldc, std::int64_t stride_c,
-                                        std::int64_t batch_size,
-                                        const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatadd_batch(backend_selector<backend::BACKEND> selector,
-                                        transpose transa, transpose transb, std::int64_t m,
-                                        std::int64_t n, double alpha, const double *a,
-                                        std::int64_t lda, std::int64_t stride_a, double beta,
-                                        const double *b, std::int64_t ldb, std::int64_t stride_b,
-                                        double *c, std::int64_t ldc, std::int64_t stride_c,
-                                        std::int64_t batch_size,
-                                        const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatadd_batch(
-    backend_selector<backend::BACKEND> selector, transpose transa, transpose transb, std::int64_t m,
-    std::int64_t n, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-    std::int64_t stride_a, std::complex<float> beta, const std::complex<float> *b, std::int64_t ldb,
-    std::int64_t stride_b, std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatadd_batch(
-    backend_selector<backend::BACKEND> selector, transpose transa, transpose transb, std::int64_t m,
-    std::int64_t n, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-    std::int64_t stride_a, std::complex<double> beta, const std::complex<double> *b,
-    std::int64_t ldb, std::int64_t stride_b, std::complex<double> *c, std::int64_t ldc,
-    std::int64_t stride_c, std::int64_t batch_size,
-    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, float alpha, const float *a,
-                                   std::int64_t lda, float *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, double alpha, const double *a,
-                                   std::int64_t lda, double *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatcopy2(backend_selector<backend::BACKEND> selector, transpose trans,
-                                    std::int64_t m, std::int64_t n, float alpha, const float *a,
-                                    std::int64_t lda, std::int64_t stridea, float *b,
-                                    std::int64_t ldb, std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatcopy2(backend_selector<backend::BACKEND> selector, transpose trans,
-                                    std::int64_t m, std::int64_t n, double alpha, const double *a,
-                                    std::int64_t lda, std::int64_t stridea, double *b,
-                                    std::int64_t ldb, std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatcopy2(backend_selector<backend::BACKEND> selector, transpose trans,
-                                    std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                    const std::complex<float> *a, std::int64_t lda,
-                                    std::int64_t stridea, std::complex<float> *b, std::int64_t ldb,
-                                    std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatcopy2(backend_selector<backend::BACKEND> selector, transpose trans,
-                                    std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                    const std::complex<double> *a, std::int64_t lda,
-                                    std::int64_t stridea, std::complex<double> *b, std::int64_t ldb,
-                                    std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event imatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, float alpha, float *ab,
-                                   std::int64_t lda, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event imatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, double alpha, double *ab,
-                                   std::int64_t lda, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event imatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                   std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event imatcopy(backend_selector<backend::BACKEND> selector, transpose trans,
-                                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                   std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatadd(backend_selector<backend::BACKEND> selector, transpose transa,
-                                  transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                                  const float *a, std::int64_t lda, float beta, const float *b,
-                                  std::int64_t ldb, float *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatadd(backend_selector<backend::BACKEND> selector, transpose transa,
-                                  transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                                  const double *a, std::int64_t lda, double beta, const double *b,
-                                  std::int64_t ldb, double *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatadd(backend_selector<backend::BACKEND> selector, transpose transa,
-                                  transpose transb, std::int64_t m, std::int64_t n,
-                                  std::complex<float> alpha, const std::complex<float> *a,
-                                  std::int64_t lda, std::complex<float> beta,
-                                  const std::complex<float> *b, std::int64_t ldb,
-                                  std::complex<float> *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatadd(backend_selector<backend::BACKEND> selector, transpose transa,
-                                  transpose transb, std::int64_t m, std::int64_t n,
-                                  std::complex<double> alpha, const std::complex<double> *a,
-                                  std::int64_t lda, std::complex<double> beta,
-                                  const std::complex<double> *b, std::int64_t ldb,
-                                  std::complex<double> *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-static inline sycl::event omatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         float* alpha, const float** a, std::int64_t* lda,
-                                         float** b, std::int64_t* ldb, std::int64_t group_count,
-                                         std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-static inline sycl::event omatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         double* alpha, const double** a, std::int64_t* lda,
-                                         double** b, std::int64_t* ldb, std::int64_t group_count,
-                                         std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-static inline sycl::event omatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         std::complex<float>* alpha, const std::complex<float>** a,
-                                         std::int64_t* lda, std::complex<float>** b,
-                                         std::int64_t* ldb, std::int64_t group_count,
-                                         std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-static inline sycl::event omatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         std::complex<double>* alpha,
-                                         const std::complex<double>** a, std::int64_t* lda,
-                                         std::complex<double>** b, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-static inline sycl::event imatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         float* alpha, float** ab, std::int64_t* lda,
-                                         std::int64_t* ldb, std::int64_t group_count,
-                                         std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-static inline sycl::event imatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         double* alpha, double** ab, std::int64_t* lda,
-                                         std::int64_t* ldb, std::int64_t group_count,
-                                         std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-static inline sycl::event imatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         std::complex<float>* alpha, std::complex<float>** ab,
-                                         std::int64_t* lda, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-static inline sycl::event imatcopy_batch(backend_selector<backend::BACKEND> selector,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         std::complex<double>* alpha, std::complex<double>** ab,
-                                         std::int64_t* lda, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
diff --git a/include/oneapi/mkl/blas/detail/blas_loader.hpp b/include/oneapi/mkl/blas/detail/blas_loader.hpp
deleted file mode 100644
index 665f5dc80..000000000
--- a/include/oneapi/mkl/blas/detail/blas_loader.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_BLAS_LOADER_HPP_
-#define _ONEMKL_BLAS_LOADER_HPP_
-
-#include <complex>
-#include <cstdint>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/types.hpp"
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/detail/get_device_id.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace column_major {
-namespace detail {
-
-#include "blas_loader.hxx"
-
-} //namespace detail
-} //namespace column_major
-namespace row_major {
-namespace detail {
-
-#include "blas_loader.hxx"
-
-} //namespace detail
-} //namespace row_major
-} //namespace blas
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_ONEMKL_BLAS_LOADER_HPP_
diff --git a/include/oneapi/mkl/blas/detail/blas_loader.hxx b/include/oneapi/mkl/blas/detail/blas_loader.hxx
deleted file mode 100644
index 98d93b2ad..000000000
--- a/include/oneapi/mkl/blas/detail/blas_loader.hxx
+++ /dev/null
@@ -1,2699 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-ONEMKL_EXPORT void herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, float beta,
-                        sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, double beta,
-                        sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        float alpha, sycl::buffer<float, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        double alpha, sycl::buffer<double, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
-ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx);
-ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        float alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
-ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        double alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx);
-
-ONEMKL_EXPORT void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-                        std::int64_t incx);
-ONEMKL_EXPORT void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-                        std::int64_t incx);
-ONEMKL_EXPORT void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                       std::int64_t incx, sycl::buffer<float, 1> &a);
-ONEMKL_EXPORT void spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                       std::int64_t incx, sycl::buffer<double, 1> &a);
-
-ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<float, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              sycl::buffer<float, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<double, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, double beta,
-                              sycl::buffer<double, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                              sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                              sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              sycl::half alpha, sycl::buffer<sycl::half, 1> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<sycl::half, 1> &b, std::int64_t ldb,
-                              std::int64_t stride_b, sycl::half beta,
-                              sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              float alpha, sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<sycl::half, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                              std::int64_t batch_size);
-ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              float alpha, sycl::buffer<std::int8_t, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::int8_t, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                              std::int64_t batch_size);
-ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                              transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                              float alpha, sycl::buffer<std::int8_t, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::int8_t, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, float beta,
-                              sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                        sycl::buffer<float, 1> &a, std::int64_t lda, float beta,
-                        sycl::buffer<float, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda, double beta,
-                        sycl::buffer<double, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
-ONEMKL_EXPORT void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
-
-ONEMKL_EXPORT void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                              transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                              sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, float beta, sycl::buffer<float, 1> &c,
-                              std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-ONEMKL_EXPORT void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                              transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                              sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, double beta, sycl::buffer<double, 1> &c,
-                              std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-ONEMKL_EXPORT void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                              transpose trans, std::int64_t n, std::int64_t k,
-                              std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, std::complex<float> beta,
-                              sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-ONEMKL_EXPORT void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                              transpose trans, std::int64_t n, std::int64_t k,
-                              std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, std::complex<double> beta,
-                              sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-ONEMKL_EXPORT void her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-
-ONEMKL_EXPORT void hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
-ONEMKL_EXPORT void hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
-
-ONEMKL_EXPORT void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c,
-                       float s);
-ONEMKL_EXPORT void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c,
-                       double s);
-ONEMKL_EXPORT void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       sycl::buffer<float, 1> &x, std::int64_t incx,
-                       sycl::buffer<float, 1> &y, std::int64_t incy, float c, float s);
-ONEMKL_EXPORT void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       sycl::buffer<double, 1> &x, std::int64_t incx,
-                       sycl::buffer<double, 1> &y, std::int64_t incy, double c, double s);
-
-ONEMKL_EXPORT void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        float alpha, sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
-ONEMKL_EXPORT void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
-
-ONEMKL_EXPORT void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                              float alpha, sycl::buffer<float, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<float, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-ONEMKL_EXPORT void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                              double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<double, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-ONEMKL_EXPORT void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                              std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-ONEMKL_EXPORT void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                              std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                         sycl::buffer<float, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-                         double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                         std::int64_t incx, std::complex<float> beta,
-                         sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                         std::int64_t incx, std::complex<double> beta,
-                         sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-ONEMKL_EXPORT void gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-
-ONEMKL_EXPORT void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                         transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                         sycl::buffer<float, 1> &a, std::int64_t lda,
-                         sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                         sycl::buffer<float, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                         transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                         sycl::buffer<double, 1> &a, std::int64_t lda,
-                         sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                         sycl::buffer<double, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                         transpose trans, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                         std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                         std::int64_t ldc);
-ONEMKL_EXPORT void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                         transpose trans, std::int64_t n, std::int64_t k,
-                         std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                         std::int64_t ldb, std::complex<double> beta,
-                         sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                        std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-                        float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                        std::int64_t m, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                        std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
-ONEMKL_EXPORT void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                        std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
-
-ONEMKL_EXPORT void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                              std::int64_t m, std::int64_t n, float alpha,
-                              sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stridea,
-                              sycl::buffer<float, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, float beta, sycl::buffer<float, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                              std::int64_t m, std::int64_t n, double alpha,
-                              sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<double, 1> &x,
-                              std::int64_t incx, std::int64_t stridex, double beta,
-                              sycl::buffer<double, 1> &y, std::int64_t incy,
-                              std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                              std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                              std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-                              sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                              std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                              std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                              std::int64_t incx, std::int64_t stridex, std::complex<double> beta,
-                              sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                              std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                              std::int64_t m, std::int64_t n, sycl::buffer<float, 1> &a,
-                              std::int64_t lda, std::int64_t stridea, sycl::buffer<float, 1> &x,
-                              std::int64_t incx, std::int64_t stridex,
-                              sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                              std::int64_t batch_size);
-
-ONEMKL_EXPORT void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                              std::int64_t m, std::int64_t n, sycl::buffer<double, 1> &a,
-                              std::int64_t lda, std::int64_t stridea,
-                              sycl::buffer<double, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<double, 1> &c,
-                              std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size);
-
-ONEMKL_EXPORT void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                              std::int64_t m, std::int64_t n,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                              std::int64_t incx, std::int64_t stridex,
-                              sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                              std::int64_t stridec, std::int64_t batch_size);
-
-ONEMKL_EXPORT void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                              std::int64_t m, std::int64_t n,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                              std::int64_t incx, std::int64_t stridex,
-                              sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                              std::int64_t stridec, std::int64_t batch_size);
-
-ONEMKL_EXPORT void her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       std::int64_t n, float alpha, sycl::buffer<std::complex<float>, 1> &x,
-                       std::int64_t incx, sycl::buffer<std::complex<float>, 1> &a,
-                       std::int64_t lda);
-ONEMKL_EXPORT void her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       std::int64_t n, double alpha, sycl::buffer<std::complex<double>, 1> &x,
-                       std::int64_t incx, sycl::buffer<std::complex<double>, 1> &a,
-                       std::int64_t lda);
-
-ONEMKL_EXPORT void hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       std::int64_t n, float alpha, sycl::buffer<std::complex<float>, 1> &x,
-                       std::int64_t incx, sycl::buffer<std::complex<float>, 1> &a);
-ONEMKL_EXPORT void hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       std::int64_t n, double alpha, sycl::buffer<std::complex<double>, 1> &x,
-                       std::int64_t incx, sycl::buffer<std::complex<double>, 1> &a);
-
-ONEMKL_EXPORT void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                             transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                             std::int64_t k, float alpha, sycl::buffer<int8_t, 1> &a,
-                             std::int64_t lda, int8_t ao, sycl::buffer<uint8_t, 1> &b,
-                             std::int64_t ldb, uint8_t bo, float beta,
-                             sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-                             sycl::buffer<int32_t, 1> &co);
-ONEMKL_EXPORT void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                             transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                             std::int64_t k, float alpha, sycl::buffer<int8_t, 1> &a,
-                             std::int64_t lda, int8_t ao, sycl::buffer<int8_t, 1> &b,
-                             std::int64_t ldb, int8_t bo, float beta,
-                             sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-                             sycl::buffer<int32_t, 1> &co);
-ONEMKL_EXPORT void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                             transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                             std::int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a,
-                             std::int64_t lda, uint8_t ao, sycl::buffer<int8_t, 1> &b,
-                             std::int64_t ldb, int8_t bo, float beta,
-                             sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-                             sycl::buffer<int32_t, 1> &co);
-ONEMKL_EXPORT void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                             transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                             std::int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a,
-                             std::int64_t lda, uint8_t ao, sycl::buffer<uint8_t, 1> &b,
-                             std::int64_t ldb, uint8_t bo, float beta,
-                             sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-                             sycl::buffer<int32_t, 1> &co);
-
-ONEMKL_EXPORT void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<float, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-ONEMKL_EXPORT void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<double, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-ONEMKL_EXPORT void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-ONEMKL_EXPORT void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-
-ONEMKL_EXPORT void hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
-ONEMKL_EXPORT void hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
-
-ONEMKL_EXPORT void spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                        sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        sycl::buffer<float, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void rotmg(oneapi::mkl::device libkey, sycl::queue &queue,
-                         sycl::buffer<float, 1> &d1, sycl::buffer<float, 1> &d2,
-                         sycl::buffer<float, 1> &x1, float y1,
-                         sycl::buffer<float, 1> &param);
-ONEMKL_EXPORT void rotmg(oneapi::mkl::device libkey, sycl::queue &queue,
-                         sycl::buffer<double, 1> &d1, sycl::buffer<double, 1> &d2,
-                         sycl::buffer<double, 1> &x1, double y1,
-                         sycl::buffer<double, 1> &param);
-
-ONEMKL_EXPORT void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-ONEMKL_EXPORT void geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-
-ONEMKL_EXPORT void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &result);
-ONEMKL_EXPORT void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &result);
-ONEMKL_EXPORT void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &result);
-ONEMKL_EXPORT void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &result);
-
-ONEMKL_EXPORT void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                         transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                         float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                         sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                         sycl::buffer<float, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                         transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                         double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                         sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                         sycl::buffer<double, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                         transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                         std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                         std::int64_t ldb, std::complex<float> beta,
-                         sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                         transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                         std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                         std::int64_t ldb, std::complex<double> beta,
-                         sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                        transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                        float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                        sycl::buffer<float, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                        transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                        double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                        sycl::buffer<double, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                        transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                        std::int64_t ldb, std::complex<float> beta,
-                        sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                        transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                        std::int64_t ldb, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                        transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                        sycl::half alpha, sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                        sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, sycl::half beta,
-                        sycl::buffer<sycl::half, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                        transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                        float alpha, sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                        sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, float beta,
-                        sycl::buffer<float, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                        transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                        float alpha, sycl::buffer<bfloat16, 1> &a, std::int64_t lda,
-                        sycl::buffer<bfloat16, 1> &b, std::int64_t ldb, float beta,
-                        sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                        sycl::buffer<float, 1> &a, std::int64_t lda);
-ONEMKL_EXPORT void syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                        sycl::buffer<double, 1> &a, std::int64_t lda);
-
-ONEMKL_EXPORT void ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                       std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                       std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                       sycl::buffer<float, 1> &a, std::int64_t lda);
-ONEMKL_EXPORT void ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                       std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                       std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                       sycl::buffer<double, 1> &a, std::int64_t lda);
-
-ONEMKL_EXPORT void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb);
-ONEMKL_EXPORT void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                        std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb);
-ONEMKL_EXPORT void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-ONEMKL_EXPORT void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-
-ONEMKL_EXPORT void dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<float>, 1> &result);
-ONEMKL_EXPORT void dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<double>, 1> &result);
-
-ONEMKL_EXPORT void hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                        uplo upper_lower, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
-ONEMKL_EXPORT void hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                        uplo upper_lower, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                        std::int64_t ldb, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<float>, 1> &a);
-ONEMKL_EXPORT void hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<double>, 1> &a);
-
-ONEMKL_EXPORT void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                        float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        sycl::buffer<float, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                        double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                        uplo upper_lower, std::int64_t m, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                        sycl::buffer<float, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                        uplo upper_lower, std::int64_t m, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                        sycl::buffer<double, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                        uplo upper_lower, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
-ONEMKL_EXPORT void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                        uplo upper_lower, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                        std::int64_t ldb, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<float>, 1> &result);
-ONEMKL_EXPORT void dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<double>, 1> &result);
-
-ONEMKL_EXPORT void syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                       std::int64_t incx, sycl::buffer<float, 1> &a, std::int64_t lda);
-ONEMKL_EXPORT void syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                       std::int64_t incx, sycl::buffer<double, 1> &a, std::int64_t lda);
-
-ONEMKL_EXPORT void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb);
-ONEMKL_EXPORT void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                        std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb);
-ONEMKL_EXPORT void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-ONEMKL_EXPORT void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-
-ONEMKL_EXPORT void symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-                        float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                        std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx,
-                        double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-                        std::int64_t incx);
-ONEMKL_EXPORT void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-                        std::int64_t incx);
-ONEMKL_EXPORT void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                              sycl::buffer<float, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<float, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                              sycl::buffer<double, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<double, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                              sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                              sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
-ONEMKL_EXPORT void hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
-
-ONEMKL_EXPORT void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<float, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-ONEMKL_EXPORT void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<double, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-ONEMKL_EXPORT void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-ONEMKL_EXPORT void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-
-ONEMKL_EXPORT void sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-                        float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-ONEMKL_EXPORT void sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, std::int64_t k, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &result);
-ONEMKL_EXPORT void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &result);
-ONEMKL_EXPORT void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &result);
-ONEMKL_EXPORT void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &result);
-
-ONEMKL_EXPORT void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-ONEMKL_EXPORT void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                        sycl::buffer<float, 1> &a);
-ONEMKL_EXPORT void spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                        std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                        sycl::buffer<double, 1> &a);
-
-ONEMKL_EXPORT void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                              uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                              std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<float, 1> &b, std::int64_t ldb,
-                              std::int64_t stride_b, std::int64_t batch_size);
-ONEMKL_EXPORT void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                              uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                              std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<double, 1> &b, std::int64_t ldb,
-                              std::int64_t stride_b, std::int64_t batch_size);
-ONEMKL_EXPORT void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                              uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                              std::int64_t n, std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-ONEMKL_EXPORT void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                              uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                              std::int64_t n, std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-ONEMKL_EXPORT void rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &y, std::int64_t incy,
-                        sycl::buffer<float, 1> &param);
-ONEMKL_EXPORT void rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &y, std::int64_t incy,
-                        sycl::buffer<double, 1> &param);
-
-ONEMKL_EXPORT void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       sycl::buffer<float, 1> &x, std::int64_t incx,
-                       sycl::buffer<float, 1> &y, std::int64_t incy,
-                       sycl::buffer<float, 1> &result);
-ONEMKL_EXPORT void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       sycl::buffer<double, 1> &x, std::int64_t incx,
-                       sycl::buffer<double, 1> &y, std::int64_t incy,
-                       sycl::buffer<double, 1> &result);
-ONEMKL_EXPORT void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       sycl::buffer<float, 1> &x, std::int64_t incx,
-                       sycl::buffer<float, 1> &y, std::int64_t incy,
-                       sycl::buffer<double, 1> &result);
-
-ONEMKL_EXPORT void sdsdot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                          float sb, sycl::buffer<float, 1> &x, std::int64_t incx,
-                          sycl::buffer<float, 1> &y, std::int64_t incy,
-                          sycl::buffer<float, 1> &result);
-
-ONEMKL_EXPORT void her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                         transpose trans, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-                         sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                         transpose trans, std::int64_t n, std::int64_t k,
-                         std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                         std::int64_t ldb, double beta,
-                         sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void rotg(oneapi::mkl::device libkey, sycl::queue &queue,
-                        sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &b,
-                        sycl::buffer<float, 1> &c, sycl::buffer<float, 1> &s);
-ONEMKL_EXPORT void rotg(oneapi::mkl::device libkey, sycl::queue &queue,
-                        sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &b,
-                        sycl::buffer<double, 1> &c, sycl::buffer<double, 1> &s);
-ONEMKL_EXPORT void rotg(oneapi::mkl::device libkey, sycl::queue &queue,
-                        sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-                        sycl::buffer<std::complex<float>, 1> &s);
-ONEMKL_EXPORT void rotg(oneapi::mkl::device libkey, sycl::queue &queue,
-                        sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &b,
-                        sycl::buffer<double, 1> &c,
-                        sycl::buffer<std::complex<double>, 1> &s);
-
-ONEMKL_EXPORT void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                  std::int64_t m, std::int64_t n, float alpha,
-                                  sycl::buffer<float, 1> &a, std::int64_t lda,
-                                  std::int64_t stride_a, sycl::buffer<float, 1> &b,
-                                  std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-ONEMKL_EXPORT void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                  std::int64_t m, std::int64_t n, double alpha,
-                                  sycl::buffer<double, 1> &a, std::int64_t lda,
-                                  std::int64_t stride_a, sycl::buffer<double, 1> &b,
-                                  std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-ONEMKL_EXPORT void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                  std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                  sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                  std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                                  std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-ONEMKL_EXPORT void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                  std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                  sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                  std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                                  std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-ONEMKL_EXPORT void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                  std::int64_t m, std::int64_t n, float alpha,
-                                  sycl::buffer<float, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                                  std::int64_t stride, std::int64_t batch_size);
-ONEMKL_EXPORT void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                  std::int64_t m, std::int64_t n, double alpha,
-                                  sycl::buffer<double, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                                  std::int64_t stride, std::int64_t batch_size);
-ONEMKL_EXPORT void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                  std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                  sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda,
-                                  std::int64_t ldb, std::int64_t stride, std::int64_t batch_size);
-ONEMKL_EXPORT void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                  std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                  sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda,
-                                  std::int64_t ldb, std::int64_t stride, std::int64_t batch_size);
-
-ONEMKL_EXPORT void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                                 transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                                 sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                                 float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                                 std::int64_t stride_b, sycl::buffer<float, 1> &c, std::int64_t ldc,
-                                 std::int64_t stride_c, std::int64_t batch_size);
-ONEMKL_EXPORT void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                                 transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                                 sycl::buffer<double, 1> &a, std::int64_t lda,
-                                 std::int64_t stride_a, double beta, sycl::buffer<double, 1> &b,
-                                 std::int64_t ldb, std::int64_t stride_b,
-                                 sycl::buffer<double, 1> &c, std::int64_t ldc,
-                                 std::int64_t stride_c, std::int64_t batch_size);
-ONEMKL_EXPORT void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                                 transpose transb, std::int64_t m, std::int64_t n,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda, std::int64_t stride_a, std::complex<float> beta,
-                                 sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                 std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c,
-                                 std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-ONEMKL_EXPORT void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                                 transpose transb, std::int64_t m, std::int64_t n,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 std::int64_t stride_a, std::complex<double> beta,
-                                 sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                 std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c,
-                                 std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                            std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                            std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb);
-ONEMKL_EXPORT void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                            std::int64_t m, std::int64_t n, double alpha,
-                            sycl::buffer<double, 1> &a, std::int64_t lda,
-                            sycl::buffer<double, 1> &b, std::int64_t ldb);
-ONEMKL_EXPORT void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                            std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                            sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                            sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-ONEMKL_EXPORT void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                            std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                            sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                            sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-
-ONEMKL_EXPORT void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                             std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                             std::int64_t lda, std::int64_t stridea, sycl::buffer<float, 1> &b,
-                             std::int64_t ldb, std::int64_t strideb);
-ONEMKL_EXPORT void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                             std::int64_t m, std::int64_t n, double alpha,
-                             sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stridea,
-                             sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t strideb);
-ONEMKL_EXPORT void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                             std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                             sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                             std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &b,
-                             std::int64_t ldb, std::int64_t strideb);
-ONEMKL_EXPORT void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                             std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                             sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                             std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b,
-                             std::int64_t ldb, std::int64_t strideb);
-
-ONEMKL_EXPORT void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                            std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &ab,
-                            std::int64_t lda, std::int64_t ldb);
-ONEMKL_EXPORT void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                            std::int64_t m, std::int64_t n, double alpha,
-                            sycl::buffer<double, 1> &ab, std::int64_t lda, std::int64_t ldb);
-ONEMKL_EXPORT void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                            std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                            sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda,
-                            std::int64_t ldb);
-ONEMKL_EXPORT void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                            std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                            sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda,
-                            std::int64_t ldb);
-
-ONEMKL_EXPORT void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                           sycl::buffer<float, 1> &a, std::int64_t lda, float beta,
-                           sycl::buffer<float, 1> &b, std::int64_t ldb, sycl::buffer<float, 1> &c,
-                           std::int64_t ldc);
-ONEMKL_EXPORT void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                           sycl::buffer<double, 1> &a, std::int64_t lda, double beta,
-                           sycl::buffer<double, 1> &b, std::int64_t ldb, sycl::buffer<double, 1> &c,
-                           std::int64_t ldc);
-ONEMKL_EXPORT void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n,
-                           std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                           std::int64_t lda, std::complex<float> beta,
-                           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-ONEMKL_EXPORT void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n,
-                           std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                           std::int64_t lda, std::complex<double> beta,
-                           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-// USM APIs
-
-ONEMKL_EXPORT sycl::event herk(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, std::int64_t n,
-                                   std::int64_t k, float alpha, const std::complex<float> *a,
-                                   std::int64_t lda, float beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event herk(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, std::int64_t n,
-                                   std::int64_t k, double alpha, const std::complex<double> *a,
-                                   std::int64_t lda, double beta, std::complex<double> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, float alpha, float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, double alpha, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, float alpha, std::complex<float> *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, double alpha, std::complex<double> *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const float *a, std::int64_t lda, float *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const double *a, std::int64_t lda, double *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const float *a, float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const double *a, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const std::complex<float> *a,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const std::complex<double> *a,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event spr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  uplo upper_lower, std::int64_t n, float alpha, const float *x,
-                                  std::int64_t incx, float *a,
-                                  const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event spr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  uplo upper_lower, std::int64_t n, double alpha, const double *x,
-                                  std::int64_t incx, double *a,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose *transa, transpose *transb, std::int64_t *m,
-                                         std::int64_t *n, std::int64_t *k, float *alpha,
-                                         const float **a, std::int64_t *lda, const float **b,
-                                         std::int64_t *ldb, float *beta, float **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose *transa, transpose *transb, std::int64_t *m,
-                                         std::int64_t *n, std::int64_t *k, double *alpha,
-                                         const double **a, std::int64_t *lda, const double **b,
-                                         std::int64_t *ldb, double *beta, double **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose *transa, transpose *transb, std::int64_t *m,
-                                         std::int64_t *n, std::int64_t *k,
-                                         std::complex<float> *alpha, const std::complex<float> **a,
-                                         std::int64_t *lda, const std::complex<float> **b,
-                                         std::int64_t *ldb, std::complex<float> *beta,
-                                         std::complex<float> **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(
-    oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, transpose *transb,
-    std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex<double> *alpha,
-    const std::complex<double> **a, std::int64_t *lda, const std::complex<double> **b,
-    std::int64_t *ldb, std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-    std::int64_t group_count, std::int64_t *group_size,
-    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose *transa, transpose *transb, std::int64_t *m,
-                                         std::int64_t *n, std::int64_t *k, sycl::half *alpha,
-                                         const sycl::half **a, std::int64_t *lda,
-                                         const sycl::half **b, std::int64_t *ldb, sycl::half *beta,
-                                         sycl::half **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                     transpose *transa, transpose *transb, std::int64_t *m,
-                                     std::int64_t *n, std::int64_t *k, float *alpha,
-                                     const sycl::half **a, std::int64_t *lda, const sycl::half **b,
-                                     std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                                     std::int64_t group_count, std::int64_t *group_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                     transpose *transa, transpose *transb, std::int64_t *m,
-                                     std::int64_t *n, std::int64_t *k, float *alpha,
-                                     const std::int8_t **a, std::int64_t *lda,
-                                     const std::int8_t **b, std::int64_t *ldb, float *beta,
-                                     float **c, std::int64_t *ldc, std::int64_t group_count,
-                                     std::int64_t *group_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                     transpose *transa, transpose *transb, std::int64_t *m,
-                                     std::int64_t *n, std::int64_t *k, float *alpha,
-                                     const std::int8_t **a, std::int64_t *lda,
-                                     const std::int8_t **b, std::int64_t *ldb, float *beta,
-                                     std::int32_t **c, std::int64_t *ldc, std::int64_t group_count,
-                                     std::int64_t *group_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose transa, transpose transb, std::int64_t m,
-                                         std::int64_t n, std::int64_t k, float alpha,
-                                         const float *a, std::int64_t lda, std::int64_t stride_a,
-                                         const float *b, std::int64_t ldb, std::int64_t stride_b,
-                                         float beta, float *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose transa, transpose transb, std::int64_t m,
-                                         std::int64_t n, std::int64_t k, double alpha,
-                                         const double *a, std::int64_t lda, std::int64_t stride_a,
-                                         const double *b, std::int64_t ldb, std::int64_t stride_b,
-                                         double beta, double *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(
-    oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-    std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-    const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-    const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-    std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(
-    oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-    std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-    const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-    const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(
-    oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-    std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half *a,
-    std::int64_t lda, std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-    std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc, std::int64_t stride_c,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                     transpose transa, transpose transb, std::int64_t m,
-                                     std::int64_t n, std::int64_t k, float alpha,
-                                     const sycl::half *a, std::int64_t lda, std::int64_t stride_a,
-                                     const sycl::half *b, std::int64_t ldb, std::int64_t stride_b,
-                                     float beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-                                     std::int64_t batch_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                     transpose transa, transpose transb, std::int64_t m,
-                                     std::int64_t n, std::int64_t k, float alpha,
-                                     const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                                     const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b,
-                                     float beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-                                     std::int64_t batch_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                     transpose transa, transpose transb, std::int64_t m,
-                                     std::int64_t n, std::int64_t k, float alpha,
-                                     const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                                     const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b,
-                                     float beta, std::int32_t *c, std::int64_t ldc,
-                                     std::int64_t stride_c, std::int64_t batch_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, std::int64_t n,
-                                   std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                                   float beta, float *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, std::int64_t n,
-                                   std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                                   double beta, double *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, std::int64_t n,
-                                   std::int64_t k, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, std::int64_t n,
-                                   std::int64_t k, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> beta, std::complex<double> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         uplo *upper_lower, transpose *trans, std::int64_t *n,
-                                         std::int64_t *k, float *alpha, const float **a,
-                                         std::int64_t *lda, float *beta, float **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         uplo *upper_lower, transpose *trans, std::int64_t *n,
-                                         std::int64_t *k, double *alpha, const double **a,
-                                         std::int64_t *lda, double *beta, double **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         uplo *upper_lower, transpose *trans, std::int64_t *n,
-                                         std::int64_t *k, std::complex<float> *alpha,
-                                         const std::complex<float> **a, std::int64_t *lda,
-                                         std::complex<float> *beta, std::complex<float> **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         uplo *upper_lower, transpose *trans, std::int64_t *n,
-                                         std::int64_t *k, std::complex<double> *alpha,
-                                         const std::complex<double> **a, std::int64_t *lda,
-                                         std::complex<double> *beta, std::complex<double> **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         uplo upper_lower, transpose trans, std::int64_t n,
-                                         std::int64_t k, float alpha, const float *a,
-                                         std::int64_t lda, std::int64_t stride_a, float beta,
-                                         float *c, std::int64_t ldc, std::int64_t stride_c,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         uplo upper_lower, transpose trans, std::int64_t n,
-                                         std::int64_t k, double alpha, const double *a,
-                                         std::int64_t lda, std::int64_t stride_a, double beta,
-                                         double *c, std::int64_t ldc, std::int64_t stride_c,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         uplo upper_lower, transpose trans, std::int64_t n,
-                                         std::int64_t k, std::complex<float> alpha,
-                                         const std::complex<float> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<float> beta,
-                                         std::complex<float> *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         uplo upper_lower, transpose trans, std::int64_t n,
-                                         std::int64_t k, std::complex<double> alpha,
-                                         const std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<double> beta,
-                                         std::complex<double> *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event her2(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event her2(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, std::int64_t k,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, const std::complex<float> *x,
-                                   std::int64_t incx, std::complex<float> beta,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, std::int64_t k,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, const std::complex<double> *x,
-                                   std::int64_t incx, std::complex<double> beta,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  std::int64_t n, std::complex<float> *x, std::int64_t incx,
-                                  std::complex<float> *y, std::int64_t incy, float c, float s,
-                                  const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  std::int64_t n, std::complex<double> *x, std::int64_t incx,
-                                  std::complex<double> *y, std::int64_t incy, double c, double s,
-                                  const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  std::int64_t n, float *x, std::int64_t incx, float *y,
-                                  std::int64_t incy, float c, float s,
-                                  const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  std::int64_t n, double *x, std::int64_t incx, double *y,
-                                  std::int64_t incy, double c, double s,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, float alpha, const float *x, std::int64_t incx,
-                                   float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, double alpha, const double *x, std::int64_t incx,
-                                   double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t *n, float *alpha, const float **x,
-                                         std::int64_t *incx, float **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t *n, double *alpha, const double **x,
-                                         std::int64_t *incx, double **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t *n, std::complex<float> *alpha,
-                                         const std::complex<float> **x, std::int64_t *incx,
-                                         std::complex<float> **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t *n, std::complex<double> *alpha,
-                                         const std::complex<double> **x, std::int64_t *incx,
-                                         std::complex<double> **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t n, float alpha, const float *x,
-                                         std::int64_t incx, std::int64_t stridex, float *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t n, double alpha, const double *x,
-                                         std::int64_t incx, std::int64_t stridex, double *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t n, std::complex<float> alpha,
-                                         const std::complex<float> *x, std::int64_t incx,
-                                         std::int64_t stridex, std::complex<float> *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t n, std::complex<double> alpha,
-                                         const std::complex<double> *x, std::int64_t incx,
-                                         std::int64_t stridex, std::complex<double> *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    std::int64_t n, float alpha, const float *x, std::int64_t incx,
-                                    const float beta, float *y, std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    std::int64_t n, double alpha, const double *x,
-                                    std::int64_t incx, const double beta, double *y,
-                                    std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    std::int64_t n, std::complex<float> alpha,
-                                    const std::complex<float> *x, std::int64_t incx,
-                                    const std::complex<float> beta, std::complex<float> *y,
-                                    std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    std::int64_t n, std::complex<double> alpha,
-                                    const std::complex<double> *x, std::int64_t incx,
-                                    const std::complex<double> beta, std::complex<double> *y,
-                                    std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gerc(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gerc(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    uplo upper_lower, transpose trans, std::int64_t n,
-                                    std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                                    const float *b, std::int64_t ldb, float beta, float *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    uplo upper_lower, transpose trans, std::int64_t n,
-                                    std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                                    const double *b, std::int64_t ldb, double beta, double *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    uplo upper_lower, transpose trans, std::int64_t n,
-                                    std::int64_t k, std::complex<float> alpha,
-                                    const std::complex<float> *a, std::int64_t lda,
-                                    const std::complex<float> *b, std::int64_t ldb,
-                                    std::complex<float> beta, std::complex<float> *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    uplo upper_lower, transpose trans, std::int64_t n,
-                                    std::int64_t k, std::complex<double> alpha,
-                                    const std::complex<double> *a, std::int64_t lda,
-                                    const std::complex<double> *b, std::int64_t ldb,
-                                    std::complex<double> beta, std::complex<double> *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose trans, std::int64_t m, std::int64_t n, float alpha,
-                                   const float *a, std::int64_t lda, const float *x,
-                                   std::int64_t incx, float beta, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose trans, std::int64_t m, std::int64_t n, double alpha,
-                                   const double *a, std::int64_t lda, const double *x,
-                                   std::int64_t incx, double beta, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose trans, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, const std::complex<float> *x,
-                                   std::int64_t incx, std::complex<float> beta,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose trans, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, const std::complex<double> *x,
-                                   std::int64_t incx, std::complex<double> beta,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         float alpha, const float *a, std::int64_t lda,
-                                         std::int64_t stridea, const float *x, std::int64_t incx,
-                                         std::int64_t stridex, float beta, float *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         double alpha, const double *a, std::int64_t lda,
-                                         std::int64_t stridea, const double *x, std::int64_t incx,
-                                         std::int64_t stridex, double beta, double *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemv_batch(
-    oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-    std::int64_t n, std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-    std::int64_t stridea, const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-    std::complex<float> beta, std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemv_batch(
-    oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-    std::int64_t n, std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-    std::int64_t stridea, const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-    std::complex<double> beta, std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose *trans, std::int64_t *m, std::int64_t *n,
-                                         float *alpha, const float **a, std::int64_t *lda,
-                                         const float **x, std::int64_t *incx, float *beta,
-                                         float **y, std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose *trans, std::int64_t *m, std::int64_t *n,
-                                         double *alpha, const double **a, std::int64_t *lda,
-                                         const double **x, std::int64_t *incx, double *beta,
-                                         double **y, std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose *trans, std::int64_t *m, std::int64_t *n,
-                                         std::complex<float> *alpha, const std::complex<float> **a,
-                                         std::int64_t *lda, const std::complex<float> **x,
-                                         std::int64_t *incx, std::complex<float> *beta,
-                                         std::complex<float> **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemv_batch(
-    oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, std::int64_t *m,
-    std::int64_t *n, std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
-    const std::complex<double> **x, std::int64_t *incx, std::complex<double> *beta,
-    std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
-    std::int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side left_right, std::int64_t m, std::int64_t n,
-                                         const float *a, std::int64_t lda, std::int64_t stridea,
-                                         const float *x, std::int64_t incx, std::int64_t stridex,
-                                         float *c, std::int64_t ldc, std::int64_t stridec,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side left_right, std::int64_t m, std::int64_t n,
-                                         const double *a, std::int64_t lda, std::int64_t stridea,
-                                         const double *x, std::int64_t incx, std::int64_t stridex,
-                                         double *c, std::int64_t ldc, std::int64_t stridec,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side left_right, std::int64_t m, std::int64_t n,
-                                         const std::complex<float> *a, std::int64_t lda,
-                                         std::int64_t stridea, const std::complex<float> *x,
-                                         std::int64_t incx, std::int64_t stridex,
-                                         std::complex<float> *c, std::int64_t ldc,
-                                         std::int64_t stridec, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side left_right, std::int64_t m, std::int64_t n,
-                                         const std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stridea, const std::complex<double> *x,
-                                         std::int64_t incx, std::int64_t stridex,
-                                         std::complex<double> *c, std::int64_t ldc,
-                                         std::int64_t stridec, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side *left_right, std::int64_t *m, std::int64_t *n,
-                                         const float **a, std::int64_t *lda, const float **x,
-                                         std::int64_t *incx, float **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side *left_right, std::int64_t *m, std::int64_t *n,
-                                         const double **a, std::int64_t *lda, const double **x,
-                                         std::int64_t *incx, double **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side *left_right, std::int64_t *m, std::int64_t *n,
-                                         const std::complex<float> **a, std::int64_t *lda,
-                                         const std::complex<float> **x, std::int64_t *incx,
-                                         std::complex<float> **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side *left_right, std::int64_t *m, std::int64_t *n,
-                                         const std::complex<double> **a, std::int64_t *lda,
-                                         const std::complex<double> **x, std::int64_t *incx,
-                                         std::complex<double> **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event her(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  uplo upper_lower, std::int64_t n, float alpha,
-                                  const std::complex<float> *x, std::int64_t incx,
-                                  std::complex<float> *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event her(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  uplo upper_lower, std::int64_t n, double alpha,
-                                  const std::complex<double> *x, std::int64_t incx,
-                                  std::complex<double> *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hpr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  uplo upper_lower, std::int64_t n, float alpha,
-                                  const std::complex<float> *x, std::int64_t incx,
-                                  std::complex<float> *a,
-                                  const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hpr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  uplo upper_lower, std::int64_t n, double alpha,
-                                  const std::complex<double> *x, std::int64_t incx,
-                                  std::complex<double> *a,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    std::int64_t n, const float *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    std::int64_t n, const double *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    std::int64_t n, const std::complex<float> *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    std::int64_t n, const std::complex<double> *x,
-                                    std::int64_t incx, std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, const std::complex<float> *x,
-                                   std::int64_t incx, std::complex<float> beta,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, const std::complex<double> *x,
-                                   std::int64_t incx, std::complex<double> beta,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event spmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, float alpha, const float *a,
-                                   const float *x, std::int64_t incx, float beta, float *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event spmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, double alpha, const double *a,
-                                   const double *x, std::int64_t incx, double beta, double *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue &queue, float *d1,
-                                    float *d2, float *x1, float y1, float *param,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue &queue, double *d1,
-                                    double *d2, double *x1, double y1, double *param,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, float *x, std::int64_t incx, float *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, double *x, std::int64_t incx, double *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geru(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geru(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const std::complex<float> *x, std::int64_t incx,
-                                   float *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const std::complex<double> *x, std::int64_t incx,
-                                   double *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const float *x, std::int64_t incx, float *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const double *x, std::int64_t incx,
-                                   double *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    uplo upper_lower, transpose transa, transpose transb,
-                                    std::int64_t n, std::int64_t k, float alpha, const float *a,
-                                    std::int64_t lda, const float *b, std::int64_t ldb, float beta,
-                                    float *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    uplo upper_lower, transpose transa, transpose transb,
-                                    std::int64_t n, std::int64_t k, double alpha, const double *a,
-                                    std::int64_t lda, const double *b, std::int64_t ldb,
-                                    double beta, double *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    uplo upper_lower, transpose transa, transpose transb,
-                                    std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                                    const std::complex<float> *a, std::int64_t lda,
-                                    const std::complex<float> *b, std::int64_t ldb,
-                                    std::complex<float> beta, std::complex<float> *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    uplo upper_lower, transpose transa, transpose transb,
-                                    std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                                    const std::complex<double> *a, std::int64_t lda,
-                                    const std::complex<double> *b, std::int64_t ldb,
-                                    std::complex<double> beta, std::complex<double> *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose transa, transpose transb, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, float alpha, const float *a,
-                                   std::int64_t lda, const float *b, std::int64_t ldb, float beta,
-                                   float *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose transa, transpose transb, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, double alpha, const double *a,
-                                   std::int64_t lda, const double *b, std::int64_t ldb, double beta,
-                                   double *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose transa, transpose transb, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose transa, transpose transb, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *b, std::int64_t ldb,
-                                   std::complex<double> beta, std::complex<double> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose transa, transpose transb, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, sycl::half alpha,
-                                   const sycl::half *a, std::int64_t lda, const sycl::half *b,
-                                   std::int64_t ldb, sycl::half beta, sycl::half *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose transa, transpose transb, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, float alpha, const sycl::half *a,
-                                   std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                                   float beta, float *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose transa, transpose transb, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a,
-                                   std::int64_t lda, const bfloat16 *b, std::int64_t ldb,
-                                   float beta, float *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue,
-                                        transpose transa, transpose transb, offset offsetc,
-                                        int64_t m, int64_t n, int64_t k, float alpha,
-                                        const std::int8_t *a, int64_t lda, std::int8_t ao,
-                                        const std::uint8_t *b, int64_t ldb, std::uint8_t bo,
-                                        float beta, std::int32_t *c, int64_t ldc,
-                                        const std::int32_t *co,
-                                        const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_bias(
-    oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-    offset offsetc, int64_t m, int64_t n, int64_t k, float alpha, const std::int8_t *a, int64_t lda,
-    std::int8_t ao, const std::int8_t *b, int64_t ldb, std::int8_t bo, float beta, std::int32_t *c,
-    int64_t ldc, const std::int32_t *co, const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue,
-                                        transpose transa, transpose transb, offset offsetc,
-                                        int64_t m, int64_t n, int64_t k, float alpha,
-                                        const std::uint8_t *a, int64_t lda, std::uint8_t ao,
-                                        const std::int8_t *b, int64_t ldb, std::int8_t bo,
-                                        float beta, std::int32_t *c, int64_t ldc,
-                                        const std::int32_t *co,
-                                        const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue,
-                                        transpose transa, transpose transb, offset offsetc,
-                                        int64_t m, int64_t n, int64_t k, float alpha,
-                                        const std::uint8_t *a, int64_t lda, std::uint8_t ao,
-                                        const std::uint8_t *b, int64_t ldb, std::uint8_t bo,
-                                        float beta, std::int32_t *c, int64_t ldc,
-                                        const std::int32_t *co,
-                                        const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syr2(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, float alpha, const float *x,
-                                   std::int64_t incx, const float *y, std::int64_t incy, float *a,
-                                   std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syr2(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, double alpha, const double *x,
-                                   std::int64_t incx, const double *y, std::int64_t incy, double *a,
-                                   std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ger(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  std::int64_t m, std::int64_t n, float alpha, const float *x,
-                                  std::int64_t incx, const float *y, std::int64_t incy, float *a,
-                                  std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ger(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  std::int64_t m, std::int64_t n, double alpha, const double *x,
-                                  std::int64_t incx, const double *y, std::int64_t incy, double *a,
-                                  std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   side left_right, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                                   const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   side left_right, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                                   const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   side left_right, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   side left_right, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side left_right, uplo upper_lower, transpose trans,
-                                         diag unit_diag, int64_t m, int64_t n, float alpha,
-                                         const float *a, int64_t lda, int64_t stride_a, float *b,
-                                         int64_t ldb, int64_t stride_b, int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side left_right, uplo upper_lower, transpose trans,
-                                         diag unit_diag, int64_t m, int64_t n, double alpha,
-                                         const double *a, int64_t lda, int64_t stride_a, double *b,
-                                         int64_t ldb, int64_t stride_b, int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side left_right, uplo upper_lower, transpose trans,
-                                         diag unit_diag, int64_t m, int64_t n,
-                                         std::complex<float> alpha, const std::complex<float> *a,
-                                         int64_t lda, int64_t stride_a, std::complex<float> *b,
-                                         int64_t ldb, int64_t stride_b, int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side left_right, uplo upper_lower, transpose trans,
-                                         diag unit_diag, int64_t m, int64_t n,
-                                         std::complex<double> alpha, const std::complex<double> *a,
-                                         int64_t lda, int64_t stride_a, std::complex<double> *b,
-                                         int64_t ldb, int64_t stride_b, int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side *left_right, uplo *upper_lower, transpose *trans,
-                                         diag *unit_diag, int64_t *m, int64_t *n, float *alpha,
-                                         const float **a, int64_t *lda, float **b, int64_t *ldb,
-                                         int64_t group_count, int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side *left_right, uplo *upper_lower, transpose *trans,
-                                         diag *unit_diag, int64_t *m, int64_t *n, double *alpha,
-                                         const double **a, int64_t *lda, double **b, int64_t *ldb,
-                                         int64_t group_count, int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side *left_right, uplo *upper_lower, transpose *trans,
-                                         diag *unit_diag, int64_t *m, int64_t *n,
-                                         std::complex<float> *alpha, const std::complex<float> **a,
-                                         int64_t *lda, std::complex<float> **b, int64_t *ldb,
-                                         int64_t group_count, int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         side *left_right, uplo *upper_lower, transpose *trans,
-                                         diag *unit_diag, int64_t *m, int64_t *n,
-                                         std::complex<double> *alpha,
-                                         const std::complex<double> **a, int64_t *lda,
-                                         std::complex<double> **b, int64_t *ldb,
-                                         int64_t group_count, int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dotu(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event dotu(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hemm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   side left_right, uplo upper_lower, std::int64_t m,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hemm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   side left_right, uplo upper_lower, std::int64_t m,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *b, std::int64_t ldb,
-                                   std::complex<double> beta, std::complex<double> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *a,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *a,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-                                   std::int64_t ku, float alpha, const float *a, std::int64_t lda,
-                                   const float *x, std::int64_t incx, float beta, float *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-                                   std::int64_t ku, double alpha, const double *a, std::int64_t lda,
-                                   const double *x, std::int64_t incx, double beta, double *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-                                   std::int64_t ku, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> beta, std::complex<float> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-                                   std::int64_t ku, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> beta, std::complex<double> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const float *a, std::int64_t lda,
-                                   float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const double *a,
-                                   std::int64_t lda, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   side left_right, uplo upper_lower, std::int64_t m,
-                                   std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                                   const float *b, std::int64_t ldb, float beta, float *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   side left_right, uplo upper_lower, std::int64_t m,
-                                   std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                                   const double *b, std::int64_t ldb, double beta, double *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   side left_right, uplo upper_lower, std::int64_t m,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   side left_right, uplo upper_lower, std::int64_t m,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *b, std::int64_t ldb,
-                                   std::complex<double> beta, std::complex<double> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dotc(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event dotc(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  uplo upper_lower, std::int64_t n, float alpha, const float *x,
-                                  std::int64_t incx, float *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  uplo upper_lower, std::int64_t n, double alpha, const double *x,
-                                  std::int64_t incx, double *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   side left_right, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                                   const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   side left_right, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                                   const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   side left_right, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   side left_right, uplo upper_lower, transpose trans,
-                                   diag unit_diag, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event symv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, float alpha, const float *a,
-                                   std::int64_t lda, const float *x, std::int64_t incx, float beta,
-                                   float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event symv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, double alpha, const double *a,
-                                   std::int64_t lda, const double *x, std::int64_t incx,
-                                   double beta, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const float *a, float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const double *a, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const std::complex<float> *a,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const std::complex<double> *a,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const float *a, std::int64_t lda, float *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const double *a, std::int64_t lda, double *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const float *x, std::int64_t incx, float *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const double *x, std::int64_t incx, double *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t *n, const float **x, std::int64_t *incx,
-                                         float **y, std::int64_t *incy, int64_t group_count,
-                                         int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t *n, const double **x, std::int64_t *incx,
-                                         double **y, std::int64_t *incy, int64_t group_count,
-                                         int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t *n, const std::complex<float> **x,
-                                         std::int64_t *incx, std::complex<float> **y,
-                                         std::int64_t *incy, int64_t group_count,
-                                         int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t *n, const std::complex<double> **x,
-                                         std::int64_t *incx, std::complex<double> **y,
-                                         std::int64_t *incy, int64_t group_count,
-                                         int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t n, const float *x, std::int64_t incx,
-                                         std::int64_t stridex, float *y, std::int64_t incy,
-                                         std::int64_t stridey, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t n, const double *x, std::int64_t incx,
-                                         std::int64_t stridex, double *y, std::int64_t incy,
-                                         std::int64_t stridey, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t n, const std::complex<float> *x,
-                                         std::int64_t incx, std::int64_t stridex,
-                                         std::complex<float> *y, std::int64_t incy,
-                                         std::int64_t stridey, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t n, const std::complex<double> *x,
-                                         std::int64_t incx, std::int64_t stridex,
-                                         std::complex<double> *y, std::int64_t incy,
-                                         std::int64_t stridey, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hemv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> beta, std::complex<float> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hemv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> beta, std::complex<double> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    std::int64_t n, const float *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    std::int64_t n, const double *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    std::int64_t n, const std::complex<float> *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    std::int64_t n, const std::complex<double> *x,
-                                    std::int64_t incx, std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, std::int64_t k, float alpha,
-                                   const float *a, std::int64_t lda, const float *x,
-                                   std::int64_t incx, float beta, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, std::int64_t k, double alpha,
-                                   const double *a, std::int64_t lda, const double *x,
-                                   std::int64_t incx, double beta, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const std::complex<float> *x, std::int64_t incx,
-                                   float *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const std::complex<double> *x, std::int64_t incx,
-                                   double *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const float *x, std::int64_t incx, float *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, const double *x, std::int64_t incx,
-                                   double *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const float *a, std::int64_t lda,
-                                   float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const double *a,
-                                   std::int64_t lda, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, transpose trans, diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event spr2(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, float alpha, const float *x,
-                                   std::int64_t incx, const float *y, std::int64_t incy, float *a,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event spr2(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   uplo upper_lower, std::int64_t n, double alpha, const double *x,
-                                   std::int64_t incx, const double *y, std::int64_t incy, double *a,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rotm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, float *x, std::int64_t incx, float *y,
-                                   std::int64_t incy, float *param,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event rotm(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t n, double *x, std::int64_t incx, double *y,
-                                   std::int64_t incy, double *param,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  std::int64_t n, const float *x, std::int64_t incx, const float *y,
-                                  std::int64_t incy, float *result,
-                                  const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  std::int64_t n, const double *x, std::int64_t incx,
-                                  const double *y, std::int64_t incy, double *result,
-                                  const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue,
-                                  std::int64_t n, const float *x, std::int64_t incx, const float *y,
-                                  std::int64_t incy, double *result,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sdsdot(oneapi::mkl::device libkey, sycl::queue &queue,
-                                     std::int64_t n, float sb, const float *x, std::int64_t incx,
-                                     const float *y, std::int64_t incy, float *result,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event her2k(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    uplo upper_lower, transpose trans, std::int64_t n,
-                                    std::int64_t k, std::complex<float> alpha,
-                                    const std::complex<float> *a, std::int64_t lda,
-                                    const std::complex<float> *b, std::int64_t ldb, float beta,
-                                    std::complex<float> *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event her2k(oneapi::mkl::device libkey, sycl::queue &queue,
-                                    uplo upper_lower, transpose trans, std::int64_t n,
-                                    std::int64_t k, std::complex<double> alpha,
-                                    const std::complex<double> *a, std::int64_t lda,
-                                    const std::complex<double> *b, std::int64_t ldb, double beta,
-                                    std::complex<double> *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, float *a,
-                                   float *b, float *c, float *s,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, double *a,
-                                   double *b, double *c, double *s,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::complex<float> *a, std::complex<float> *b, float *c,
-                                   std::complex<float> *s,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::complex<double> *a, std::complex<double> *b, double *c,
-                                   std::complex<double> *s,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         float alpha, const float *a, std::int64_t lda,
-                                         std::int64_t stride_a, float *b, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         double alpha, const double *a, std::int64_t lda,
-                                         std::int64_t stride_a, double *b, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         std::complex<float> alpha, const std::complex<float> *a,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::complex<float> *b, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         std::complex<double> alpha, const std::complex<double> *a,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::complex<double> *b, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         float alpha, float *ab, std::int64_t lda, std::int64_t ldb,
-                                         std::int64_t stride, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         double alpha, double *ab, std::int64_t lda,
-                                         std::int64_t ldb, std::int64_t stride,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         std::complex<float> alpha, std::complex<float> *ab,
-                                         std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         transpose trans, std::int64_t m, std::int64_t n,
-                                         std::complex<double> alpha, std::complex<double> *ab,
-                                         std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                        transpose transa, transpose transb, std::int64_t m,
-                                        std::int64_t n, float alpha, const float *a,
-                                        std::int64_t lda, std::int64_t stride_a, float beta,
-                                        const float *b, std::int64_t ldb, std::int64_t stride_b,
-                                        float *c, std::int64_t ldc, std::int64_t stride_c,
-                                        std::int64_t batch_size,
-                                        const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                        transpose transa, transpose transb, std::int64_t m,
-                                        std::int64_t n, double alpha, const double *a,
-                                        std::int64_t lda, std::int64_t stride_a, double beta,
-                                        const double *b, std::int64_t ldb, std::int64_t stride_b,
-                                        double *c, std::int64_t ldc, std::int64_t stride_c,
-                                        std::int64_t batch_size,
-                                        const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatadd_batch(
-    oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-    std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-    std::int64_t lda, std::int64_t stride_a, std::complex<float> beta, const std::complex<float> *b,
-    std::int64_t ldb, std::int64_t stride_b, std::complex<float> *c, std::int64_t ldc,
-    std::int64_t stride_c, std::int64_t batch_size,
-    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatadd_batch(
-    oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-    std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-    std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-    const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b, std::complex<double> *c,
-    std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                   std::int64_t m, std::int64_t n, float alpha, const float *a,
-                                   std::int64_t lda, float *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                   std::int64_t m, std::int64_t n, double alpha, const double *a,
-                                   std::int64_t lda, double *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                    std::int64_t m, std::int64_t n, float alpha, const float *a,
-                                    std::int64_t lda, std::int64_t stridea, float *b,
-                                    std::int64_t ldb, std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                    std::int64_t m, std::int64_t n, double alpha, const double *a,
-                                    std::int64_t lda, std::int64_t stridea, double *b,
-                                    std::int64_t ldb, std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                    std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                    const std::complex<float> *a, std::int64_t lda,
-                                    std::int64_t stridea, std::complex<float> *b, std::int64_t ldb,
-                                    std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                    std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                    const std::complex<double> *a, std::int64_t lda,
-                                    std::int64_t stridea, std::complex<double> *b, std::int64_t ldb,
-                                    std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                   std::int64_t m, std::int64_t n, float alpha, float *ab,
-                                   std::int64_t lda, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                   std::int64_t m, std::int64_t n, double alpha, double *ab,
-                                   std::int64_t lda, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                   std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                   std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                                  transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                                  const float *a, std::int64_t lda, float beta, const float *b,
-                                  std::int64_t ldb, float *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                                  transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                                  const double *a, std::int64_t lda, double beta, const double *b,
-                                  std::int64_t ldb, double *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                                  transpose transb, std::int64_t m, std::int64_t n,
-                                  std::complex<float> alpha, const std::complex<float> *a,
-                                  std::int64_t lda, std::complex<float> beta,
-                                  const std::complex<float> *b, std::int64_t ldb,
-                                  std::complex<float> *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                                  transpose transb, std::int64_t m, std::int64_t n,
-                                  std::complex<double> alpha, const std::complex<double> *a,
-                                  std::int64_t lda, std::complex<double> beta,
-                                  const std::complex<double> *b, std::int64_t ldb,
-                                  std::complex<double> *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         float* alpha, const float** a, std::int64_t* lda,
-                                         float** b, std::int64_t* ldb, std::int64_t group_count,
-                                         std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         double* alpha, const double** a, std::int64_t* lda,
-                                         double** b, std::int64_t* ldb, std::int64_t group_count,
-                                         std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         std::complex<float>* alpha, const std::complex<float>** a,
-                                         std::int64_t* lda, std::complex<float>** b,
-                                         std::int64_t* ldb, std::int64_t group_count,
-                                         std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         std::complex<double>* alpha,
-                                         const std::complex<double>** a, std::int64_t* lda,
-                                         std::complex<double>** b, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         float* alpha, float** ab, std::int64_t* lda,
-                                         std::int64_t* ldb, std::int64_t group_count,
-                                         std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         double* alpha, double** ab, std::int64_t* lda,
-                                         std::int64_t* ldb, std::int64_t group_count,
-                                         std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         std::complex<float>* alpha, std::complex<float>** ab,
-                                         std::int64_t* lda, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue,
-                                         transpose* trans, std::int64_t* m, std::int64_t* n,
-                                         std::complex<double>* alpha, std::complex<double>** ab,
-                                         std::int64_t* lda, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
diff --git a/include/oneapi/mkl/blas/detail/cublas/blas_ct.hpp b/include/oneapi/mkl/blas/detail/cublas/blas_ct.hpp
deleted file mode 100644
index 2443c64d2..000000000
--- a/include/oneapi/mkl/blas/detail/cublas/blas_ct.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _DETAIL_CUBLAS_BLAS_CT_HPP_
-#define _DETAIL_CUBLAS_BLAS_CT_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/detail/backend_selector.hpp"
-#include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
-#include "oneapi/mkl/blas/detail/blas_ct_backends.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace column_major {
-
-#define MAJOR column_major
-#include "blas_ct.hxx"
-#undef MAJOR
-
-} //namespace column_major
-namespace row_major {
-
-#define MAJOR row_major
-#include "blas_ct.hxx"
-#undef MAJOR
-
-} //namespace row_major
-} //namespace blas
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_DETAIL_CUBLAS_BLAS_CT_HPP_
diff --git a/include/oneapi/mkl/blas/detail/cublas/blas_ct.hxx b/include/oneapi/mkl/blas/detail/cublas/blas_ct.hxx
deleted file mode 100644
index 9483a66c1..000000000
--- a/include/oneapi/mkl/blas/detail/cublas/blas_ct.hxx
+++ /dev/null
@@ -1,4381 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-void herk(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, float beta, sycl::buffer<std::complex<float>, 1> &c,
-          std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void herk(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, double alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, double beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void scal(backend_selector<backend::cublas> selector, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::cublas> selector, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::cublas> selector, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::cublas> selector, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::cublas> selector, std::int64_t n, float alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::cublas> selector, std::int64_t n, double alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void trmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void tpmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void spr(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a) {
-    oneapi::mkl::blas::cublas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void spr(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a) {
-    oneapi::mkl::blas::cublas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void gemm_batch(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                double beta, sycl::buffer<double, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void syrk(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk_batch(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void syrk_batch(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, double beta,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void syrk_batch(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void syrk_batch(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void her2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::cublas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void her2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::cublas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void hbmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void hbmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void rot(backend_selector<backend::cublas> selector, std::int64_t n,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c, float s) {
-    oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::cublas> selector, std::int64_t n,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c, double s) {
-    oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::cublas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy, float c, float s) {
-    oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::cublas> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy, double c, double s) {
-    oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void axpy(backend_selector<backend::cublas> selector, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::cublas> selector, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::cublas> selector, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::cublas> selector, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy_batch(backend_selector<backend::cublas> selector, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::cublas> selector, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::cublas> selector, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::cublas> selector, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpby(backend_selector<backend::cublas> selector, std::int64_t n, float alpha,
-           sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-           sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(backend_selector<backend::cublas> selector, std::int64_t n, double alpha,
-           sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-           sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(backend_selector<backend::cublas> selector, std::int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(backend_selector<backend::cublas> selector, std::int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-           std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void sdsdot(backend_selector<backend::cublas> selector, std::int64_t n, float sb,
-            sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-            std::int64_t incy, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result);
-}
-
-void gerc(backend_selector<backend::cublas> selector, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::cublas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void gerc(backend_selector<backend::cublas> selector, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::cublas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void syr2k(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-           std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-           std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-           std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv_batch(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<float, 1> &x, std::int64_t incx,
-                std::int64_t stridex, float beta, sycl::buffer<float, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void gemv_batch(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<double, 1> &x, std::int64_t incx,
-                std::int64_t stridex, double beta, sycl::buffer<double, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void gemv_batch(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                std::int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void gemv_batch(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                std::int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::cublas> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<float, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<float, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::cublas> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<double, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<double, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::cublas> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::cublas> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void her(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::cublas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void her(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::cublas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void hpr(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a) {
-    oneapi::mkl::blas::cublas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void hpr(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a) {
-    oneapi::mkl::blas::cublas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void iamin(backend_selector<backend::cublas> selector, std::int64_t n,
-           sycl::buffer<float, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::cublas> selector, std::int64_t n,
-           sycl::buffer<double, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::cublas> selector, std::int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::cublas> selector, std::int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void hpmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void hpmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void spmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void spmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void gemm_bias(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void swap(backend_selector<backend::cublas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void geru(backend_selector<backend::cublas> selector, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::cublas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void geru(backend_selector<backend::cublas> selector, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::cublas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void nrm2(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::cublas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void gemm(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<bfloat16, 1> &a, std::int64_t lda, sycl::buffer<bfloat16, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void syr2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::cublas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void syr2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &a,
-          std::int64_t lda) {
-    oneapi::mkl::blas::cublas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void ger(backend_selector<backend::cublas> selector, std::int64_t m, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-         std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::cublas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                          lda);
-}
-
-void ger(backend_selector<backend::cublas> selector, std::int64_t m, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-         std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::cublas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                          lda);
-}
-
-void trsm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void dotu(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dotu(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void hemm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hemm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hpr2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a) {
-    oneapi::mkl::blas::cublas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void hpr2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a) {
-    oneapi::mkl::blas::cublas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void gbmv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void tbmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void symm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void dotc(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dotc(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void syr(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a,
-         std::int64_t lda) {
-    oneapi::mkl::blas::cublas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void syr(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a,
-         std::int64_t lda) {
-    oneapi::mkl::blas::cublas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void trmm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void rotmg(backend_selector<backend::cublas> selector, sycl::buffer<float, 1> &d1,
-           sycl::buffer<float, 1> &d2, sycl::buffer<float, 1> &x1, float y1,
-           sycl::buffer<float, 1> &param) {
-    oneapi::mkl::blas::cublas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param);
-}
-
-void rotmg(backend_selector<backend::cublas> selector, sycl::buffer<double, 1> &d1,
-           sycl::buffer<double, 1> &d2, sycl::buffer<double, 1> &x1, double y1,
-           sycl::buffer<double, 1> &param) {
-    oneapi::mkl::blas::cublas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param);
-}
-
-void tpsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void trsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void copy(backend_selector<backend::cublas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy_batch(backend_selector<backend::cublas> selector, std::int64_t n,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::cublas> selector, std::int64_t n,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::cublas> selector, std::int64_t n,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::cublas> selector, std::int64_t n,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void hemv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void hemv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemmt(backend_selector<backend::cublas> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, float alpha,
-           sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-           std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::cublas> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, double alpha,
-           sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-           std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::cublas> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::cublas> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-           std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void asum(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::cublas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void sbmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void sbmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void tbsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::cublas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void spr2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a) {
-    oneapi::mkl::blas::cublas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void spr2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &a) {
-    oneapi::mkl::blas::cublas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void iamax(backend_selector<backend::cublas> selector, std::int64_t n,
-           sycl::buffer<float, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::cublas> selector, std::int64_t n,
-           sycl::buffer<double, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::cublas> selector, std::int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::cublas> selector, std::int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void rotm(backend_selector<backend::cublas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-          sycl::buffer<float, 1> &param) {
-    oneapi::mkl::blas::cublas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param);
-}
-
-void rotm(backend_selector<backend::cublas> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &param) {
-    oneapi::mkl::blas::cublas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param);
-}
-
-void dot(backend_selector<backend::cublas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dot(backend_selector<backend::cublas> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-         sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dot(backend_selector<backend::cublas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::cublas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void trsm_batch(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void her2k(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void her2k(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void rotg(backend_selector<backend::cublas> selector, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<float, 1> &s) {
-    oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::cublas> selector, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<double, 1> &s) {
-    oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::cublas> selector, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<std::complex<float>, 1> &s) {
-    oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::cublas> selector, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<std::complex<double>, 1> &s) {
-    oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void symv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void symv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::cublas::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void omatcopy_batch(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                   std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<float, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                   std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer<double, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<double, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                   sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                   std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                   sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                   std::int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                   std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::cublas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-              sycl::buffer<float, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-              sycl::buffer<double, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy2(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-               std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<float, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    oneapi::mkl::blas::cublas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-               std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<double, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    oneapi::mkl::blas::cublas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    oneapi::mkl::blas::cublas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    oneapi::mkl::blas::cublas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void imatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void imatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void imatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void imatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void omatadd(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-             std::int64_t lda, float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-             sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-             std::int64_t lda, double beta, sycl::buffer<double, 1> &b, std::int64_t ldb,
-             sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<float> alpha,
-             sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-             sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<double> alpha,
-             sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-             sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-// USM APIs
-
-sycl::event syr2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *x, std::int64_t incx, const float *y,
-                     std::int64_t incy, float *a, std::int64_t lda,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *x, std::int64_t incx, const double *y,
-                     std::int64_t incy, double *a, std::int64_t lda,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::cublas> selector, std::int64_t n, float alpha,
-                     float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::cublas> selector, std::int64_t n, double alpha,
-                     double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::cublas> selector, std::int64_t n,
-                     std::complex<float> alpha, std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::cublas> selector, std::int64_t n,
-                     std::complex<double> alpha, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::cublas> selector, std::int64_t n, float alpha,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::cublas> selector, std::int64_t n, double alpha,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event spr(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const float *x, std::int64_t incx, float *a,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event spr(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const double *x, std::int64_t incx, double *a,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event hpmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hpmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                     float beta, float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, double alpha, const double *a,
-                     std::int64_t lda, double beta, double *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                     std::complex<float> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                     std::complex<double> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::cublas> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k, float *alpha,
-                           const float **a, std::int64_t *lda, float *beta, float **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::cublas> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k, double *alpha,
-                           const double **a, std::int64_t *lda, double *beta, double **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::cublas> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k,
-                           std::complex<float> *alpha, const std::complex<float> **a,
-                           std::int64_t *lda, std::complex<float> *beta, std::complex<float> **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::cublas> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k,
-                           std::complex<double> *alpha, const std::complex<double> **a,
-                           std::int64_t *lda, std::complex<double> *beta, std::complex<double> **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::cublas> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                           const float *a, std::int64_t lda, std::int64_t stride_a, float beta,
-                           float *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::cublas> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                           const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                           double *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::cublas> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k,
-                           std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<float> beta,
-                           std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::cublas> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k,
-                           std::complex<double> alpha, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-                           std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event her2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event her2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event hbmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                     std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hbmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                     std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::cublas> selector, std::int64_t n,
-                    std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                    std::int64_t incy, float c, float s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::cublas> selector, std::int64_t n,
-                    std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                    std::int64_t incy, double c, double s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::cublas> selector, std::int64_t n, float *x,
-                    std::int64_t incx, float *y, std::int64_t incy, float c, float s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::cublas> selector, std::int64_t n, double *x,
-                    std::int64_t incx, double *y, std::int64_t incy, double c, double s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::cublas> selector, std::int64_t n, float alpha,
-                     const float *x, std::int64_t incx, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::cublas> selector, std::int64_t n, double alpha,
-                     const double *x, std::int64_t incx, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::cublas> selector, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::cublas> selector, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::cublas> selector, std::int64_t *n,
-                           float *alpha, const float **x, std::int64_t *incx, float **y,
-                           std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::cublas> selector, std::int64_t *n,
-                           double *alpha, const double **x, std::int64_t *incx, double **y,
-                           std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::cublas> selector, std::int64_t *n,
-                           std::complex<float> *alpha, const std::complex<float> **x,
-                           std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::cublas> selector, std::int64_t *n,
-                           std::complex<double> *alpha, const std::complex<double> **x,
-                           std::int64_t *incx, std::complex<double> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::cublas> selector, std::int64_t n, float alpha,
-                           const float *x, std::int64_t incx, std::int64_t stridex, float *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::cublas> selector, std::int64_t n, double alpha,
-                           const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::cublas> selector, std::int64_t n,
-                           std::complex<float> alpha, const std::complex<float> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<float> *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::cublas> selector, std::int64_t n,
-                           std::complex<double> alpha, const std::complex<double> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<double> *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::cublas> selector, std::int64_t n, float alpha,
-                      const float *x, std::int64_t incx, const float beta, float *y,
-                      std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::cublas> selector, std::int64_t n, double alpha,
-                      const double *x, std::int64_t incx, const double beta, double *y,
-                      std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::cublas> selector, std::int64_t n,
-                      std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                      const std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::cublas> selector, std::int64_t n,
-                      std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                      const std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gerc(backend_selector<backend::cublas> selector, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event gerc(backend_selector<backend::cublas> selector, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                      const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, double alpha, const double *a,
-                      std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                      std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                      std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                      std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                      const std::complex<double> *a, std::int64_t lda,
-                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
-                      std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x,
-                     std::int64_t incx, float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                     const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                     std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                     std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::cublas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, const float *a,
-                           std::int64_t lda, std::int64_t stridea, const float *x,
-                           std::int64_t incx, std::int64_t stridex, float beta, float *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::cublas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, const double *a,
-                           std::int64_t lda, std::int64_t stridea, const double *x,
-                           std::int64_t incx, std::int64_t stridex, double beta, double *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::cublas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-                           const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                           std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::cublas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stridea,
-                           const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                           std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::cublas> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, float *alpha, const float **a,
-                           std::int64_t *lda, const float **x, std::int64_t *incx, float *beta,
-                           float **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::cublas> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, double *alpha, const double **a,
-                           std::int64_t *lda, const double **x, std::int64_t *incx, double *beta,
-                           double **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::cublas> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<float> *alpha,
-                           const std::complex<float> **a, std::int64_t *lda,
-                           const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> *beta, std::complex<float> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::cublas> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<double> *alpha,
-                           const std::complex<double> **a, std::int64_t *lda,
-                           const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> *beta, std::complex<double> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::cublas> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const float *a, std::int64_t lda,
-                           std::int64_t stridea, const float *x, std::int64_t incx,
-                           std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::cublas> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const double *a, std::int64_t lda,
-                           std::int64_t stridea, const double *x, std::int64_t incx,
-                           std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::cublas> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stridea, const std::complex<float> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<float> *c,
-                           std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::cublas> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stridea, const std::complex<double> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<double> *c,
-                           std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::cublas> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const float **a, std::int64_t *lda,
-                           const float **x, std::int64_t *incx, float **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::cublas> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const double **a, std::int64_t *lda,
-                           const double **x, std::int64_t *incx, double **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::cublas> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const std::complex<float> **a,
-                           std::int64_t *lda, const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::cublas> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const std::complex<double> **a,
-                           std::int64_t *lda, const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event her(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const std::complex<float> *x, std::int64_t incx,
-                    std::complex<float> *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event her(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const std::complex<double> *x, std::int64_t incx,
-                    std::complex<double> *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event hpr(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const std::complex<float> *x, std::int64_t incx,
-                    std::complex<float> *a, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event hpr(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const std::complex<double> *x, std::int64_t incx,
-                    std::complex<double> *a, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::cublas> selector, std::int64_t n, const float *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::cublas> selector, std::int64_t n, const double *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::cublas> selector, std::int64_t n,
-                      const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::cublas> selector, std::int64_t n,
-                      const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           float *alpha, const float **a, std::int64_t *lda, const float **b,
-                           std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           double *alpha, const double **a, std::int64_t *lda, const double **b,
-                           std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           std::complex<float> *alpha, const std::complex<float> **a,
-                           std::int64_t *lda, const std::complex<float> **b, std::int64_t *ldb,
-                           std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           std::complex<double> *alpha, const std::complex<double> **a,
-                           std::int64_t *lda, const std::complex<double> **b, std::int64_t *ldb,
-                           std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           sycl::half *alpha, const sycl::half **a, std::int64_t *lda,
-                           const sycl::half **b, std::int64_t *ldb, sycl::half *beta,
-                           sycl::half **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const sycl::half **a, std::int64_t *lda, const sycl::half **b,
-                       std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, float **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
-                           const float *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                           float *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
-                           const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
-                           double *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                           std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           std::complex<double> alpha, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                           std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           sycl::half alpha, const sycl::half *a, std::int64_t lda,
-                           std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-                           std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc,
-                           std::int64_t stride_c, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const sycl::half *a, std::int64_t lda, std::int64_t stride_a,
-                       const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::cublas> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       std::int32_t *c, std::int64_t ldc, std::int64_t stride_c,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event spmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *a, const float *x, std::int64_t incx, float beta,
-                     float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event spmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *a, const double *x, std::int64_t incx, double beta,
-                     double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::cublas> selector, std::int64_t n, float *x,
-                     std::int64_t incx, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::cublas> selector, std::int64_t n, double *x,
-                     std::int64_t incx, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::cublas> selector, std::int64_t n,
-                     std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::cublas> selector, std::int64_t n,
-                     std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event geru(backend_selector<backend::cublas> selector, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event geru(backend_selector<backend::cublas> selector, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::cublas> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::cublas> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::cublas> selector, std::int64_t n, const float *x,
-                     std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::cublas> selector, std::int64_t n, const double *x,
-                     std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a,
-                     std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a,
-                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                     const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                     sycl::half beta, sycl::half *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                     const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                     float beta, float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a,
-                     std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::cublas> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                          std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::cublas> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                          std::int8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::cublas> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda,
-                          std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::cublas> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda,
-                          std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event herk(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, float alpha, const std::complex<float> *a,
-                     std::int64_t lda, float beta, std::complex<float> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::herk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event herk(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, double alpha, const std::complex<double> *a,
-                     std::int64_t lda, double beta, std::complex<double> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::herk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event ger(backend_selector<backend::cublas> selector, std::int64_t m, std::int64_t n,
-                    float alpha, const float *x, std::int64_t incx, const float *y,
-                    std::int64_t incy, float *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y,
-                                                      incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event ger(backend_selector<backend::cublas> selector, std::int64_t m, std::int64_t n,
-                    double alpha, const double *x, std::int64_t incx, const double *y,
-                    std::int64_t incy, double *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y,
-                                                      incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                     const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                     const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::cublas> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                           std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::cublas> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                           std::int64_t stride_a, double *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::cublas> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<float> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::cublas> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::cublas> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, float *alpha, const float **a, std::int64_t *lda,
-                           float **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::cublas> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, double *alpha, const double **a, std::int64_t *lda,
-                           double **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::cublas> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, std::complex<float> *alpha,
-                           const std::complex<float> **a, std::int64_t *lda,
-                           std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::cublas> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, std::complex<double> *alpha,
-                           const std::complex<double> **a, std::int64_t *lda,
-                           std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event dotu(backend_selector<backend::cublas> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                     std::int64_t incy, std::complex<float> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event dotu(backend_selector<backend::cublas> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event hemm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event hemm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event hpr2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event hpr2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a,
-                     std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
-                     const double *a, std::int64_t lda, const double *x, std::int64_t incx,
-                     double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
-                     std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
-                     std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
-                     std::int64_t lda, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
-                     std::int64_t lda, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                     const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, double alpha, const double *a,
-                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event dotc(backend_selector<backend::cublas> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                     std::int64_t incy, std::complex<float> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event dotc(backend_selector<backend::cublas> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event syr(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                     const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                     const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::cublas> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event rotmg(backend_selector<backend::cublas> selector, float *d1, float *d2, float *x1,
-                      float y1, float *param, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event rotmg(backend_selector<backend::cublas> selector, double *d1, double *d2,
-                      double *x1, double y1, double *param,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::cublas> selector, std::int64_t n, const float *x,
-                     std::int64_t incx, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::cublas> selector, std::int64_t n, const double *x,
-                     std::int64_t incx, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::cublas> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::cublas> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::cublas> selector, std::int64_t *n,
-                           const float **x, std::int64_t *incx, float **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::cublas> selector, std::int64_t *n,
-                           const double **x, std::int64_t *incx, double **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::cublas> selector, std::int64_t *n,
-                           const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::cublas> selector, std::int64_t *n,
-                           const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::cublas> selector, std::int64_t n,
-                           const float *x, std::int64_t incx, std::int64_t stridex, float *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::cublas> selector, std::int64_t n,
-                           const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::cublas> selector, std::int64_t n,
-                           const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::cublas> selector, std::int64_t n,
-                           const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event hemv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::hemv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hemv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::hemv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::cublas> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      float alpha, const float *a, std::int64_t lda, const float *b,
-                      std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::cublas> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      double alpha, const double *a, std::int64_t lda, const double *b,
-                      std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::cublas> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                      const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
-                      std::complex<float> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::cublas> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
-                      std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event sbmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x,
-                     std::int64_t incx, float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event sbmv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                     const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::cublas> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::cublas> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::cublas> selector, std::int64_t n, const float *x,
-                     std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::cublas> selector, std::int64_t n, const double *x,
-                     std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
-                     std::int64_t lda, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
-                     std::int64_t lda, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event spr2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *x, std::int64_t incx, const float *y,
-                     std::int64_t incy, float *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event spr2(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *x, std::int64_t incx, const double *y,
-                     std::int64_t incy, double *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::cublas> selector, std::int64_t n, const float *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::cublas> selector, std::int64_t n, const double *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::cublas> selector, std::int64_t n,
-                      const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::cublas> selector, std::int64_t n,
-                      const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event rotm(backend_selector<backend::cublas> selector, std::int64_t n, float *x,
-                     std::int64_t incx, float *y, std::int64_t incy, float *param,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy,
-                                                       param, dependencies);
-    return done;
-}
-
-sycl::event rotm(backend_selector<backend::cublas> selector, std::int64_t n, double *x,
-                     std::int64_t incx, double *y, std::int64_t incy, double *param,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy,
-                                                       param, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::cublas> selector, float *a, float *b, float *c,
-                     float *s, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::cublas> selector, double *a, double *b, double *c,
-                     double *s, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::cublas> selector, std::complex<float> *a,
-                     std::complex<float> *b, float *c, std::complex<float> *s,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::cublas> selector, std::complex<double> *a,
-                     std::complex<double> *b, double *c, std::complex<double> *s,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event sdsdot(backend_selector<backend::cublas> selector, std::int64_t n, float sb,
-                       const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                       float *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y,
-                                                         incy, result, dependencies);
-    return done;
-}
-
-sycl::event her2k(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                      std::int64_t ldb, float beta, std::complex<float> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event her2k(backend_selector<backend::cublas> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                      const std::complex<double> *a, std::int64_t lda,
-                      const std::complex<double> *b, std::int64_t ldb, double beta,
-                      std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::cublas> selector, std::int64_t n, const float *x,
-                    std::int64_t incx, const float *y, std::int64_t incy, float *result,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                      result, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::cublas> selector, std::int64_t n, const double *x,
-                    std::int64_t incx, const double *y, std::int64_t incy, double *result,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                      result, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::cublas> selector, std::int64_t n, const float *x,
-                    std::int64_t incx, const float *y, std::int64_t incy, double *result,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                      result, dependencies);
-    return done;
-}
-
-sycl::event symv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *a, std::int64_t lda, const float *x,
-                     std::int64_t incx, float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::symv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event symv(backend_selector<backend::cublas> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *a, std::int64_t lda, const double *x,
-                     std::int64_t incx, double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::symv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::cublas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, const float *a,
-                           std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::cublas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, const double *a,
-                           std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::cublas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::cublas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::cublas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda,
-                           std::int64_t ldb, std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::cublas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, double *ab,
-                           std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::cublas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::cublas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::cublas> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                          const float *a, std::int64_t lda, std::int64_t stride_a, float beta,
-                          const float *b, std::int64_t ldb, std::int64_t stride_b, float *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::cublas> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                          const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                          const double *b, std::int64_t ldb, std::int64_t stride_b, double *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::cublas> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                          std::int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::cublas> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<double> alpha, const std::complex<double> *a,
-                          std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                      std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                      std::int64_t stridea, double *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                      std::int64_t lda, std::int64_t stridea, std::complex<float> *b,
-                      std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                      std::int64_t lda, std::int64_t stridea, std::complex<double> *b,
-                      std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, float *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, double *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, std::complex<float> *ab,
-                     std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::cublas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, std::complex<double> *ab,
-                     std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                    float beta, const float *b, std::int64_t ldb, float *c, std::int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                    double beta, const double *b, std::int64_t ldb, double *c, std::int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                    const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                    const std::complex<float> *b, std::int64_t ldb, std::complex<float> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::cublas> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                    const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                    const std::complex<double> *b, std::int64_t ldb, std::complex<double> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::cublas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, float* alpha, const float** a,
-                           std::int64_t* lda, float** b, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::cublas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, double* alpha, const double** a,
-                           std::int64_t* lda, double** b, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::cublas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<float>* alpha,
-                           const std::complex<float>** a, std::int64_t* lda,
-                           std::complex<float>** b, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::cublas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<double>* alpha,
-                           const std::complex<double>** a, std::int64_t* lda,
-                           std::complex<double>** b, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::cublas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, float* alpha, float** ab,
-                           std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::cublas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, double* alpha, double** ab,
-                           std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::cublas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<float>* alpha,
-                           std::complex<float>** ab, std::int64_t* lda, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::cublas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<double>* alpha,
-                           std::complex<double>** ab, std::int64_t* lda, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
diff --git a/include/oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp b/include/oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp
deleted file mode 100644
index 8a6d5448f..000000000
--- a/include/oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#ifndef _ONEMKL_BLAS_CUBLAS_HPP_
-#define _ONEMKL_BLAS_CUBLAS_HPP_
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-#include <string>
-#include "oneapi/mkl/types.hpp"
-
-namespace oneapi {
-namespace mkl {
-using oneapi::mkl::diag;
-using oneapi::mkl::offset;
-using oneapi::mkl::side;
-using oneapi::mkl::transpose;
-using oneapi::mkl::uplo;
-namespace blas {
-namespace cublas {
-namespace column_major {
-
-#include "onemkl_blas_cublas.hxx"
-
-} //namespace column_major
-namespace row_major {
-
-#include "onemkl_blas_cublas.hxx"
-
-} //namespace row_major
-} //namespace cublas
-} //namespace blas
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_ONEMKL_BLAS_CUBLAS_HPP_
diff --git a/include/oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hxx b/include/oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hxx
deleted file mode 100644
index 1141eb238..000000000
--- a/include/oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hxx
+++ /dev/null
@@ -1,2370 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-// Buffer APIs
-
-void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result);
-
-void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &result);
-
-void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result);
-
-void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result);
-
-void axpy(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void axpy(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void axpy(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void axpy(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-void axpy_batch(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<float, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-void axpy_batch(sycl::queue &queue, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size);
-
-void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-void axpby(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-           std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void axpby(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-           std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void axpby(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void axpby(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-           std::int64_t incy);
-
-void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<float, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<double, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size);
-
-void copy_batch(sycl::queue &queue, std::int64_t n,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-         sycl::buffer<float, 1> &y, std::int64_t incy, sycl::buffer<float, 1> &result);
-
-void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-         sycl::buffer<double, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &result);
-
-void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-         sycl::buffer<float, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &result);
-
-void dotc(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result);
-
-void dotc(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result);
-
-void dotu(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result);
-
-void dotu(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result);
-
-void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result);
-
-void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-
-void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-
-void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-
-void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result);
-
-void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-
-void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-
-void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-
-void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result);
-
-void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &result);
-
-void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result);
-
-void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result);
-
-void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-         std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c,
-         float s);
-
-void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-         std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-         double c, double s);
-
-void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-         sycl::buffer<float, 1> &y, std::int64_t incy, float c, float s);
-
-void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-         sycl::buffer<double, 1> &y, std::int64_t incy, double c, double s);
-
-void rotg(sycl::queue &queue, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &b,
-          sycl::buffer<float, 1> &c, sycl::buffer<float, 1> &s);
-
-void rotg(sycl::queue &queue, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &b,
-          sycl::buffer<double, 1> &c, sycl::buffer<double, 1> &s);
-
-void rotg(sycl::queue &queue, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<std::complex<float>, 1> &s);
-
-void rotg(sycl::queue &queue, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<std::complex<double>, 1> &s);
-
-void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &y, std::int64_t incy, sycl::buffer<float, 1> &param);
-
-void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &param);
-
-void rotmg(sycl::queue &queue, sycl::buffer<float, 1> &d1, sycl::buffer<float, 1> &d2,
-           sycl::buffer<float, 1> &x1, float y1, sycl::buffer<float, 1> &param);
-
-void rotmg(sycl::queue &queue, sycl::buffer<double, 1> &d1, sycl::buffer<double, 1> &d2,
-           sycl::buffer<double, 1> &x1, double y1, sycl::buffer<double, 1> &param);
-
-void scal(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-          std::int64_t incx);
-
-void scal(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-          std::int64_t incx);
-
-void scal(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-void scal(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-void scal(sycl::queue &queue, std::int64_t n, float alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-void scal(sycl::queue &queue, std::int64_t n, double alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-void sdsdot(sycl::queue &queue, std::int64_t n, float sb, sycl::buffer<float, 1> &x,
-            std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-            sycl::buffer<float, 1> &result);
-
-void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-          std::int64_t ku, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-          std::int64_t ku, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-          std::int64_t ku, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-          std::int64_t ku, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy);
-
-void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy);
-
-void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex, float beta,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size);
-
-void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<double, 1> &x, std::int64_t incx,
-                std::int64_t stridex, double beta, sycl::buffer<double, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size);
-
-void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size);
-
-void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size);
-
-void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size);
-
-void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size);
-
-void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size);
-
-void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-         std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda);
-
-void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-         std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda);
-
-void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-
-void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-
-void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-
-void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-
-void hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy);
-
-void hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy);
-
-void her(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-
-void her(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-
-void her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-
-void her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-
-void hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-          std::int64_t incy);
-
-void hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy);
-
-void hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a);
-
-void hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a);
-
-void hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a);
-
-void hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a);
-
-void sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x, std::int64_t incx,
-          double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a);
-
-void spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a);
-
-void spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a);
-
-void spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &a);
-
-void symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a,
-         std::int64_t lda);
-
-void syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a,
-         std::int64_t lda);
-
-void syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda);
-
-void syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda);
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx);
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx);
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx);
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx);
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx);
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x, std::int64_t incx);
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx);
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx);
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x, std::int64_t incx);
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx);
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx);
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx);
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx);
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx);
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-          std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-          std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-          std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-          std::int64_t n, std::int64_t k, sycl::half alpha, sycl::buffer<sycl::half, 1> &a,
-          std::int64_t lda, sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, sycl::half beta,
-          sycl::buffer<sycl::half, 1> &c, std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<sycl::half, 1> &a,
-          std::int64_t lda, sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<bfloat16, 1> &a,
-          std::int64_t lda, sycl::buffer<bfloat16, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-          std::int64_t ldc);
-
-void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          float alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, float beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          double alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, double beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-           std::int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-           std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-           float beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-           std::int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-           std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-           double beta, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-          std::int64_t ldc);
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-          std::int64_t ldc);
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size);
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                std::int64_t k, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, double beta, sycl::buffer<double, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size);
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size);
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-           std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-           sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-           std::int64_t k, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-           sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-           std::int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-           std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-           std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-           std::int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-           std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-           std::int64_t ldc);
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb);
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb);
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb);
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb);
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<float, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<double, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, double beta,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, sycl::half alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<sycl::half, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<sycl::half, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int8_t, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int8_t, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size);
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size);
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size);
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-           std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-           std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-           std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-           std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-           std::int64_t ldc);
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-               std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc, sycl::buffer<int32_t, 1> &co);
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-               std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc, sycl::buffer<int32_t, 1> &co);
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-               std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc, sycl::buffer<int32_t, 1> &co);
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-               std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc, sycl::buffer<int32_t, 1> &co);
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                    sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                    sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size);
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                    sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                    sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size);
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                    int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-                    int64_t stride_b, int64_t batch_size);
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                    int64_t lda, int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    int64_t ldb, int64_t stride_b, int64_t batch_size);
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                    sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size);
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                    sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size);
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size);
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size);
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                   float beta, sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size);
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                   double beta, sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size);
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                   int64_t stride_a, std::complex<float> beta,
-                   sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<std::complex<float>, 1> &c, int64_t ldc, int64_t stride_c,
-                   int64_t batch_size);
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                   int64_t lda, int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                   int64_t batch_size);
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-              sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b, int64_t ldb);
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-              sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b, int64_t ldb);
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-              sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-              sycl::buffer<std::complex<float>, 1> &b, int64_t ldb);
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-              sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-              sycl::buffer<std::complex<double>, 1> &b, int64_t ldb);
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-               sycl::buffer<float, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<float, 1> &b, int64_t ldb, std::int64_t strideb);
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-               sycl::buffer<double, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<double, 1> &b, int64_t ldb, std::int64_t strideb);
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-               sycl::buffer<std::complex<float>, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::int64_t strideb);
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-               std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-               std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-               std::int64_t strideb);
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-              sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb);
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-              sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb);
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-              sycl::buffer<std::complex<float>, 1> &ab, int64_t lda, int64_t ldb);
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-              sycl::buffer<std::complex<double>, 1> &ab, int64_t lda, int64_t ldb);
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             float alpha, sycl::buffer<float, 1> &a, int64_t lda, float beta,
-             sycl::buffer<float, 1> &b, int64_t ldb, sycl::buffer<float, 1> &c, int64_t ldc);
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             double alpha, sycl::buffer<double, 1> &a, int64_t lda, double beta,
-             sycl::buffer<double, 1> &b, int64_t ldb, sycl::buffer<double, 1> &c, int64_t ldc);
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-             std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-             sycl::buffer<std::complex<float>, 1> &c, int64_t ldc);
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-             std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-             sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
-
-// USM APIs
-
-sycl::event asum(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                     std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event asum(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                     std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event asum(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                     float *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event asum(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                     double *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy(sycl::queue &queue, std::int64_t n, float alpha, const float *x,
-                     std::int64_t incx, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy(sycl::queue &queue, std::int64_t n, double alpha, const double *x,
-                     std::int64_t incx, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, float *alpha, const float **x,
-                           std::int64_t *incx, float **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, double *alpha, const double **x,
-                           std::int64_t *incx, double **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
-                           const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
-                           const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, float alpha, const float *x,
-                           std::int64_t incx, std::int64_t stridex, float *y, std::int64_t incy,
-                           std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, double alpha, const double *x,
-                           std::int64_t incx, std::int64_t stridex, double *y, std::int64_t incy,
-                           std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                           const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpby(sycl::queue &queue, std::int64_t n, float alpha, const float *x,
-                      std::int64_t incx, const float beta, float *y, std::int64_t incy,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpby(sycl::queue &queue, std::int64_t n, double alpha, const double *x,
-                      std::int64_t incx, const double beta, double *y, std::int64_t incy,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpby(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                      const std::complex<float> *x, std::int64_t incx,
-                      const std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpby(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                      const std::complex<double> *x, std::int64_t incx,
-                      const std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                     float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                     double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                     std::int64_t incx, std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                     std::int64_t incx, std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const float **x,
-                           std::int64_t *incx, float **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const double **x,
-                           std::int64_t *incx, double **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const std::complex<float> **x,
-                           std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const std::complex<double> **x,
-                           std::int64_t *incx, std::complex<double> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const float *x,
-                           std::int64_t incx, std::int64_t stridex, float *y, std::int64_t incy,
-                           std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const double *x,
-                           std::int64_t incx, std::int64_t stridex, double *y, std::int64_t incy,
-                           std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<float> *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<double> *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                    const float *y, std::int64_t incy, float *result,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dot(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                    const double *y, std::int64_t incy, double *result,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                    const float *y, std::int64_t incy, double *result,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dotc(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                     std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
-                     std::complex<float> *result,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dotc(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                     std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
-                     std::complex<double> *result,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dotu(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                     std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
-                     std::complex<float> *result,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dotu(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                     std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
-                     std::complex<double> *result,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamin(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                      std::int64_t *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamin(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                      std::int64_t *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamin(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamin(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamax(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                      std::int64_t *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamax(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                      std::int64_t *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamax(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamax(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event nrm2(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                     std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event nrm2(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                     std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event nrm2(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                     float *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event nrm2(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                     double *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex<float> *x,
-                    std::int64_t incx, std::complex<float> *y, std::int64_t incy, float c, float s,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex<double> *x,
-                    std::int64_t incx, std::complex<double> *y, std::int64_t incy, double c,
-                    double s, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rot(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
-                    std::int64_t incy, float c, float s,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rot(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
-                    std::int64_t incy, double c, double s,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c, double *s,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotg(sycl::queue &queue, std::complex<float> *a, std::complex<float> *b,
-                     float *c, std::complex<float> *s,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotg(sycl::queue &queue, std::complex<double> *a, std::complex<double> *b,
-                     double *c, std::complex<double> *s,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotm(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
-                     std::int64_t incy, float *param,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotm(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
-                     double *y, std::int64_t incy, double *param,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1, float y1,
-                      float *param, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1, double y1,
-                      double *param, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, float *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, double *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event sdsdot(sycl::queue &queue, std::int64_t n, float sb, const float *x,
-                       std::int64_t incx, const float *y, std::int64_t incy, float *result,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event swap(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event swap(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx,
-                     double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex<float> *x,
-                     std::int64_t incx, std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex<double> *x,
-                     std::int64_t incx, std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     std::int64_t kl, std::int64_t ku, float alpha, const float *a,
-                     std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     std::int64_t kl, std::int64_t ku, double alpha, const double *a,
-                     std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
-                     std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
-                     std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     float alpha, const float *a, std::int64_t lda, const float *x,
-                     std::int64_t incx, float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     double alpha, const double *a, std::int64_t lda, const double *x,
-                     std::int64_t incx, double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                           float alpha, const float *a, std::int64_t lda, std::int64_t stridea,
-                           const float *x, std::int64_t incx, std::int64_t stridex, float beta,
-                           float *y, std::int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                           double alpha, const double *a, std::int64_t lda, std::int64_t stridea,
-                           const double *x, std::int64_t incx, std::int64_t stridex, double beta,
-                           double *y, std::int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                           std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stridea, const std::complex<float> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-                           std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                           std::complex<double> alpha, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stridea, const std::complex<double> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<double> beta,
-                           std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m,
-                           std::int64_t *n, float *alpha, const float **a, std::int64_t *lda,
-                           const float **x, std::int64_t *incx, float *beta, float **y,
-                           std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m,
-                           std::int64_t *n, double *alpha, const double **a, std::int64_t *lda,
-                           const double **x, std::int64_t *incx, double *beta, double **y,
-                           std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m,
-                           std::int64_t *n, std::complex<float> *alpha,
-                           const std::complex<float> **a, std::int64_t *lda,
-                           const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> *beta, std::complex<float> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m,
-                           std::int64_t *n, std::complex<double> *alpha,
-                           const std::complex<double> **a, std::int64_t *lda,
-                           const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> *beta, std::complex<double> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                           const float *a, std::int64_t lda, std::int64_t stridea, const float *x,
-                           std::int64_t incx, std::int64_t stridex, float *c, std::int64_t ldc,
-                           std::int64_t stridec, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                           const double *a, std::int64_t lda, std::int64_t stridea, const double *x,
-                           std::int64_t incx, std::int64_t stridex, double *c, std::int64_t ldc,
-                           std::int64_t stridec, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                           const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-                           const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<float> *c, std::int64_t ldc, std::int64_t stridec,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stridea,
-                           const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<double> *c, std::int64_t ldc, std::int64_t stridec,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m,
-                           std::int64_t *n, const float **a, std::int64_t *lda, const float **x,
-                           std::int64_t *incx, float **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m,
-                           std::int64_t *n, const double **a, std::int64_t *lda, const double **x,
-                           std::int64_t *incx, double **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m,
-                           std::int64_t *n, const std::complex<float> **a, std::int64_t *lda,
-                           const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m,
-                           std::int64_t *n, const std::complex<double> **a, std::int64_t *lda,
-                           const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
-                    const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
-                    std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
-                    const double *x, std::int64_t incx, const double *y, std::int64_t incy,
-                    double *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
-                    std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
-                    std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                    const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                    const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                     float alpha, const float *a, std::int64_t lda, const float *x,
-                     std::int64_t incx, float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                     double alpha, const double *a, std::int64_t lda, const double *x,
-                     std::int64_t incx, double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                     const float *a, const float *x, std::int64_t incx, float beta, float *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                     const double *a, const double *x, std::int64_t incx, double beta, double *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                    const float *x, std::int64_t incx, float *a,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                    const double *x, std::int64_t incx, double *a,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                     const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                     const double *x, std::int64_t incx, const double *y, std::int64_t incy,
-                     double *a, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                     const float *a, std::int64_t lda, const float *x, std::int64_t incx,
-                     float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                     const double *a, std::int64_t lda, const double *x, std::int64_t incx,
-                     double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                    const float *x, std::int64_t incx, float *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                    const double *x, std::int64_t incx, double *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-                     const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-                     const double *x, std::int64_t incx, const double *y, std::int64_t incy,
-                     double *a, std::int64_t lda,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, std::int64_t k, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, std::int64_t k, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const float *a, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const double *a, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const std::complex<float> *a, std::complex<float> *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const std::complex<double> *a, std::complex<double> *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const float *a, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const double *a, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const std::complex<float> *a, std::complex<float> *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const std::complex<double> *a, std::complex<double> *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const double *a, std::int64_t lda, double *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const double *a, std::int64_t lda, double *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                     std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                     std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                     const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                     std::int64_t n, std::int64_t k, double alpha, const double *a,
-                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                     std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                     std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                     std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half *a,
-                     std::int64_t lda, const sycl::half *b, std::int64_t ldb, sycl::half beta,
-                     sycl::half *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                     std::int64_t n, std::int64_t k, float alpha, const sycl::half *a,
-                     std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                     std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a,
-                     std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
-                     std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
-                     std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                     std::int64_t k, float alpha, const std::complex<float> *a, std::int64_t lda,
-                     float beta, std::complex<float> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                     std::int64_t k, double alpha, const std::complex<double> *a, std::int64_t lda,
-                     double beta, std::complex<double> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                      std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                      std::int64_t lda, const std::complex<float> *b, std::int64_t ldb, float beta,
-                      std::complex<float> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                      std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                      std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
-                      double beta, std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                     std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *b,
-                     std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                     std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                     const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
-                     std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
-                     std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                     std::int64_t k, float alpha, const float *a, std::int64_t lda, float beta,
-                     float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                     std::int64_t k, double alpha, const double *a, std::int64_t lda, double beta,
-                     double *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                     std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> beta, std::complex<float> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                     std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> beta, std::complex<double> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans,
-                           std::int64_t *n, std::int64_t *k, float *alpha, const float **a,
-                           std::int64_t *lda, float *beta, float **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans,
-                           std::int64_t *n, std::int64_t *k, double *alpha, const double **a,
-                           std::int64_t *lda, double *beta, double **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans,
-                           std::int64_t *n, std::int64_t *k, std::complex<float> *alpha,
-                           const std::complex<float> **a, std::int64_t *lda,
-                           std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans,
-                           std::int64_t *n, std::int64_t *k, std::complex<double> *alpha,
-                           const std::complex<double> **a, std::int64_t *lda,
-                           std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans,
-                           std::int64_t n, std::int64_t k, float alpha, const float *a,
-                           std::int64_t lda, std::int64_t stride_a, float beta, float *c,
-                           std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans,
-                           std::int64_t n, std::int64_t k, double alpha, const double *a,
-                           std::int64_t lda, std::int64_t stride_a, double beta, double *c,
-                           std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans,
-                           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                           const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
-                           std::int64_t stride_c, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans,
-                           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-                           std::int64_t stride_c, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                      std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b,
-                      std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                      std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                      const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                      std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                      std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
-                      std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                      std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                      std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
-                      std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a,
-                     std::int64_t lda, float *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a,
-                     std::int64_t lda, double *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a,
-                     std::int64_t lda, float *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a,
-                     std::int64_t lda, double *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                           transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                           float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
-                           float *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                           transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                           double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
-                           double *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                           transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                           std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<float> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                           transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                           std::complex<double> alpha, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<double> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower,
-                           transpose *trans, diag *unit_diag, std::int64_t *m, std::int64_t *n,
-                           float *alpha, const float **a, std::int64_t *lda, float **b,
-                           std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower,
-                           transpose *trans, diag *unit_diag, std::int64_t *m, std::int64_t *n,
-                           double *alpha, const double **a, std::int64_t *lda, double **b,
-                           std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower,
-                           transpose *trans, diag *unit_diag, std::int64_t *m, std::int64_t *n,
-                           std::complex<float> *alpha, const std::complex<float> **a,
-                           std::int64_t *lda, std::complex<float> **b, std::int64_t *ldb,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower,
-                           transpose *trans, diag *unit_diag, std::int64_t *m, std::int64_t *n,
-                           std::complex<double> *alpha, const std::complex<double> **a,
-                           std::int64_t *lda, std::complex<double> **b, std::int64_t *ldb,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb,
-                           std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha,
-                           const float **a, std::int64_t *lda, const float **b, std::int64_t *ldb,
-                           float *beta, float **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb,
-                           std::int64_t *m, std::int64_t *n, std::int64_t *k, double *alpha,
-                           const double **a, std::int64_t *lda, const double **b, std::int64_t *ldb,
-                           double *beta, double **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb,
-                           std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           std::complex<float> *alpha, const std::complex<float> **a,
-                           std::int64_t *lda, const std::complex<float> **b, std::int64_t *ldb,
-                           std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb,
-                           std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           std::complex<double> *alpha, const std::complex<double> **a,
-                           std::int64_t *lda, const std::complex<double> **b, std::int64_t *ldb,
-                           std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb,
-                           std::int64_t *m, std::int64_t *n, std::int64_t *k, sycl::half *alpha,
-                           const sycl::half **a, std::int64_t *lda, const sycl::half **b,
-                           std::int64_t *ldb, sycl::half *beta, sycl::half **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m,
-                       std::int64_t *n, std::int64_t *k, float *alpha, const sycl::half **a,
-                       std::int64_t *lda, const sycl::half **b, std::int64_t *ldb, float *beta,
-                       float **c, std::int64_t *ldc, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m,
-                       std::int64_t *n, std::int64_t *k, float *alpha, const std::int8_t **a,
-                       std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb, float *beta,
-                       float **c, std::int64_t *ldc, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m,
-                       std::int64_t *n, std::int64_t *k, float *alpha, const std::int8_t **a,
-                       std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb, float *beta,
-                       std::int32_t **c, std::int64_t *ldc, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                           std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                           const float *a, std::int64_t lda, std::int64_t stride_a, const float *b,
-                           std::int64_t ldb, std::int64_t stride_b, float beta, float *c,
-                           std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                           std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                           const double *a, std::int64_t lda, std::int64_t stride_a,
-                           const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
-                           double *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                           std::int64_t m, std::int64_t n, std::int64_t k,
-                           std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                           std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                           std::int64_t m, std::int64_t n, std::int64_t k,
-                           std::complex<double> alpha, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                           std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb,
-                           std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                           const sycl::half *a, std::int64_t lda, std::int64_t stride_a,
-                           const sycl::half *b, std::int64_t ldb, std::int64_t stride_b,
-                           sycl::half beta, sycl::half *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                       std::int64_t n, std::int64_t k, float alpha, const sycl::half *a,
-                       std::int64_t lda, std::int64_t stride_a, const sycl::half *b,
-                       std::int64_t ldb, std::int64_t stride_b, float beta, float *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                       std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a,
-                       std::int64_t lda, std::int64_t stride_a, const std::int8_t *b,
-                       std::int64_t ldb, std::int64_t stride_b, float beta, float *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                       std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a,
-                       std::int64_t lda, std::int64_t stride_a, const std::int8_t *b,
-                       std::int64_t ldb, std::int64_t stride_b, float beta, std::int32_t *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                      std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                      const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                      std::int64_t n, std::int64_t k, double alpha, const double *a,
-                      std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                      std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                      std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                      std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                      const std::complex<double> *a, std::int64_t lda,
-                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
-                      std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                          offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-                          float alpha, const std::int8_t *a, std::int64_t lda, std::int8_t ao,
-                          const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, float beta,
-                          std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                          offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-                          float alpha, const std::int8_t *a, std::int64_t lda, std::int8_t ao,
-                          const std::int8_t *b, std::int64_t ldb, std::int8_t bo, float beta,
-                          std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                          offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-                          float alpha, const std::uint8_t *a, std::int64_t lda, std::uint8_t ao,
-                          const std::int8_t *b, std::int64_t ldb, std::int8_t bo, float beta,
-                          std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                          offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-                          float alpha, const std::uint8_t *a, std::int64_t lda, std::uint8_t ao,
-                          const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, float beta,
-                          std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                           const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb,
-                           int64_t stride_b, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                           const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb,
-                           int64_t stride_b, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                           int64_t stride_a, std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                           int64_t stride_a, std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                           float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                           double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<float> alpha, std::complex<float> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<double> alpha, std::complex<double> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, float alpha, const float *a, int64_t lda, int64_t stride_a,
-                          float beta, const float *b, int64_t ldb, int64_t stride_b, float *c,
-                          int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, double alpha, const double *a, int64_t lda, int64_t stride_a,
-                          double beta, const double *b, int64_t ldb, int64_t stride_b, double *c,
-                          int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                          int64_t lda, int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                          std::complex<float> *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                          int64_t lda, int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                          std::complex<double> *c, int64_t ldc, int64_t stride_c,
-                          int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                     const float *a, int64_t lda, float *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                     const double *a, int64_t lda, double *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                     std::complex<float> *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                     std::complex<double> *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                      const float *a, int64_t lda, std::int64_t stridea, float *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                      const double *a, int64_t lda, std::int64_t stridea, double *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                      std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                      std::int64_t stridea, std::complex<float> *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                      std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                      std::int64_t stridea, std::complex<double> *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                     float *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                     double *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<float> alpha, std::complex<float> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<double> alpha, std::complex<double> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    float alpha, const float *a, int64_t lda, float beta, const float *b,
-                    int64_t ldb, float *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    double alpha, const double *a, int64_t lda, double beta, const double *b,
-                    int64_t ldb, double *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                    std::complex<float> beta, const std::complex<float> *b, int64_t ldb,
-                    std::complex<float> *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                    std::complex<double> beta, const std::complex<double> *b, int64_t ldb,
-                    std::complex<double> *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           float* alpha, const float** a, int64_t* lda, float** b, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           double* alpha, const double** a, int64_t* lda, double** b, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<float>* alpha, const std::complex<float>** a, int64_t* lda,
-                           std::complex<float>** b, int64_t* ldb, int64_t group_count,
-                           int64_t* groupsize, const std::vector<sycl::event>& dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<double>* alpha, const std::complex<double>** a,
-                           int64_t* lda, std::complex<double>** b, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           float* alpha, float** ab, int64_t* lda, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           double* alpha, double** ab, int64_t* lda, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<float>* alpha, std::complex<float>** ab, int64_t* lda,
-                           int64_t* ldb, int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<double>* alpha, std::complex<double>** ab, int64_t* lda,
-                           int64_t* ldb, int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies = {});
diff --git a/include/oneapi/mkl/blas/detail/mklcpu/blas_ct.hpp b/include/oneapi/mkl/blas/detail/mklcpu/blas_ct.hpp
deleted file mode 100644
index 24b0a9c7e..000000000
--- a/include/oneapi/mkl/blas/detail/mklcpu/blas_ct.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _DETAIL_MKLCPU_BLAS_CT_HPP__
-#define _DETAIL_MKLCPU_BLAS_CT_HPP__
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/detail/backend_selector.hpp"
-
-#include "oneapi/mkl/blas/detail/blas_ct_backends.hpp"
-#include "oneapi/mkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace column_major {
-
-#define MAJOR column_major
-#include "blas_ct.hxx"
-#undef MAJOR
-
-} //namespace column_major
-namespace row_major {
-
-#define MAJOR row_major
-#include "blas_ct.hxx"
-#undef MAJOR
-
-} //namespace row_major
-} //namespace blas
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_DETAIL_MKLCPU_BLAS_CT_HPP_
diff --git a/include/oneapi/mkl/blas/detail/mklcpu/blas_ct.hxx b/include/oneapi/mkl/blas/detail/mklcpu/blas_ct.hxx
deleted file mode 100644
index 1724bf5c7..000000000
--- a/include/oneapi/mkl/blas/detail/mklcpu/blas_ct.hxx
+++ /dev/null
@@ -1,4383 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void herk(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, float beta, sycl::buffer<std::complex<float>, 1> &c,
-          std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void herk(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, double alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, double beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void scal(backend_selector<backend::mklcpu> selector, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::mklcpu> selector, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::mklcpu> selector, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::mklcpu> selector, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::mklcpu> selector, std::int64_t n, float alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::mklcpu> selector, std::int64_t n, double alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void trmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void tpmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void spr(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a) {
-    oneapi::mkl::blas::mklcpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void spr(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a) {
-    oneapi::mkl::blas::mklcpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                double beta, sycl::buffer<double, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void syrk(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk_batch(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void syrk_batch(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, double beta,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void syrk_batch(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void syrk_batch(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void her2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklcpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void her2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklcpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void hbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void hbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void rot(backend_selector<backend::mklcpu> selector, std::int64_t n,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c, float s) {
-    oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::mklcpu> selector, std::int64_t n,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c, double s) {
-    oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::mklcpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy, float c, float s) {
-    oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::mklcpu> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy, double c, double s) {
-    oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void axpy(backend_selector<backend::mklcpu> selector, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::mklcpu> selector, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::mklcpu> selector, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::mklcpu> selector, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpby(backend_selector<backend::mklcpu> selector, std::int64_t n, float alpha,
-           sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-           sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(backend_selector<backend::mklcpu> selector, std::int64_t n, double alpha,
-           sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-           sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(backend_selector<backend::mklcpu> selector, std::int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(backend_selector<backend::mklcpu> selector, std::int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-           std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void sdsdot(backend_selector<backend::mklcpu> selector, std::int64_t n, float sb,
-            sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-            std::int64_t incy, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result);
-}
-
-void gerc(backend_selector<backend::mklcpu> selector, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void gerc(backend_selector<backend::mklcpu> selector, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void syr2k(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-           std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-           std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-           std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv_batch(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<float, 1> &x, std::int64_t incx,
-                std::int64_t stridex, float beta, sycl::buffer<float, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void gemv_batch(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<double, 1> &x, std::int64_t incx,
-                std::int64_t stridex, double beta, sycl::buffer<double, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void gemv_batch(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                std::int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void gemv_batch(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                std::int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::mklcpu> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<float, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<float, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::mklcpu> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<double, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<double, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::mklcpu> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::mklcpu> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void her(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklcpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void her(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklcpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void hpr(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a) {
-    oneapi::mkl::blas::mklcpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void hpr(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a) {
-    oneapi::mkl::blas::mklcpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void iamin(backend_selector<backend::mklcpu> selector, std::int64_t n,
-           sycl::buffer<float, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::mklcpu> selector, std::int64_t n,
-           sycl::buffer<double, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::mklcpu> selector, std::int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::mklcpu> selector, std::int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void hpmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void hpmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void spmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void spmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void gemm_bias(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void swap(backend_selector<backend::mklcpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void geru(backend_selector<backend::mklcpu> selector, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklcpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void geru(backend_selector<backend::mklcpu> selector, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklcpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void nrm2(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::mklcpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void gemm(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<bfloat16, 1> &a, std::int64_t lda, sycl::buffer<bfloat16, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void syr2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void syr2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &a,
-          std::int64_t lda) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void ger(backend_selector<backend::mklcpu> selector, std::int64_t m, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-         std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklcpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                          lda);
-}
-
-void ger(backend_selector<backend::mklcpu> selector, std::int64_t m, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-         std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklcpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                          lda);
-}
-
-void trsm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void dotu(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dotu(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void hemm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hemm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hpr2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a) {
-    oneapi::mkl::blas::mklcpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void hpr2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a) {
-    oneapi::mkl::blas::mklcpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void gbmv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void tbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void symm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void dotc(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dotc(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void syr(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a,
-         std::int64_t lda) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void syr(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a,
-         std::int64_t lda) {
-    oneapi::mkl::blas::mklcpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void trmm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void rotmg(backend_selector<backend::mklcpu> selector, sycl::buffer<float, 1> &d1,
-           sycl::buffer<float, 1> &d2, sycl::buffer<float, 1> &x1, float y1,
-           sycl::buffer<float, 1> &param) {
-    oneapi::mkl::blas::mklcpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param);
-}
-
-void rotmg(backend_selector<backend::mklcpu> selector, sycl::buffer<double, 1> &d1,
-           sycl::buffer<double, 1> &d2, sycl::buffer<double, 1> &x1, double y1,
-           sycl::buffer<double, 1> &param) {
-    oneapi::mkl::blas::mklcpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param);
-}
-
-void tpsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void trsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void copy(backend_selector<backend::mklcpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void hemv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void hemv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemmt(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, float alpha,
-           sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-           std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, double alpha,
-           sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-           std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-           std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void asum(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::mklcpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void sbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void sbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void tbsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklcpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void spr2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a) {
-    oneapi::mkl::blas::mklcpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void spr2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &a) {
-    oneapi::mkl::blas::mklcpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void iamax(backend_selector<backend::mklcpu> selector, std::int64_t n,
-           sycl::buffer<float, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::mklcpu> selector, std::int64_t n,
-           sycl::buffer<double, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::mklcpu> selector, std::int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::mklcpu> selector, std::int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void rotm(backend_selector<backend::mklcpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-          sycl::buffer<float, 1> &param) {
-    oneapi::mkl::blas::mklcpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param);
-}
-
-void rotm(backend_selector<backend::mklcpu> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &param) {
-    oneapi::mkl::blas::mklcpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param);
-}
-
-void dot(backend_selector<backend::mklcpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dot(backend_selector<backend::mklcpu> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-         sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dot(backend_selector<backend::mklcpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::mklcpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void trsm_batch(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void her2k(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void her2k(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void rotg(backend_selector<backend::mklcpu> selector, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<float, 1> &s) {
-    oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::mklcpu> selector, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<double, 1> &s) {
-    oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::mklcpu> selector, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<std::complex<float>, 1> &s) {
-    oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::mklcpu> selector, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<std::complex<double>, 1> &s) {
-    oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void symv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void symv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklcpu::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void omatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                   std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<float, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                   std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer<double, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<double, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                   sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                   std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                   sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                   std::int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                   std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-              sycl::buffer<float, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-              sycl::buffer<double, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy2(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-               std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<float, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-               std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<double, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void imatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void imatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void imatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void imatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void omatadd(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-             std::int64_t lda, float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-             sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-             std::int64_t lda, double beta, sycl::buffer<double, 1> &b, std::int64_t ldb,
-             sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<float> alpha,
-             sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-             sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<double> alpha,
-             sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-             sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-// USM APIs
-
-sycl::event syr2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *x, std::int64_t incx, const float *y,
-                     std::int64_t incy, float *a, std::int64_t lda,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *x, std::int64_t incx, const double *y,
-                     std::int64_t incy, double *a, std::int64_t lda,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::mklcpu> selector, std::int64_t n, float alpha,
-                     float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::mklcpu> selector, std::int64_t n, double alpha,
-                     double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     std::complex<float> alpha, std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     std::complex<double> alpha, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::mklcpu> selector, std::int64_t n, float alpha,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::mklcpu> selector, std::int64_t n, double alpha,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event spr(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const float *x, std::int64_t incx, float *a,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event spr(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const double *x, std::int64_t incx, double *a,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event hpmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hpmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                     float beta, float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, double alpha, const double *a,
-                     std::int64_t lda, double beta, double *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                     std::complex<float> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                     std::complex<double> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklcpu> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k, float *alpha,
-                           const float **a, std::int64_t *lda, float *beta, float **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklcpu> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k, double *alpha,
-                           const double **a, std::int64_t *lda, double *beta, double **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklcpu> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k,
-                           std::complex<float> *alpha, const std::complex<float> **a,
-                           std::int64_t *lda, std::complex<float> *beta, std::complex<float> **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklcpu> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k,
-                           std::complex<double> *alpha, const std::complex<double> **a,
-                           std::int64_t *lda, std::complex<double> *beta, std::complex<double> **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklcpu> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                           const float *a, std::int64_t lda, std::int64_t stride_a, float beta,
-                           float *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklcpu> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                           const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                           double *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklcpu> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k,
-                           std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<float> beta,
-                           std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklcpu> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k,
-                           std::complex<double> alpha, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-                           std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event her2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event her2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event hbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                     std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                     std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                    std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                    std::int64_t incy, float c, float s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                    std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                    std::int64_t incy, double c, double s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::mklcpu> selector, std::int64_t n, float *x,
-                    std::int64_t incx, float *y, std::int64_t incy, float c, float s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::mklcpu> selector, std::int64_t n, double *x,
-                    std::int64_t incx, double *y, std::int64_t incy, double c, double s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::mklcpu> selector, std::int64_t n, float alpha,
-                     const float *x, std::int64_t incx, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::mklcpu> selector, std::int64_t n, double alpha,
-                     const double *x, std::int64_t incx, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklcpu> selector, std::int64_t *n,
-                           float *alpha, const float **x, std::int64_t *incx, float **y,
-                           std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklcpu> selector, std::int64_t *n,
-                           double *alpha, const double **x, std::int64_t *incx, double **y,
-                           std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklcpu> selector, std::int64_t *n,
-                           std::complex<float> *alpha, const std::complex<float> **x,
-                           std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklcpu> selector, std::int64_t *n,
-                           std::complex<double> *alpha, const std::complex<double> **x,
-                           std::int64_t *incx, std::complex<double> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n, float alpha,
-                           const float *x, std::int64_t incx, std::int64_t stridex, float *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n, double alpha,
-                           const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                           std::complex<float> alpha, const std::complex<float> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<float> *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                           std::complex<double> alpha, const std::complex<double> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<double> *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::mklcpu> selector, std::int64_t n, float alpha,
-                      const float *x, std::int64_t incx, const float beta, float *y,
-                      std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::mklcpu> selector, std::int64_t n, double alpha,
-                      const double *x, std::int64_t incx, const double beta, double *y,
-                      std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                      std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                      const std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                      std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                      const std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gerc(backend_selector<backend::mklcpu> selector, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event gerc(backend_selector<backend::mklcpu> selector, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                      const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, double alpha, const double *a,
-                      std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                      std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                      std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                      std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                      const std::complex<double> *a, std::int64_t lda,
-                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
-                      std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x,
-                     std::int64_t incx, float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                     const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                     std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                     std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklcpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, const float *a,
-                           std::int64_t lda, std::int64_t stridea, const float *x,
-                           std::int64_t incx, std::int64_t stridex, float beta, float *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklcpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, const double *a,
-                           std::int64_t lda, std::int64_t stridea, const double *x,
-                           std::int64_t incx, std::int64_t stridex, double beta, double *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklcpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-                           const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                           std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklcpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stridea,
-                           const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                           std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklcpu> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, float *alpha, const float **a,
-                           std::int64_t *lda, const float **x, std::int64_t *incx, float *beta,
-                           float **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklcpu> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, double *alpha, const double **a,
-                           std::int64_t *lda, const double **x, std::int64_t *incx, double *beta,
-                           double **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklcpu> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<float> *alpha,
-                           const std::complex<float> **a, std::int64_t *lda,
-                           const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> *beta, std::complex<float> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklcpu> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<double> *alpha,
-                           const std::complex<double> **a, std::int64_t *lda,
-                           const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> *beta, std::complex<double> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklcpu> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const float *a, std::int64_t lda,
-                           std::int64_t stridea, const float *x, std::int64_t incx,
-                           std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklcpu> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const double *a, std::int64_t lda,
-                           std::int64_t stridea, const double *x, std::int64_t incx,
-                           std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklcpu> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stridea, const std::complex<float> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<float> *c,
-                           std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklcpu> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stridea, const std::complex<double> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<double> *c,
-                           std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklcpu> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const float **a, std::int64_t *lda,
-                           const float **x, std::int64_t *incx, float **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklcpu> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const double **a, std::int64_t *lda,
-                           const double **x, std::int64_t *incx, double **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklcpu> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const std::complex<float> **a,
-                           std::int64_t *lda, const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklcpu> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const std::complex<double> **a,
-                           std::int64_t *lda, const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event her(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const std::complex<float> *x, std::int64_t incx,
-                    std::complex<float> *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event her(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const std::complex<double> *x, std::int64_t incx,
-                    std::complex<double> *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event hpr(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const std::complex<float> *x, std::int64_t incx,
-                    std::complex<float> *a, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event hpr(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const std::complex<double> *x, std::int64_t incx,
-                    std::complex<double> *a, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::mklcpu> selector, std::int64_t n, const float *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::mklcpu> selector, std::int64_t n, const double *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                      const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                      const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           float *alpha, const float **a, std::int64_t *lda, const float **b,
-                           std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           double *alpha, const double **a, std::int64_t *lda, const double **b,
-                           std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           std::complex<float> *alpha, const std::complex<float> **a,
-                           std::int64_t *lda, const std::complex<float> **b, std::int64_t *ldb,
-                           std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           std::complex<double> *alpha, const std::complex<double> **a,
-                           std::int64_t *lda, const std::complex<double> **b, std::int64_t *ldb,
-                           std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           sycl::half *alpha, const sycl::half **a, std::int64_t *lda,
-                           const sycl::half **b, std::int64_t *ldb, sycl::half *beta,
-                           sycl::half **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const sycl::half **a, std::int64_t *lda, const sycl::half **b,
-                       std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, float **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
-                           const float *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                           float *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
-                           const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
-                           double *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                           std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           std::complex<double> alpha, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                           std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           sycl::half alpha, const sycl::half *a, std::int64_t lda,
-                           std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-                           std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc,
-                           std::int64_t stride_c, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const sycl::half *a, std::int64_t lda, std::int64_t stride_a,
-                       const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklcpu> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       std::int32_t *c, std::int64_t ldc, std::int64_t stride_c,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event spmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *a, const float *x, std::int64_t incx, float beta,
-                     float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event spmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *a, const double *x, std::int64_t incx, double beta,
-                     double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::mklcpu> selector, std::int64_t n, float *x,
-                     std::int64_t incx, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::mklcpu> selector, std::int64_t n, double *x,
-                     std::int64_t incx, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event geru(backend_selector<backend::mklcpu> selector, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event geru(backend_selector<backend::mklcpu> selector, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::mklcpu> selector, std::int64_t n, const float *x,
-                     std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::mklcpu> selector, std::int64_t n, const double *x,
-                     std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a,
-                     std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a,
-                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                     const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                     sycl::half beta, sycl::half *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                     const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                     float beta, float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a,
-                     std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::mklcpu> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                          std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::mklcpu> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                          std::int8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::mklcpu> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda,
-                          std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::mklcpu> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda,
-                          std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event herk(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, float alpha, const std::complex<float> *a,
-                     std::int64_t lda, float beta, std::complex<float> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::herk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event herk(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, double alpha, const std::complex<double> *a,
-                     std::int64_t lda, double beta, std::complex<double> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::herk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event ger(backend_selector<backend::mklcpu> selector, std::int64_t m, std::int64_t n,
-                    float alpha, const float *x, std::int64_t incx, const float *y,
-                    std::int64_t incy, float *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y,
-                                                      incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event ger(backend_selector<backend::mklcpu> selector, std::int64_t m, std::int64_t n,
-                    double alpha, const double *x, std::int64_t incx, const double *y,
-                    std::int64_t incy, double *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y,
-                                                      incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                     const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                     const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklcpu> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                           std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklcpu> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                           std::int64_t stride_a, double *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklcpu> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<float> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklcpu> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklcpu> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, float *alpha, const float **a, std::int64_t *lda,
-                           float **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklcpu> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, double *alpha, const double **a, std::int64_t *lda,
-                           double **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklcpu> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, std::complex<float> *alpha,
-                           const std::complex<float> **a, std::int64_t *lda,
-                           std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklcpu> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, std::complex<double> *alpha,
-                           const std::complex<double> **a, std::int64_t *lda,
-                           std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event dotu(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                     std::int64_t incy, std::complex<float> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event dotu(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event hemm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event hemm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event hpr2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event hpr2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a,
-                     std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
-                     const double *a, std::int64_t lda, const double *x, std::int64_t incx,
-                     double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
-                     std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
-                     std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
-                     std::int64_t lda, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
-                     std::int64_t lda, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                     const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, double alpha, const double *a,
-                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event dotc(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                     std::int64_t incy, std::complex<float> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event dotc(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event syr(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                     const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                     const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::mklcpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event rotmg(backend_selector<backend::mklcpu> selector, float *d1, float *d2, float *x1,
-                      float y1, float *param, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event rotmg(backend_selector<backend::mklcpu> selector, double *d1, double *d2,
-                      double *x1, double y1, double *param,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::mklcpu> selector, std::int64_t n, const float *x,
-                     std::int64_t incx, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::mklcpu> selector, std::int64_t n, const double *x,
-                     std::int64_t incx, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklcpu> selector, std::int64_t *n,
-                           const float **x, std::int64_t *incx, float **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklcpu> selector, std::int64_t *n,
-                           const double **x, std::int64_t *incx, double **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklcpu> selector, std::int64_t *n,
-                           const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklcpu> selector, std::int64_t *n,
-                           const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                           const float *x, std::int64_t incx, std::int64_t stridex, float *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                           const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                           const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                           const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event hemv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::hemv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hemv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::hemv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::mklcpu> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      float alpha, const float *a, std::int64_t lda, const float *b,
-                      std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::mklcpu> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      double alpha, const double *a, std::int64_t lda, const double *b,
-                      std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::mklcpu> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                      const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
-                      std::complex<float> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::mklcpu> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
-                      std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event sbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x,
-                     std::int64_t incx, float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event sbmv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                     const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::mklcpu> selector, std::int64_t n, const float *x,
-                     std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::mklcpu> selector, std::int64_t n, const double *x,
-                     std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
-                     std::int64_t lda, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
-                     std::int64_t lda, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event spr2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *x, std::int64_t incx, const float *y,
-                     std::int64_t incy, float *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event spr2(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *x, std::int64_t incx, const double *y,
-                     std::int64_t incy, double *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::mklcpu> selector, std::int64_t n, const float *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::mklcpu> selector, std::int64_t n, const double *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                      const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::mklcpu> selector, std::int64_t n,
-                      const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event rotm(backend_selector<backend::mklcpu> selector, std::int64_t n, float *x,
-                     std::int64_t incx, float *y, std::int64_t incy, float *param,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy,
-                                                       param, dependencies);
-    return done;
-}
-
-sycl::event rotm(backend_selector<backend::mklcpu> selector, std::int64_t n, double *x,
-                     std::int64_t incx, double *y, std::int64_t incy, double *param,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy,
-                                                       param, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::mklcpu> selector, float *a, float *b, float *c,
-                     float *s, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::mklcpu> selector, double *a, double *b, double *c,
-                     double *s, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::mklcpu> selector, std::complex<float> *a,
-                     std::complex<float> *b, float *c, std::complex<float> *s,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::mklcpu> selector, std::complex<double> *a,
-                     std::complex<double> *b, double *c, std::complex<double> *s,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event sdsdot(backend_selector<backend::mklcpu> selector, std::int64_t n, float sb,
-                       const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                       float *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y,
-                                                         incy, result, dependencies);
-    return done;
-}
-
-sycl::event her2k(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                      std::int64_t ldb, float beta, std::complex<float> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event her2k(backend_selector<backend::mklcpu> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                      const std::complex<double> *a, std::int64_t lda,
-                      const std::complex<double> *b, std::int64_t ldb, double beta,
-                      std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::mklcpu> selector, std::int64_t n, const float *x,
-                    std::int64_t incx, const float *y, std::int64_t incy, float *result,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                      result, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::mklcpu> selector, std::int64_t n, const double *x,
-                    std::int64_t incx, const double *y, std::int64_t incy, double *result,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                      result, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::mklcpu> selector, std::int64_t n, const float *x,
-                    std::int64_t incx, const float *y, std::int64_t incy, double *result,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                      result, dependencies);
-    return done;
-}
-
-sycl::event symv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *a, std::int64_t lda, const float *x,
-                     std::int64_t incx, float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::symv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event symv(backend_selector<backend::mklcpu> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *a, std::int64_t lda, const double *x,
-                     std::int64_t incx, double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::symv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, const float *a,
-                           std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, const double *a,
-                           std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda,
-                           std::int64_t ldb, std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, double *ab,
-                           std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklcpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::mklcpu> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                          const float *a, std::int64_t lda, std::int64_t stride_a, float beta,
-                          const float *b, std::int64_t ldb, std::int64_t stride_b, float *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::mklcpu> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                          const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                          const double *b, std::int64_t ldb, std::int64_t stride_b, double *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::mklcpu> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                          std::int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::mklcpu> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<double> alpha, const std::complex<double> *a,
-                          std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                      std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                      std::int64_t stridea, double *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                      std::int64_t lda, std::int64_t stridea, std::complex<float> *b,
-                      std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                      std::int64_t lda, std::int64_t stridea, std::complex<double> *b,
-                      std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, float *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, double *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, std::complex<float> *ab,
-                     std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::mklcpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, std::complex<double> *ab,
-                     std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                    float beta, const float *b, std::int64_t ldb, float *c, std::int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                    double beta, const double *b, std::int64_t ldb, double *c, std::int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                    const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                    const std::complex<float> *b, std::int64_t ldb, std::complex<float> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::mklcpu> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                    const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                    const std::complex<double> *b, std::int64_t ldb, std::complex<double> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklcpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, float* alpha, const float** a,
-                           std::int64_t* lda, float** b, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklcpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, double* alpha, const double** a,
-                           std::int64_t* lda, double** b, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklcpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<float>* alpha,
-                           const std::complex<float>** a, std::int64_t* lda,
-                           std::complex<float>** b, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklcpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<double>* alpha,
-                           const std::complex<double>** a, std::int64_t* lda,
-                           std::complex<double>** b, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklcpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, float* alpha, float** ab,
-                           std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklcpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, double* alpha, double** ab,
-                           std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklcpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<float>* alpha,
-                           std::complex<float>** ab, std::int64_t* lda, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklcpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<double>* alpha,
-                           std::complex<double>** ab, std::int64_t* lda, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
diff --git a/include/oneapi/mkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp b/include/oneapi/mkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp
deleted file mode 100644
index db9e3f4d0..000000000
--- a/include/oneapi/mkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_BLAS_MKLCPU_HPP_
-#define _ONEMKL_BLAS_MKLCPU_HPP_
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/detail/export.hpp"
-
-namespace oneapi {
-namespace mkl {
-using oneapi::mkl::transpose;
-using oneapi::mkl::uplo;
-using oneapi::mkl::side;
-using oneapi::mkl::diag;
-using oneapi::mkl::offset;
-namespace blas {
-namespace mklcpu {
-namespace column_major {
-
-#include "oneapi/mkl/blas/detail/onemkl_blas_backends.hxx"
-
-} // namespace column_major
-namespace row_major {
-
-#include "oneapi/mkl/blas/detail/onemkl_blas_backends.hxx"
-
-} // namespace row_major
-} // namespace mklcpu
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_BLAS_MKLCPU_HPP_
diff --git a/include/oneapi/mkl/blas/detail/mklgpu/blas_ct.hpp b/include/oneapi/mkl/blas/detail/mklgpu/blas_ct.hpp
deleted file mode 100644
index 10ceb3b73..000000000
--- a/include/oneapi/mkl/blas/detail/mklgpu/blas_ct.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _DETAIL_MKLGPU_BLAS_CT_HPP__
-#define _DETAIL_MKLGPU_BLAS_CT_HPP__
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/detail/backends.hpp"
-
-#include "oneapi/mkl/blas/detail/blas_ct_backends.hpp"
-#include "oneapi/mkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace column_major {
-
-#define MAJOR column_major
-#include "blas_ct.hxx"
-#undef MAJOR
-
-} //namespace column_major
-namespace row_major {
-
-#define MAJOR row_major
-#include "blas_ct.hxx"
-#undef MAJOR
-
-} //namespace row_major
-} //namespace blas
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_DETAIL_MKLGPU_BLAS_CT_HPP_
diff --git a/include/oneapi/mkl/blas/detail/mklgpu/blas_ct.hxx b/include/oneapi/mkl/blas/detail/mklgpu/blas_ct.hxx
deleted file mode 100644
index c69257e9c..000000000
--- a/include/oneapi/mkl/blas/detail/mklgpu/blas_ct.hxx
+++ /dev/null
@@ -1,4383 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void herk(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, float beta, sycl::buffer<std::complex<float>, 1> &c,
-          std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void herk(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, double alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, double beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void scal(backend_selector<backend::mklgpu> selector, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::mklgpu> selector, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::mklgpu> selector, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::mklgpu> selector, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::mklgpu> selector, std::int64_t n, float alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::mklgpu> selector, std::int64_t n, double alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void trmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void tpmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void spr(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a) {
-    oneapi::mkl::blas::mklgpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void spr(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a) {
-    oneapi::mkl::blas::mklgpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                double beta, sycl::buffer<double, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void syrk(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk_batch(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void syrk_batch(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, double beta,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void syrk_batch(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void syrk_batch(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void her2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklgpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void her2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklgpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void hbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void hbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void rot(backend_selector<backend::mklgpu> selector, std::int64_t n,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c, float s) {
-    oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::mklgpu> selector, std::int64_t n,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c, double s) {
-    oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::mklgpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy, float c, float s) {
-    oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::mklgpu> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy, double c, double s) {
-    oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void axpy(backend_selector<backend::mklgpu> selector, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::mklgpu> selector, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::mklgpu> selector, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::mklgpu> selector, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpby(backend_selector<backend::mklgpu> selector, std::int64_t n, float alpha,
-           sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-           sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(backend_selector<backend::mklgpu> selector, std::int64_t n, double alpha,
-           sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-           sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(backend_selector<backend::mklgpu> selector, std::int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(backend_selector<backend::mklgpu> selector, std::int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-           std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void sdsdot(backend_selector<backend::mklgpu> selector, std::int64_t n, float sb,
-            sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-            std::int64_t incy, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result);
-}
-
-void gerc(backend_selector<backend::mklgpu> selector, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void gerc(backend_selector<backend::mklgpu> selector, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void syr2k(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-           std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-           std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-           std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv_batch(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<float, 1> &x, std::int64_t incx,
-                std::int64_t stridex, float beta, sycl::buffer<float, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void gemv_batch(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<double, 1> &x, std::int64_t incx,
-                std::int64_t stridex, double beta, sycl::buffer<double, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void gemv_batch(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                std::int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void gemv_batch(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                std::int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::mklgpu> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<float, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<float, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::mklgpu> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<double, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<double, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::mklgpu> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::mklgpu> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void her(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklgpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void her(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklgpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void hpr(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a) {
-    oneapi::mkl::blas::mklgpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void hpr(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a) {
-    oneapi::mkl::blas::mklgpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void iamin(backend_selector<backend::mklgpu> selector, std::int64_t n,
-           sycl::buffer<float, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::mklgpu> selector, std::int64_t n,
-           sycl::buffer<double, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::mklgpu> selector, std::int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::mklgpu> selector, std::int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void hpmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void hpmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void spmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void spmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void gemm_bias(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void swap(backend_selector<backend::mklgpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void geru(backend_selector<backend::mklgpu> selector, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklgpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void geru(backend_selector<backend::mklgpu> selector, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklgpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void nrm2(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::mklgpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void gemm(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<bfloat16, 1> &a, std::int64_t lda, sycl::buffer<bfloat16, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void syr2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void syr2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &a,
-          std::int64_t lda) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void ger(backend_selector<backend::mklgpu> selector, std::int64_t m, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-         std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklgpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                          lda);
-}
-
-void ger(backend_selector<backend::mklgpu> selector, std::int64_t m, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-         std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::mklgpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                          lda);
-}
-
-void trsm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void dotu(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dotu(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void hemm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hemm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hpr2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a) {
-    oneapi::mkl::blas::mklgpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void hpr2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a) {
-    oneapi::mkl::blas::mklgpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void gbmv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void tbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void symm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void dotc(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dotc(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void syr(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a,
-         std::int64_t lda) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void syr(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a,
-         std::int64_t lda) {
-    oneapi::mkl::blas::mklgpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void trmm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void rotmg(backend_selector<backend::mklgpu> selector, sycl::buffer<float, 1> &d1,
-           sycl::buffer<float, 1> &d2, sycl::buffer<float, 1> &x1, float y1,
-           sycl::buffer<float, 1> &param) {
-    oneapi::mkl::blas::mklgpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param);
-}
-
-void rotmg(backend_selector<backend::mklgpu> selector, sycl::buffer<double, 1> &d1,
-           sycl::buffer<double, 1> &d2, sycl::buffer<double, 1> &x1, double y1,
-           sycl::buffer<double, 1> &param) {
-    oneapi::mkl::blas::mklgpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param);
-}
-
-void tpsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void trsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void copy(backend_selector<backend::mklgpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void hemv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void hemv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemmt(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, float alpha,
-           sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-           std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, double alpha,
-           sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-           std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-           std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void asum(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::mklgpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void sbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void sbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void tbsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::mklgpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void spr2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a) {
-    oneapi::mkl::blas::mklgpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void spr2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &a) {
-    oneapi::mkl::blas::mklgpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void iamax(backend_selector<backend::mklgpu> selector, std::int64_t n,
-           sycl::buffer<float, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::mklgpu> selector, std::int64_t n,
-           sycl::buffer<double, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::mklgpu> selector, std::int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::mklgpu> selector, std::int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void rotm(backend_selector<backend::mklgpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-          sycl::buffer<float, 1> &param) {
-    oneapi::mkl::blas::mklgpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param);
-}
-
-void rotm(backend_selector<backend::mklgpu> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &param) {
-    oneapi::mkl::blas::mklgpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param);
-}
-
-void dot(backend_selector<backend::mklgpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dot(backend_selector<backend::mklgpu> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-         sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dot(backend_selector<backend::mklgpu> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::mklgpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void trsm_batch(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void her2k(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void her2k(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void rotg(backend_selector<backend::mklgpu> selector, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<float, 1> &s) {
-    oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::mklgpu> selector, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<double, 1> &s) {
-    oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::mklgpu> selector, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<std::complex<float>, 1> &s) {
-    oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::mklgpu> selector, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<std::complex<double>, 1> &s) {
-    oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void symv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void symv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::mklgpu::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void omatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                   std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<float, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                   std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer<double, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<double, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                   sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                   std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                   sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                   std::int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                   std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-              sycl::buffer<float, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-              sycl::buffer<double, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy2(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-               std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<float, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-               std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<double, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void imatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void imatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void imatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void imatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void omatadd(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-             std::int64_t lda, float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-             sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-             std::int64_t lda, double beta, sycl::buffer<double, 1> &b, std::int64_t ldb,
-             sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<float> alpha,
-             sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-             sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<double> alpha,
-             sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-             sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-// USM APIs
-
-sycl::event syr2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *x, std::int64_t incx, const float *y,
-                     std::int64_t incy, float *a, std::int64_t lda,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *x, std::int64_t incx, const double *y,
-                     std::int64_t incy, double *a, std::int64_t lda,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::mklgpu> selector, std::int64_t n, float alpha,
-                     float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::mklgpu> selector, std::int64_t n, double alpha,
-                     double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     std::complex<float> alpha, std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     std::complex<double> alpha, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::mklgpu> selector, std::int64_t n, float alpha,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::mklgpu> selector, std::int64_t n, double alpha,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event spr(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const float *x, std::int64_t incx, float *a,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event spr(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const double *x, std::int64_t incx, double *a,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event hpmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hpmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                     float beta, float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, double alpha, const double *a,
-                     std::int64_t lda, double beta, double *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                     std::complex<float> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                     std::complex<double> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklgpu> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k, float *alpha,
-                           const float **a, std::int64_t *lda, float *beta, float **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklgpu> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k, double *alpha,
-                           const double **a, std::int64_t *lda, double *beta, double **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklgpu> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k,
-                           std::complex<float> *alpha, const std::complex<float> **a,
-                           std::int64_t *lda, std::complex<float> *beta, std::complex<float> **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklgpu> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k,
-                           std::complex<double> *alpha, const std::complex<double> **a,
-                           std::int64_t *lda, std::complex<double> *beta, std::complex<double> **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklgpu> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                           const float *a, std::int64_t lda, std::int64_t stride_a, float beta,
-                           float *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklgpu> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                           const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                           double *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklgpu> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k,
-                           std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<float> beta,
-                           std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::mklgpu> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k,
-                           std::complex<double> alpha, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-                           std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event her2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event her2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event hbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                     std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                     std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                    std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                    std::int64_t incy, float c, float s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                    std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                    std::int64_t incy, double c, double s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::mklgpu> selector, std::int64_t n, float *x,
-                    std::int64_t incx, float *y, std::int64_t incy, float c, float s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::mklgpu> selector, std::int64_t n, double *x,
-                    std::int64_t incx, double *y, std::int64_t incy, double c, double s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::mklgpu> selector, std::int64_t n, float alpha,
-                     const float *x, std::int64_t incx, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::mklgpu> selector, std::int64_t n, double alpha,
-                     const double *x, std::int64_t incx, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklgpu> selector, std::int64_t *n,
-                           float *alpha, const float **x, std::int64_t *incx, float **y,
-                           std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklgpu> selector, std::int64_t *n,
-                           double *alpha, const double **x, std::int64_t *incx, double **y,
-                           std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklgpu> selector, std::int64_t *n,
-                           std::complex<float> *alpha, const std::complex<float> **x,
-                           std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklgpu> selector, std::int64_t *n,
-                           std::complex<double> *alpha, const std::complex<double> **x,
-                           std::int64_t *incx, std::complex<double> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n, float alpha,
-                           const float *x, std::int64_t incx, std::int64_t stridex, float *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n, double alpha,
-                           const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                           std::complex<float> alpha, const std::complex<float> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<float> *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                           std::complex<double> alpha, const std::complex<double> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<double> *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::mklgpu> selector, std::int64_t n, float alpha,
-                      const float *x, std::int64_t incx, const float beta, float *y,
-                      std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::mklgpu> selector, std::int64_t n, double alpha,
-                      const double *x, std::int64_t incx, const double beta, double *y,
-                      std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                      std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                      const std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                      std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                      const std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gerc(backend_selector<backend::mklgpu> selector, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event gerc(backend_selector<backend::mklgpu> selector, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                      const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, double alpha, const double *a,
-                      std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                      std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                      std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                      std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                      const std::complex<double> *a, std::int64_t lda,
-                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
-                      std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x,
-                     std::int64_t incx, float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                     const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                     std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                     std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklgpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, const float *a,
-                           std::int64_t lda, std::int64_t stridea, const float *x,
-                           std::int64_t incx, std::int64_t stridex, float beta, float *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklgpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, const double *a,
-                           std::int64_t lda, std::int64_t stridea, const double *x,
-                           std::int64_t incx, std::int64_t stridex, double beta, double *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklgpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-                           const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                           std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklgpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stridea,
-                           const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                           std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklgpu> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, float *alpha, const float **a,
-                           std::int64_t *lda, const float **x, std::int64_t *incx, float *beta,
-                           float **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklgpu> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, double *alpha, const double **a,
-                           std::int64_t *lda, const double **x, std::int64_t *incx, double *beta,
-                           double **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklgpu> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<float> *alpha,
-                           const std::complex<float> **a, std::int64_t *lda,
-                           const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> *beta, std::complex<float> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::mklgpu> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<double> *alpha,
-                           const std::complex<double> **a, std::int64_t *lda,
-                           const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> *beta, std::complex<double> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklgpu> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const float *a, std::int64_t lda,
-                           std::int64_t stridea, const float *x, std::int64_t incx,
-                           std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklgpu> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const double *a, std::int64_t lda,
-                           std::int64_t stridea, const double *x, std::int64_t incx,
-                           std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklgpu> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stridea, const std::complex<float> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<float> *c,
-                           std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklgpu> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stridea, const std::complex<double> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<double> *c,
-                           std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklgpu> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const float **a, std::int64_t *lda,
-                           const float **x, std::int64_t *incx, float **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklgpu> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const double **a, std::int64_t *lda,
-                           const double **x, std::int64_t *incx, double **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklgpu> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const std::complex<float> **a,
-                           std::int64_t *lda, const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::mklgpu> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const std::complex<double> **a,
-                           std::int64_t *lda, const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event her(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const std::complex<float> *x, std::int64_t incx,
-                    std::complex<float> *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event her(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const std::complex<double> *x, std::int64_t incx,
-                    std::complex<double> *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event hpr(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const std::complex<float> *x, std::int64_t incx,
-                    std::complex<float> *a, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event hpr(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const std::complex<double> *x, std::int64_t incx,
-                    std::complex<double> *a, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::mklgpu> selector, std::int64_t n, const float *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::mklgpu> selector, std::int64_t n, const double *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                      const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                      const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           sycl::half *alpha, const sycl::half **a, std::int64_t *lda,
-                           const sycl::half **b, std::int64_t *ldb, sycl::half *beta,
-                           sycl::half **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const sycl::half **a, std::int64_t *lda, const sycl::half **b,
-                       std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, float **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           float *alpha, const float **a, std::int64_t *lda, const float **b,
-                           std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           double *alpha, const double **a, std::int64_t *lda, const double **b,
-                           std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           std::complex<float> *alpha, const std::complex<float> **a,
-                           std::int64_t *lda, const std::complex<float> **b, std::int64_t *ldb,
-                           std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           std::complex<double> *alpha, const std::complex<double> **a,
-                           std::int64_t *lda, const std::complex<double> **b, std::int64_t *ldb,
-                           std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           sycl::half alpha, const sycl::half *a, std::int64_t lda,
-                           std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-                           std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc,
-                           std::int64_t stride_c, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const sycl::half *a, std::int64_t lda, std::int64_t stride_a,
-                       const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       std::int32_t *c, std::int64_t ldc, std::int64_t stride_c,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
-                           const float *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                           float *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
-                           const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
-                           double *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                           std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::mklgpu> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           std::complex<double> alpha, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                           std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event spmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *a, const float *x, std::int64_t incx, float beta,
-                     float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event spmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *a, const double *x, std::int64_t incx, double beta,
-                     double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::mklgpu> selector, std::int64_t n, float *x,
-                     std::int64_t incx, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::mklgpu> selector, std::int64_t n, double *x,
-                     std::int64_t incx, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event geru(backend_selector<backend::mklgpu> selector, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event geru(backend_selector<backend::mklgpu> selector, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::mklgpu> selector, std::int64_t n, const float *x,
-                     std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::mklgpu> selector, std::int64_t n, const double *x,
-                     std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a,
-                     std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a,
-                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                     const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                     sycl::half beta, sycl::half *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                     const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                     float beta, float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a,
-                     std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::mklgpu> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                          std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::mklgpu> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                          std::int8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::mklgpu> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda,
-                          std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::mklgpu> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda,
-                          std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event herk(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, float alpha, const std::complex<float> *a,
-                     std::int64_t lda, float beta, std::complex<float> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::herk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event herk(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, double alpha, const std::complex<double> *a,
-                     std::int64_t lda, double beta, std::complex<double> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::herk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event ger(backend_selector<backend::mklgpu> selector, std::int64_t m, std::int64_t n,
-                    float alpha, const float *x, std::int64_t incx, const float *y,
-                    std::int64_t incy, float *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y,
-                                                      incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event ger(backend_selector<backend::mklgpu> selector, std::int64_t m, std::int64_t n,
-                    double alpha, const double *x, std::int64_t incx, const double *y,
-                    std::int64_t incy, double *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y,
-                                                      incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                     const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                     const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklgpu> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                           std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklgpu> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                           std::int64_t stride_a, double *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklgpu> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<float> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklgpu> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklgpu> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, float *alpha, const float **a, std::int64_t *lda,
-                           float **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklgpu> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, double *alpha, const double **a, std::int64_t *lda,
-                           double **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklgpu> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, std::complex<float> *alpha,
-                           const std::complex<float> **a, std::int64_t *lda,
-                           std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::mklgpu> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, std::complex<double> *alpha,
-                           const std::complex<double> **a, std::int64_t *lda,
-                           std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event dotu(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                     std::int64_t incy, std::complex<float> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event dotu(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event hemm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event hemm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event hpr2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event hpr2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a,
-                     std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
-                     const double *a, std::int64_t lda, const double *x, std::int64_t incx,
-                     double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
-                     std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
-                     std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
-                     std::int64_t lda, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
-                     std::int64_t lda, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                     const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, double alpha, const double *a,
-                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event dotc(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                     std::int64_t incy, std::complex<float> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event dotc(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event syr(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                     const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                     const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::mklgpu> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event rotmg(backend_selector<backend::mklgpu> selector, float *d1, float *d2, float *x1,
-                      float y1, float *param, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event rotmg(backend_selector<backend::mklgpu> selector, double *d1, double *d2,
-                      double *x1, double y1, double *param,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::mklgpu> selector, std::int64_t n, const float *x,
-                     std::int64_t incx, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::mklgpu> selector, std::int64_t n, const double *x,
-                     std::int64_t incx, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklgpu> selector, std::int64_t *n,
-                           const float **x, std::int64_t *incx, float **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklgpu> selector, std::int64_t *n,
-                           const double **x, std::int64_t *incx, double **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklgpu> selector, std::int64_t *n,
-                           const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklgpu> selector, std::int64_t *n,
-                           const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                           const float *x, std::int64_t incx, std::int64_t stridex, float *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                           const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                           const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                           const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event hemv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::hemv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hemv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::hemv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::mklgpu> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      float alpha, const float *a, std::int64_t lda, const float *b,
-                      std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::mklgpu> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      double alpha, const double *a, std::int64_t lda, const double *b,
-                      std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::mklgpu> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                      const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
-                      std::complex<float> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::mklgpu> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
-                      std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event sbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x,
-                     std::int64_t incx, float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event sbmv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                     const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::mklgpu> selector, std::int64_t n, const float *x,
-                     std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::mklgpu> selector, std::int64_t n, const double *x,
-                     std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
-                     std::int64_t lda, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
-                     std::int64_t lda, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event spr2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *x, std::int64_t incx, const float *y,
-                     std::int64_t incy, float *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event spr2(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *x, std::int64_t incx, const double *y,
-                     std::int64_t incy, double *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::mklgpu> selector, std::int64_t n, const float *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::mklgpu> selector, std::int64_t n, const double *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                      const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::mklgpu> selector, std::int64_t n,
-                      const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event rotm(backend_selector<backend::mklgpu> selector, std::int64_t n, float *x,
-                     std::int64_t incx, float *y, std::int64_t incy, float *param,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy,
-                                                       param, dependencies);
-    return done;
-}
-
-sycl::event rotm(backend_selector<backend::mklgpu> selector, std::int64_t n, double *x,
-                     std::int64_t incx, double *y, std::int64_t incy, double *param,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy,
-                                                       param, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::mklgpu> selector, float *a, float *b, float *c,
-                     float *s, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::mklgpu> selector, double *a, double *b, double *c,
-                     double *s, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::mklgpu> selector, std::complex<float> *a,
-                     std::complex<float> *b, float *c, std::complex<float> *s,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::mklgpu> selector, std::complex<double> *a,
-                     std::complex<double> *b, double *c, std::complex<double> *s,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event sdsdot(backend_selector<backend::mklgpu> selector, std::int64_t n, float sb,
-                       const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                       float *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y,
-                                                         incy, result, dependencies);
-    return done;
-}
-
-sycl::event her2k(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                      std::int64_t ldb, float beta, std::complex<float> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event her2k(backend_selector<backend::mklgpu> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                      const std::complex<double> *a, std::int64_t lda,
-                      const std::complex<double> *b, std::int64_t ldb, double beta,
-                      std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::mklgpu> selector, std::int64_t n, const float *x,
-                    std::int64_t incx, const float *y, std::int64_t incy, float *result,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                      result, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::mklgpu> selector, std::int64_t n, const double *x,
-                    std::int64_t incx, const double *y, std::int64_t incy, double *result,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                      result, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::mklgpu> selector, std::int64_t n, const float *x,
-                    std::int64_t incx, const float *y, std::int64_t incy, double *result,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                      result, dependencies);
-    return done;
-}
-
-sycl::event symv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *a, std::int64_t lda, const float *x,
-                     std::int64_t incx, float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::symv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event symv(backend_selector<backend::mklgpu> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *a, std::int64_t lda, const double *x,
-                     std::int64_t incx, double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::symv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, const float *a,
-                           std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, const double *a,
-                           std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda,
-                           std::int64_t ldb, std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, double *ab,
-                           std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklgpu> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::mklgpu> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                          const float *a, std::int64_t lda, std::int64_t stride_a, float beta,
-                          const float *b, std::int64_t ldb, std::int64_t stride_b, float *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::mklgpu> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                          const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                          const double *b, std::int64_t ldb, std::int64_t stride_b, double *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::mklgpu> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                          std::int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::mklgpu> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<double> alpha, const std::complex<double> *a,
-                          std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                      std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                      std::int64_t stridea, double *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                      std::int64_t lda, std::int64_t stridea, std::complex<float> *b,
-                      std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                      std::int64_t lda, std::int64_t stridea, std::complex<double> *b,
-                      std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, float *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, double *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, std::complex<float> *ab,
-                     std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::mklgpu> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, std::complex<double> *ab,
-                     std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                    float beta, const float *b, std::int64_t ldb, float *c, std::int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                    double beta, const double *b, std::int64_t ldb, double *c, std::int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                    const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                    const std::complex<float> *b, std::int64_t ldb, std::complex<float> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::mklgpu> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                    const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                    const std::complex<double> *b, std::int64_t ldb, std::complex<double> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklgpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, float* alpha, const float** a,
-                           std::int64_t* lda, float** b, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklgpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, double* alpha, const double** a,
-                           std::int64_t* lda, double** b, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklgpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<float>* alpha,
-                           const std::complex<float>** a, std::int64_t* lda,
-                           std::complex<float>** b, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::mklgpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<double>* alpha,
-                           const std::complex<double>** a, std::int64_t* lda,
-                           std::complex<double>** b, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklgpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, float* alpha, float** ab,
-                           std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklgpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, double* alpha, double** ab,
-                           std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklgpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<float>* alpha,
-                           std::complex<float>** ab, std::int64_t* lda, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::mklgpu> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<double>* alpha,
-                           std::complex<double>** ab, std::int64_t* lda, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
diff --git a/include/oneapi/mkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp b/include/oneapi/mkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp
deleted file mode 100644
index cf5a2a398..000000000
--- a/include/oneapi/mkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_BLAS_MKLGPU_HPP_
-#define _ONEMKL_BLAS_MKLGPU_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-
-#include "oneapi/mkl/detail/export.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace mklgpu {
-namespace column_major {
-
-#include "oneapi/mkl/blas/detail/onemkl_blas_backends.hxx"
-
-} //namespace column_major
-namespace row_major {
-
-#include "oneapi/mkl/blas/detail/onemkl_blas_backends.hxx"
-
-} //namespace row_major
-} //namespace mklgpu
-} //namespace blas
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_ONEMKL_BLAS_MKLGPU_HPP_
diff --git a/include/oneapi/mkl/blas/detail/netlib/blas_ct.hpp b/include/oneapi/mkl/blas/detail/netlib/blas_ct.hpp
deleted file mode 100644
index cdfc79e7f..000000000
--- a/include/oneapi/mkl/blas/detail/netlib/blas_ct.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _DETAIL_NETLIB_BLAS_CT_HPP_
-#define _DETAIL_NETLIB_BLAS_CT_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/detail/backends.hpp"
-#include "oneapi/mkl/blas/detail/netlib/onemkl_blas_netlib.hpp"
-#include "oneapi/mkl/blas/detail/blas_ct_backends.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace column_major {
-
-#define MAJOR column_major
-#include "blas_ct.hxx"
-#undef MAJOR
-
-} //namespace column_major
-namespace row_major {
-
-#define MAJOR row_major
-#include "blas_ct.hxx"
-#undef MAJOR
-
-} //namespace row_major
-} //namespace blas
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_DETAIL_NETLIB_BLAS_CT_HPP_
diff --git a/include/oneapi/mkl/blas/detail/netlib/blas_ct.hxx b/include/oneapi/mkl/blas/detail/netlib/blas_ct.hxx
deleted file mode 100644
index 404d79ae0..000000000
--- a/include/oneapi/mkl/blas/detail/netlib/blas_ct.hxx
+++ /dev/null
@@ -1,4388 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void herk(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, float beta, sycl::buffer<std::complex<float>, 1> &c,
-          std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void herk(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, double alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, double beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void scal(backend_selector<backend::netlib> selector, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::netlib> selector, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::netlib> selector, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::netlib> selector, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::netlib> selector, std::int64_t n, float alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::netlib> selector, std::int64_t n, double alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void trmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void tpmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void spr(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a) {
-    oneapi::mkl::blas::netlib::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void spr(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a) {
-    oneapi::mkl::blas::netlib::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void gemm_batch(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                double beta, sycl::buffer<double, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, stride_a, b, ldb, stride_b, beta, c,
-                                                 ldc, stride_c, batch_size);
-}
-
-void syrk(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a,
-                                           lda, beta, c, ldc);
-}
-
-void syrk_batch(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void syrk_batch(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, double beta,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void syrk_batch(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void syrk_batch(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                 batch_size);
-}
-
-void her2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::netlib::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void her2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::netlib::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void hbmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void hbmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void rot(backend_selector<backend::netlib> selector, std::int64_t n,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c, float s) {
-    oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::netlib> selector, std::int64_t n,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c, double s) {
-    oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::netlib> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy, float c, float s) {
-    oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::netlib> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy, double c, double s) {
-    oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void axpy(backend_selector<backend::netlib> selector, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::netlib> selector, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::netlib> selector, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::netlib> selector, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy_batch(backend_selector<backend::netlib> selector, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::netlib> selector, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::netlib> selector, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::netlib> selector, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                 y, incy, stridey, batch_size);
-}
-
-void axpby(backend_selector<backend::netlib> selector, std::int64_t n, float alpha,
-           sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-           sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(backend_selector<backend::netlib> selector, std::int64_t n, double alpha,
-           sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-           sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(backend_selector<backend::netlib> selector, std::int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(backend_selector<backend::netlib> selector, std::int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-           std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy);
-}
-
-void sdsdot(backend_selector<backend::netlib> selector, std::int64_t n, float sb,
-            sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-            std::int64_t incy, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result);
-}
-
-void gerc(backend_selector<backend::netlib> selector, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::netlib::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void gerc(backend_selector<backend::netlib> selector, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::netlib::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void syr2k(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-           std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-           std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-           std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-          std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-          std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemv_batch(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<float, 1> &x, std::int64_t incx,
-                std::int64_t stridex, float beta, sycl::buffer<float, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void gemv_batch(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<double, 1> &x, std::int64_t incx,
-                std::int64_t stridex, double beta, sycl::buffer<double, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void gemv_batch(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                std::int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void gemv_batch(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                std::int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::netlib> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<float, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<float, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::netlib> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<double, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<double, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::netlib> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::netlib> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                 stridea, x, incx, stridex, c, ldc, stridec,
-                                                 batch_size);
-}
-
-void her(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::netlib::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void her(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::netlib::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void hpr(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a) {
-    oneapi::mkl::blas::netlib::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void hpr(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a) {
-    oneapi::mkl::blas::netlib::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void iamin(backend_selector<backend::netlib> selector, std::int64_t n,
-           sycl::buffer<float, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::netlib> selector, std::int64_t n,
-           sycl::buffer<double, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::netlib> selector, std::int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::netlib> selector, std::int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void hpmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void hpmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void spmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void spmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                           beta, y, incy);
-}
-
-void gemm_bias(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n,
-                                                k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void swap(backend_selector<backend::netlib> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void geru(backend_selector<backend::netlib> selector, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::netlib::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void geru(backend_selector<backend::netlib> selector, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::netlib::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void nrm2(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::netlib> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void gemm(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<bfloat16, 1> &a, std::int64_t lda, sycl::buffer<bfloat16, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                           lda, b, ldb, beta, c, ldc);
-}
-
-void syr2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::netlib::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void syr2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &a,
-          std::int64_t lda) {
-    oneapi::mkl::blas::netlib::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a, lda);
-}
-
-void ger(backend_selector<backend::netlib> selector, std::int64_t m, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-         std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::netlib::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                          lda);
-}
-
-void ger(backend_selector<backend::netlib> selector, std::int64_t m, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-         std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::netlib::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                          lda);
-}
-
-void trsm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void dotu(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dotu(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void hemm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hemm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hpr2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a) {
-    oneapi::mkl::blas::netlib::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void hpr2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a) {
-    oneapi::mkl::blas::netlib::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void gbmv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void tbmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void symm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                           alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void dotc(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dotc(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void syr(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a,
-         std::int64_t lda) {
-    oneapi::mkl::blas::netlib::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void syr(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a,
-         std::int64_t lda) {
-    oneapi::mkl::blas::netlib::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                          lda);
-}
-
-void trmm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                           unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void rotmg(backend_selector<backend::netlib> selector, sycl::buffer<float, 1> &d1,
-           sycl::buffer<float, 1> &d2, sycl::buffer<float, 1> &x1, float y1,
-           sycl::buffer<float, 1> &param) {
-    oneapi::mkl::blas::netlib::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param);
-}
-
-void rotmg(backend_selector<backend::netlib> selector, sycl::buffer<double, 1> &d1,
-           sycl::buffer<double, 1> &d2, sycl::buffer<double, 1> &x1, double y1,
-           sycl::buffer<double, 1> &param) {
-    oneapi::mkl::blas::netlib::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param);
-}
-
-void tpsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void tpsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, x, incx);
-}
-
-void trsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           a, lda, x, incx);
-}
-
-void copy(backend_selector<backend::netlib> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy_batch(backend_selector<backend::netlib> selector, std::int64_t n,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::netlib> selector, std::int64_t n,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::netlib> selector, std::int64_t n,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::netlib> selector, std::int64_t n,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy,
-                                                 stridey, batch_size);
-}
-
-void hemv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void hemv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void gemmt(backend_selector<backend::netlib> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, float alpha,
-           sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-           std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::netlib> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, double alpha,
-           sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-           std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::netlib> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::netlib> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-           std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void asum(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::netlib> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void sbmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void sbmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                           x, incx, beta, y, incy);
-}
-
-void tbsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::netlib::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                           k, a, lda, x, incx);
-}
-
-void spr2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a) {
-    oneapi::mkl::blas::netlib::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void spr2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &a) {
-    oneapi::mkl::blas::netlib::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                           incy, a);
-}
-
-void iamax(backend_selector<backend::netlib> selector, std::int64_t n,
-           sycl::buffer<float, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::netlib> selector, std::int64_t n,
-           sycl::buffer<double, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::netlib> selector, std::int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::netlib> selector, std::int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void rotm(backend_selector<backend::netlib> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-          sycl::buffer<float, 1> &param) {
-    oneapi::mkl::blas::netlib::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param);
-}
-
-void rotm(backend_selector<backend::netlib> selector, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &param) {
-    oneapi::mkl::blas::netlib::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param);
-}
-
-void dot(backend_selector<backend::netlib> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dot(backend_selector<backend::netlib> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-         sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dot(backend_selector<backend::netlib> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::netlib::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void trsm_batch(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha, a, lda, stride_a, b,
-                                                 ldb, stride_b, batch_size);
-}
-
-void her2k(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void her2k(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void rotg(backend_selector<backend::netlib> selector, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<float, 1> &s) {
-    oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::netlib> selector, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<double, 1> &s) {
-    oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::netlib> selector, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<std::complex<float>, 1> &s) {
-    oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::netlib> selector, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<std::complex<double>, 1> &s) {
-    oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void symv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void symv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::netlib::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                           incx, beta, y, incy);
-}
-
-void omatcopy_batch(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                     lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                     lda, ldb, stride, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                   std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<float, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                   std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer<double, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<double, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                   sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                   std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                   sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                   std::int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                   std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::netlib::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                    alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-                                                    c, ldc, stride_c, batch_size);
-}
-
-void omatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-              sycl::buffer<float, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-              sycl::buffer<double, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                               ldb);
-}
-
-void omatcopy2(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-               std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<float, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    oneapi::mkl::blas::netlib::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-               std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<double, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    oneapi::mkl::blas::netlib::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    oneapi::mkl::blas::netlib::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    oneapi::mkl::blas::netlib::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                stridea, b, ldb, strideb);
-}
-
-void imatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void imatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void imatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void imatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                               ldb);
-}
-
-void omatadd(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-             std::int64_t lda, float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-             sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-             std::int64_t lda, double beta, sycl::buffer<double, 1> &b, std::int64_t ldb,
-             sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<float> alpha,
-             sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-             sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<double> alpha,
-             sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-             sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                              lda, beta, b, ldb, c, ldc);
-}
-
-// USM APIs
-
-sycl::event syr2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *x, std::int64_t incx, const float *y,
-                     std::int64_t incy, float *a, std::int64_t lda,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *x, std::int64_t incx, const double *y,
-                     std::int64_t incy, double *a, std::int64_t lda,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::netlib> selector, std::int64_t n, float alpha,
-                     float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::netlib> selector, std::int64_t n, double alpha,
-                     double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::netlib> selector, std::int64_t n,
-                     std::complex<float> alpha, std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::netlib> selector, std::int64_t n,
-                     std::complex<double> alpha, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::netlib> selector, std::int64_t n, float alpha,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::netlib> selector, std::int64_t n, double alpha,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event spr(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const float *x, std::int64_t incx, float *a,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event spr(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const double *x, std::int64_t incx, double *a,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event hpmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hpmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                     float beta, float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, double alpha, const double *a,
-                     std::int64_t lda, double beta, double *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                     std::complex<float> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                     std::complex<double> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::netlib> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k, float *alpha,
-                           const float **a, std::int64_t *lda, float *beta, float **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::netlib> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k, double *alpha,
-                           const double **a, std::int64_t *lda, double *beta, double **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::netlib> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k,
-                           std::complex<float> *alpha, const std::complex<float> **a,
-                           std::int64_t *lda, std::complex<float> *beta, std::complex<float> **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::netlib> selector, uplo *upper_lower,
-                           transpose *trans, std::int64_t *n, std::int64_t *k,
-                           std::complex<double> *alpha, const std::complex<double> **a,
-                           std::int64_t *lda, std::complex<double> *beta, std::complex<double> **c,
-                           std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::netlib> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                           const float *a, std::int64_t lda, std::int64_t stride_a, float beta,
-                           float *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::netlib> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                           const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                           double *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::netlib> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k,
-                           std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<float> beta,
-                           std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::netlib> selector, uplo upper_lower,
-                           transpose trans, std::int64_t n, std::int64_t k,
-                           std::complex<double> alpha, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-                           std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event her2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event her2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event hbmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                     std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hbmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                     std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::netlib> selector, std::int64_t n,
-                    std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                    std::int64_t incy, float c, float s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::netlib> selector, std::int64_t n,
-                    std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                    std::int64_t incy, double c, double s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::netlib> selector, std::int64_t n, float *x,
-                    std::int64_t incx, float *y, std::int64_t incy, float c, float s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::netlib> selector, std::int64_t n, double *x,
-                    std::int64_t incx, double *y, std::int64_t incy, double c, double s,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                      s, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::netlib> selector, std::int64_t n, float alpha,
-                     const float *x, std::int64_t incx, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::netlib> selector, std::int64_t n, double alpha,
-                     const double *x, std::int64_t incx, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::netlib> selector, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::netlib> selector, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                       incy, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::netlib> selector, std::int64_t *n,
-                           float *alpha, const float **x, std::int64_t *incx, float **y,
-                           std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::netlib> selector, std::int64_t *n,
-                           double *alpha, const double **x, std::int64_t *incx, double **y,
-                           std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::netlib> selector, std::int64_t *n,
-                           std::complex<float> *alpha, const std::complex<float> **x,
-                           std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::netlib> selector, std::int64_t *n,
-                           std::complex<double> *alpha, const std::complex<double> **x,
-                           std::int64_t *incx, std::complex<double> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::netlib> selector, std::int64_t n, float alpha,
-                           const float *x, std::int64_t incx, std::int64_t stridex, float *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::netlib> selector, std::int64_t n, double alpha,
-                           const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::netlib> selector, std::int64_t n,
-                           std::complex<float> alpha, const std::complex<float> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<float> *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::netlib> selector, std::int64_t n,
-                           std::complex<double> alpha, const std::complex<double> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<double> *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                             incx, stridex, y, incy, stridey,
-                                                             batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::netlib> selector, std::int64_t n, float alpha,
-                      const float *x, std::int64_t incx, const float beta, float *y,
-                      std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::netlib> selector, std::int64_t n, double alpha,
-                      const double *x, std::int64_t incx, const double beta, double *y,
-                      std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::netlib> selector, std::int64_t n,
-                      std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                      const std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::netlib> selector, std::int64_t n,
-                      std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                      const std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                        beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gerc(backend_selector<backend::netlib> selector, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event gerc(backend_selector<backend::netlib> selector, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                      const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, double alpha, const double *a,
-                      std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                      std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                      std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                      std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                      const std::complex<double> *a, std::int64_t lda,
-                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
-                      std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x,
-                     std::int64_t incx, float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                     const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                     std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                     std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::netlib> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, const float *a,
-                           std::int64_t lda, std::int64_t stridea, const float *x,
-                           std::int64_t incx, std::int64_t stridex, float beta, float *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::netlib> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, const double *a,
-                           std::int64_t lda, std::int64_t stridea, const double *x,
-                           std::int64_t incx, std::int64_t stridex, double beta, double *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::netlib> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-                           const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                           std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::netlib> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stridea,
-                           const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                           std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::netlib> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, float *alpha, const float **a,
-                           std::int64_t *lda, const float **x, std::int64_t *incx, float *beta,
-                           float **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::netlib> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, double *alpha, const double **a,
-                           std::int64_t *lda, const double **x, std::int64_t *incx, double *beta,
-                           double **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::netlib> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<float> *alpha,
-                           const std::complex<float> **a, std::int64_t *lda,
-                           const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> *beta, std::complex<float> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::netlib> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<double> *alpha,
-                           const std::complex<double> **a, std::int64_t *lda,
-                           const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> *beta, std::complex<double> **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, x, incx, beta, y, incy,
-                                                             group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::netlib> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const float *a, std::int64_t lda,
-                           std::int64_t stridea, const float *x, std::int64_t incx,
-                           std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::netlib> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const double *a, std::int64_t lda,
-                           std::int64_t stridea, const double *x, std::int64_t incx,
-                           std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::netlib> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stridea, const std::complex<float> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<float> *c,
-                           std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::netlib> selector, side left_right,
-                           std::int64_t m, std::int64_t n, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stridea, const std::complex<double> *x,
-                           std::int64_t incx, std::int64_t stridex, std::complex<double> *c,
-                           std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::netlib> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const float **a, std::int64_t *lda,
-                           const float **x, std::int64_t *incx, float **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::netlib> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const double **a, std::int64_t *lda,
-                           const double **x, std::int64_t *incx, double **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::netlib> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const std::complex<float> **a,
-                           std::int64_t *lda, const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::netlib> selector, side *left_right,
-                           std::int64_t *m, std::int64_t *n, const std::complex<double> **a,
-                           std::int64_t *lda, const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n,
-                                                             a, lda, x, incx, c, ldc, group_count,
-                                                             group_size, dependencies);
-    return done;
-}
-
-sycl::event her(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const std::complex<float> *x, std::int64_t incx,
-                    std::complex<float> *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::her(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event her(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const std::complex<double> *x, std::int64_t incx,
-                    std::complex<double> *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::her(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event hpr(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const std::complex<float> *x, std::int64_t incx,
-                    std::complex<float> *a, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event hpr(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const std::complex<double> *x, std::int64_t incx,
-                    std::complex<double> *a, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::netlib> selector, std::int64_t n, const float *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::netlib> selector, std::int64_t n, const double *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::netlib> selector, std::int64_t n,
-                      const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::netlib> selector, std::int64_t n,
-                      const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           float *alpha, const float **a, std::int64_t *lda, const float **b,
-                           std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           double *alpha, const double **a, std::int64_t *lda, const double **b,
-                           std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           std::complex<float> *alpha, const std::complex<float> **a,
-                           std::int64_t *lda, const std::complex<float> **b, std::int64_t *ldb,
-                           std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           std::complex<double> *alpha, const std::complex<double> **a,
-                           std::int64_t *lda, const std::complex<double> **b, std::int64_t *ldb,
-                           std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose *transa,
-                           transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                           sycl::half *alpha, const sycl::half **a, std::int64_t *lda,
-                           const sycl::half **b, std::int64_t *ldb, sycl::half *beta,
-                           sycl::half **c, std::int64_t *ldc, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const sycl::half **a, std::int64_t *lda, const sycl::half **b,
-                       std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, float **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
-                           const float *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                           float *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
-                           const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
-                           double *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                           std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           std::complex<double> alpha, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                           std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose transa,
-                           transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                           sycl::half alpha, const sycl::half *a, std::int64_t lda,
-                           std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-                           std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc,
-                           std::int64_t stride_c, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const sycl::half *a, std::int64_t lda, std::int64_t stride_a,
-                       const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::netlib> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       std::int32_t *c, std::int64_t ldc, std::int64_t stride_c,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event spmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *a, const float *x, std::int64_t incx, float beta,
-                     float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event spmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *a, const double *x, std::int64_t incx, double beta,
-                     double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha,
-                                                       a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::netlib> selector, std::int64_t n, float *x,
-                     std::int64_t incx, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::netlib> selector, std::int64_t n, double *x,
-                     std::int64_t incx, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::netlib> selector, std::int64_t n,
-                     std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::netlib> selector, std::int64_t n,
-                     std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event geru(backend_selector<backend::netlib> selector, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event geru(backend_selector<backend::netlib> selector, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::netlib> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::netlib> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::netlib> selector, std::int64_t n, const float *x,
-                     std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::netlib> selector, std::int64_t n, const double *x,
-                     std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a,
-                     std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a,
-                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                     const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                     sycl::half beta, sycl::half *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                     const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                     float beta, float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                     std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a,
-                     std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                               a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::netlib> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                          std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::netlib> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                          std::int8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::netlib> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda,
-                          std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::netlib> selector, transpose transa,
-                          transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                          std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda,
-                          std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                          float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event herk(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, float alpha, const std::complex<float> *a,
-                     std::int64_t lda, float beta, std::complex<float> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::herk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event herk(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     std::int64_t n, std::int64_t k, double alpha, const std::complex<double> *a,
-                     std::int64_t lda, double beta, std::complex<double> *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::herk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event ger(backend_selector<backend::netlib> selector, std::int64_t m, std::int64_t n,
-                    float alpha, const float *x, std::int64_t incx, const float *y,
-                    std::int64_t incy, float *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y,
-                                                      incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event ger(backend_selector<backend::netlib> selector, std::int64_t m, std::int64_t n,
-                    double alpha, const double *x, std::int64_t incx, const double *y,
-                    std::int64_t incy, double *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y,
-                                                      incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                     const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                     const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::netlib> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                           std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::netlib> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                           std::int64_t stride_a, double *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::netlib> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<float> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::netlib> selector, side left_right,
-                           uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                           std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::netlib> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, float *alpha, const float **a, std::int64_t *lda,
-                           float **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::netlib> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, double *alpha, const double **a, std::int64_t *lda,
-                           double **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::netlib> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, std::complex<float> *alpha,
-                           const std::complex<float> **a, std::int64_t *lda,
-                           std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::netlib> selector, side *left_right,
-                           uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                           std::int64_t *n, std::complex<double> *alpha,
-                           const std::complex<double> **a, std::int64_t *lda,
-                           std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event dotu(backend_selector<backend::netlib> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                     std::int64_t incy, std::complex<float> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event dotu(backend_selector<backend::netlib> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event hemm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event hemm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event hpr2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                     const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event hpr2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a,
-                     std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
-                     const double *a, std::int64_t lda, const double *x, std::int64_t incx,
-                     double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
-                     std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
-                     std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
-                     std::int64_t lda, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
-                     std::int64_t lda, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                     const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, double alpha, const double *a,
-                     std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                     std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                     std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                     std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                               alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event dotc(backend_selector<backend::netlib> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                     std::int64_t incy, std::complex<float> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event dotc(backend_selector<backend::netlib> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx,
-                     const std::complex<double> *y, std::int64_t incy, std::complex<double> *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event syr(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                    float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                    double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha,
-                                                      x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                     const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                     const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::netlib> selector, side left_right, uplo upper_lower,
-                     transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right,
-                                                       upper_lower, trans, unit_diag, m, n, alpha,
-                                                       a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event rotmg(backend_selector<backend::netlib> selector, float *d1, float *d2, float *x1,
-                      float y1, float *param, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event rotmg(backend_selector<backend::netlib> selector, double *d1, double *d2,
-                      double *x1, double y1, double *param,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                     std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans,
-                                                       unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::netlib> selector, std::int64_t n, const float *x,
-                     std::int64_t incx, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::netlib> selector, std::int64_t n, const double *x,
-                     std::int64_t incx, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::netlib> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::netlib> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                     std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::netlib> selector, std::int64_t *n,
-                           const float **x, std::int64_t *incx, float **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::netlib> selector, std::int64_t *n,
-                           const double **x, std::int64_t *incx, double **y, std::int64_t *incy,
-                           std::int64_t group_count, std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::netlib> selector, std::int64_t *n,
-                           const std::complex<float> **x, std::int64_t *incx,
-                           std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::netlib> selector, std::int64_t *n,
-                           const std::complex<double> **x, std::int64_t *incx,
-                           std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
-                           std::int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::netlib> selector, std::int64_t n,
-                           const float *x, std::int64_t incx, std::int64_t stridex, float *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::netlib> selector, std::int64_t n,
-                           const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                           std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::netlib> selector, std::int64_t n,
-                           const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::netlib> selector, std::int64_t n,
-                           const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                           std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event hemv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                     const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                     std::complex<float> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::hemv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hemv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                     const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                     std::complex<double> *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::hemv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::netlib> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      float alpha, const float *a, std::int64_t lda, const float *b,
-                      std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::netlib> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      double alpha, const double *a, std::int64_t lda, const double *b,
-                      std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::netlib> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                      const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
-                      std::complex<float> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::netlib> selector, uplo upper_lower,
-                      transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                      std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                      const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
-                      std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                        transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                        c, ldc, dependencies);
-    return done;
-}
-
-sycl::event sbmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x,
-                     std::int64_t incx, float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event sbmv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                     const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                               lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::netlib> selector, std::int64_t n,
-                     const std::complex<float> *x, std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::netlib> selector, std::int64_t n,
-                     const std::complex<double> *x, std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::netlib> selector, std::int64_t n, const float *x,
-                     std::int64_t incx, float *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::netlib> selector, std::int64_t n, const double *x,
-                     std::int64_t incx, double *result,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                       dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const float *a,
-                     std::int64_t lda, float *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const double *a,
-                     std::int64_t lda, double *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                     diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event spr2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *x, std::int64_t incx, const float *y,
-                     std::int64_t incy, float *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event spr2(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *x, std::int64_t incx, const double *y,
-                     std::int64_t incy, double *a,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::netlib> selector, std::int64_t n, const float *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::netlib> selector, std::int64_t n, const double *x,
-                      std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::netlib> selector, std::int64_t n,
-                      const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::netlib> selector, std::int64_t n,
-                      const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event rotm(backend_selector<backend::netlib> selector, std::int64_t n, float *x,
-                     std::int64_t incx, float *y, std::int64_t incy, float *param,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy,
-                                                       param, dependencies);
-    return done;
-}
-
-sycl::event rotm(backend_selector<backend::netlib> selector, std::int64_t n, double *x,
-                     std::int64_t incx, double *y, std::int64_t incy, double *param,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy,
-                                                       param, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::netlib> selector, float *a, float *b, float *c,
-                     float *s, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::netlib> selector, double *a, double *b, double *c,
-                     double *s, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::netlib> selector, std::complex<float> *a,
-                     std::complex<float> *b, float *c, std::complex<float> *s,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::netlib> selector, std::complex<double> *a,
-                     std::complex<double> *b, double *c, std::complex<double> *s,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event sdsdot(backend_selector<backend::netlib> selector, std::int64_t n, float sb,
-                       const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                       float *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y,
-                                                         incy, result, dependencies);
-    return done;
-}
-
-sycl::event her2k(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                      const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                      std::int64_t ldb, float beta, std::complex<float> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event her2k(backend_selector<backend::netlib> selector, uplo upper_lower, transpose trans,
-                      std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                      const std::complex<double> *a, std::int64_t lda,
-                      const std::complex<double> *b, std::int64_t ldb, double beta,
-                      std::complex<double> *c, std::int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::netlib> selector, std::int64_t n, const float *x,
-                    std::int64_t incx, const float *y, std::int64_t incy, float *result,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                      result, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::netlib> selector, std::int64_t n, const double *x,
-                    std::int64_t incx, const double *y, std::int64_t incy, double *result,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                      result, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::netlib> selector, std::int64_t n, const float *x,
-                    std::int64_t incx, const float *y, std::int64_t incy, double *result,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                      result, dependencies);
-    return done;
-}
-
-sycl::event symv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     float alpha, const float *a, std::int64_t lda, const float *x,
-                     std::int64_t incx, float beta, float *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::symv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event symv(backend_selector<backend::netlib> selector, uplo upper_lower, std::int64_t n,
-                     double alpha, const double *a, std::int64_t lda, const double *x,
-                     std::int64_t incx, double beta, double *y, std::int64_t incy,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::symv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::netlib> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, const float *a,
-                           std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::netlib> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, const double *a,
-                           std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::netlib> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::netlib> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::netlib> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda,
-                           std::int64_t ldb, std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::netlib> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, double *ab,
-                           std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::netlib> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::netlib> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::netlib> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                          const float *a, std::int64_t lda, std::int64_t stride_a, float beta,
-                          const float *b, std::int64_t ldb, std::int64_t stride_b, float *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::netlib> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                          const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                          const double *b, std::int64_t ldb, std::int64_t stride_b, double *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::netlib> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                          std::int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::netlib> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<double> alpha, const std::complex<double> *a,
-                          std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                      std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                      std::int64_t stridea, double *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                      std::int64_t lda, std::int64_t stridea, std::complex<float> *b,
-                      std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                      std::int64_t lda, std::int64_t stridea, std::complex<double> *b,
-                      std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, float *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, double *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, std::complex<float> *ab,
-                     std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::netlib> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, std::complex<double> *ab,
-                     std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha,
-                                                           ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                    float beta, const float *b, std::int64_t ldb, float *c, std::int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                    double beta, const double *b, std::int64_t ldb, double *c, std::int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                    const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                    const std::complex<float> *b, std::int64_t ldb, std::complex<float> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::netlib> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                    const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                    const std::complex<double> *b, std::int64_t ldb, std::complex<double> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                  a, lda, beta, b, ldb, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::netlib> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, float* alpha, const float** a,
-                           std::int64_t* lda, float** b, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::netlib> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, double* alpha, const double** a,
-                           std::int64_t* lda, double** b, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::netlib> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<float>* alpha,
-                           const std::complex<float>** a, std::int64_t* lda,
-                           std::complex<float>** b, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::netlib> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<double>* alpha,
-                           const std::complex<double>** a, std::int64_t* lda,
-                           std::complex<double>** b, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, a, lda, b, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::netlib> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, float* alpha, float** ab,
-                           std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::netlib> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, double* alpha, double** ab,
-                           std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::netlib> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<float>* alpha,
-                           std::complex<float>** ab, std::int64_t* lda, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::netlib> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<double>* alpha,
-                           std::complex<double>** ab, std::int64_t* lda, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                 alpha, ab, lda, ldb, group_count,
-                                                                 groupsize, dependencies);
-    return done;
-}
diff --git a/include/oneapi/mkl/blas/detail/netlib/onemkl_blas_netlib.hpp b/include/oneapi/mkl/blas/detail/netlib/onemkl_blas_netlib.hpp
deleted file mode 100644
index cb96c9a27..000000000
--- a/include/oneapi/mkl/blas/detail/netlib/onemkl_blas_netlib.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_BLAS_NETLIB_HPP_
-#define _ONEMKL_BLAS_NETLIB_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-
-#include "oneapi/mkl/detail/export.hpp"
-
-namespace oneapi {
-namespace mkl {
-
-using oneapi::mkl::transpose;
-using oneapi::mkl::uplo;
-using oneapi::mkl::side;
-using oneapi::mkl::diag;
-using oneapi::mkl::offset;
-
-namespace blas {
-namespace netlib {
-namespace column_major {
-
-#include "oneapi/mkl/blas/detail/onemkl_blas_backends.hxx"
-
-} //namespace column_major
-namespace row_major {
-
-#include "oneapi/mkl/blas/detail/onemkl_blas_backends.hxx"
-
-} //namespace row_major
-} //namespace netlib
-} //namespace blas
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_ONEMKL_BLAS_NETLIB_HPP_
diff --git a/include/oneapi/mkl/blas/detail/onemkl_blas_backends.hxx b/include/oneapi/mkl/blas/detail/onemkl_blas_backends.hxx
deleted file mode 100644
index fbb64a6a0..000000000
--- a/include/oneapi/mkl/blas/detail/onemkl_blas_backends.hxx
+++ /dev/null
@@ -1,2946 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-ONEMKL_EXPORT void gemm(sycl::queue &queue, oneapi::mkl::transpose transa,
-                        oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                        std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                        float beta, sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void gemm(sycl::queue &queue, oneapi::mkl::transpose transa,
-                        oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                        std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                        std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                        double beta, sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void gemm(sycl::queue &queue, oneapi::mkl::transpose transa,
-                        oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                        std::int64_t k, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                        std::int64_t ldc);
-
-ONEMKL_EXPORT void gemm(sycl::queue &queue, oneapi::mkl::transpose transa,
-                        oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                        std::int64_t k, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                        std::int64_t ldc);
-
-ONEMKL_EXPORT void gemm(sycl::queue &queue, oneapi::mkl::transpose transa,
-                        oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                        std::int64_t k, sycl::half alpha, sycl::buffer<sycl::half, 1> &a,
-                        std::int64_t lda, sycl::buffer<sycl::half, 1> &b, std::int64_t ldb,
-                        sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void gemm(sycl::queue &queue, oneapi::mkl::transpose transa,
-                        oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                        std::int64_t k, float alpha, sycl::buffer<sycl::half, 1> &a,
-                        std::int64_t lda, sycl::buffer<sycl::half, 1> &b, std::int64_t ldb,
-                        float beta, sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void gemm(sycl::queue &queue, oneapi::mkl::transpose transa,
-                        oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                        std::int64_t k, float alpha, sycl::buffer<bfloat16, 1> &a,
-                        std::int64_t lda, sycl::buffer<bfloat16, 1> &b, std::int64_t ldb,
-                        float beta, sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void symm(sycl::queue &queue, oneapi::mkl::side left_right,
-                        oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                        sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void symm(sycl::queue &queue, oneapi::mkl::side left_right,
-                        oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                        sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void symm(sycl::queue &queue, oneapi::mkl::side left_right,
-                        oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                        std::int64_t ldb, std::complex<float> beta,
-                        sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void symm(sycl::queue &queue, oneapi::mkl::side left_right,
-                        oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                        std::int64_t ldb, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void hemm(sycl::queue &queue, oneapi::mkl::side left_right,
-                        oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                        std::int64_t ldb, std::complex<float> beta,
-                        sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void hemm(sycl::queue &queue, oneapi::mkl::side left_right,
-                        oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                        std::int64_t ldb, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                        sycl::buffer<float, 1> &a, std::int64_t lda, float beta,
-                        sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda, double beta,
-                        sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, std::complex<float> beta,
-                        sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                              oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                              float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, float beta, sycl::buffer<float, 1> &c,
-                              std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                              oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                              double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, double beta, sycl::buffer<double, 1> &c,
-                              std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                              oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                              std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, std::complex<float> beta,
-                              sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                              oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                              std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, std::complex<double> beta,
-                              sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void herk(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, float beta,
-                        sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void herk(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, double beta,
-                        sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                         sycl::buffer<float, 1> &a, std::int64_t lda,
-                         sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                         sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                         sycl::buffer<double, 1> &a, std::int64_t lda,
-                         sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                         sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                         std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                         std::int64_t ldb, std::complex<float> beta,
-                         sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                         std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                         std::int64_t ldb, std::complex<double> beta,
-                         sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void her2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                         std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                         std::int64_t ldb, float beta, sycl::buffer<std::complex<float>, 1> &c,
-                         std::int64_t ldc);
-
-ONEMKL_EXPORT void her2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                         std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                         std::int64_t ldb, double beta,
-                         sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void trmm(sycl::queue &queue, oneapi::mkl::side left_right,
-                        oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                        oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &b, std::int64_t ldb);
-
-ONEMKL_EXPORT void trmm(sycl::queue &queue, oneapi::mkl::side left_right,
-                        oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                        oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &b, std::int64_t ldb);
-
-ONEMKL_EXPORT void trmm(sycl::queue &queue, oneapi::mkl::side left_right,
-                        oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                        oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                        std::int64_t ldb);
-
-ONEMKL_EXPORT void trmm(sycl::queue &queue, oneapi::mkl::side left_right,
-                        oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                        oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                        std::int64_t ldb);
-
-ONEMKL_EXPORT void trsm(sycl::queue &queue, oneapi::mkl::side left_right,
-                        oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                        oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &b, std::int64_t ldb);
-
-ONEMKL_EXPORT void trsm(sycl::queue &queue, oneapi::mkl::side left_right,
-                        oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                        oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &b, std::int64_t ldb);
-
-ONEMKL_EXPORT void trsm(sycl::queue &queue, oneapi::mkl::side left_right,
-                        oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                        oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                        std::int64_t ldb);
-
-ONEMKL_EXPORT void trsm(sycl::queue &queue, oneapi::mkl::side left_right,
-                        oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                        oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b,
-                        std::int64_t ldb);
-
-ONEMKL_EXPORT void gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                        std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-                        float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                        std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                        std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx,
-                        double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                        std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
-
-ONEMKL_EXPORT void gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                        std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
-
-ONEMKL_EXPORT void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                              std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                              std::int64_t lda, std::int64_t stridea, sycl::buffer<float, 1> &x,
-                              std::int64_t incx, std::int64_t stridex, float beta,
-                              sycl::buffer<float, 1> &y, std::int64_t incy,
-                              std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                              std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                              std::int64_t lda, std::int64_t stridea,
-                              sycl::buffer<double, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, double beta, sycl::buffer<double, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                              std::int64_t n, std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                              std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-                              sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                              std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                              std::int64_t n, std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                              std::int64_t incx, std::int64_t stridex, std::complex<double> beta,
-                              sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                              std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m,
-                              std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<float, 1> &x,
-                              std::int64_t incx, std::int64_t stridex,
-                              sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                              std::int64_t batch_size);
-
-ONEMKL_EXPORT void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m,
-                              std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stridea, sycl::buffer<double, 1> &x,
-                              std::int64_t incx, std::int64_t stridex,
-                              sycl::buffer<double, 1> &c, std::int64_t ldc,
-                              std::int64_t stridec, std::int64_t batch_size);
-
-ONEMKL_EXPORT void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m,
-                              std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-                              std::int64_t lda, std::int64_t stridea,
-                              sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &c,
-                              std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size);
-
-ONEMKL_EXPORT void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m,
-                              std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-                              std::int64_t lda, std::int64_t stridea,
-                              sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &c,
-                              std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                        std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        sycl::buffer<float, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                        std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                        std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
-
-ONEMKL_EXPORT void gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                        std::int64_t n, std::int64_t kl, std::int64_t ku,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
-                       sycl::buffer<float, 1> &x, std::int64_t incx,
-                       sycl::buffer<float, 1> &y, std::int64_t incy,
-                       sycl::buffer<float, 1> &a, std::int64_t lda);
-
-ONEMKL_EXPORT void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
-                       sycl::buffer<double, 1> &x, std::int64_t incx,
-                       sycl::buffer<double, 1> &y, std::int64_t incy,
-                       sycl::buffer<double, 1> &a, std::int64_t lda);
-
-ONEMKL_EXPORT void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda);
-
-ONEMKL_EXPORT void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda);
-
-ONEMKL_EXPORT void geru(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda);
-
-ONEMKL_EXPORT void geru(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda);
-
-ONEMKL_EXPORT void hbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        std::int64_t k, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
-
-ONEMKL_EXPORT void hbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        std::int64_t k, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
-
-ONEMKL_EXPORT void hemv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, std::complex<float> beta,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void hemv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, std::complex<double> beta,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void her(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                       float alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-
-ONEMKL_EXPORT void her(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                       double alpha, sycl::buffer<std::complex<double>, 1> &x,
-                       std::int64_t incx, sycl::buffer<std::complex<double>, 1> &a,
-                       std::int64_t lda);
-
-ONEMKL_EXPORT void her2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda);
-
-ONEMKL_EXPORT void her2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda);
-
-ONEMKL_EXPORT void hpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy);
-
-ONEMKL_EXPORT void hpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy);
-
-ONEMKL_EXPORT void hpr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                       float alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       sycl::buffer<std::complex<float>, 1> &a);
-
-ONEMKL_EXPORT void hpr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                       double alpha, sycl::buffer<std::complex<double>, 1> &x,
-                       std::int64_t incx, sycl::buffer<std::complex<double>, 1> &a);
-
-ONEMKL_EXPORT void hpr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<float>, 1> &a);
-
-ONEMKL_EXPORT void hpr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y,
-                        std::int64_t incy, sycl::buffer<std::complex<double>, 1> &a);
-
-ONEMKL_EXPORT void sbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                        std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-                        float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void sbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                        std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx,
-                        double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void symv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                        sycl::buffer<float, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void symv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void syr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                       float alpha, sycl::buffer<float, 1> &x, std::int64_t incx,
-                       sycl::buffer<float, 1> &a, std::int64_t lda);
-
-ONEMKL_EXPORT void syr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                       double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-                       sycl::buffer<double, 1> &a, std::int64_t lda);
-
-ONEMKL_EXPORT void syr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        float alpha, sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &y, std::int64_t incy,
-                        sycl::buffer<float, 1> &a, std::int64_t lda);
-
-ONEMKL_EXPORT void syr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &y, std::int64_t incy,
-                        sycl::buffer<double, 1> &a, std::int64_t lda);
-
-ONEMKL_EXPORT void spmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        float alpha, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-                        std::int64_t incy);
-
-ONEMKL_EXPORT void spmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        double alpha, sycl::buffer<double, 1> &a,
-                        sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void spr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                       float alpha, sycl::buffer<float, 1> &x, std::int64_t incx,
-                       sycl::buffer<float, 1> &a);
-
-ONEMKL_EXPORT void spr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                       double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-                       sycl::buffer<double, 1> &a);
-
-ONEMKL_EXPORT void spr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        float alpha, sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &y, std::int64_t incy,
-                        sycl::buffer<float, 1> &a);
-
-ONEMKL_EXPORT void spr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                        double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &y, std::int64_t incy,
-                        sycl::buffer<double, 1> &a);
-
-ONEMKL_EXPORT void tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        std::int64_t k, sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        std::int64_t k, sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        std::int64_t k, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
-
-ONEMKL_EXPORT void tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        std::int64_t k, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx);
-
-ONEMKL_EXPORT void tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        std::int64_t k, sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        std::int64_t k, sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        std::int64_t k, sycl::buffer<std::complex<float>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                        std::int64_t incx);
-
-ONEMKL_EXPORT void tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        std::int64_t k, sycl::buffer<std::complex<double>, 1> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                        std::int64_t incx);
-
-ONEMKL_EXPORT void tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-                        std::int64_t incx);
-
-ONEMKL_EXPORT void tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-                        std::int64_t incx);
-
-ONEMKL_EXPORT void tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-                        std::int64_t incx);
-
-ONEMKL_EXPORT void tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-                        std::int64_t incx);
-
-ONEMKL_EXPORT void tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                        sycl::buffer<float, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                        sycl::buffer<double, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void dotc(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<float>, 1> &result);
-
-ONEMKL_EXPORT void dotc(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<double>, 1> &result);
-
-ONEMKL_EXPORT void dotu(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<float>, 1> &result);
-
-ONEMKL_EXPORT void dotu(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                        sycl::buffer<std::complex<double>, 1> &result);
-
-ONEMKL_EXPORT void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                         std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-
-ONEMKL_EXPORT void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                         std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-
-ONEMKL_EXPORT void iamax(sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-
-ONEMKL_EXPORT void iamax(sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-
-ONEMKL_EXPORT void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                         std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-
-ONEMKL_EXPORT void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                         std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-
-ONEMKL_EXPORT void iamin(sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-
-ONEMKL_EXPORT void iamin(sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         sycl::buffer<std::int64_t, 1> &result);
-
-ONEMKL_EXPORT void asum(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &result);
-
-ONEMKL_EXPORT void asum(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &result);
-
-ONEMKL_EXPORT void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, sycl::buffer<float, 1> &result);
-
-ONEMKL_EXPORT void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, sycl::buffer<double, 1> &result);
-
-ONEMKL_EXPORT void axpy(sycl::queue &queue, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void axpy(sycl::queue &queue, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void axpy(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void axpy(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void axpy_batch(sycl::queue &queue, std::int64_t n, float alpha,
-                              sycl::buffer<float, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<float, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void axpy_batch(sycl::queue &queue, std::int64_t n, double alpha,
-                              sycl::buffer<double, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<double, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void axpby(sycl::queue &queue, std::int64_t n, float alpha,
-                         sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                         sycl::buffer<float, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void axpby(sycl::queue &queue, std::int64_t n, double alpha,
-                         sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                         sycl::buffer<double, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void axpby(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                         std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                         std::int64_t incy);
-
-ONEMKL_EXPORT void axpby(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                         std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                         std::int64_t incy);
-
-ONEMKL_EXPORT void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void copy(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void copy(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                              std::int64_t incx, std::int64_t stridex,
-                              sycl::buffer<float, 1> &y, std::int64_t incy,
-                              std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void copy_batch(sycl::queue &queue, std::int64_t n,
-                              sycl::buffer<double, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<double, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void copy_batch(sycl::queue &queue, std::int64_t n,
-                              sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void copy_batch(sycl::queue &queue, std::int64_t n,
-                              sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                              std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                              std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-ONEMKL_EXPORT void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                       std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                       sycl::buffer<float, 1> &result);
-
-ONEMKL_EXPORT void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                       std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                       sycl::buffer<double, 1> &result);
-
-ONEMKL_EXPORT void sdsdot(sycl::queue &queue, std::int64_t n, float sb,
-                          sycl::buffer<float, 1> &x, std::int64_t incx,
-                          sycl::buffer<float, 1> &y, std::int64_t incy,
-                          sycl::buffer<float, 1> &result);
-
-ONEMKL_EXPORT void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                       std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                       sycl::buffer<double, 1> &result);
-
-ONEMKL_EXPORT void nrm2(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<float, 1> &result);
-
-ONEMKL_EXPORT void nrm2(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<double, 1> &result);
-
-ONEMKL_EXPORT void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, sycl::buffer<float, 1> &result);
-
-ONEMKL_EXPORT void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, sycl::buffer<double, 1> &result);
-
-ONEMKL_EXPORT void rot(sycl::queue &queue, std::int64_t n,
-                       sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                       sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c,
-                       float s);
-
-ONEMKL_EXPORT void rot(sycl::queue &queue, std::int64_t n,
-                       sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                       sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c,
-                       double s);
-
-ONEMKL_EXPORT void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                       std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy, float c,
-                       float s);
-
-ONEMKL_EXPORT void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                       std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                       double c, double s);
-
-ONEMKL_EXPORT void rotg(sycl::queue &queue, sycl::buffer<float, 1> &a,
-                        sycl::buffer<float, 1> &b, sycl::buffer<float, 1> &c,
-                        sycl::buffer<float, 1> &s);
-
-ONEMKL_EXPORT void rotg(sycl::queue &queue, sycl::buffer<double, 1> &a,
-                        sycl::buffer<double, 1> &b, sycl::buffer<double, 1> &c,
-                        sycl::buffer<double, 1> &s);
-
-ONEMKL_EXPORT void rotg(sycl::queue &queue, sycl::buffer<std::complex<float>, 1> &a,
-                        sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-                        sycl::buffer<std::complex<float>, 1> &s);
-
-ONEMKL_EXPORT void rotg(sycl::queue &queue, sycl::buffer<std::complex<double>, 1> &a,
-                        sycl::buffer<std::complex<double>, 1> &b,
-                        sycl::buffer<double, 1> &c,
-                        sycl::buffer<std::complex<double>, 1> &s);
-
-ONEMKL_EXPORT void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                        sycl::buffer<float, 1> &param);
-
-ONEMKL_EXPORT void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                        sycl::buffer<double, 1> &param);
-
-ONEMKL_EXPORT void rotmg(sycl::queue &queue, sycl::buffer<float, 1> &d1,
-                         sycl::buffer<float, 1> &d2, sycl::buffer<float, 1> &x1, float y1,
-                         sycl::buffer<float, 1> &param);
-
-ONEMKL_EXPORT void rotmg(sycl::queue &queue, sycl::buffer<double, 1> &d1,
-                         sycl::buffer<double, 1> &d2, sycl::buffer<double, 1> &x1,
-                         double y1, sycl::buffer<double, 1> &param);
-
-ONEMKL_EXPORT void scal(sycl::queue &queue, std::int64_t n, float alpha,
-                        sycl::buffer<float, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void scal(sycl::queue &queue, std::int64_t n, double alpha,
-                        sycl::buffer<double, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void scal(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void scal(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void scal(sycl::queue &queue, std::int64_t n, float alpha,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void scal(sycl::queue &queue, std::int64_t n, double alpha,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-ONEMKL_EXPORT void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                        std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                        std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void swap(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void swap(sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                        sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                              oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                              std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<float, 1> &b, std::int64_t ldb,
-                              std::int64_t stride_b, float beta, sycl::buffer<float, 1> &c,
-                              std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                              oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                              std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<double, 1> &b, std::int64_t ldb,
-                              std::int64_t stride_b, double beta, sycl::buffer<double, 1> &c,
-                              std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                              oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                              std::int64_t k, std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                              sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                              oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                              std::int64_t k, std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                              sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                              oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                              std::int64_t k, sycl::half alpha, sycl::buffer<sycl::half, 1> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<sycl::half, 1> &b, std::int64_t ldb,
-                              std::int64_t stride_b, sycl::half beta,
-                              sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                              std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                              oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                              std::int64_t k, float alpha, sycl::buffer<sycl::half, 1> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<sycl::half, 1> &b, std::int64_t ldb,
-                              std::int64_t stride_b, float beta, sycl::buffer<float, 1> &c,
-                              std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                              oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                              std::int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb,
-                              std::int64_t stride_b, float beta, sycl::buffer<float, 1> &c,
-                              std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                              oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                              std::int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb,
-                              std::int64_t stride_b, float beta, sycl::buffer<std::int32_t, 1> &c,
-                              std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right,
-                              oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                              oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                              float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<float, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-ONEMKL_EXPORT void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right,
-                              oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                              oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                              double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<double, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-ONEMKL_EXPORT void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right,
-                              oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                              oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                              std::complex<float> alpha,
-                              sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-ONEMKL_EXPORT void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right,
-                              oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                              oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                              std::complex<double> alpha,
-                              sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                              std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-ONEMKL_EXPORT void gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                         oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                         std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                         std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                         float beta, sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                         oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                         std::int64_t n, std::int64_t k, double alpha,
-                         sycl::buffer<double, 1> &a, std::int64_t lda,
-                         sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                         sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                         oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                         std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                         std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                         std::int64_t ldc);
-
-ONEMKL_EXPORT void gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                         oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                         std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                         sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                         std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                         std::int64_t ldc);
-
-ONEMKL_EXPORT void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa,
-                             oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc,
-                             std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                             sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
-                             sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo,
-                             float beta, sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-                             sycl::buffer<int32_t, 1> &co);
-
-ONEMKL_EXPORT void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa,
-                             oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc,
-                             std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                             sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
-                             sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo,
-                             float beta, sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-                             sycl::buffer<int32_t, 1> &co);
-
-ONEMKL_EXPORT void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa,
-                             oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc,
-                             std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                             sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-                             sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo,
-                             float beta, sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-                             sycl::buffer<int32_t, 1> &co);
-
-ONEMKL_EXPORT void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa,
-                             oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc,
-                             std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                             sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-                             sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo,
-                             float beta, sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-                             sycl::buffer<int32_t, 1> &co);
-
-ONEMKL_EXPORT void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                  std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                                  std::int64_t lda, std::int64_t stride_a,
-                                  sycl::buffer<float, 1> &b, std::int64_t ldb,
-                                  std::int64_t stride_b, std::int64_t batch_size);
-
-ONEMKL_EXPORT void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                  std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                                  std::int64_t lda, std::int64_t stride_a,
-                                  sycl::buffer<double, 1> &b, std::int64_t ldb,
-                                  std::int64_t stride_b, std::int64_t batch_size);
-
-ONEMKL_EXPORT void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                  std::int64_t n, std::complex<float> alpha,
-                                  sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                  std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                                  std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-ONEMKL_EXPORT void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                  std::int64_t n, std::complex<double> alpha,
-                                  sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                  std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                                  std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-
-ONEMKL_EXPORT void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                  std::int64_t n, float alpha, sycl::buffer<float, 1> &ab,
-                                  std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                                  std::int64_t batch_size);
-
-ONEMKL_EXPORT void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                  std::int64_t n, double alpha, sycl::buffer<double, 1> &ab,
-                                  std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                                  std::int64_t batch_size);
-
-ONEMKL_EXPORT void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                  std::int64_t n, std::complex<float> alpha,
-                                  sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda,
-                                  std::int64_t ldb, std::int64_t stride, std::int64_t batch_size);
-
-ONEMKL_EXPORT void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                  std::int64_t n, std::complex<double> alpha,
-                                  sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda,
-                                  std::int64_t ldb, std::int64_t stride, std::int64_t batch_size);
-
-ONEMKL_EXPORT void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                 oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                 float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                 std::int64_t stride_a, float beta, sycl::buffer<float, 1> &b,
-                                 std::int64_t ldb, std::int64_t stride_b, sycl::buffer<float, 1> &c,
-                                 std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                 oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                 double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                 std::int64_t stride_a, double beta, sycl::buffer<double, 1> &b,
-                                 std::int64_t ldb, std::int64_t stride_b,
-                                 sycl::buffer<double, 1> &c, std::int64_t ldc,
-                                 std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                 oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda, std::int64_t stride_a, std::complex<float> beta,
-                                 sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                 std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c,
-                                 std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                 oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 std::int64_t stride_a, std::complex<double> beta,
-                                 sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                 std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c,
-                                 std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-ONEMKL_EXPORT void omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                            std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                            std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb);
-
-ONEMKL_EXPORT void omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                            std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                            std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb);
-
-ONEMKL_EXPORT void omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                            std::int64_t n, std::complex<float> alpha,
-                            sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                            sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-
-ONEMKL_EXPORT void omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                            std::int64_t n, std::complex<double> alpha,
-                            sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                            sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-
-ONEMKL_EXPORT void omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                             std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                             std::int64_t lda, std::int64_t stridea, sycl::buffer<float, 1> &b,
-                             std::int64_t ldb, std::int64_t strideb);
-
-ONEMKL_EXPORT void omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                             std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                             std::int64_t lda, std::int64_t stridea, sycl::buffer<double, 1> &b,
-                             std::int64_t ldb, std::int64_t strideb);
-
-ONEMKL_EXPORT void omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                             std::int64_t n, std::complex<float> alpha,
-                             sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                             std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &b,
-                             std::int64_t ldb, std::int64_t strideb);
-
-ONEMKL_EXPORT void omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                             std::int64_t n, std::complex<double> alpha,
-                             sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                             std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b,
-                             std::int64_t ldb, std::int64_t strideb);
-
-ONEMKL_EXPORT void imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                            std::int64_t n, float alpha, sycl::buffer<float, 1> &ab,
-                            std::int64_t lda, std::int64_t ldb);
-
-ONEMKL_EXPORT void imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                            std::int64_t n, double alpha, sycl::buffer<double, 1> &ab,
-                            std::int64_t lda, std::int64_t ldb);
-
-ONEMKL_EXPORT void imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                            std::int64_t n, std::complex<float> alpha,
-                            sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda,
-                            std::int64_t ldb);
-
-ONEMKL_EXPORT void imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                            std::int64_t n, std::complex<double> alpha,
-                            sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda,
-                            std::int64_t ldb);
-
-ONEMKL_EXPORT void omatadd(sycl::queue &queue, oneapi::mkl::transpose transa,
-                           oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                           float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, float beta,
-                           sycl::buffer<float, 1> &b, std::int64_t ldb, sycl::buffer<float, 1> &c,
-                           std::int64_t ldc);
-
-ONEMKL_EXPORT void omatadd(sycl::queue &queue, oneapi::mkl::transpose transa,
-                           oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                           double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, double beta,
-                           sycl::buffer<double, 1> &b, std::int64_t ldb, sycl::buffer<double, 1> &c,
-                           std::int64_t ldc);
-
-ONEMKL_EXPORT void omatadd(sycl::queue &queue, oneapi::mkl::transpose transa,
-                           oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                           std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                           std::int64_t lda, std::complex<float> beta,
-                           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-ONEMKL_EXPORT void omatadd(sycl::queue &queue, oneapi::mkl::transpose transa,
-                           oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                           std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                           std::int64_t lda, std::complex<double> beta,
-                           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-// USM APIs
-
-ONEMKL_EXPORT sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                   oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                                   const float *b, std::int64_t ldb, float beta, float *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                   oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                                   const double *b, std::int64_t ldb, double beta, double *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                   oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                   oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *b, std::int64_t ldb,
-                                   std::complex<double> beta, std::complex<double> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                   oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, sycl::half alpha, const sycl::half *a,
-                                   std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                                   sycl::half beta, sycl::half *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                   oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, float alpha, const sycl::half *a,
-                                   std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                                   float beta, float *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                   oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, float alpha, const bfloat16 *a, std::int64_t lda,
-                                   const bfloat16 *b, std::int64_t ldb, float beta, float *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                        oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc,
-                                        std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                                        const std::int8_t *a, std::int64_t lda, std::int8_t ao,
-                                        const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                                        float beta, std::int32_t *c, std::int64_t ldc,
-                                        const std::int32_t *co,
-                                        const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                        oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc,
-                                        std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                                        const std::int8_t *a, std::int64_t lda, std::int8_t ao,
-                                        const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                                        float beta, std::int32_t *c, std::int64_t ldc,
-                                        const std::int32_t *co,
-                                        const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                        oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc,
-                                        std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                                        const std::uint8_t *a, std::int64_t lda, std::uint8_t ao,
-                                        const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                                        float beta, std::int32_t *c, std::int64_t ldc,
-                                        const std::int32_t *co,
-                                        const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                        oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc,
-                                        std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                                        const std::uint8_t *a, std::int64_t lda, std::uint8_t ao,
-                                        const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                                        float beta, std::int32_t *c, std::int64_t ldc,
-                                        const std::int32_t *co,
-                                        const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event symm(sycl::queue &queue, oneapi::mkl::side left_right,
-                                   oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                   float alpha, const float *a, std::int64_t lda, const float *b,
-                                   std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event symm(sycl::queue &queue, oneapi::mkl::side left_right,
-                                   oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                   double alpha, const double *a, std::int64_t lda, const double *b,
-                                   std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event symm(sycl::queue &queue, oneapi::mkl::side left_right,
-                                   oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event symm(sycl::queue &queue, oneapi::mkl::side left_right,
-                                   oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, const std::complex<double> *b,
-                                   std::int64_t ldb, std::complex<double> beta,
-                                   std::complex<double> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hemm(sycl::queue &queue, oneapi::mkl::side left_right,
-                                   oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> beta, std::complex<float> *c,
-                                   std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hemm(sycl::queue &queue, oneapi::mkl::side left_right,
-                                   oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, const std::complex<double> *b,
-                                   std::int64_t ldb, std::complex<double> beta,
-                                   std::complex<double> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                   float alpha, const float *a, std::int64_t lda, float beta,
-                                   float *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                   double alpha, const double *a, std::int64_t lda, double beta,
-                                   double *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> beta,
-                                   std::complex<float> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> beta,
-                                   std::complex<double> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower,
-                                         oneapi::mkl::transpose *trans, std::int64_t *n,
-                                         std::int64_t *k, float *alpha, const float **a,
-                                         std::int64_t *lda, float *beta, float **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower,
-                                         oneapi::mkl::transpose *trans, std::int64_t *n,
-                                         std::int64_t *k, double *alpha, const double **a,
-                                         std::int64_t *lda, double *beta, double **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower,
-                                         oneapi::mkl::transpose *trans, std::int64_t *n,
-                                         std::int64_t *k, std::complex<float> *alpha,
-                                         const std::complex<float> **a, std::int64_t *lda,
-                                         std::complex<float> *beta, std::complex<float> **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower,
-                                         oneapi::mkl::transpose *trans, std::int64_t *n,
-                                         std::int64_t *k, std::complex<double> *alpha,
-                                         const std::complex<double> **a, std::int64_t *lda,
-                                         std::complex<double> *beta, std::complex<double> **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                         oneapi::mkl::transpose trans, std::int64_t n,
-                                         std::int64_t k, float alpha, const float *a,
-                                         std::int64_t lda, std::int64_t stride_a, float beta,
-                                         float *c, std::int64_t ldc, std::int64_t stride_c,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                         oneapi::mkl::transpose trans, std::int64_t n,
-                                         std::int64_t k, double alpha, const double *a,
-                                         std::int64_t lda, std::int64_t stride_a, double beta,
-                                         double *c, std::int64_t ldc, std::int64_t stride_c,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                         oneapi::mkl::transpose trans, std::int64_t n,
-                                         std::int64_t k, std::complex<float> alpha,
-                                         const std::complex<float> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<float> beta,
-                                         std::complex<float> *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                         oneapi::mkl::transpose trans, std::int64_t n,
-                                         std::int64_t k, std::complex<double> alpha,
-                                         const std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<double> beta,
-                                         std::complex<double> *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event herk(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                   float alpha, const std::complex<float> *a, std::int64_t lda,
-                                   float beta, std::complex<float> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event herk(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                   double alpha, const std::complex<double> *a, std::int64_t lda,
-                                   double beta, std::complex<double> *c, std::int64_t ldc,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                    float alpha, const float *a, std::int64_t lda, const float *b,
-                                    std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                    double alpha, const double *a, std::int64_t lda,
-                                    const double *b, std::int64_t ldb, double beta, double *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                    std::complex<float> alpha, const std::complex<float> *a,
-                                    std::int64_t lda, const std::complex<float> *b,
-                                    std::int64_t ldb, std::complex<float> beta,
-                                    std::complex<float> *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                    std::complex<double> alpha, const std::complex<double> *a,
-                                    std::int64_t lda, const std::complex<double> *b,
-                                    std::int64_t ldb, std::complex<double> beta,
-                                    std::complex<double> *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event her2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                    std::complex<float> alpha, const std::complex<float> *a,
-                                    std::int64_t lda, const std::complex<float> *b,
-                                    std::int64_t ldb, float beta, std::complex<float> *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event her2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                    std::complex<double> alpha, const std::complex<double> *a,
-                                    std::int64_t lda, const std::complex<double> *b,
-                                    std::int64_t ldb, double beta, std::complex<double> *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trmm(sycl::queue &queue, oneapi::mkl::side left_right,
-                                   oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                   oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                   float alpha, const float *a, std::int64_t lda, float *b,
-                                   std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trmm(sycl::queue &queue, oneapi::mkl::side left_right,
-                                   oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                   oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                   double alpha, const double *a, std::int64_t lda, double *b,
-                                   std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trmm(sycl::queue &queue, oneapi::mkl::side left_right,
-                                   oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                   oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trmm(sycl::queue &queue, oneapi::mkl::side left_right,
-                                   oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                   oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsm(sycl::queue &queue, oneapi::mkl::side left_right,
-                                   oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                   oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                   float alpha, const float *a, std::int64_t lda, float *b,
-                                   std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsm(sycl::queue &queue, oneapi::mkl::side left_right,
-                                   oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                   oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                   double alpha, const double *a, std::int64_t lda, double *b,
-                                   std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsm(sycl::queue &queue, oneapi::mkl::side left_right,
-                                   oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                   oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsm(sycl::queue &queue, oneapi::mkl::side left_right,
-                                   oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                   oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right,
-                                         oneapi::mkl::uplo upper_lower,
-                                         oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                         std::int64_t m, std::int64_t n, float alpha,
-                                         const float *a, std::int64_t lda, std::int64_t stride_a,
-                                         float *b, std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right,
-                                         oneapi::mkl::uplo upper_lower,
-                                         oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                         std::int64_t m, std::int64_t n, double alpha,
-                                         const double *a, std::int64_t lda, std::int64_t stride_a,
-                                         double *b, std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsm_batch(
-    sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-    std::int64_t stride_a, std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsm_batch(
-    sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-    std::int64_t stride_a, std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side *left_right,
-                                         oneapi::mkl::uplo *upper_lower,
-                                         oneapi::mkl::transpose *trans,
-                                         oneapi::mkl::diag *unit_diag, std::int64_t *m,
-                                         std::int64_t *n, float *alpha, const float **a,
-                                         std::int64_t *lda, float **b, std::int64_t *ldb,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side *left_right,
-                                         oneapi::mkl::uplo *upper_lower,
-                                         oneapi::mkl::transpose *trans,
-                                         oneapi::mkl::diag *unit_diag, std::int64_t *m,
-                                         std::int64_t *n, double *alpha, const double **a,
-                                         std::int64_t *lda, double **b, std::int64_t *ldb,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsm_batch(
-    sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower,
-    oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m, std::int64_t *n,
-    std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
-    std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size,
-    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsm_batch(
-    sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower,
-    oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m, std::int64_t *n,
-    std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
-    std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size,
-    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemv(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, float alpha, const float *a,
-                                   std::int64_t lda, const float *x, std::int64_t incx, float beta,
-                                   float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemv(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, double alpha, const double *a,
-                                   std::int64_t lda, const double *x, std::int64_t incx,
-                                   double beta, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemv(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> beta, std::complex<float> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemv(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> beta, std::complex<double> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t m, std::int64_t n, float alpha,
-                                         const float *a, std::int64_t lda, std::int64_t stridea,
-                                         const float *x, std::int64_t incx, std::int64_t stridex,
-                                         float beta, float *y, std::int64_t incy,
-                                         std::int64_t stridey, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t m, std::int64_t n, double alpha,
-                                         const double *a, std::int64_t lda, std::int64_t stridea,
-                                         const double *x, std::int64_t incx, std::int64_t stridex,
-                                         double beta, double *y, std::int64_t incy,
-                                         std::int64_t stridey, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemv_batch(
-    sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-    std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-    const std::complex<float> *x, std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-    std::complex<float> *y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemv_batch(
-    sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-    std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-    std::int64_t stridea, const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-    std::complex<double> beta, std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                         std::int64_t *m, std::int64_t *n, float *alpha,
-                                         const float **a, std::int64_t *lda, const float **x,
-                                         std::int64_t *incx, float *beta, float **y,
-                                         std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                         std::int64_t *m, std::int64_t *n, double *alpha,
-                                         const double **a, std::int64_t *lda, const double **x,
-                                         std::int64_t *incx, double *beta, double **y,
-                                         std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                         std::int64_t *m, std::int64_t *n,
-                                         std::complex<float> *alpha, const std::complex<float> **a,
-                                         std::int64_t *lda, const std::complex<float> **x,
-                                         std::int64_t *incx, std::complex<float> *beta,
-                                         std::complex<float> **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemv_batch(
-    sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-    std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
-    const std::complex<double> **x, std::int64_t *incx, std::complex<double> *beta,
-    std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
-    std::int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right,
-                                         std::int64_t m, std::int64_t n, const float *a,
-                                         std::int64_t lda, std::int64_t stridea, const float *x,
-                                         std::int64_t incx, std::int64_t stridex, float *c,
-                                         std::int64_t ldc, std::int64_t stridec,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right,
-                                         std::int64_t m, std::int64_t n, const double *a,
-                                         std::int64_t lda, std::int64_t stridea, const double *x,
-                                         std::int64_t incx, std::int64_t stridex, double *c,
-                                         std::int64_t ldc, std::int64_t stridec,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right,
-                                         std::int64_t m, std::int64_t n,
-                                         const std::complex<float> *a, std::int64_t lda,
-                                         std::int64_t stridea, const std::complex<float> *x,
-                                         std::int64_t incx, std::int64_t stridex,
-                                         std::complex<float> *c, std::int64_t ldc,
-                                         std::int64_t stridec, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right,
-                                         std::int64_t m, std::int64_t n,
-                                         const std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stridea, const std::complex<double> *x,
-                                         std::int64_t incx, std::int64_t stridex,
-                                         std::complex<double> *c, std::int64_t ldc,
-                                         std::int64_t stridec, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right,
-                                         std::int64_t *m, std::int64_t *n, const float **a,
-                                         std::int64_t *lda, const float **x, std::int64_t *incx,
-                                         float **c, std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right,
-                                         std::int64_t *m, std::int64_t *n, const double **a,
-                                         std::int64_t *lda, const double **x, std::int64_t *incx,
-                                         double **c, std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right,
-                                         std::int64_t *m, std::int64_t *n,
-                                         const std::complex<float> **a, std::int64_t *lda,
-                                         const std::complex<float> **x, std::int64_t *incx,
-                                         std::complex<float> **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right,
-                                         std::int64_t *m, std::int64_t *n,
-                                         const std::complex<double> **a, std::int64_t *lda,
-                                         const std::complex<double> **x, std::int64_t *incx,
-                                         std::complex<double> **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gbmv(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                                   float alpha, const float *a, std::int64_t lda, const float *x,
-                                   std::int64_t incx, float beta, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gbmv(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                                   double alpha, const double *a, std::int64_t lda, const double *x,
-                                   std::int64_t incx, double beta, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gbmv(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                                   std::complex<float> alpha, const std::complex<float> *a,
-                                   std::int64_t lda, const std::complex<float> *x,
-                                   std::int64_t incx, std::complex<float> beta,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gbmv(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku,
-                                   std::complex<double> alpha, const std::complex<double> *a,
-                                   std::int64_t lda, const std::complex<double> *x,
-                                   std::int64_t incx, std::complex<double> beta,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                  float alpha, const float *x, std::int64_t incx, const float *y,
-                                  std::int64_t incy, float *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                  double alpha, const double *x, std::int64_t incx, const double *y,
-                                  std::int64_t incy, double *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *x,
-                                   std::int64_t incx, const std::complex<float> *y,
-                                   std::int64_t incy, std::complex<float> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *x,
-                                   std::int64_t incx, const std::complex<double> *y,
-                                   std::int64_t incy, std::complex<double> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *x,
-                                   std::int64_t incx, const std::complex<float> *y,
-                                   std::int64_t incy, std::complex<float> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *x,
-                                   std::int64_t incx, const std::complex<double> *y,
-                                   std::int64_t incy, std::complex<double> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> beta, std::complex<float> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> beta, std::complex<double> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hemv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> beta, std::complex<float> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hemv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> beta, std::complex<double> *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event her(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  std::int64_t n, float alpha, const std::complex<float> *x,
-                                  std::int64_t incx, std::complex<float> *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event her(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  std::int64_t n, double alpha, const std::complex<double> *x,
-                                  std::int64_t incx, std::complex<double> *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event her2(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event her2(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, const std::complex<float> *x,
-                                   std::int64_t incx, std::complex<float> beta,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, const std::complex<double> *x,
-                                   std::int64_t incx, std::complex<double> beta,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hpr(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  std::int64_t n, float alpha, const std::complex<float> *x,
-                                  std::int64_t incx, std::complex<float> *a,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hpr(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  std::int64_t n, double alpha, const std::complex<double> *x,
-                                  std::int64_t incx, std::complex<double> *a,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hpr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *a,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hpr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *a,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, std::int64_t k, float alpha, const float *a,
-                                   std::int64_t lda, const float *x, std::int64_t incx, float beta,
-                                   float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, std::int64_t k, double alpha, const double *a,
-                                   std::int64_t lda, const double *x, std::int64_t incx,
-                                   double beta, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event symv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                                   const float *x, std::int64_t incx, float beta, float *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event symv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                                   const double *x, std::int64_t incx, double beta, double *y,
-                                   std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syr(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  std::int64_t n, float alpha, const float *x, std::int64_t incx,
-                                  float *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syr(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  std::int64_t n, double alpha, const double *x, std::int64_t incx,
-                                  double *a, std::int64_t lda,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, float alpha, const float *x, std::int64_t incx,
-                                   const float *y, std::int64_t incy, float *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, double alpha, const double *x, std::int64_t incx,
-                                   const double *y, std::int64_t incy, double *a, std::int64_t lda,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event spmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, float alpha, const float *a, const float *x,
-                                   std::int64_t incx, float beta, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event spmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, double alpha, const double *a, const double *x,
-                                   std::int64_t incx, double beta, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event spr(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  std::int64_t n, float alpha, const float *x, std::int64_t incx,
-                                  float *a, const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event spr(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  std::int64_t n, double alpha, const double *x, std::int64_t incx,
-                                  double *a, const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event spr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, float alpha, const float *x, std::int64_t incx,
-                                   const float *y, std::int64_t incy, float *a,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event spr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, double alpha, const double *x, std::int64_t incx,
-                                   const double *y, std::int64_t incy, double *a,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const float *a, std::int64_t lda,
-                                   float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const double *a,
-                                   std::int64_t lda, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const float *a, std::int64_t lda,
-                                   float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const double *a,
-                                   std::int64_t lda, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const float *a, float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const double *a, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const std::complex<float> *a,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const std::complex<double> *a,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const float *a, float *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const double *a, double *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const std::complex<float> *a,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const std::complex<double> *a,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const float *a, std::int64_t lda, float *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const double *a, std::int64_t lda, double *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const float *a, std::int64_t lda, float *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const double *a, std::int64_t lda, double *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                   std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dotc(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dotc(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dotu(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   const std::complex<float> *y, std::int64_t incy,
-                                   std::complex<float> *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dotu(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   const std::complex<double> *y, std::int64_t incy,
-                                   std::complex<double> *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event iamax(sycl::queue &queue, std::int64_t n, const float *x,
-                                    std::int64_t incx, std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event iamax(sycl::queue &queue, std::int64_t n, const double *x,
-                                    std::int64_t incx, std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event iamax(sycl::queue &queue, std::int64_t n,
-                                    const std::complex<float> *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event iamax(sycl::queue &queue, std::int64_t n,
-                                    const std::complex<double> *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event iamin(sycl::queue &queue, std::int64_t n, const float *x,
-                                    std::int64_t incx, std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event iamin(sycl::queue &queue, std::int64_t n, const double *x,
-                                    std::int64_t incx, std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event iamin(sycl::queue &queue, std::int64_t n,
-                                    const std::complex<float> *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event iamin(sycl::queue &queue, std::int64_t n,
-                                    const std::complex<double> *x, std::int64_t incx,
-                                    std::int64_t *result,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event asum(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx, float *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event asum(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx, double *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event asum(sycl::queue &queue, std::int64_t n, const float *x,
-                                   std::int64_t incx, float *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event asum(sycl::queue &queue, std::int64_t n, const double *x,
-                                   std::int64_t incx, double *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpy(sycl::queue &queue, std::int64_t n, float alpha,
-                                   const float *x, std::int64_t incx, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpy(sycl::queue &queue, std::int64_t n, double alpha,
-                                   const double *x, std::int64_t incx, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpy(sycl::queue &queue, std::int64_t n,
-                                   std::complex<float> alpha, const std::complex<float> *x,
-                                   std::int64_t incx, std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpy(sycl::queue &queue, std::int64_t n,
-                                   std::complex<double> alpha, const std::complex<double> *x,
-                                   std::int64_t incx, std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, float *alpha,
-                                         const float **x, std::int64_t *incx, float **y,
-                                         std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, double *alpha,
-                                         const double **x, std::int64_t *incx, double **y,
-                                         std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n,
-                                         std::complex<float> *alpha, const std::complex<float> **x,
-                                         std::int64_t *incx, std::complex<float> **y,
-                                         std::int64_t *incy, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n,
-                                         std::complex<double> *alpha,
-                                         const std::complex<double> **x, std::int64_t *incx,
-                                         std::complex<double> **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, float alpha,
-                                         const float *x, std::int64_t incx, std::int64_t stridex,
-                                         float *y, std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, double alpha,
-                                         const double *x, std::int64_t incx, std::int64_t stridex,
-                                         double *y, std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t n,
-                                         std::complex<float> alpha, const std::complex<float> *x,
-                                         std::int64_t incx, std::int64_t stridex,
-                                         std::complex<float> *y, std::int64_t incy,
-                                         std::int64_t stridey, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t n,
-                                         std::complex<double> alpha, const std::complex<double> *x,
-                                         std::int64_t incx, std::int64_t stridex,
-                                         std::complex<double> *y, std::int64_t incy,
-                                         std::int64_t stridey, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpby(sycl::queue &queue, std::int64_t n, float alpha,
-                                    const float *x, std::int64_t incx, const float beta, float *y,
-                                    std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpby(sycl::queue &queue, std::int64_t n, double alpha,
-                                    const double *x, std::int64_t incx, const double beta,
-                                    double *y, std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpby(sycl::queue &queue, std::int64_t n,
-                                    std::complex<float> alpha, const std::complex<float> *x,
-                                    std::int64_t incx, const std::complex<float> beta,
-                                    std::complex<float> *y, std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event axpby(sycl::queue &queue, std::int64_t n,
-                                    std::complex<double> alpha, const std::complex<double> *x,
-                                    std::int64_t incx, const std::complex<double> beta,
-                                    std::complex<double> *y, std::int64_t incy,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy(sycl::queue &queue, std::int64_t n, const float *x,
-                                   std::int64_t incx, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy(sycl::queue &queue, std::int64_t n, const double *x,
-                                   std::int64_t incx, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx,
-                                   std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx,
-                                   std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const float **x,
-                                         std::int64_t *incx, float **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const double **x,
-                                         std::int64_t *incx, double **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t *n,
-                                         const std::complex<float> **x, std::int64_t *incx,
-                                         std::complex<float> **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t *n,
-                                         const std::complex<double> **x, std::int64_t *incx,
-                                         std::complex<double> **y, std::int64_t *incy,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const float *x,
-                                         std::int64_t incx, std::int64_t stridex, float *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const double *x,
-                                         std::int64_t incx, std::int64_t stridex, double *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t n,
-                                         const std::complex<float> *x, std::int64_t incx,
-                                         std::int64_t stridex, std::complex<float> *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t n,
-                                         const std::complex<double> *x, std::int64_t incx,
-                                         std::int64_t stridex, std::complex<double> *y,
-                                         std::int64_t incy, std::int64_t stridey,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x,
-                                  std::int64_t incx, const float *y, std::int64_t incy,
-                                  float *result,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dot(sycl::queue &queue, std::int64_t n, const double *x,
-                                  std::int64_t incx, const double *y, std::int64_t incy,
-                                  double *result,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sdsdot(sycl::queue &queue, std::int64_t n, float sb,
-                                     const float *x, std::int64_t incx, const float *y,
-                                     std::int64_t incy, float *result,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x,
-                                  std::int64_t incx, const float *y, std::int64_t incy,
-                                  double *result,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event nrm2(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<float> *x, std::int64_t incx, float *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event nrm2(sycl::queue &queue, std::int64_t n,
-                                   const std::complex<double> *x, std::int64_t incx, double *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event nrm2(sycl::queue &queue, std::int64_t n, const float *x,
-                                   std::int64_t incx, float *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event nrm2(sycl::queue &queue, std::int64_t n, const double *x,
-                                   std::int64_t incx, double *result,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex<float> *x,
-                                  std::int64_t incx, std::complex<float> *y, std::int64_t incy,
-                                  float c, float s,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex<double> *x,
-                                  std::int64_t incx, std::complex<double> *y, std::int64_t incy,
-                                  double c, double s,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rot(sycl::queue &queue, std::int64_t n, float *x,
-                                  std::int64_t incx, float *y, std::int64_t incy, float c, float s,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rot(sycl::queue &queue, std::int64_t n, double *x,
-                                  std::int64_t incx, double *y, std::int64_t incy, double c,
-                                  double s, const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c,
-                                   double *s,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rotg(sycl::queue &queue, std::complex<float> *a,
-                                   std::complex<float> *b, float *c, std::complex<float> *s,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rotg(sycl::queue &queue, std::complex<double> *a,
-                                   std::complex<double> *b, double *c, std::complex<double> *s,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rotm(sycl::queue &queue, std::int64_t n, float *x,
-                                   std::int64_t incx, float *y, std::int64_t incy, float *param,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rotm(sycl::queue &queue, std::int64_t n, double *x,
-                                   std::int64_t incx, double *y, std::int64_t incy, double *param,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1,
-                                    float y1, float *param,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1,
-                                    double y1, double *param,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, float *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, double *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event scal(sycl::queue &queue, std::int64_t n,
-                                   std::complex<float> alpha, std::complex<float> *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event scal(sycl::queue &queue, std::int64_t n,
-                                   std::complex<double> alpha, std::complex<double> *x,
-                                   std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha,
-                                   std::complex<float> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha,
-                                   std::complex<double> *x, std::int64_t incx,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event swap(sycl::queue &queue, std::int64_t n, float *x,
-                                   std::int64_t incx, float *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event swap(sycl::queue &queue, std::int64_t n, double *x,
-                                   std::int64_t incx, double *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex<float> *x,
-                                   std::int64_t incx, std::complex<float> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex<double> *x,
-                                   std::int64_t incx, std::complex<double> *y, std::int64_t incy,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                                         oneapi::mkl::transpose *transb, std::int64_t *m,
-                                         std::int64_t *n, std::int64_t *k, float *alpha,
-                                         const float **a, std::int64_t *lda, const float **b,
-                                         std::int64_t *ldb, float *beta, float **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                                         oneapi::mkl::transpose *transb, std::int64_t *m,
-                                         std::int64_t *n, std::int64_t *k, double *alpha,
-                                         const double **a, std::int64_t *lda, const double **b,
-                                         std::int64_t *ldb, double *beta, double **c,
-                                         std::int64_t *ldc, std::int64_t group_count,
-                                         std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                                         oneapi::mkl::transpose *transb, std::int64_t *m,
-                                         std::int64_t *n, std::int64_t *k,
-                                         std::complex<float> *alpha, const std::complex<float> **a,
-                                         std::int64_t *lda, const std::complex<float> **b,
-                                         std::int64_t *ldb, std::complex<float> *beta,
-                                         std::complex<float> **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(
-    sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-    std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex<double> *alpha,
-    const std::complex<double> **a, std::int64_t *lda, const std::complex<double> **b,
-    std::int64_t *ldb, std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-    std::int64_t group_count, std::int64_t *group_size,
-    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                                         oneapi::mkl::transpose *transb, std::int64_t *m,
-                                         std::int64_t *n, std::int64_t *k, sycl::half *alpha,
-                                         const sycl::half **a, std::int64_t *lda,
-                                         const sycl::half **b, std::int64_t *ldb, sycl::half *beta,
-                                         sycl::half **c, std::int64_t *ldc,
-                                         std::int64_t group_count, std::int64_t *group_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                                     oneapi::mkl::transpose *transb, std::int64_t *m,
-                                     std::int64_t *n, std::int64_t *k, float *alpha,
-                                     const sycl::half **a, std::int64_t *lda, const sycl::half **b,
-                                     std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                                     std::int64_t group_count, std::int64_t *group_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                                     oneapi::mkl::transpose *transb, std::int64_t *m,
-                                     std::int64_t *n, std::int64_t *k, float *alpha,
-                                     const std::int8_t **a, std::int64_t *lda,
-                                     const std::int8_t **b, std::int64_t *ldb, float *beta,
-                                     float **c, std::int64_t *ldc, std::int64_t group_count,
-                                     std::int64_t *group_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                                     oneapi::mkl::transpose *transb, std::int64_t *m,
-                                     std::int64_t *n, std::int64_t *k, float *alpha,
-                                     const std::int8_t **a, std::int64_t *lda,
-                                     const std::int8_t **b, std::int64_t *ldb, float *beta,
-                                     std::int32_t **c, std::int64_t *ldc, std::int64_t group_count,
-                                     std::int64_t *group_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                         oneapi::mkl::transpose transb, std::int64_t m,
-                                         std::int64_t n, std::int64_t k, float alpha,
-                                         const float *a, std::int64_t lda, std::int64_t stride_a,
-                                         const float *b, std::int64_t ldb, std::int64_t stride_b,
-                                         float beta, float *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                         oneapi::mkl::transpose transb, std::int64_t m,
-                                         std::int64_t n, std::int64_t k, double alpha,
-                                         const double *a, std::int64_t lda, std::int64_t stride_a,
-                                         const double *b, std::int64_t ldb, std::int64_t stride_b,
-                                         double beta, double *c, std::int64_t ldc,
-                                         std::int64_t stride_c, std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(
-    sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-    std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-    const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-    const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-    std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(
-    sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-    std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-    const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-    const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-    std::complex<double> beta, std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(
-    sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-    std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half *a,
-    std::int64_t lda, std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-    std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc, std::int64_t stride_c,
-    std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                     oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                     std::int64_t k, float alpha, const sycl::half *a,
-                                     std::int64_t lda, std::int64_t stride_a, const sycl::half *b,
-                                     std::int64_t ldb, std::int64_t stride_b, float beta, float *c,
-                                     std::int64_t ldc, std::int64_t stride_c,
-                                     std::int64_t batch_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                     oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                     std::int64_t k, float alpha, const std::int8_t *a,
-                                     std::int64_t lda, std::int64_t stride_a, const std::int8_t *b,
-                                     std::int64_t ldb, std::int64_t stride_b, float beta, float *c,
-                                     std::int64_t ldc, std::int64_t stride_c,
-                                     std::int64_t batch_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                     oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                     std::int64_t k, float alpha, const std::int8_t *a,
-                                     std::int64_t lda, std::int64_t stride_a, const std::int8_t *b,
-                                     std::int64_t ldb, std::int64_t stride_b, float beta,
-                                     std::int32_t *c, std::int64_t ldc, std::int64_t stride_c,
-                                     std::int64_t batch_size,
-                                     const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                                    std::int64_t n, std::int64_t k, float alpha, const float *a,
-                                    std::int64_t lda, const float *b, std::int64_t ldb, float beta,
-                                    float *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                                    std::int64_t n, std::int64_t k, double alpha, const double *a,
-                                    std::int64_t lda, const double *b, std::int64_t ldb,
-                                    double beta, double *c, std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                                    std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                                    const std::complex<float> *a, std::int64_t lda,
-                                    const std::complex<float> *b, std::int64_t ldb,
-                                    std::complex<float> beta, std::complex<float> *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                                    std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                                    const std::complex<double> *a, std::int64_t lda,
-                                    const std::complex<double> *b, std::int64_t ldb,
-                                    std::complex<double> beta, std::complex<double> *c,
-                                    std::int64_t ldc,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t m, std::int64_t n, float alpha,
-                                         const float *a, std::int64_t lda, std::int64_t stride_a,
-                                         float *b, std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t m, std::int64_t n, double alpha,
-                                         const double *a, std::int64_t lda, std::int64_t stride_a,
-                                         double *b, std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                         const std::complex<float> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<float> *b,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                         const std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<double> *b,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t m, std::int64_t n, float alpha, float *ab,
-                                         std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t m, std::int64_t n, double alpha, double *ab,
-                                         std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                         std::complex<float> *ab, std::int64_t lda,
-                                         std::int64_t ldb, std::int64_t stride,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                         std::complex<double> *ab, std::int64_t lda,
-                                         std::int64_t ldb, std::int64_t stride,
-                                         std::int64_t batch_size,
-                                         const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                        oneapi::mkl::transpose transb, std::int64_t m,
-                                        std::int64_t n, float alpha, const float *a,
-                                        std::int64_t lda, std::int64_t stride_a, float beta,
-                                        const float *b, std::int64_t ldb, std::int64_t stride_b,
-                                        float *c, std::int64_t ldc, std::int64_t stride_c,
-                                        std::int64_t batch_size,
-                                        const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                        oneapi::mkl::transpose transb, std::int64_t m,
-                                        std::int64_t n, double alpha, const double *a,
-                                        std::int64_t lda, std::int64_t stride_a, double beta,
-                                        const double *b, std::int64_t ldb, std::int64_t stride_b,
-                                        double *c, std::int64_t ldc, std::int64_t stride_c,
-                                        std::int64_t batch_size,
-                                        const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatadd_batch(
-    sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-    std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-    std::int64_t lda, std::int64_t stride_a, std::complex<float> beta, const std::complex<float> *b,
-    std::int64_t ldb, std::int64_t stride_b, std::complex<float> *c, std::int64_t ldc,
-    std::int64_t stride_c, std::int64_t batch_size,
-    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatadd_batch(
-    sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-    std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-    std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-    const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b, std::complex<double> *c,
-    std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                   std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                                   float *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                   std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                                   double *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   const std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   const std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *b, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                    std::int64_t m, std::int64_t n, float alpha, const float *a,
-                                    std::int64_t lda, std::int64_t stridea, float *b,
-                                    std::int64_t ldb, std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                    std::int64_t m, std::int64_t n, double alpha, const double *a,
-                                    std::int64_t lda, std::int64_t stridea, double *b,
-                                    std::int64_t ldb, std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                    std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                    const std::complex<float> *a, std::int64_t lda,
-                                    std::int64_t stridea, std::complex<float> *b, std::int64_t ldb,
-                                    std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                    std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                    const std::complex<double> *a, std::int64_t lda,
-                                    std::int64_t stridea, std::complex<double> *b, std::int64_t ldb,
-                                    std::int64_t strideb,
-                                    const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                   std::int64_t n, float alpha, float *ab, std::int64_t lda,
-                                   std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                   std::int64_t n, double alpha, double *ab, std::int64_t lda,
-                                   std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                   std::int64_t n, std::complex<float> alpha,
-                                   std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                   std::int64_t n, std::complex<double> alpha,
-                                   std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-                                   const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event omatadd(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                  oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                  float alpha, const float *a, std::int64_t lda, float beta,
-                                  const float *b, std::int64_t ldb, float *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatadd(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                  oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                  double alpha, const double *a, std::int64_t lda, double beta,
-                                  const double *b, std::int64_t ldb, double *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatadd(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                  oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                  std::complex<float> alpha, const std::complex<float> *a,
-                                  std::int64_t lda, std::complex<float> beta,
-                                  const std::complex<float> *b, std::int64_t ldb,
-                                  std::complex<float> *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatadd(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                  oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                  std::complex<double> alpha, const std::complex<double> *a,
-                                  std::int64_t lda, std::complex<double> beta,
-                                  const std::complex<double> *b, std::int64_t ldb,
-                                  std::complex<double> *c, std::int64_t ldc,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, float* alpha, const float** a,
-                                         std::int64_t* lda, float** b, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, double* alpha, const double** a,
-                                         std::int64_t* lda, double** b, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, std::complex<float>* alpha,
-                                         const std::complex<float>** a, std::int64_t* lda,
-                                         std::complex<float>** b, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, std::complex<double>* alpha,
-                                         const std::complex<double>** a, std::int64_t* lda,
-                                         std::complex<double>** b, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, float* alpha, float** ab,
-                                         std::int64_t* lda, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, double* alpha, double** ab,
-                                         std::int64_t* lda, std::int64_t* ldb,
-                                         std::int64_t group_count, std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, std::complex<float>* alpha,
-                                         std::complex<float>** ab, std::int64_t* lda,
-                                         std::int64_t* ldb, std::int64_t group_count,
-                                         std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
-
-ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m,
-                                         std::int64_t* n, std::complex<double>* alpha,
-                                         std::complex<double>** ab, std::int64_t* lda,
-                                         std::int64_t* ldb, std::int64_t group_count,
-                                         std::int64_t* groupsize,
-                                         const std::vector<sycl::event>& dependencies = {});
diff --git a/include/oneapi/mkl/blas/detail/portblas/blas_ct.hpp b/include/oneapi/mkl/blas/detail/portblas/blas_ct.hpp
deleted file mode 100644
index 6d3b0b2c2..000000000
--- a/include/oneapi/mkl/blas/detail/portblas/blas_ct.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _DETAIL_PORTBLAS_BLAS_CT_HPP_
-#define _DETAIL_PORTBLAS_BLAS_CT_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/detail/backend_selector.hpp"
-#include "oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp"
-#include "oneapi/mkl/blas/detail/blas_ct_backends.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace column_major {
-
-#define MAJOR column_major
-#include "blas_ct.hxx"
-#undef MAJOR
-
-} //namespace column_major
-namespace row_major {
-
-#define MAJOR row_major
-#include "blas_ct.hxx"
-#undef MAJOR
-
-} //namespace row_major
-} //namespace blas
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_DETAIL_PORTBLAS_BLAS_CT_HPP_
diff --git a/include/oneapi/mkl/blas/detail/portblas/blas_ct.hxx b/include/oneapi/mkl/blas/detail/portblas/blas_ct.hxx
deleted file mode 100644
index 8a66ed707..000000000
--- a/include/oneapi/mkl/blas/detail/portblas/blas_ct.hxx
+++ /dev/null
@@ -1,4296 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void herk(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, float beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                             a, lda, beta, c, ldc);
-}
-
-void herk(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, double alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, double beta, sycl::buffer<std::complex<double>, 1> &c,
-          std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                             a, lda, beta, c, ldc);
-}
-
-void scal(backend_selector<backend::portblas> selector, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::portblas> selector, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::portblas> selector, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::portblas> selector, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::portblas> selector, std::int64_t n, float alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::portblas> selector, std::int64_t n, double alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void trmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, lda, x, incx);
-}
-
-void tpmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-          std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, x, incx);
-}
-
-void tpmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-          std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, x, incx);
-}
-
-void tpmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, x, incx);
-}
-
-void tpmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, x, incx);
-}
-
-void spr(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-         float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a) {
-    oneapi::mkl::blas::portblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx,
-                                            a);
-}
-
-void spr(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-         double alpha, sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a) {
-    oneapi::mkl::blas::portblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx,
-                                            a);
-}
-
-void gemm_batch(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                   alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                   c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b, double beta,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                   alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                   c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                   alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                   c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                   alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                   c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                   alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                   c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                   alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                   c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                   alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                   c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                   alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                   c, ldc, stride_c, batch_size);
-}
-
-void syrk(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                             a, lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                             a, lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                             a, lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                             a, lda, beta, c, ldc);
-}
-
-void syrk_batch(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                   alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                   batch_size);
-}
-
-void syrk_batch(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer<double, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                   alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                   batch_size);
-}
-
-void syrk_batch(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                   alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                   batch_size);
-}
-
-void syrk_batch(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                   alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                   batch_size);
-}
-
-void her2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::portblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx,
-                                             y, incy, a, lda);
-}
-
-void her2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::portblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx,
-                                             y, incy, a, lda);
-}
-
-void hbmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                             x, incx, beta, y, incy);
-}
-
-void hbmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                             x, incx, beta, y, incy);
-}
-
-void rot(backend_selector<backend::portblas> selector, std::int64_t n,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c, float s) {
-    oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::portblas> selector, std::int64_t n,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c, double s) {
-    oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy, float c, float s) {
-    oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy, double c, double s) {
-    oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void axpy(backend_selector<backend::portblas> selector, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::portblas> selector, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::portblas> selector, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::portblas> selector, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy_batch(backend_selector<backend::portblas> selector, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                   y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::portblas> selector, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                   y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::portblas> selector, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                   y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::portblas> selector, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                   y, incy, stridey, batch_size);
-}
-
-void axpby(backend_selector<backend::portblas> selector, std::int64_t n, float alpha,
-           sycl::buffer<float, 1> &x, std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-           std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y,
-                                              incy);
-}
-
-void axpby(backend_selector<backend::portblas> selector, std::int64_t n, double alpha,
-           sycl::buffer<double, 1> &x, std::int64_t incx, double beta, sycl::buffer<double, 1> &y,
-           std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y,
-                                              incy);
-}
-
-void axpby(backend_selector<backend::portblas> selector, std::int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y,
-                                              incy);
-}
-
-void axpby(backend_selector<backend::portblas> selector, std::int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y,
-                                              incy);
-}
-
-void sdsdot(backend_selector<backend::portblas> selector, std::int64_t n, float sb,
-            sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-            std::int64_t incy, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy,
-                                               result);
-}
-
-void gerc(backend_selector<backend::portblas> selector, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::portblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                             lda);
-}
-
-void gerc(backend_selector<backend::portblas> selector, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::portblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                             lda);
-}
-
-void syr2k(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-           sycl::buffer<float, 1> &b, std::int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-           std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                              a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-           std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                              a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                              a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                              a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                             incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                             incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                             incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                             incx, beta, y, incy);
-}
-
-void gemv_batch(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<float, 1> &x, std::int64_t incx,
-                std::int64_t stridex, float beta, sycl::buffer<float, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                   stridea, x, incx, stridex, beta, y, incy,
-                                                   stridey, batch_size);
-}
-
-void gemv_batch(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<double, 1> &x, std::int64_t incx,
-                std::int64_t stridex, double beta, sycl::buffer<double, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                   stridea, x, incx, stridex, beta, y, incy,
-                                                   stridey, batch_size);
-}
-
-void gemv_batch(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                   stridea, x, incx, stridex, beta, y, incy,
-                                                   stridey, batch_size);
-}
-
-void gemv_batch(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                std::int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                   stridea, x, incx, stridex, beta, y, incy,
-                                                   stridey, batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::portblas> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                   stridea, x, incx, stridex, c, ldc, stridec,
-                                                   batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::portblas> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                   stridea, x, incx, stridex, c, ldc, stridec,
-                                                   batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::portblas> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                   stridea, x, incx, stridex, c, ldc, stridec,
-                                                   batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::portblas> selector, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                   stridea, x, incx, stridex, c, ldc, stridec,
-                                                   batch_size);
-}
-
-void her(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-         float alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::portblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                            lda);
-}
-
-void her(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-         double alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::portblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                            lda);
-}
-
-void hpr(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-         float alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a) {
-    oneapi::mkl::blas::portblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx,
-                                            a);
-}
-
-void hpr(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-         double alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a) {
-    oneapi::mkl::blas::portblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx,
-                                            a);
-}
-
-void iamin(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::portblas> selector, std::int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::portblas> selector, std::int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void hpmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x,
-                                             incx, beta, y, incy);
-}
-
-void hpmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x,
-                                             incx, beta, y, incy);
-}
-
-void spmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x,
-                                             incx, beta, y, incy);
-}
-
-void spmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x, std::int64_t incx,
-          double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x,
-                                             incx, beta, y, incy);
-}
-
-void gemm_bias(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao, sycl::buffer<uint8_t, 1> &b,
-               std::int64_t ldb, uint8_t bo, float beta, sycl::buffer<int32_t, 1> &c,
-               std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m,
-                                                  n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc,
-                                                  co);
-}
-
-void gemm_bias(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao, sycl::buffer<int8_t, 1> &b,
-               std::int64_t ldb, int8_t bo, float beta, sycl::buffer<int32_t, 1> &c,
-               std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m,
-                                                  n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc,
-                                                  co);
-}
-
-void gemm_bias(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m,
-                                                  n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc,
-                                                  co);
-}
-
-void gemm_bias(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m,
-                                                  n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc,
-                                                  co);
-}
-
-void swap(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::portblas> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::portblas> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void geru(backend_selector<backend::portblas> selector, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::portblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                             lda);
-}
-
-void geru(backend_selector<backend::portblas> selector, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::portblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                             lda);
-}
-
-void nrm2(backend_selector<backend::portblas> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::portblas> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void gemm(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                             a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                             a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                             a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                             a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                             a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                             a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer<bfloat16, 1> &a,
-          std::int64_t lda, sycl::buffer<bfloat16, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha,
-                                             a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::portblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx,
-                                             y, incy, a, lda);
-}
-
-void syr2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::portblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx,
-                                             y, incy, a, lda);
-}
-
-void ger(backend_selector<backend::portblas> selector, std::int64_t m, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<float, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::portblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                            lda);
-}
-
-void ger(backend_selector<backend::portblas> selector, std::int64_t m, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-         std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda) {
-    oneapi::mkl::blas::portblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                            lda);
-}
-
-void trsm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                             unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                             unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                             unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                             unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void dotu(backend_selector<backend::portblas> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dotu(backend_selector<backend::portblas> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void hemm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                             alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hemm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                             alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hpr2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a) {
-    oneapi::mkl::blas::portblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx,
-                                             y, incy, a);
-}
-
-void hpr2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a) {
-    oneapi::mkl::blas::portblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx,
-                                             y, incy, a);
-}
-
-void gbmv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                             lda, x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                             lda, x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                             lda, x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                             lda, x, incx, beta, y, incy);
-}
-
-void tbmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             k, a, lda, x, incx);
-}
-
-void symm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &b, std::int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-          std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                             alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                             alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                             alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                             alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void dotc(backend_selector<backend::portblas> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dotc(backend_selector<backend::portblas> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void syr(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-         float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a,
-         std::int64_t lda) {
-    oneapi::mkl::blas::portblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                            lda);
-}
-
-void syr(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-         double alpha, sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a,
-         std::int64_t lda) {
-    oneapi::mkl::blas::portblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                            lda);
-}
-
-void trmm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                             unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                             unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                             unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                             unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void rotmg(backend_selector<backend::portblas> selector, sycl::buffer<float, 1> &d1,
-           sycl::buffer<float, 1> &d2, sycl::buffer<float, 1> &x1, float y1,
-           sycl::buffer<float, 1> &param) {
-    oneapi::mkl::blas::portblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param);
-}
-
-void rotmg(backend_selector<backend::portblas> selector, sycl::buffer<double, 1> &d1,
-           sycl::buffer<double, 1> &d2, sycl::buffer<double, 1> &x1, double y1,
-           sycl::buffer<double, 1> &param) {
-    oneapi::mkl::blas::portblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param);
-}
-
-void tpsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-          std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, x, incx);
-}
-
-void tpsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-          std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, x, incx);
-}
-
-void tpsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, x, incx);
-}
-
-void tpsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, x, incx);
-}
-
-void trsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             a, lda, x, incx);
-}
-
-void copy(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::portblas> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::portblas> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy_batch(backend_selector<backend::portblas> selector, std::int64_t n,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y,
-                                                   incy, stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::portblas> selector, std::int64_t n,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y,
-                                                   incy, stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::portblas> selector, std::int64_t n,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y,
-                                                   incy, stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::portblas> selector, std::int64_t n,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y,
-                                                   incy, stridey, batch_size);
-}
-
-void hemv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                             incx, beta, y, incy);
-}
-
-void hemv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                             incx, beta, y, incy);
-}
-
-void gemmt(backend_selector<backend::portblas> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-           std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n,
-                                              k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::portblas> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, double alpha,
-           sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-           std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n,
-                                              k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::portblas> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n,
-                                              k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::portblas> selector, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n,
-                                              k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void asum(backend_selector<backend::portblas> selector, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::portblas> selector, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void sbmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                             x, incx, beta, y, incy);
-}
-
-void sbmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          std::int64_t k, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                             x, incx, beta, y, incy);
-}
-
-void tbsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    oneapi::mkl::blas::portblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                             k, a, lda, x, incx);
-}
-
-void spr2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a) {
-    oneapi::mkl::blas::portblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx,
-                                             y, incy, a);
-}
-
-void spr2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &a) {
-    oneapi::mkl::blas::portblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx,
-                                             y, incy, a);
-}
-
-void iamax(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::portblas> selector, std::int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::portblas> selector, std::int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void rotm(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-          sycl::buffer<float, 1> &param) {
-    oneapi::mkl::blas::portblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param);
-}
-
-void rotm(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-          sycl::buffer<double, 1> &param) {
-    oneapi::mkl::blas::portblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param);
-}
-
-void dot(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dot(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-         sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dot(backend_selector<backend::portblas> selector, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::portblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void trsm_batch(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                   trans, unit_diag, m, n, alpha, a, lda, stride_a,
-                                                   b, ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                   trans, unit_diag, m, n, alpha, a, lda, stride_a,
-                                                   b, ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                   trans, unit_diag, m, n, alpha, a, lda, stride_a,
-                                                   b, ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                   trans, unit_diag, m, n, alpha, a, lda, stride_a,
-                                                   b, ldb, stride_b, batch_size);
-}
-
-void her2k(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                              a, lda, b, ldb, beta, c, ldc);
-}
-
-void her2k(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                              a, lda, b, ldb, beta, c, ldc);
-}
-
-void rotg(backend_selector<backend::portblas> selector, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &b, sycl::buffer<float, 1> &c, sycl::buffer<float, 1> &s) {
-    oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::portblas> selector, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &b, sycl::buffer<double, 1> &c, sycl::buffer<double, 1> &s) {
-    oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::portblas> selector, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<std::complex<float>, 1> &s) {
-    oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::portblas> selector, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<std::complex<double>, 1> &s) {
-    oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void symv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                             incx, beta, y, incy);
-}
-
-void symv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    oneapi::mkl::blas::portblas::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                             incx, beta, y, incy);
-}
-
-void omatcopy_batch(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                       lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                       lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                       lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                       lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                       lda, ldb, stride, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                   std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<float, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                      alpha, a, lda, stride_a, beta, b, ldb,
-                                                      stride_b, c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                   std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer<double, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<double, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                      alpha, a, lda, stride_a, beta, b, ldb,
-                                                      stride_b, c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                   sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                   std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                      alpha, a, lda, stride_a, beta, b, ldb,
-                                                      stride_b, c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                   sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                   std::int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                   std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::portblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                      alpha, a, lda, stride_a, beta, b, ldb,
-                                                      stride_b, c, ldc, stride_c, batch_size);
-}
-
-void omatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-              sycl::buffer<float, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 b, ldb);
-}
-
-void omatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-              sycl::buffer<double, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 b, ldb);
-}
-
-void omatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 b, ldb);
-}
-
-void omatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 b, ldb);
-}
-
-void omatcopy2(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-               std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<float, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    oneapi::mkl::blas::portblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                  stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-               std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<double, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    oneapi::mkl::blas::portblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                  stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    oneapi::mkl::blas::portblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                  stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    oneapi::mkl::blas::portblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                  stridea, b, ldb, strideb);
-}
-
-void imatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                                 ldb);
-}
-
-void imatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                                 ldb);
-}
-
-void imatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                                 ldb);
-}
-
-void imatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                                 ldb);
-}
-
-void omatadd(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-             std::int64_t lda, float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-             sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                a, lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-             std::int64_t lda, double beta, sycl::buffer<double, 1> &b, std::int64_t ldb,
-             sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                a, lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<float> alpha,
-             sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-             sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                a, lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<double> alpha,
-             sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-             sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha,
-                                                a, lda, beta, b, ldb, c, ldc);
-}
-
-// USM APIs
-
-sycl::event syr2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                 float *a, std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syr2(
-        selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 double alpha, const double *x, std::int64_t incx, const double *y,
-                 std::int64_t incy, double *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syr2(
-        selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::portblas> selector, std::int64_t n, float alpha,
-                 float *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::portblas> selector, std::int64_t n, double alpha,
-                 double *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::portblas> selector, std::int64_t n,
-                 std::complex<float> alpha, std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::portblas> selector, std::int64_t n,
-                 std::complex<double> alpha, std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::portblas> selector, std::int64_t n, float alpha,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::portblas> selector, std::int64_t n, double alpha,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                         unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                         unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                         unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                         unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event spr(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                float alpha, const float *x, std::int64_t incx, float *a,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event spr(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                double alpha, const double *x, std::int64_t incx, double *a,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event hpmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a,
-                 const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::hpmv(
-        selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hpmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a,
-                 const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::hpmv(
-        selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                 float beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                 double beta, double *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                 std::complex<float> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                 std::complex<double> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::portblas> selector, uplo *upper_lower,
-                       transpose *trans, std::int64_t *n, std::int64_t *k, float *alpha,
-                       const float **a, std::int64_t *lda, float *beta, float **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::portblas> selector, uplo *upper_lower,
-                       transpose *trans, std::int64_t *n, std::int64_t *k, double *alpha,
-                       const double **a, std::int64_t *lda, double *beta, double **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::portblas> selector, uplo *upper_lower,
-                       transpose *trans, std::int64_t *n, std::int64_t *k,
-                       std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
-                       std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::portblas> selector, uplo *upper_lower,
-                       transpose *trans, std::int64_t *n, std::int64_t *k,
-                       std::complex<double> *alpha, const std::complex<double> **a,
-                       std::int64_t *lda, std::complex<double> *beta, std::complex<double> **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::portblas> selector, uplo upper_lower,
-                       transpose trans, std::int64_t n, std::int64_t k, float alpha, const float *a,
-                       std::int64_t lda, std::int64_t stride_a, float beta, float *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::portblas> selector, uplo upper_lower,
-                       transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                       const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                       double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::portblas> selector, uplo upper_lower,
-                       transpose trans, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                       const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                       std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::portblas> selector, uplo upper_lower,
-                       transpose trans, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                       const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                       std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event her2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::her2(
-        selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event her2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::her2(
-        selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event hbmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                 std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                 std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                                 lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hbmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                 std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                 std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                                 lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::portblas> selector, std::int64_t n,
-                std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                std::int64_t incy, float c, float s, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy,
-                                                        c, s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::portblas> selector, std::int64_t n,
-                std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                std::int64_t incy, double c, double s,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy,
-                                                        c, s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::portblas> selector, std::int64_t n, float *x,
-                std::int64_t incx, float *y, std::int64_t incy, float c, float s,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy,
-                                                        c, s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::portblas> selector, std::int64_t n, double *x,
-                std::int64_t incx, double *y, std::int64_t incy, double c, double s,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy,
-                                                        c, s, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::portblas> selector, std::int64_t n, float alpha,
-                 const float *x, std::int64_t incx, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                         incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::portblas> selector, std::int64_t n, double alpha,
-                 const double *x, std::int64_t incx, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                         incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::portblas> selector, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                         incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::portblas> selector, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                         incy, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::portblas> selector, std::int64_t *n, float *alpha,
-                       const float **x, std::int64_t *incx, float **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::portblas> selector, std::int64_t *n, double *alpha,
-                       const double **x, std::int64_t *incx, double **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::portblas> selector, std::int64_t *n,
-                       std::complex<float> *alpha, const std::complex<float> **x,
-                       std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::portblas> selector, std::int64_t *n,
-                       std::complex<double> *alpha, const std::complex<double> **x,
-                       std::int64_t *incx, std::complex<double> **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::portblas> selector, std::int64_t n, float alpha,
-                       const float *x, std::int64_t incx, std::int64_t stridex, float *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                               incx, stridex, y, incy, stridey,
-                                                               batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::portblas> selector, std::int64_t n, double alpha,
-                       const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                               incx, stridex, y, incy, stridey,
-                                                               batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::portblas> selector, std::int64_t n,
-                       std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                       std::int64_t stridex, std::complex<float> *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                               incx, stridex, y, incy, stridey,
-                                                               batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::portblas> selector, std::int64_t n,
-                       std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                       std::int64_t stridex, std::complex<double> *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                               incx, stridex, y, incy, stridey,
-                                                               batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::portblas> selector, std::int64_t n, float alpha,
-                  const float *x, std::int64_t incx, const float beta, float *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                          beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::portblas> selector, std::int64_t n, double alpha,
-                  const double *x, std::int64_t incx, const double beta, double *y,
-                  std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                          beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::portblas> selector, std::int64_t n,
-                  std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                  const std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                          beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::portblas> selector, std::int64_t n,
-                  std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                  const std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                          beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gerc(backend_selector<backend::portblas> selector, std::int64_t m, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx,
-                                                         y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event gerc(backend_selector<backend::portblas> selector, std::int64_t m, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx,
-                                                         y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                  const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans,
-                                                          n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                                                          dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                  const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans,
-                                                          n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                                                          dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                  const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                  std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans,
-                                                          n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                                                          dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                  const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                  std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans,
-                                                          n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                                                          dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                 std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x,
-                 std::int64_t incx, float beta, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemv(
-        selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                 std::int64_t n, double alpha, const double *a, std::int64_t lda, const double *x,
-                 std::int64_t incx, double beta, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemv(
-        selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                 std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                 std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                 std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemv(
-        selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                 std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                 std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                 std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemv(
-        selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::portblas> selector, transpose trans,
-                       std::int64_t m, std::int64_t n, float alpha, const float *a,
-                       std::int64_t lda, std::int64_t stridea, const float *x, std::int64_t incx,
-                       std::int64_t stridex, float beta, float *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::portblas> selector, transpose trans,
-                       std::int64_t m, std::int64_t n, double alpha, const double *a,
-                       std::int64_t lda, std::int64_t stridea, const double *x, std::int64_t incx,
-                       std::int64_t stridex, double beta, double *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::portblas> selector, transpose trans,
-                       std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                       const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-                       const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::portblas> selector, transpose trans,
-                       std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                       const std::complex<double> *a, std::int64_t lda, std::int64_t stridea,
-                       const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::portblas> selector, transpose *trans,
-                       std::int64_t *m, std::int64_t *n, float *alpha, const float **a,
-                       std::int64_t *lda, const float **x, std::int64_t *incx, float *beta,
-                       float **y, std::int64_t *incy, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::portblas> selector, transpose *trans,
-                       std::int64_t *m, std::int64_t *n, double *alpha, const double **a,
-                       std::int64_t *lda, const double **x, std::int64_t *incx, double *beta,
-                       double **y, std::int64_t *incy, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::portblas> selector, transpose *trans,
-                       std::int64_t *m, std::int64_t *n, std::complex<float> *alpha,
-                       const std::complex<float> **a, std::int64_t *lda,
-                       const std::complex<float> **x, std::int64_t *incx, std::complex<float> *beta,
-                       std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::portblas> selector, transpose *trans,
-                       std::int64_t *m, std::int64_t *n, std::complex<double> *alpha,
-                       const std::complex<double> **a, std::int64_t *lda,
-                       const std::complex<double> **x, std::int64_t *incx,
-                       std::complex<double> *beta, std::complex<double> **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::portblas> selector, side left_right,
-                       std::int64_t m, std::int64_t n, const float *a, std::int64_t lda,
-                       std::int64_t stridea, const float *x, std::int64_t incx,
-                       std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::portblas> selector, side left_right,
-                       std::int64_t m, std::int64_t n, const double *a, std::int64_t lda,
-                       std::int64_t stridea, const double *x, std::int64_t incx,
-                       std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::portblas> selector, side left_right,
-                       std::int64_t m, std::int64_t n, const std::complex<float> *a,
-                       std::int64_t lda, std::int64_t stridea, const std::complex<float> *x,
-                       std::int64_t incx, std::int64_t stridex, std::complex<float> *c,
-                       std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::portblas> selector, side left_right,
-                       std::int64_t m, std::int64_t n, const std::complex<double> *a,
-                       std::int64_t lda, std::int64_t stridea, const std::complex<double> *x,
-                       std::int64_t incx, std::int64_t stridex, std::complex<double> *c,
-                       std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::portblas> selector, side *left_right,
-                       std::int64_t *m, std::int64_t *n, const float **a, std::int64_t *lda,
-                       const float **x, std::int64_t *incx, float **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size,
-        dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::portblas> selector, side *left_right,
-                       std::int64_t *m, std::int64_t *n, const double **a, std::int64_t *lda,
-                       const double **x, std::int64_t *incx, double **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size,
-        dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::portblas> selector, side *left_right,
-                       std::int64_t *m, std::int64_t *n, const std::complex<float> **a,
-                       std::int64_t *lda, const std::complex<float> **x, std::int64_t *incx,
-                       std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size,
-        dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::portblas> selector, side *left_right,
-                       std::int64_t *m, std::int64_t *n, const std::complex<double> **a,
-                       std::int64_t *lda, const std::complex<double> **x, std::int64_t *incx,
-                       std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size,
-        dependencies);
-    return done;
-}
-
-sycl::event her(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                float alpha, const std::complex<float> *x, std::int64_t incx,
-                std::complex<float> *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event her(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                double alpha, const std::complex<double> *x, std::int64_t incx,
-                std::complex<double> *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event hpr(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                float alpha, const std::complex<float> *x, std::int64_t incx,
-                std::complex<float> *a, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event hpr(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                double alpha, const std::complex<double> *x, std::int64_t incx,
-                std::complex<double> *a, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::portblas> selector, std::int64_t n, const float *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                          dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::portblas> selector, std::int64_t n, const double *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                          dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::portblas> selector, std::int64_t n,
-                  const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                          dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::portblas> selector, std::int64_t n,
-                  const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                          dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       sycl::half *alpha, const sycl::half **a, std::int64_t *lda,
-                       const sycl::half **b, std::int64_t *ldb, sycl::half *beta, sycl::half **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const sycl::half **a, std::int64_t *lda, const sycl::half **b,
-                       std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, float **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const float **a, std::int64_t *lda, const float **b,
-                       std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       double *alpha, const double **a, std::int64_t *lda, const double **b,
-                       std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
-                       const std::complex<float> **b, std::int64_t *ldb, std::complex<float> *beta,
-                       std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       std::complex<double> *alpha, const std::complex<double> **a,
-                       std::int64_t *lda, const std::complex<double> **b, std::int64_t *ldb,
-                       std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       sycl::half alpha, const sycl::half *a, std::int64_t lda,
-                       std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-                       std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const sycl::half *a, std::int64_t lda, std::int64_t stride_a,
-                       const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       std::int32_t *c, std::int64_t ldc, std::int64_t stride_c,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
-                       const float *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
-                       const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
-                       double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                       std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb,
-                       std::int64_t stride_b, std::complex<float> beta, std::complex<float> *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::portblas> selector, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                       std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb,
-                       std::int64_t stride_b, std::complex<double> beta, std::complex<double> *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event spmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 float alpha, const float *a, const float *x, std::int64_t incx, float beta,
-                 float *y, std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::spmv(
-        selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event spmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 double alpha, const double *a, const double *x, std::int64_t incx, double beta,
-                 double *y, std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::spmv(
-        selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::portblas> selector, std::int64_t n, float *x,
-                 std::int64_t incx, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::portblas> selector, std::int64_t n, double *x,
-                 std::int64_t incx, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::portblas> selector, std::int64_t n,
-                 std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::portblas> selector, std::int64_t n,
-                 std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event geru(backend_selector<backend::portblas> selector, std::int64_t m, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx,
-                                                         y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event geru(backend_selector<backend::portblas> selector, std::int64_t m, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx,
-                                                         y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::portblas> selector, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, float *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::portblas> selector, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, double *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::portblas> selector, std::int64_t n, const float *x,
-                 std::int64_t incx, float *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::portblas> selector, std::int64_t n, const double *x,
-                 std::int64_t incx, double *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a,
-                 std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a,
-                 std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                 std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                 std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                 const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                 sycl::half beta, sycl::half *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a,
-                 std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::portblas> selector, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a,
-                 std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k,
-                                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::portblas> selector, transpose transa,
-                      transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                      std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                      std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                      float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::portblas> selector, transpose transa,
-                      transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                      std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                      std::int8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                      float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::portblas> selector, transpose transa,
-                      transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                      std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda,
-                      std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                      float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::portblas> selector, transpose transa,
-                      transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                      std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda,
-                      std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                      float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event herk(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, float alpha, const std::complex<float> *a,
-                 std::int64_t lda, float beta, std::complex<float> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::herk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event herk(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, double alpha, const std::complex<double> *a,
-                 std::int64_t lda, double beta, std::complex<double> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::herk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event ger(backend_selector<backend::portblas> selector, std::int64_t m, std::int64_t n,
-                float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                float *a, std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx,
-                                                        y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event ger(backend_selector<backend::portblas> selector, std::int64_t m, std::int64_t n,
-                double alpha, const double *x, std::int64_t incx, const double *y,
-                std::int64_t incy, double *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx,
-                                                        y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                 const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right,
-                                                         upper_lower, trans, unit_diag, m, n, alpha,
-                                                         a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                 const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right,
-                                                         upper_lower, trans, unit_diag, m, n, alpha,
-                                                         a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right,
-                                                         upper_lower, trans, unit_diag, m, n, alpha,
-                                                         a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right,
-                                                         upper_lower, trans, unit_diag, m, n, alpha,
-                                                         a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::portblas> selector, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                       std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                       std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::portblas> selector, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                       std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                       std::int64_t stride_a, double *b, std::int64_t ldb, std::int64_t stride_b,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::portblas> selector, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                       std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                       std::int64_t lda, std::int64_t stride_a, std::complex<float> *b,
-                       std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::portblas> selector, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                       std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                       std::int64_t lda, std::int64_t stride_a, std::complex<double> *b,
-                       std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::portblas> selector, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                       std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, float **b,
-                       std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::portblas> selector, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                       std::int64_t *n, double *alpha, const double **a, std::int64_t *lda,
-                       double **b, std::int64_t *ldb, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::portblas> selector, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                       std::int64_t *n, std::complex<float> *alpha, const std::complex<float> **a,
-                       std::int64_t *lda, std::complex<float> **b, std::int64_t *ldb,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::portblas> selector, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                       std::int64_t *n, std::complex<double> *alpha, const std::complex<double> **a,
-                       std::int64_t *lda, std::complex<double> **b, std::int64_t *ldb,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event dotu(backend_selector<backend::portblas> selector, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                 std::int64_t incy, std::complex<float> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy,
-                                                         result, dependencies);
-    return done;
-}
-
-sycl::event dotu(backend_selector<backend::portblas> selector, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
-                 std::int64_t incy, std::complex<double> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy,
-                                                         result, dependencies);
-    return done;
-}
-
-sycl::event hemm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                 std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::hemm(selector.get_queue(), left_right,
-                                                         upper_lower, m, n, alpha, a, lda, b, ldb,
-                                                         beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event hemm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                 std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::hemm(selector.get_queue(), left_right,
-                                                         upper_lower, m, n, alpha, a, lda, b, ldb,
-                                                         beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event hpr2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n,
-                                                         alpha, x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event hpr2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n,
-                                                         alpha, x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                 std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a,
-                 std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha,
-                                                 a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                 std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, const double *a,
-                 std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha,
-                                                 a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                 std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
-                 std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha,
-                                                 a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                 std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
-                 std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha,
-                                                 a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda,
-                 float *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda,
-                 double *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                 std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                 std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                 const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right,
-                                                         upper_lower, m, n, alpha, a, lda, b, ldb,
-                                                         beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                 const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right,
-                                                         upper_lower, m, n, alpha, a, lda, b, ldb,
-                                                         beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                 std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right,
-                                                         upper_lower, m, n, alpha, a, lda, b, ldb,
-                                                         beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                 std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right,
-                                                         upper_lower, m, n, alpha, a, lda, b, ldb,
-                                                         beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event dotc(backend_selector<backend::portblas> selector, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                 std::int64_t incy, std::complex<float> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy,
-                                                         result, dependencies);
-    return done;
-}
-
-sycl::event dotc(backend_selector<backend::portblas> selector, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
-                 std::int64_t incy, std::complex<double> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy,
-                                                         result, dependencies);
-    return done;
-}
-
-sycl::event syr(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                 const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right,
-                                                         upper_lower, trans, unit_diag, m, n, alpha,
-                                                         a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                 const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right,
-                                                         upper_lower, trans, unit_diag, m, n, alpha,
-                                                         a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right,
-                                                         upper_lower, trans, unit_diag, m, n, alpha,
-                                                         a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::portblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right,
-                                                         upper_lower, trans, unit_diag, m, n, alpha,
-                                                         a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event rotmg(backend_selector<backend::portblas> selector, float *d1, float *d2, float *x1,
-                  float y1, float *param, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1,
-                                                          param, dependencies);
-    return done;
-}
-
-sycl::event rotmg(backend_selector<backend::portblas> selector, double *d1, double *d2, double *x1,
-                  double y1, double *param, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1,
-                                                          param, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                         unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                         unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                         unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                         unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::trsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::portblas> selector, std::int64_t n, const float *x,
-                 std::int64_t incx, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::portblas> selector, std::int64_t n, const double *x,
-                 std::int64_t incx, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::portblas> selector, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::portblas> selector, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::portblas> selector, std::int64_t *n,
-                       const float **x, std::int64_t *incx, float **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::portblas> selector, std::int64_t *n,
-                       const double **x, std::int64_t *incx, double **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::portblas> selector, std::int64_t *n,
-                       const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y,
-                       std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::portblas> selector, std::int64_t *n,
-                       const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
-                       std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::portblas> selector, std::int64_t n, const float *x,
-                       std::int64_t incx, std::int64_t stridex, float *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::portblas> selector, std::int64_t n,
-                       const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::portblas> selector, std::int64_t n,
-                       const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::portblas> selector, std::int64_t n,
-                       const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event hemv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                 const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::hemv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hemv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                 const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::hemv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::portblas> selector, uplo upper_lower, transpose transa,
-                  transpose transb, std::int64_t n, std::int64_t k, float alpha, const float *a,
-                  std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                          transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                          c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::portblas> selector, uplo upper_lower, transpose transa,
-                  transpose transb, std::int64_t n, std::int64_t k, double alpha, const double *a,
-                  std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                          transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                          c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::portblas> selector, uplo upper_lower, transpose transa,
-                  transpose transb, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                  const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                  std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                          transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                          c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::portblas> selector, uplo upper_lower, transpose transa,
-                  transpose transb, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                  const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                  std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                          transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                          c, ldc, dependencies);
-    return done;
-}
-
-sycl::event sbmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x,
-                 std::int64_t incx, float beta, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                                 lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event sbmv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *x,
-                 std::int64_t incx, double beta, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                                 lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::portblas> selector, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, float *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::portblas> selector, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, double *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::portblas> selector, std::int64_t n, const float *x,
-                 std::int64_t incx, float *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::portblas> selector, std::int64_t n, const double *x,
-                 std::int64_t incx, double *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda,
-                 float *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda,
-                 double *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                 std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                 std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event spr2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                 float *a, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::spr2(selector.get_queue(), upper_lower, n,
-                                                         alpha, x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event spr2(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 double alpha, const double *x, std::int64_t incx, const double *y,
-                 std::int64_t incy, double *a, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::spr2(selector.get_queue(), upper_lower, n,
-                                                         alpha, x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::portblas> selector, std::int64_t n, const float *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                          dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::portblas> selector, std::int64_t n, const double *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                          dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::portblas> selector, std::int64_t n,
-                  const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                          dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::portblas> selector, std::int64_t n,
-                  const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                          dependencies);
-    return done;
-}
-
-sycl::event rotm(backend_selector<backend::portblas> selector, std::int64_t n, float *x,
-                 std::int64_t incx, float *y, std::int64_t incy, float *param,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy,
-                                                         param, dependencies);
-    return done;
-}
-
-sycl::event rotm(backend_selector<backend::portblas> selector, std::int64_t n, double *x,
-                 std::int64_t incx, double *y, std::int64_t incy, double *param,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy,
-                                                         param, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::portblas> selector, float *a, float *b, float *c,
-                 float *s, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::portblas> selector, double *a, double *b, double *c,
-                 double *s, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::portblas> selector, std::complex<float> *a,
-                 std::complex<float> *b, float *c, std::complex<float> *s,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::portblas> selector, std::complex<double> *a,
-                 std::complex<double> *b, double *c, std::complex<double> *s,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event sdsdot(backend_selector<backend::portblas> selector, std::int64_t n, float sb,
-                   const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                   float *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y,
-                                                           incy, result, dependencies);
-    return done;
-}
-
-sycl::event her2k(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                  const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                  std::int64_t ldb, float beta, std::complex<float> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans,
-                                                          n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                                                          dependencies);
-    return done;
-}
-
-sycl::event her2k(backend_selector<backend::portblas> selector, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                  const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                  std::int64_t ldb, double beta, std::complex<double> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans,
-                                                          n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                                                          dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::portblas> selector, std::int64_t n, const float *x,
-                std::int64_t incx, const float *y, std::int64_t incy, float *result,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                        result, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::portblas> selector, std::int64_t n, const double *x,
-                std::int64_t incx, const double *y, std::int64_t incy, double *result,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                        result, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::portblas> selector, std::int64_t n, const float *x,
-                std::int64_t incx, const float *y, std::int64_t incy, double *result,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                        result, dependencies);
-    return done;
-}
-
-sycl::event symv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 float alpha, const float *a, std::int64_t lda, const float *x, std::int64_t incx,
-                 float beta, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::symv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event symv(backend_selector<backend::portblas> selector, uplo upper_lower, std::int64_t n,
-                 double alpha, const double *a, std::int64_t lda, const double *x,
-                 std::int64_t incx, double beta, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::symv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::portblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, const float *a,
-                           std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::portblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, const double *a,
-                           std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::portblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::portblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::portblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda,
-                           std::int64_t ldb, std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::portblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, double *ab,
-                           std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::portblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::portblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::portblas> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                          const float *a, std::int64_t lda, std::int64_t stride_a, float beta,
-                          const float *b, std::int64_t ldb, std::int64_t stride_b, float *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::portblas> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                          const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                          const double *b, std::int64_t ldb, std::int64_t stride_b, double *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::portblas> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                          std::int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::portblas> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<double> alpha, const std::complex<double> *a,
-                          std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n,
-                                                             alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                      std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                      std::int64_t stridea, double *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                      std::int64_t lda, std::int64_t stridea, std::complex<float> *b,
-                      std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                      std::int64_t lda, std::int64_t stridea, std::complex<double> *b,
-                      std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, float *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n,
-                                                             alpha, ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, double *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n,
-                                                             alpha, ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, std::complex<float> *ab,
-                     std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n,
-                                                             alpha, ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::portblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, std::complex<double> *ab,
-                     std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n,
-                                                             alpha, ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::portblas> selector, transpose transa,
-                    transpose transb, std::int64_t m, std::int64_t n, float alpha, const float *a,
-                    std::int64_t lda, float beta, const float *b, std::int64_t ldb, float *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m,
-                                                            n, alpha, a, lda, beta, b, ldb, c, ldc,
-                                                            dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::portblas> selector, transpose transa,
-                    transpose transb, std::int64_t m, std::int64_t n, double alpha, const double *a,
-                    std::int64_t lda, double beta, const double *b, std::int64_t ldb, double *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m,
-                                                            n, alpha, a, lda, beta, b, ldb, c, ldc,
-                                                            dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::portblas> selector, transpose transa,
-                    transpose transb, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                    const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                    const std::complex<float> *b, std::int64_t ldb, std::complex<float> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m,
-                                                            n, alpha, a, lda, beta, b, ldb, c, ldc,
-                                                            dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::portblas> selector, transpose transa,
-                    transpose transb, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                    const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                    const std::complex<double> *b, std::int64_t ldb, std::complex<double> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m,
-                                                            n, alpha, a, lda, beta, b, ldb, c, ldc,
-                                                            dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::portblas> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, float *alpha, const float **a,
-                           std::int64_t *lda, float **b, std::int64_t *ldb,
-                           std::int64_t group_count, std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::portblas> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, double *alpha, const double **a,
-                           std::int64_t *lda, double **b, std::int64_t *ldb,
-                           std::int64_t group_count, std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::portblas> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<float> *alpha,
-                           const std::complex<float> **a, std::int64_t *lda,
-                           std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::portblas> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<double> *alpha,
-                           const std::complex<double> **a, std::int64_t *lda,
-                           std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize,
-        dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::portblas> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, float *alpha, float **ab,
-                           std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize,
-        dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::portblas> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, double *alpha, double **ab,
-                           std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize,
-        dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::portblas> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<float> *alpha,
-                           std::complex<float> **ab, std::int64_t *lda, std::int64_t *ldb,
-                           std::int64_t group_count, std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize,
-        dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::portblas> selector, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<double> *alpha,
-                           std::complex<double> **ab, std::int64_t *lda, std::int64_t *ldb,
-                           std::int64_t group_count, std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize,
-        dependencies);
-    return done;
-}
diff --git a/include/oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp b/include/oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp
deleted file mode 100644
index c8d47d742..000000000
--- a/include/oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_BLAS_PORTBLAS_HPP_
-#define _ONEMKL_BLAS_PORTBLAS_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include <complex>
-
-#include "oneapi/mkl/types.hpp"
-
-#include "oneapi/mkl/detail/export.hpp"
-
-namespace oneapi {
-namespace mkl {
-
-using oneapi::mkl::transpose;
-using oneapi::mkl::uplo;
-using oneapi::mkl::side;
-using oneapi::mkl::diag;
-using oneapi::mkl::offset;
-
-namespace blas {
-namespace portblas {
-namespace column_major {
-
-#include "oneapi/mkl/blas/detail/onemkl_blas_backends.hxx"
-
-} //namespace column_major
-namespace row_major {
-
-#include "oneapi/mkl/blas/detail/onemkl_blas_backends.hxx"
-
-} //namespace row_major
-} // namespace portblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _ONEMKL_BLAS_PORTBLAS_HPP_
diff --git a/include/oneapi/mkl/blas/detail/rocblas/blas_ct.hpp b/include/oneapi/mkl/blas/detail/rocblas/blas_ct.hpp
deleted file mode 100644
index 1a019b19e..000000000
--- a/include/oneapi/mkl/blas/detail/rocblas/blas_ct.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _DETAIL_ROCBLAS_BLAS_CT_HPP_
-#define _DETAIL_ROCBLAS_BLAS_CT_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/detail/backend_selector.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hpp"
-#include "oneapi/mkl/blas/detail/blas_ct_backends.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace column_major {
-
-#define MAJOR column_major
-#include "blas_ct.hxx"
-#undef MAJOR
-
-} //namespace column_major
-namespace row_major {
-
-#define MAJOR row_major
-#include "blas_ct.hxx"
-#undef MAJOR
-
-} //namespace row_major
-} //namespace blas
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_DETAIL_ROCBLAS_BLAS_CT_HPP_
diff --git a/include/oneapi/mkl/blas/detail/rocblas/blas_ct.hxx b/include/oneapi/mkl/blas/detail/rocblas/blas_ct.hxx
deleted file mode 100644
index bc86929b0..000000000
--- a/include/oneapi/mkl/blas/detail/rocblas/blas_ct.hxx
+++ /dev/null
@@ -1,4180 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-void herk(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans, int64_t n,
-          int64_t k, float alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda, float beta,
-          sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, beta, c, ldc);
-}
-
-void herk(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans, int64_t n,
-          int64_t k, double alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          double beta, sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, beta, c, ldc);
-}
-
-void scal(backend_selector<backend::rocblas> selector, int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::rocblas> selector, int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::rocblas> selector, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::rocblas> selector, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::rocblas> selector, int64_t n, float alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void scal(backend_selector<backend::rocblas> selector, int64_t n, double alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx);
-}
-
-void trmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<float, 1> &a, int64_t lda,
-          sycl::buffer<float, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<double, 1> &a, int64_t lda,
-          sycl::buffer<double, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, lda, x, incx);
-}
-
-void trmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, lda, x, incx);
-}
-
-void tpmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-          int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, x, incx);
-}
-
-void tpmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-          int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, x, incx);
-}
-
-void tpmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, x, incx);
-}
-
-void tpmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, x, incx);
-}
-
-void spr(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, int64_t incx, sycl::buffer<float, 1> &a) {
-    oneapi::mkl::blas::rocblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void spr(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, int64_t incx, sycl::buffer<double, 1> &a) {
-    oneapi::mkl::blas::rocblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void gemm_batch(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<float, 1> &b, int64_t ldb,
-                int64_t stride_b, float beta, sycl::buffer<float, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                  alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                  c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                int64_t m, int64_t n, int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<double, 1> &b, int64_t ldb,
-                int64_t stride_b, double beta, sycl::buffer<double, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                  alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                  c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                int64_t m, int64_t n, int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, int64_t stride_b,
-                std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                  alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                  c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                int64_t m, int64_t n, int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, int64_t stride_b,
-                std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                  alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                  c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                int64_t m, int64_t n, int64_t k, sycl::half alpha, sycl::buffer<sycl::half, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<sycl::half, 1> &b, int64_t ldb,
-                int64_t stride_b, sycl::half beta, sycl::buffer<sycl::half, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                  alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                  c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer<sycl::half, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<sycl::half, 1> &b, int64_t ldb,
-                int64_t stride_b, float beta, sycl::buffer<float, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                  alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                  c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<std::int8_t, 1> &b, int64_t ldb,
-                int64_t stride_b, float beta, sycl::buffer<float, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                  alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                  c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<std::int8_t, 1> &b, int64_t ldb,
-                int64_t stride_b, float beta, sycl::buffer<std::int32_t, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k,
-                                                  alpha, a, lda, stride_a, b, ldb, stride_b, beta,
-                                                  c, ldc, stride_c, batch_size);
-}
-
-void syrk(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans, int64_t n,
-          int64_t k, float alpha, sycl::buffer<float, 1> &a, int64_t lda, float beta,
-          sycl::buffer<float, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans, int64_t n,
-          int64_t k, double alpha, sycl::buffer<double, 1> &a, int64_t lda, double beta,
-          sycl::buffer<double, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans, int64_t n,
-          int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          int64_t lda, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-          int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, beta, c, ldc);
-}
-
-void syrk(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans, int64_t n,
-          int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          int64_t lda, std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-          int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                            a, lda, beta, c, ldc);
-}
-
-void syrk_batch(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                int64_t n, int64_t k, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-                int64_t stride_a, float beta, sycl::buffer<float, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                  alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                  batch_size);
-}
-
-void syrk_batch(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                int64_t n, int64_t k, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-                int64_t stride_a, double beta, sycl::buffer<double, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                  alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                  batch_size);
-}
-
-void syrk_batch(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                int64_t n, int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stride_a,
-                std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                  alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                  batch_size);
-}
-
-void syrk_batch(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                int64_t n, int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stride_a,
-                std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k,
-                                                  alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-                                                  batch_size);
-}
-
-void her2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda) {
-    oneapi::mkl::blas::rocblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                            incy, a, lda);
-}
-
-void her2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda) {
-    oneapi::mkl::blas::rocblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                            incy, a, lda);
-}
-
-void hbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, int64_t k,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                            x, incx, beta, y, incy);
-}
-
-void hbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, int64_t k,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                            x, incx, beta, y, incy);
-}
-
-void rot(backend_selector<backend::rocblas> selector, int64_t n,
-         sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &y, int64_t incy, float c, float s) {
-    oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::rocblas> selector, int64_t n,
-         sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &y, int64_t incy, double c, double s) {
-    oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<float, 1> &x,
-         int64_t incx, sycl::buffer<float, 1> &y, int64_t incy, float c, float s) {
-    oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void rot(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<double, 1> &x,
-         int64_t incx, sycl::buffer<double, 1> &y, int64_t incy, double c, double s) {
-    oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s);
-}
-
-void axpy(backend_selector<backend::rocblas> selector, int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, int64_t incx, sycl::buffer<float, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::rocblas> selector, int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, int64_t incx, sycl::buffer<double, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::rocblas> selector, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy(backend_selector<backend::rocblas> selector, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy);
-}
-
-void axpy_batch(backend_selector<backend::rocblas> selector, int64_t n, float alpha,
-                sycl::buffer<float, 1> &x, int64_t incx, int64_t stridex, sycl::buffer<float, 1> &y,
-                int64_t incy, int64_t stridey, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                  y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::rocblas> selector, int64_t n, double alpha,
-                sycl::buffer<double, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<double, 1> &y, int64_t incy, int64_t stridey, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                  y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::rocblas> selector, int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                  y, incy, stridey, batch_size);
-}
-
-void axpy_batch(backend_selector<backend::rocblas> selector, int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex,
-                                                  y, incy, stridey, batch_size);
-}
-
-void axpby(backend_selector<backend::rocblas> selector, int64_t n, float alpha,
-           sycl::buffer<float, 1> &x, int64_t incx, float beta, sycl::buffer<float, 1> &y,
-           int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y,
-                                             incy);
-}
-
-void axpby(backend_selector<backend::rocblas> selector, int64_t n, double alpha,
-           sycl::buffer<double, 1> &x, int64_t incx, double beta, sycl::buffer<double, 1> &y,
-           int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y,
-                                             incy);
-}
-
-void axpby(backend_selector<backend::rocblas> selector, int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y,
-                                             incy);
-}
-
-void axpby(backend_selector<backend::rocblas> selector, int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y,
-                                             incy);
-}
-
-void sdsdot(backend_selector<backend::rocblas> selector, int64_t n, float sb,
-            sycl::buffer<float, 1> &x, int64_t incx, sycl::buffer<float, 1> &y, int64_t incy,
-            sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy,
-                                              result);
-}
-
-void gerc(backend_selector<backend::rocblas> selector, int64_t m, int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda) {
-    oneapi::mkl::blas::rocblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                            lda);
-}
-
-void gerc(backend_selector<backend::rocblas> selector, int64_t m, int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda) {
-    oneapi::mkl::blas::rocblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                            lda);
-}
-
-void syr2k(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-           int64_t n, int64_t k, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-           sycl::buffer<float, 1> &b, int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-           int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                             a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-           int64_t n, int64_t k, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-           sycl::buffer<double, 1> &b, int64_t ldb, double beta, sycl::buffer<double, 1> &c,
-           int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                             a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-           int64_t n, int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-           std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                             a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-           int64_t n, int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                             a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-          float alpha, sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x,
-          int64_t incx, float beta, sycl::buffer<float, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                            incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x,
-          int64_t incx, double beta, sycl::buffer<double, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                            incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                            incx, beta, y, incy);
-}
-
-void gemv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x,
-                                            incx, beta, y, incy);
-}
-
-void gemv_batch(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-                float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stridea,
-                sycl::buffer<float, 1> &x, int64_t incx, int64_t stridex, float beta,
-                sycl::buffer<float, 1> &y, int64_t incy, int64_t stridey, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                  stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                  batch_size);
-}
-
-void gemv_batch(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-                double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stridea,
-                sycl::buffer<double, 1> &x, int64_t incx, int64_t stridex, double beta,
-                sycl::buffer<double, 1> &y, int64_t incy, int64_t stridey, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                  stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                  batch_size);
-}
-
-void gemv_batch(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                int64_t stridea, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-                int64_t stridex, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                int64_t incy, int64_t stridey, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                  stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                  batch_size);
-}
-
-void gemv_batch(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-                int64_t stridea, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-                int64_t stridex, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                  stridea, x, incx, stridex, beta, y, incy, stridey,
-                                                  batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::rocblas> selector, side left_right, int64_t m, int64_t n,
-                sycl::buffer<float, 1> &a, int64_t lda, int64_t stridea, sycl::buffer<float, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<float, 1> &c, int64_t ldc,
-                int64_t stridec, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                  stridea, x, incx, stridex, c, ldc, stridec,
-                                                  batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::rocblas> selector, side left_right, int64_t m, int64_t n,
-                sycl::buffer<double, 1> &a, int64_t lda, int64_t stridea,
-                sycl::buffer<double, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<double, 1> &c, int64_t ldc, int64_t stridec, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                  stridea, x, incx, stridex, c, ldc, stridec,
-                                                  batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::rocblas> selector, side left_right, int64_t m, int64_t n,
-                sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stridea,
-                sycl::buffer<std::complex<float>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &c, int64_t ldc, int64_t stridec,
-                int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                  stridea, x, incx, stridex, c, ldc, stridec,
-                                                  batch_size);
-}
-
-void dgmm_batch(backend_selector<backend::rocblas> selector, side left_right, int64_t m, int64_t n,
-                sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stridea,
-                sycl::buffer<std::complex<double>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stridec,
-                int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda,
-                                                  stridea, x, incx, stridex, c, ldc, stridec,
-                                                  batch_size);
-}
-
-void her(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a, int64_t lda) {
-    oneapi::mkl::blas::rocblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                           lda);
-}
-
-void her(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a, int64_t lda) {
-    oneapi::mkl::blas::rocblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                           lda);
-}
-
-void hpr(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a) {
-    oneapi::mkl::blas::rocblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void hpr(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a) {
-    oneapi::mkl::blas::rocblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a);
-}
-
-void iamin(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<float, 1> &x,
-           int64_t incx, sycl::buffer<int64_t, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<double, 1> &x,
-           int64_t incx, sycl::buffer<int64_t, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::rocblas> selector, int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void iamin(backend_selector<backend::rocblas> selector, int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result);
-}
-
-void hpmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                            beta, y, incy);
-}
-
-void hpmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                            beta, y, incy);
-}
-
-void spmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                            beta, y, incy);
-}
-
-void spmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x, int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx,
-                                            beta, y, incy);
-}
-
-void gemm_bias(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-               offset offsetc, int64_t m, int64_t n, int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, int64_t lda, int8_t ao, sycl::buffer<uint8_t, 1> &b,
-               int64_t ldb, uint8_t bo, float beta, sycl::buffer<int32_t, 1> &c, int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m,
-                                                 n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc,
-                                                 co);
-}
-
-void gemm_bias(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-               offset offsetc, int64_t m, int64_t n, int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, int64_t lda, int8_t ao, sycl::buffer<int8_t, 1> &b,
-               int64_t ldb, int8_t bo, float beta, sycl::buffer<int32_t, 1> &c, int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m,
-                                                 n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc,
-                                                 co);
-}
-
-void gemm_bias(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-               offset offsetc, int64_t m, int64_t n, int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, int64_t lda, uint8_t ao, sycl::buffer<int8_t, 1> &b,
-               int64_t ldb, int8_t bo, float beta, sycl::buffer<int32_t, 1> &c, int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m,
-                                                 n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc,
-                                                 co);
-}
-
-void gemm_bias(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-               offset offsetc, int64_t m, int64_t n, int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, int64_t lda, uint8_t ao, sycl::buffer<uint8_t, 1> &b,
-               int64_t ldb, uint8_t bo, float beta, sycl::buffer<int32_t, 1> &c, int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m,
-                                                 n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc,
-                                                 co);
-}
-
-void swap(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<float, 1> &x,
-          int64_t incx, sycl::buffer<float, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<double, 1> &x,
-          int64_t incx, sycl::buffer<double, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::rocblas> selector, int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void swap(backend_selector<backend::rocblas> selector, int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void geru(backend_selector<backend::rocblas> selector, int64_t m, int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda) {
-    oneapi::mkl::blas::rocblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                            lda);
-}
-
-void geru(backend_selector<backend::rocblas> selector, int64_t m, int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda) {
-    oneapi::mkl::blas::rocblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                            lda);
-}
-
-void nrm2(backend_selector<backend::rocblas> selector, int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::rocblas> selector, int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<float, 1> &x,
-          int64_t incx, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void nrm2(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<double, 1> &x,
-          int64_t incx, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result);
-}
-
-void gemm(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-          int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-          sycl::buffer<float, 1> &b, int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-          int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                            lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-          int64_t m, int64_t n, int64_t k, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-          sycl::buffer<double, 1> &b, int64_t ldb, double beta, sycl::buffer<double, 1> &c,
-          int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                            lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-          int64_t m, int64_t n, int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                            lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-          int64_t m, int64_t n, int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                            lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-          int64_t m, int64_t n, int64_t k, sycl::half alpha, sycl::buffer<sycl::half, 1> &a,
-          int64_t lda, sycl::buffer<sycl::half, 1> &b, int64_t ldb, sycl::half beta,
-          sycl::buffer<sycl::half, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                            lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-          int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer<sycl::half, 1> &a, int64_t lda,
-          sycl::buffer<sycl::half, 1> &b, int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-          int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                            lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-          int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer<bfloat16, 1> &a, int64_t lda,
-          sycl::buffer<bfloat16, 1> &b, int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-          int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a,
-                                            lda, b, ldb, beta, c, ldc);
-}
-
-void syr2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, int64_t incx, sycl::buffer<float, 1> &y, int64_t incy,
-          sycl::buffer<float, 1> &a, int64_t lda) {
-    oneapi::mkl::blas::rocblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                            incy, a, lda);
-}
-
-void syr2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, int64_t incx, sycl::buffer<double, 1> &y, int64_t incy,
-          sycl::buffer<double, 1> &a, int64_t lda) {
-    oneapi::mkl::blas::rocblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                            incy, a, lda);
-}
-
-void ger(backend_selector<backend::rocblas> selector, int64_t m, int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, int64_t incx, sycl::buffer<float, 1> &y, int64_t incy,
-         sycl::buffer<float, 1> &a, int64_t lda) {
-    oneapi::mkl::blas::rocblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void ger(backend_selector<backend::rocblas> selector, int64_t m, int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, int64_t incx, sycl::buffer<double, 1> &y, int64_t incy,
-         sycl::buffer<double, 1> &a, int64_t lda) {
-    oneapi::mkl::blas::rocblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a,
-                                           lda);
-}
-
-void trsm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b, int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                            unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, int64_t m, int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b, int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                            unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                            unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans,
-                                            unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void dotu(backend_selector<backend::rocblas> selector, int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dotu(backend_selector<backend::rocblas> selector, int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void hemm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower, int64_t m,
-          int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hemm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower, int64_t m,
-          int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          int64_t lda, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hpr2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a) {
-    oneapi::mkl::blas::rocblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                            incy, a);
-}
-
-void hpr2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a) {
-    oneapi::mkl::blas::rocblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                            incy, a);
-}
-
-void gbmv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-          int64_t kl, int64_t ku, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-          sycl::buffer<float, 1> &x, int64_t incx, float beta, sycl::buffer<float, 1> &y,
-          int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                            lda, x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-          int64_t kl, int64_t ku, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-          sycl::buffer<double, 1> &x, int64_t incx, double beta, sycl::buffer<double, 1> &y,
-          int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                            lda, x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-          int64_t kl, int64_t ku, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                            lda, x, incx, beta, y, incy);
-}
-
-void gbmv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-          int64_t kl, int64_t ku, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                            lda, x, incx, beta, y, incy);
-}
-
-void tbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, int64_t k, sycl::buffer<float, 1> &a, int64_t lda,
-          sycl::buffer<float, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, int64_t k, sycl::buffer<double, 1> &a, int64_t lda,
-          sycl::buffer<double, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, int64_t k, sycl::buffer<std::complex<float>, 1> &a,
-          int64_t lda, sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            k, a, lda, x, incx);
-}
-
-void tbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, int64_t k, sycl::buffer<std::complex<double>, 1> &a,
-          int64_t lda, sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            k, a, lda, x, incx);
-}
-
-void symm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower, int64_t m,
-          int64_t n, float alpha, sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b,
-          int64_t ldb, float beta, sycl::buffer<float, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower, int64_t m,
-          int64_t n, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-          sycl::buffer<double, 1> &b, int64_t ldb, double beta, sycl::buffer<double, 1> &c,
-          int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower, int64_t m,
-          int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower, int64_t m,
-          int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          int64_t lda, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                            alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void dotc(backend_selector<backend::rocblas> selector, int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dotc(backend_selector<backend::rocblas> selector, int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void syr(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, int64_t incx, sycl::buffer<float, 1> &a, int64_t lda) {
-    oneapi::mkl::blas::rocblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                           lda);
-}
-
-void syr(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, int64_t incx, sycl::buffer<double, 1> &a, int64_t lda) {
-    oneapi::mkl::blas::rocblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a,
-                                           lda);
-}
-
-void trmm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b, int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                            unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, int64_t m, int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b, int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                            unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                            unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans,
-                                            unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void rotmg(backend_selector<backend::rocblas> selector, sycl::buffer<float, 1> &d1,
-           sycl::buffer<float, 1> &d2, sycl::buffer<float, 1> &x1, float y1,
-           sycl::buffer<float, 1> &param) {
-    oneapi::mkl::blas::rocblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param);
-}
-
-void rotmg(backend_selector<backend::rocblas> selector, sycl::buffer<double, 1> &d1,
-           sycl::buffer<double, 1> &d2, sycl::buffer<double, 1> &x1, double y1,
-           sycl::buffer<double, 1> &param) {
-    oneapi::mkl::blas::rocblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param);
-}
-
-void tpsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-          int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, x, incx);
-}
-
-void tpsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-          int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, x, incx);
-}
-
-void tpsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, x, incx);
-}
-
-void tpsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, x, incx);
-}
-
-void trsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<float, 1> &a, int64_t lda,
-          sycl::buffer<float, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<double, 1> &a, int64_t lda,
-          sycl::buffer<double, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, lda, x, incx);
-}
-
-void trsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            a, lda, x, incx);
-}
-
-void copy(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<float, 1> &x,
-          int64_t incx, sycl::buffer<float, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<double, 1> &x,
-          int64_t incx, sycl::buffer<double, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::rocblas> selector, int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy(backend_selector<backend::rocblas> selector, int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy);
-}
-
-void copy_batch(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<float, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<float, 1> &y, int64_t incy,
-                int64_t stridey, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y,
-                                                  incy, stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<double, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<double, 1> &y, int64_t incy,
-                int64_t stridey, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y,
-                                                  incy, stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::rocblas> selector, int64_t n,
-                sycl::buffer<std::complex<float>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y,
-                                                  incy, stridey, batch_size);
-}
-
-void copy_batch(backend_selector<backend::rocblas> selector, int64_t n,
-                sycl::buffer<std::complex<double>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y,
-                                                  incy, stridey, batch_size);
-}
-
-void hemv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                            incx, beta, y, incy);
-}
-
-void hemv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                            incx, beta, y, incy);
-}
-
-void gemmt(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose transa,
-           transpose transb, int64_t n, int64_t k, float alpha, sycl::buffer<float, 1> &a,
-           int64_t lda, sycl::buffer<float, 1> &b, int64_t ldb, float beta,
-           sycl::buffer<float, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n,
-                                             k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose transa,
-           transpose transb, int64_t n, int64_t k, double alpha, sycl::buffer<double, 1> &a,
-           int64_t lda, sycl::buffer<double, 1> &b, int64_t ldb, double beta,
-           sycl::buffer<double, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n,
-                                             k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose transa,
-           transpose transb, int64_t n, int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n,
-                                             k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose transa,
-           transpose transb, int64_t n, int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n,
-                                             k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void asum(backend_selector<backend::rocblas> selector, int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::rocblas> selector, int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<float, 1> &x,
-          int64_t incx, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void asum(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<double, 1> &x,
-          int64_t incx, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result);
-}
-
-void sbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, int64_t k,
-          float alpha, sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x,
-          int64_t incx, float beta, sycl::buffer<float, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                            x, incx, beta, y, incy);
-}
-
-void sbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, int64_t k,
-          double alpha, sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x,
-          int64_t incx, double beta, sycl::buffer<double, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda,
-                                            x, incx, beta, y, incy);
-}
-
-void tbsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, int64_t k, sycl::buffer<float, 1> &a, int64_t lda,
-          sycl::buffer<float, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, int64_t k, sycl::buffer<double, 1> &a, int64_t lda,
-          sycl::buffer<double, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, int64_t k, sycl::buffer<std::complex<float>, 1> &a,
-          int64_t lda, sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            k, a, lda, x, incx);
-}
-
-void tbsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-          diag unit_diag, int64_t n, int64_t k, sycl::buffer<std::complex<double>, 1> &a,
-          int64_t lda, sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
-    oneapi::mkl::blas::rocblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n,
-                                            k, a, lda, x, incx);
-}
-
-void spr2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, int64_t incx, sycl::buffer<float, 1> &y, int64_t incy,
-          sycl::buffer<float, 1> &a) {
-    oneapi::mkl::blas::rocblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                            incy, a);
-}
-
-void spr2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, int64_t incx, sycl::buffer<double, 1> &y, int64_t incy,
-          sycl::buffer<double, 1> &a) {
-    oneapi::mkl::blas::rocblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y,
-                                            incy, a);
-}
-
-void iamax(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<float, 1> &x,
-           int64_t incx, sycl::buffer<int64_t, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<double, 1> &x,
-           int64_t incx, sycl::buffer<int64_t, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::rocblas> selector, int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void iamax(backend_selector<backend::rocblas> selector, int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result);
-}
-
-void rotm(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<float, 1> &x,
-          int64_t incx, sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &param) {
-    oneapi::mkl::blas::rocblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param);
-}
-
-void rotm(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<double, 1> &x,
-          int64_t incx, sycl::buffer<double, 1> &y, int64_t incy, sycl::buffer<double, 1> &param) {
-    oneapi::mkl::blas::rocblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param);
-}
-
-void dot(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<float, 1> &x,
-         int64_t incx, sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dot(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<double, 1> &x,
-         int64_t incx, sycl::buffer<double, 1> &y, int64_t incy, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void dot(backend_selector<backend::rocblas> selector, int64_t n, sycl::buffer<float, 1> &x,
-         int64_t incx, sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<double, 1> &result) {
-    oneapi::mkl::blas::rocblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result);
-}
-
-void trsm_batch(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a, sycl::buffer<float, 1> &b,
-                int64_t ldb, int64_t stride_b, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                  trans, unit_diag, m, n, alpha, a, lda, stride_a,
-                                                  b, ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, int64_t m, int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                  trans, unit_diag, m, n, alpha, a, lda, stride_a,
-                                                  b, ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, int64_t stride_b,
-                int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                  trans, unit_diag, m, n, alpha, a, lda, stride_a,
-                                                  b, ldb, stride_b, batch_size);
-}
-
-void trsm_batch(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, int64_t stride_b,
-                int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower,
-                                                  trans, unit_diag, m, n, alpha, a, lda, stride_a,
-                                                  b, ldb, stride_b, batch_size);
-}
-
-void her2k(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-           int64_t n, int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, float beta,
-           sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                             a, lda, b, ldb, beta, c, ldc);
-}
-
-void her2k(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-           int64_t n, int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, double beta,
-           sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha,
-                                             a, lda, b, ldb, beta, c, ldc);
-}
-
-void rotg(backend_selector<backend::rocblas> selector, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &b, sycl::buffer<float, 1> &c, sycl::buffer<float, 1> &s) {
-    oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::rocblas> selector, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &b, sycl::buffer<double, 1> &c, sycl::buffer<double, 1> &s) {
-    oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::rocblas> selector, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<std::complex<float>, 1> &s) {
-    oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void rotg(backend_selector<backend::rocblas> selector, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<std::complex<double>, 1> &s) {
-    oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s);
-}
-
-void symv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x, int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                            incx, beta, y, incy);
-}
-
-void symv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x, int64_t incx,
-          double beta, sycl::buffer<double, 1> &y, int64_t incy) {
-    oneapi::mkl::blas::rocblas::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x,
-                                            incx, beta, y, incy);
-}
-
-void omatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                      lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                      lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                      lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a,
-                                                      lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                      lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                      lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                      lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab,
-                                                      lda, ldb, stride, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                   std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<float, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                     alpha, a, lda, stride_a, beta, b, ldb,
-                                                     stride_b, c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                   std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer<double, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<double, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                     alpha, a, lda, stride_a, beta, b, ldb,
-                                                     stride_b, c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                   sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                   std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                     alpha, a, lda, stride_a, beta, b, ldb,
-                                                     stride_b, c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                   sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                   std::int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                   std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n,
-                                                     alpha, a, lda, stride_a, beta, b, ldb,
-                                                     stride_b, c, ldc, stride_c, batch_size);
-}
-
-void omatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-              sycl::buffer<float, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                                ldb);
-}
-
-void omatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-              sycl::buffer<double, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                                ldb);
-}
-
-void omatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                                ldb);
-}
-
-void omatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b,
-                                                ldb);
-}
-
-void omatcopy2(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-               std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<float, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-               std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<double, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, b, ldb, strideb);
-}
-
-void omatcopy2(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda,
-                                                 stridea, b, ldb, strideb);
-}
-
-void imatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                                ldb);
-}
-
-void imatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                                ldb);
-}
-
-void imatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                                ldb);
-}
-
-void imatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda,
-                                                ldb);
-}
-
-void omatadd(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-             std::int64_t lda, float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-             sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                               lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-             std::int64_t lda, double beta, sycl::buffer<double, 1> &b, std::int64_t ldb,
-             sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                               lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<float> alpha,
-             sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-             sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                               lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<double> alpha,
-             sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-             sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a,
-                                               lda, beta, b, ldb, c, ldc);
-}
-
-// USM APIs
-
-sycl::event syr2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 float alpha, const float *x, int64_t incx, const float *y, int64_t incy, float *a,
-                 int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 double alpha, const double *x, int64_t incx, const double *y, int64_t incy,
-                 double *a, int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::rocblas> selector, int64_t n, float alpha, float *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::rocblas> selector, int64_t n, double alpha, double *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::rocblas> selector, int64_t n, std::complex<float> alpha,
-                 std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::rocblas> selector, int64_t n, std::complex<double> alpha,
-                 std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::rocblas> selector, int64_t n, float alpha,
-                 std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event scal(backend_selector<backend::rocblas> selector, int64_t n, double alpha,
-                 std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const float *a, int64_t lda, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const double *a, int64_t lda, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const std::complex<float> *a, int64_t lda,
-                 std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const std::complex<double> *a, int64_t lda,
-                 std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const float *a, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                        unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const double *a, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                        unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const std::complex<float> *a, std::complex<float> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                        unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const std::complex<double> *a, std::complex<double> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans,
-                                                        unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event spr(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                float alpha, const float *x, int64_t incx, float *a,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event spr(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                double alpha, const double *x, int64_t incx, double *a,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event hpmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a,
-                 const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha,
-                                                        a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hpmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a,
-                 const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha,
-                                                        a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 int64_t n, int64_t k, float alpha, const float *a, int64_t lda, float beta,
-                 float *c, int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 int64_t n, int64_t k, double alpha, const double *a, int64_t lda, double beta,
-                 double *c, int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 int64_t n, int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                 int64_t lda, std::complex<float> beta, std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 int64_t n, int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                 int64_t lda, std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::rocblas> selector, uplo *upper_lower,
-                       transpose *trans, int64_t *n, int64_t *k, float *alpha, const float **a,
-                       int64_t *lda, float *beta, float **c, int64_t *ldc, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::rocblas> selector, uplo *upper_lower,
-                       transpose *trans, int64_t *n, int64_t *k, double *alpha, const double **a,
-                       int64_t *lda, double *beta, double **c, int64_t *ldc, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::rocblas> selector, uplo *upper_lower,
-                       transpose *trans, int64_t *n, int64_t *k, std::complex<float> *alpha,
-                       const std::complex<float> **a, int64_t *lda, std::complex<float> *beta,
-                       std::complex<float> **c, int64_t *ldc, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::rocblas> selector, uplo *upper_lower,
-                       transpose *trans, int64_t *n, int64_t *k, std::complex<double> *alpha,
-                       const std::complex<double> **a, int64_t *lda, std::complex<double> *beta,
-                       std::complex<double> **c, int64_t *ldc, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::rocblas> selector, uplo upper_lower,
-                       transpose trans, int64_t n, int64_t k, float alpha, const float *a,
-                       int64_t lda, int64_t stride_a, float beta, float *c, int64_t ldc,
-                       int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::rocblas> selector, uplo upper_lower,
-                       transpose trans, int64_t n, int64_t k, double alpha, const double *a,
-                       int64_t lda, int64_t stride_a, double beta, double *c, int64_t ldc,
-                       int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::rocblas> selector, uplo upper_lower,
-                       transpose trans, int64_t n, int64_t k, std::complex<float> alpha,
-                       const std::complex<float> *a, int64_t lda, int64_t stride_a,
-                       std::complex<float> beta, std::complex<float> *c, int64_t ldc,
-                       int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event syrk_batch(backend_selector<backend::rocblas> selector, uplo upper_lower,
-                       transpose trans, int64_t n, int64_t k, std::complex<double> alpha,
-                       const std::complex<double> *a, int64_t lda, int64_t stride_a,
-                       std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                       int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event her2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, int64_t incx,
-                 const std::complex<float> *y, int64_t incy, std::complex<float> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event her2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, int64_t incx,
-                 const std::complex<double> *y, int64_t incy, std::complex<double> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event hbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 int64_t k, std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                                lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 int64_t k, std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                                lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::rocblas> selector, int64_t n, std::complex<float> *x,
-                int64_t incx, std::complex<float> *y, int64_t incy, float c, float s,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                       s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::rocblas> selector, int64_t n, std::complex<double> *x,
-                int64_t incx, std::complex<double> *y, int64_t incy, double c, double s,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                       s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::rocblas> selector, int64_t n, float *x, int64_t incx,
-                float *y, int64_t incy, float c, float s,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                       s, dependencies);
-    return done;
-}
-
-sycl::event rot(backend_selector<backend::rocblas> selector, int64_t n, double *x, int64_t incx,
-                double *y, int64_t incy, double c, double s,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c,
-                                                       s, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::rocblas> selector, int64_t n, float alpha,
-                 const float *x, int64_t incx, float *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                        incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::rocblas> selector, int64_t n, double alpha,
-                 const double *x, int64_t incx, double *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                        incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::rocblas> selector, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, int64_t incx, std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                        incy, dependencies);
-    return done;
-}
-
-sycl::event axpy(backend_selector<backend::rocblas> selector, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, int64_t incx, std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y,
-                                                        incy, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::rocblas> selector, int64_t *n, float *alpha,
-                       const float **x, int64_t *incx, float **y, int64_t *incy,
-                       int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::rocblas> selector, int64_t *n, double *alpha,
-                       const double **x, int64_t *incx, double **y, int64_t *incy,
-                       int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::rocblas> selector, int64_t *n,
-                       std::complex<float> *alpha, const std::complex<float> **x, int64_t *incx,
-                       std::complex<float> **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::rocblas> selector, int64_t *n,
-                       std::complex<double> *alpha, const std::complex<double> **x, int64_t *incx,
-                       std::complex<double> **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(
-        selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::rocblas> selector, int64_t n, float alpha,
-                       const float *x, int64_t incx, int64_t stridex, float *y, int64_t incy,
-                       int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                              incx, stridex, y, incy, stridey,
-                                                              batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::rocblas> selector, int64_t n, double alpha,
-                       const double *x, int64_t incx, int64_t stridex, double *y, int64_t incy,
-                       int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                              incx, stridex, y, incy, stridey,
-                                                              batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::rocblas> selector, int64_t n,
-                       std::complex<float> alpha, const std::complex<float> *x, int64_t incx,
-                       int64_t stridex, std::complex<float> *y, int64_t incy, int64_t stridey,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                              incx, stridex, y, incy, stridey,
-                                                              batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpy_batch(backend_selector<backend::rocblas> selector, int64_t n,
-                       std::complex<double> alpha, const std::complex<double> *x, int64_t incx,
-                       int64_t stridex, std::complex<double> *y, int64_t incy, int64_t stridey,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x,
-                                                              incx, stridex, y, incy, stridey,
-                                                              batch_size, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::rocblas> selector, int64_t n, float alpha,
-                  const float *x, int64_t incx, const float beta, float *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                         beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::rocblas> selector, int64_t n, double alpha,
-                  const double *x, int64_t incx, const double beta, double *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                         beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::rocblas> selector, int64_t n, std::complex<float> alpha,
-                  const std::complex<float> *x, int64_t incx, const std::complex<float> beta,
-                  std::complex<float> *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                         beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event axpby(backend_selector<backend::rocblas> selector, int64_t n,
-                  std::complex<double> alpha, const std::complex<double> *x, int64_t incx,
-                  const std::complex<double> beta, std::complex<double> *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx,
-                                                         beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gerc(backend_selector<backend::rocblas> selector, int64_t m, int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, int64_t incx,
-                 const std::complex<float> *y, int64_t incy, std::complex<float> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx,
-                                                        y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event gerc(backend_selector<backend::rocblas> selector, int64_t m, int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, int64_t incx,
-                 const std::complex<double> *y, int64_t incy, std::complex<double> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx,
-                                                        y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                  int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b,
-                  int64_t ldb, float beta, float *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                  int64_t n, int64_t k, double alpha, const double *a, int64_t lda, const double *b,
-                  int64_t ldb, double beta, double *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                  int64_t n, int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                  int64_t lda, const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                  std::complex<float> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event syr2k(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                  int64_t n, int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                  int64_t lda, const std::complex<double> *b, int64_t ldb,
-                  std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-                 float alpha, const float *a, int64_t lda, const float *x, int64_t incx, float beta,
-                 float *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                        lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-                 double alpha, const double *a, int64_t lda, const double *x, int64_t incx,
-                 double beta, double *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                        lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                        lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a,
-                                                        lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::rocblas> selector, transpose trans, int64_t m,
-                       int64_t n, float alpha, const float *a, int64_t lda, int64_t stridea,
-                       const float *x, int64_t incx, int64_t stridex, float beta, float *y,
-                       int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::rocblas> selector, transpose trans, int64_t m,
-                       int64_t n, double alpha, const double *a, int64_t lda, int64_t stridea,
-                       const double *x, int64_t incx, int64_t stridex, double beta, double *y,
-                       int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::rocblas> selector, transpose trans, int64_t m,
-                       int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                       int64_t lda, int64_t stridea, const std::complex<float> *x, int64_t incx,
-                       int64_t stridex, std::complex<float> beta, std::complex<float> *y,
-                       int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::rocblas> selector, transpose trans, int64_t m,
-                       int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                       int64_t lda, int64_t stridea, const std::complex<double> *x, int64_t incx,
-                       int64_t stridex, std::complex<double> beta, std::complex<double> *y,
-                       int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy,
-        stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::rocblas> selector, transpose *trans, int64_t *m,
-                       int64_t *n, float *alpha, const float **a, int64_t *lda, const float **x,
-                       int64_t *incx, float *beta, float **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::rocblas> selector, transpose *trans, int64_t *m,
-                       int64_t *n, double *alpha, const double **a, int64_t *lda, const double **x,
-                       int64_t *incx, double *beta, double **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::rocblas> selector, transpose *trans, int64_t *m,
-                       int64_t *n, std::complex<float> *alpha, const std::complex<float> **a,
-                       int64_t *lda, const std::complex<float> **x, int64_t *incx,
-                       std::complex<float> *beta, std::complex<float> **y, int64_t *incy,
-                       int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event gemv_batch(backend_selector<backend::rocblas> selector, transpose *trans, int64_t *m,
-                       int64_t *n, std::complex<double> *alpha, const std::complex<double> **a,
-                       int64_t *lda, const std::complex<double> **x, int64_t *incx,
-                       std::complex<double> *beta, std::complex<double> **y, int64_t *incy,
-                       int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count,
-        group_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::rocblas> selector, side left_right, int64_t m,
-                       int64_t n, const float *a, int64_t lda, int64_t stridea, const float *x,
-                       int64_t incx, int64_t stridex, float *c, int64_t ldc, int64_t stridec,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::rocblas> selector, side left_right, int64_t m,
-                       int64_t n, const double *a, int64_t lda, int64_t stridea, const double *x,
-                       int64_t incx, int64_t stridex, double *c, int64_t ldc, int64_t stridec,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::rocblas> selector, side left_right, int64_t m,
-                       int64_t n, const std::complex<float> *a, int64_t lda, int64_t stridea,
-                       const std::complex<float> *x, int64_t incx, int64_t stridex,
-                       std::complex<float> *c, int64_t ldc, int64_t stridec, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::rocblas> selector, side left_right, int64_t m,
-                       int64_t n, const std::complex<double> *a, int64_t lda, int64_t stridea,
-                       const std::complex<double> *x, int64_t incx, int64_t stridex,
-                       std::complex<double> *c, int64_t ldc, int64_t stridec, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec,
-        batch_size, dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::rocblas> selector, side *left_right, int64_t *m,
-                       int64_t *n, const float **a, int64_t *lda, const float **x, int64_t *incx,
-                       float **c, int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size,
-        dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::rocblas> selector, side *left_right, int64_t *m,
-                       int64_t *n, const double **a, int64_t *lda, const double **x, int64_t *incx,
-                       double **c, int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size,
-        dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::rocblas> selector, side *left_right, int64_t *m,
-                       int64_t *n, const std::complex<float> **a, int64_t *lda,
-                       const std::complex<float> **x, int64_t *incx, std::complex<float> **c,
-                       int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size,
-        dependencies);
-    return done;
-}
-
-sycl::event dgmm_batch(backend_selector<backend::rocblas> selector, side *left_right, int64_t *m,
-                       int64_t *n, const std::complex<double> **a, int64_t *lda,
-                       const std::complex<double> **x, int64_t *incx, std::complex<double> **c,
-                       int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(
-        selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size,
-        dependencies);
-    return done;
-}
-
-sycl::event her(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                float alpha, const std::complex<float> *x, int64_t incx, std::complex<float> *a,
-                int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event her(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                double alpha, const std::complex<double> *x, int64_t incx, std::complex<double> *a,
-                int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event hpr(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                float alpha, const std::complex<float> *x, int64_t incx, std::complex<float> *a,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event hpr(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                double alpha, const std::complex<double> *x, int64_t incx, std::complex<double> *a,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, a, dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::rocblas> selector, int64_t n, const float *x,
-                  int64_t incx, int64_t *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::rocblas> selector, int64_t n, const double *x,
-                  int64_t incx, int64_t *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::rocblas> selector, int64_t n,
-                  const std::complex<float> *x, int64_t incx, int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event iamin(backend_selector<backend::rocblas> selector, int64_t n,
-                  const std::complex<double> *x, int64_t incx, int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose *transa,
-                       transpose *transb, int64_t *m, int64_t *n, int64_t *k, float *alpha,
-                       const float **a, int64_t *lda, const float **b, int64_t *ldb, float *beta,
-                       float **c, int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose *transa,
-                       transpose *transb, int64_t *m, int64_t *n, int64_t *k, double *alpha,
-                       const double **a, int64_t *lda, const double **b, int64_t *ldb, double *beta,
-                       double **c, int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose *transa,
-                       transpose *transb, int64_t *m, int64_t *n, int64_t *k,
-                       std::complex<float> *alpha, const std::complex<float> **a, int64_t *lda,
-                       const std::complex<float> **b, int64_t *ldb, std::complex<float> *beta,
-                       std::complex<float> **c, int64_t *ldc, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose *transa,
-                       transpose *transb, int64_t *m, int64_t *n, int64_t *k,
-                       std::complex<double> *alpha, const std::complex<double> **a, int64_t *lda,
-                       const std::complex<double> **b, int64_t *ldb, std::complex<double> *beta,
-                       std::complex<double> **c, int64_t *ldc, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose *transa,
-                       transpose *transb, int64_t *m, int64_t *n, int64_t *k, sycl::half *alpha,
-                       const sycl::half **a, int64_t *lda, const sycl::half **b, int64_t *ldb,
-                       sycl::half *beta, sycl::half **c, int64_t *ldc, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose *transa,
-                       transpose *transb, int64_t *m, int64_t *n, int64_t *k, float *alpha,
-                       const sycl::half **a, int64_t *lda, const sycl::half **b, int64_t *ldb,
-                       float *beta, float **c, int64_t *ldc, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose *transa,
-                       transpose *transb, int64_t *m, int64_t *n, int64_t *k, float *alpha,
-                       const std::int8_t **a, int64_t *lda, const std::int8_t **b, int64_t *ldb,
-                       float *beta, float **c, int64_t *ldc, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose *transa,
-                       transpose *transb, int64_t *m, int64_t *n, int64_t *k, float *alpha,
-                       const std::int8_t **a, int64_t *lda, const std::int8_t **b, int64_t *ldb,
-                       float *beta, std::int32_t **c, int64_t *ldc, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose transa,
-                       transpose transb, int64_t m, int64_t n, int64_t k, float alpha,
-                       const float *a, int64_t lda, int64_t stride_a, const float *b, int64_t ldb,
-                       int64_t stride_b, float beta, float *c, int64_t ldc, int64_t stride_c,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose transa,
-                       transpose transb, int64_t m, int64_t n, int64_t k, double alpha,
-                       const double *a, int64_t lda, int64_t stride_a, const double *b, int64_t ldb,
-                       int64_t stride_b, double beta, double *c, int64_t ldc, int64_t stride_c,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose transa,
-                       transpose transb, int64_t m, int64_t n, int64_t k, std::complex<float> alpha,
-                       const std::complex<float> *a, int64_t lda, int64_t stride_a,
-                       const std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                       std::complex<float> beta, std::complex<float> *c, int64_t ldc,
-                       int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose transa,
-                       transpose transb, int64_t m, int64_t n, int64_t k,
-                       std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                       int64_t stride_a, const std::complex<double> *b, int64_t ldb,
-                       int64_t stride_b, std::complex<double> beta, std::complex<double> *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose transa,
-                       transpose transb, int64_t m, int64_t n, int64_t k, sycl::half alpha,
-                       const sycl::half *a, int64_t lda, int64_t stride_a, const sycl::half *b,
-                       int64_t ldb, int64_t stride_b, sycl::half beta, sycl::half *c, int64_t ldc,
-                       int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose transa,
-                       transpose transb, int64_t m, int64_t n, int64_t k, float alpha,
-                       const sycl::half *a, int64_t lda, int64_t stride_a, const sycl::half *b,
-                       int64_t ldb, int64_t stride_b, float beta, float *c, int64_t ldc,
-                       int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose transa,
-                       transpose transb, int64_t m, int64_t n, int64_t k, float alpha,
-                       const std::int8_t *a, int64_t lda, int64_t stride_a, const std::int8_t *b,
-                       int64_t ldb, int64_t stride_b, float beta, float *c, int64_t ldc,
-                       int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event gemm_batch(backend_selector<backend::rocblas> selector, transpose transa,
-                       transpose transb, int64_t m, int64_t n, int64_t k, float alpha,
-                       const std::int8_t *a, int64_t lda, int64_t stride_a, const std::int8_t *b,
-                       int64_t ldb, int64_t stride_b, float beta, std::int32_t *c, int64_t ldc,
-                       int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(
-        selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event spmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 float alpha, const float *a, const float *x, int64_t incx, float beta, float *y,
-                 int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha,
-                                                        a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event spmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 double alpha, const double *a, const double *x, int64_t incx, double beta,
-                 double *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha,
-                                                        a, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::rocblas> selector, int64_t n, float *x, int64_t incx,
-                 float *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::rocblas> selector, int64_t n, double *x, int64_t incx,
-                 double *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::rocblas> selector, int64_t n, std::complex<float> *x,
-                 int64_t incx, std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event swap(backend_selector<backend::rocblas> selector, int64_t n, std::complex<double> *x,
-                 int64_t incx, std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event geru(backend_selector<backend::rocblas> selector, int64_t m, int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, int64_t incx,
-                 const std::complex<float> *y, int64_t incy, std::complex<float> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx,
-                                                        y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event geru(backend_selector<backend::rocblas> selector, int64_t m, int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, int64_t incx,
-                 const std::complex<double> *y, int64_t incy, std::complex<double> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx,
-                                                        y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::rocblas> selector, int64_t n,
-                 const std::complex<float> *x, int64_t incx, float *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::rocblas> selector, int64_t n,
-                 const std::complex<double> *x, int64_t incx, double *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::rocblas> selector, int64_t n, const float *x,
-                 int64_t incx, float *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event nrm2(backend_selector<backend::rocblas> selector, int64_t n, const double *x,
-                 int64_t incx, double *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                 int64_t m, int64_t n, int64_t k, float alpha, const float *a, int64_t lda,
-                 const float *b, int64_t ldb, float beta, float *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                 int64_t m, int64_t n, int64_t k, double alpha, const double *a, int64_t lda,
-                 const double *b, int64_t ldb, double beta, double *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                 int64_t m, int64_t n, int64_t k, std::complex<float> alpha,
-                 const std::complex<float> *a, int64_t lda, const std::complex<float> *b,
-                 int64_t ldb, std::complex<float> beta, std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                 int64_t m, int64_t n, int64_t k, std::complex<double> alpha,
-                 const std::complex<double> *a, int64_t lda, const std::complex<double> *b,
-                 int64_t ldb, std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                 int64_t m, int64_t n, int64_t k, sycl::half alpha, const sycl::half *a,
-                 int64_t lda, const sycl::half *b, int64_t ldb, sycl::half beta, sycl::half *c,
-                 int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                 int64_t m, int64_t n, int64_t k, float alpha, const sycl::half *a, int64_t lda,
-                 const sycl::half *b, int64_t ldb, float beta, float *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                 int64_t m, int64_t n, int64_t k, float alpha, const bfloat16 *a, int64_t lda,
-                 const bfloat16 *b, int64_t ldb, float beta, float *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::rocblas> selector, transpose transa,
-                      transpose transb, offset offsetc, int64_t m, int64_t n, int64_t k,
-                      float alpha, const std::int8_t *a, int64_t lda, std::int8_t ao,
-                      const std::uint8_t *b, int64_t ldb, std::uint8_t bo, float beta,
-                      std::int32_t *c, int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::rocblas> selector, transpose transa,
-                      transpose transb, offset offsetc, int64_t m, int64_t n, int64_t k,
-                      float alpha, const std::int8_t *a, int64_t lda, std::int8_t ao,
-                      const std::int8_t *b, int64_t ldb, std::int8_t bo, float beta,
-                      std::int32_t *c, int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::rocblas> selector, transpose transa,
-                      transpose transb, offset offsetc, int64_t m, int64_t n, int64_t k,
-                      float alpha, const std::uint8_t *a, int64_t lda, std::uint8_t ao,
-                      const std::int8_t *b, int64_t ldb, std::int8_t bo, float beta,
-                      std::int32_t *c, int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event gemm_bias(backend_selector<backend::rocblas> selector, transpose transa,
-                      transpose transb, offset offsetc, int64_t m, int64_t n, int64_t k,
-                      float alpha, const std::uint8_t *a, int64_t lda, std::uint8_t ao,
-                      const std::uint8_t *b, int64_t ldb, std::uint8_t bo, float beta,
-                      std::int32_t *c, int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_bias(
-        selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta,
-        c, ldc, co, dependencies);
-    return done;
-}
-
-sycl::event herk(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 int64_t n, int64_t k, float alpha, const std::complex<float> *a, int64_t lda,
-                 float beta, std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::herk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event herk(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 int64_t n, int64_t k, double alpha, const std::complex<double> *a, int64_t lda,
-                 double beta, std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::herk(
-        selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event ger(backend_selector<backend::rocblas> selector, int64_t m, int64_t n, float alpha,
-                const float *x, int64_t incx, const float *y, int64_t incy, float *a, int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event ger(backend_selector<backend::rocblas> selector, int64_t m, int64_t n, double alpha,
-                const double *x, int64_t incx, const double *y, int64_t incy, double *a,
-                int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx,
-                                                       y, incy, a, lda, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha, const float *a,
-                 int64_t lda, float *b, int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right,
-                                                        upper_lower, trans, unit_diag, m, n, alpha,
-                                                        a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t m, int64_t n, double alpha,
-                 const double *a, int64_t lda, double *b, int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right,
-                                                        upper_lower, trans, unit_diag, m, n, alpha,
-                                                        a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, int64_t lda, std::complex<float> *b, int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right,
-                                                        upper_lower, trans, unit_diag, m, n, alpha,
-                                                        a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, int64_t lda, std::complex<double> *b, int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right,
-                                                        upper_lower, trans, unit_diag, m, n, alpha,
-                                                        a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::rocblas> selector, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n,
-                       float alpha, const float *a, int64_t lda, int64_t stride_a, float *b,
-                       int64_t ldb, int64_t stride_b, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::rocblas> selector, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n,
-                       double alpha, const double *a, int64_t lda, int64_t stride_a, double *b,
-                       int64_t ldb, int64_t stride_b, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::rocblas> selector, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n,
-                       std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                       int64_t stride_a, std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::rocblas> selector, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n,
-                       std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                       int64_t stride_a, std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-        stride_a, b, ldb, stride_b, batch_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::rocblas> selector, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, int64_t *m, int64_t *n,
-                       float *alpha, const float **a, int64_t *lda, float **b, int64_t *ldb,
-                       int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::rocblas> selector, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, int64_t *m, int64_t *n,
-                       double *alpha, const double **a, int64_t *lda, double **b, int64_t *ldb,
-                       int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::rocblas> selector, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, int64_t *m, int64_t *n,
-                       std::complex<float> *alpha, const std::complex<float> **a, int64_t *lda,
-                       std::complex<float> **b, int64_t *ldb, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event trsm_batch(backend_selector<backend::rocblas> selector, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, int64_t *m, int64_t *n,
-                       std::complex<double> *alpha, const std::complex<double> **a, int64_t *lda,
-                       std::complex<double> **b, int64_t *ldb, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(
-        selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event dotu(backend_selector<backend::rocblas> selector, int64_t n,
-                 const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
-                 int64_t incy, std::complex<float> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy,
-                                                        result, dependencies);
-    return done;
-}
-
-sycl::event dotu(backend_selector<backend::rocblas> selector, int64_t n,
-                 const std::complex<double> *x, int64_t incx, const std::complex<double> *y,
-                 int64_t incy, std::complex<double> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy,
-                                                        result, dependencies);
-    return done;
-}
-
-sycl::event hemm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                 int64_t m, int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                 int64_t lda, const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                 std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event hemm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                 int64_t m, int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                 int64_t lda, const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
-                 std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event hpr2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, int64_t incx,
-                 const std::complex<float> *y, int64_t incy, std::complex<float> *a,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event hpr2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, int64_t incx,
-                 const std::complex<double> *y, int64_t incy, std::complex<double> *a,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-                 int64_t kl, int64_t ku, float alpha, const float *a, int64_t lda, const float *x,
-                 int64_t incx, float beta, float *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                                lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-                 int64_t kl, int64_t ku, double alpha, const double *a, int64_t lda,
-                 const double *x, int64_t incx, double beta, double *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                                lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-                 int64_t kl, int64_t ku, std::complex<float> alpha, const std::complex<float> *a,
-                 int64_t lda, const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                                lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gbmv(backend_selector<backend::rocblas> selector, transpose trans, int64_t m, int64_t n,
-                 int64_t kl, int64_t ku, std::complex<double> alpha, const std::complex<double> *a,
-                 int64_t lda, const std::complex<double> *x, int64_t incx,
-                 std::complex<double> beta, std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a,
-                                                lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, int64_t k, const float *a, int64_t lda, float *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, int64_t k, const double *a, int64_t lda, double *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, int64_t k, const std::complex<float> *a, int64_t lda,
-                 std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, int64_t k, const std::complex<double> *a, int64_t lda,
-                 std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tbmv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                 int64_t m, int64_t n, float alpha, const float *a, int64_t lda, const float *b,
-                 int64_t ldb, float beta, float *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                 int64_t m, int64_t n, double alpha, const double *a, int64_t lda, const double *b,
-                 int64_t ldb, double beta, double *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                 int64_t m, int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                 int64_t lda, const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                 std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event symm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                 int64_t m, int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                 int64_t lda, const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
-                 std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n,
-                                                alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event dotc(backend_selector<backend::rocblas> selector, int64_t n,
-                 const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
-                 int64_t incy, std::complex<float> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy,
-                                                        result, dependencies);
-    return done;
-}
-
-sycl::event dotc(backend_selector<backend::rocblas> selector, int64_t n,
-                 const std::complex<double> *x, int64_t incx, const std::complex<double> *y,
-                 int64_t incy, std::complex<double> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy,
-                                                        result, dependencies);
-    return done;
-}
-
-sycl::event syr(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                float alpha, const float *x, int64_t incx, float *a, int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event syr(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                double alpha, const double *x, int64_t incx, double *a, int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha,
-                                                       x, incx, a, lda, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha, const float *a,
-                 int64_t lda, float *b, int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right,
-                                                        upper_lower, trans, unit_diag, m, n, alpha,
-                                                        a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t m, int64_t n, double alpha,
-                 const double *a, int64_t lda, double *b, int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right,
-                                                        upper_lower, trans, unit_diag, m, n, alpha,
-                                                        a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, int64_t lda, std::complex<float> *b, int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right,
-                                                        upper_lower, trans, unit_diag, m, n, alpha,
-                                                        a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event trmm(backend_selector<backend::rocblas> selector, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, int64_t lda, std::complex<double> *b, int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right,
-                                                        upper_lower, trans, unit_diag, m, n, alpha,
-                                                        a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event rotmg(backend_selector<backend::rocblas> selector, float *d1, float *d2, float *x1,
-                  float y1, float *param, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1,
-                                                         param, dependencies);
-    return done;
-}
-
-sycl::event rotmg(backend_selector<backend::rocblas> selector, double *d1, double *d2, double *x1,
-                  double y1, double *param, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1,
-                                                         param, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const float *a, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                        unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const double *a, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                        unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const std::complex<float> *a, std::complex<float> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                        unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tpsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const std::complex<double> *a, std::complex<double> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans,
-                                                        unit_diag, n, a, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const float *a, int64_t lda, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const double *a, int64_t lda, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const std::complex<float> *a, int64_t lda,
-                 std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event trsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, const std::complex<double> *a, int64_t lda,
-                 std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::trsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::rocblas> selector, int64_t n, const float *x,
-                 int64_t incx, float *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::rocblas> selector, int64_t n, const double *x,
-                 int64_t incx, double *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::rocblas> selector, int64_t n,
-                 const std::complex<float> *x, int64_t incx, std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event copy(backend_selector<backend::rocblas> selector, int64_t n,
-                 const std::complex<double> *x, int64_t incx, std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::rocblas> selector, int64_t *n, const float **x,
-                       int64_t *incx, float **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::rocblas> selector, int64_t *n, const double **x,
-                       int64_t *incx, double **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::rocblas> selector, int64_t *n,
-                       const std::complex<float> **x, int64_t *incx, std::complex<float> **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::rocblas> selector, int64_t *n,
-                       const std::complex<double> **x, int64_t *incx, std::complex<double> **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::rocblas> selector, int64_t n, const float *x,
-                       int64_t incx, int64_t stridex, float *y, int64_t incy, int64_t stridey,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::rocblas> selector, int64_t n, const double *x,
-                       int64_t incx, int64_t stridex, double *y, int64_t incy, int64_t stridey,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::rocblas> selector, int64_t n,
-                       const std::complex<float> *x, int64_t incx, int64_t stridex,
-                       std::complex<float> *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event copy_batch(backend_selector<backend::rocblas> selector, int64_t n,
-                       const std::complex<double> *x, int64_t incx, int64_t stridex,
-                       std::complex<double> *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch(
-        selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-    return done;
-}
-
-sycl::event hemv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::hemv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event hemv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::hemv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose transa,
-                  transpose transb, int64_t n, int64_t k, float alpha, const float *a, int64_t lda,
-                  const float *b, int64_t ldb, float beta, float *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                         transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                         c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose transa,
-                  transpose transb, int64_t n, int64_t k, double alpha, const double *a,
-                  int64_t lda, const double *b, int64_t ldb, double beta, double *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                         transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                         c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose transa,
-                  transpose transb, int64_t n, int64_t k, std::complex<float> alpha,
-                  const std::complex<float> *a, int64_t lda, const std::complex<float> *b,
-                  int64_t ldb, std::complex<float> beta, std::complex<float> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                         transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                         c, ldc, dependencies);
-    return done;
-}
-
-sycl::event gemmt(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose transa,
-                  transpose transb, int64_t n, int64_t k, std::complex<double> alpha,
-                  const std::complex<double> *a, int64_t lda, const std::complex<double> *b,
-                  int64_t ldb, std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa,
-                                                         transb, n, k, alpha, a, lda, b, ldb, beta,
-                                                         c, ldc, dependencies);
-    return done;
-}
-
-sycl::event sbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 int64_t k, float alpha, const float *a, int64_t lda, const float *x, int64_t incx,
-                 float beta, float *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                                lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event sbmv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 int64_t k, double alpha, const double *a, int64_t lda, const double *x,
-                 int64_t incx, double beta, double *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a,
-                                                lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::rocblas> selector, int64_t n,
-                 const std::complex<float> *x, int64_t incx, float *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::rocblas> selector, int64_t n,
-                 const std::complex<double> *x, int64_t incx, double *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::rocblas> selector, int64_t n, const float *x,
-                 int64_t incx, float *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event asum(backend_selector<backend::rocblas> selector, int64_t n, const double *x,
-                 int64_t incx, double *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result,
-                                                        dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, int64_t k, const float *a, int64_t lda, float *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, int64_t k, const double *a, int64_t lda, double *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, int64_t k, const std::complex<float> *a, int64_t lda,
-                 std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event tbsv(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t n, int64_t k, const std::complex<double> *a, int64_t lda,
-                 std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::tbsv(
-        selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies);
-    return done;
-}
-
-sycl::event spr2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 float alpha, const float *x, int64_t incx, const float *y, int64_t incy, float *a,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event spr2(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 double alpha, const double *x, int64_t incx, const double *y, int64_t incy,
-                 double *a, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha,
-                                                        x, incx, y, incy, a, dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::rocblas> selector, int64_t n, const float *x,
-                  int64_t incx, int64_t *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::rocblas> selector, int64_t n, const double *x,
-                  int64_t incx, int64_t *result, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::rocblas> selector, int64_t n,
-                  const std::complex<float> *x, int64_t incx, int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event iamax(backend_selector<backend::rocblas> selector, int64_t n,
-                  const std::complex<double> *x, int64_t incx, int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result,
-                                                         dependencies);
-    return done;
-}
-
-sycl::event rotm(backend_selector<backend::rocblas> selector, int64_t n, float *x, int64_t incx,
-                 float *y, int64_t incy, float *param,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy,
-                                                        param, dependencies);
-    return done;
-}
-
-sycl::event rotm(backend_selector<backend::rocblas> selector, int64_t n, double *x, int64_t incx,
-                 double *y, int64_t incy, double *param,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy,
-                                                        param, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::rocblas> selector, float *a, float *b, float *c,
-                 float *s, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::rocblas> selector, double *a, double *b, double *c,
-                 double *s, const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::rocblas> selector, std::complex<float> *a,
-                 std::complex<float> *b, float *c, std::complex<float> *s,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event rotg(backend_selector<backend::rocblas> selector, std::complex<double> *a,
-                 std::complex<double> *b, double *c, std::complex<double> *s,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies);
-    return done;
-}
-
-sycl::event sdsdot(backend_selector<backend::rocblas> selector, int64_t n, float sb, const float *x,
-                   int64_t incx, const float *y, int64_t incy, float *result,
-                   const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y,
-                                                          incy, result, dependencies);
-    return done;
-}
-
-sycl::event her2k(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                  int64_t n, int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                  int64_t lda, const std::complex<float> *b, int64_t ldb, float beta,
-                  std::complex<float> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event her2k(backend_selector<backend::rocblas> selector, uplo upper_lower, transpose trans,
-                  int64_t n, int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                  int64_t lda, const std::complex<double> *b, int64_t ldb, double beta,
-                  std::complex<double> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done =
-        oneapi::mkl::blas::rocblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k,
-                                                 alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::rocblas> selector, int64_t n, const float *x,
-                int64_t incx, const float *y, int64_t incy, float *result,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::rocblas> selector, int64_t n, const double *x,
-                int64_t incx, const double *y, int64_t incy, double *result,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event dot(backend_selector<backend::rocblas> selector, int64_t n, const float *x,
-                int64_t incx, const float *y, int64_t incy, double *result,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy,
-                                                       result, dependencies);
-    return done;
-}
-
-sycl::event symv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 float alpha, const float *a, int64_t lda, const float *x, int64_t incx, float beta,
-                 float *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::symv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event symv(backend_selector<backend::rocblas> selector, uplo upper_lower, int64_t n,
-                 double alpha, const double *a, int64_t lda, const double *x, int64_t incx,
-                 double beta, double *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::symv(
-        selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, const float *a,
-                           std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, const double *a,
-                           std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda,
-                           std::int64_t ldb, std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, double *ab,
-                           std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::rocblas> selector, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::rocblas> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                          const float *a, std::int64_t lda, std::int64_t stride_a, float beta,
-                          const float *b, std::int64_t ldb, std::int64_t stride_b, float *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::rocblas> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                          const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                          const double *b, std::int64_t ldb, std::int64_t stride_b, double *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::rocblas> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                          std::int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatadd_batch(backend_selector<backend::rocblas> selector, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<double> alpha, const std::complex<double> *a,
-                          std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch(
-        selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b,
-        c, ldc, stride_c, batch_size, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n,
-                                                            alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n,
-                                                            alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n,
-                                                            alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n,
-                                                            alpha, a, lda, b, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                      std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                      std::int64_t stridea, double *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                      std::int64_t lda, std::int64_t stridea, std::complex<float> *b,
-                      std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event omatcopy2(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                      std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                      std::int64_t lda, std::int64_t stridea, std::complex<double> *b,
-                      std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy2(
-        selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, float alpha, float *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n,
-                                                            alpha, ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, double alpha, double *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n,
-                                                            alpha, ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<float> alpha, std::complex<float> *ab,
-                     std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n,
-                                                            alpha, ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event imatcopy(backend_selector<backend::rocblas> selector, transpose trans, std::int64_t m,
-                     std::int64_t n, std::complex<double> alpha, std::complex<double> *ab,
-                     std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n,
-                                                            alpha, ab, lda, ldb, dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                    float beta, const float *b, std::int64_t ldb, float *c, std::int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m,
-                                                           n, alpha, a, lda, beta, b, ldb, c, ldc,
-                                                           dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                    double beta, const double *b, std::int64_t ldb, double *c, std::int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m,
-                                                           n, alpha, a, lda, beta, b, ldb, c, ldc,
-                                                           dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                    const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                    const std::complex<float> *b, std::int64_t ldb, std::complex<float> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m,
-                                                           n, alpha, a, lda, beta, b, ldb, c, ldc,
-                                                           dependencies);
-    return done;
-}
-
-sycl::event omatadd(backend_selector<backend::rocblas> selector, transpose transa, transpose transb,
-                    std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                    const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                    const std::complex<double> *b, std::int64_t ldb, std::complex<double> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m,
-                                                           n, alpha, a, lda, beta, b, ldb, c, ldc,
-                                                           dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::rocblas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, float* alpha, const float** a,
-                           std::int64_t* lda, float** b, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::rocblas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, double* alpha, const double** a,
-                           std::int64_t* lda, double** b, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::rocblas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<float>* alpha,
-                           const std::complex<float>** a, std::int64_t* lda,
-                           std::complex<float>** b, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize,
-        dependencies);
-    return done;
-}
-
-sycl::event omatcopy_batch(backend_selector<backend::rocblas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<double>* alpha,
-                           const std::complex<double>** a, std::int64_t* lda,
-                           std::complex<double>** b, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(
-        selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize,
-        dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::rocblas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, float* alpha, float** ab,
-                           std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                  alpha, ab, lda, ldb, group_count,
-                                                                  groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::rocblas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, double* alpha, double** ab,
-                           std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count,
-                           std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                  alpha, ab, lda, ldb, group_count,
-                                                                  groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::rocblas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<float>* alpha,
-                           std::complex<float>** ab, std::int64_t* lda, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                  alpha, ab, lda, ldb, group_count,
-                                                                  groupsize, dependencies);
-    return done;
-}
-
-sycl::event imatcopy_batch(backend_selector<backend::rocblas> selector, transpose* trans,
-                           std::int64_t* m, std::int64_t* n, std::complex<double>* alpha,
-                           std::complex<double>** ab, std::int64_t* lda, std::int64_t* ldb,
-                           std::int64_t group_count, std::int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n,
-                                                                  alpha, ab, lda, ldb, group_count,
-                                                                  groupsize, dependencies);
-    return done;
-}
diff --git a/include/oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hpp b/include/oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hpp
deleted file mode 100644
index a642e5609..000000000
--- a/include/oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#ifndef _ONEMKL_BLAS_ROCBLAS_HPP_
-#define _ONEMKL_BLAS_ROCBLAS_HPP_
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-#include <string>
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-
-namespace oneapi {
-namespace mkl {
-using oneapi::mkl::diag;
-using oneapi::mkl::offset;
-using oneapi::mkl::side;
-using oneapi::mkl::transpose;
-using oneapi::mkl::uplo;
-namespace blas {
-namespace rocblas {
-namespace column_major {
-
-#include "onemkl_blas_rocblas.hxx"
-
-} //namespace column_major
-namespace row_major {
-
-#include "onemkl_blas_rocblas.hxx"
-
-} //namespace row_major
-} //namespace rocblas
-} //namespace blas
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_ONEMKL_BLAS_ROCBLAS_HPP_
diff --git a/include/oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hxx b/include/oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hxx
deleted file mode 100644
index 70aabaaf9..000000000
--- a/include/oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hxx
+++ /dev/null
@@ -1,2160 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-// Buffer APIs
-
-void asum(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &result);
-
-void asum(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &result);
-
-void asum(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &result);
-
-void asum(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &result);
-
-void axpy(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &y, int64_t incy);
-
-void axpy(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &y, int64_t incy);
-
-void axpy(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy);
-
-void axpy(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy);
-
-void axpy_batch(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x, int64_t incx,
-                int64_t stridex, sycl::buffer<float, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size);
-
-void axpy_batch(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<double, 1> &y, int64_t incy,
-                int64_t stridey, int64_t batch_size);
-
-void axpy_batch(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size);
-
-void axpy_batch(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size);
-
-void axpby(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x, int64_t incx,
-           float beta, sycl::buffer<float, 1> &y, int64_t incy);
-
-void axpby(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x, int64_t incx,
-           double beta, sycl::buffer<double, 1> &y, int64_t incy);
-
-void axpby(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, int64_t incy);
-
-void axpby(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &y, int64_t incy);
-
-void copy(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &y, int64_t incy);
-
-void copy(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &y, int64_t incy);
-
-void copy(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy);
-
-void copy(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy);
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-                int64_t stridex, sycl::buffer<float, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size);
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-                int64_t stridex, sycl::buffer<double, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size);
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                int64_t incy, int64_t stridey, int64_t batch_size);
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                int64_t incy, int64_t stridey, int64_t batch_size);
-
-void dot(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-         sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &result);
-
-void dot(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-         sycl::buffer<double, 1> &y, int64_t incy, sycl::buffer<double, 1> &result);
-
-void dot(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-         sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<double, 1> &result);
-
-void dotc(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result);
-
-void dotc(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result);
-
-void dotu(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result);
-
-void dotu(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result);
-
-void iamin(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result);
-
-void iamin(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result);
-
-void iamin(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result);
-
-void iamin(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result);
-
-void iamax(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result);
-
-void iamax(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result);
-
-void iamax(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result);
-
-void iamax(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result);
-
-void nrm2(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &result);
-
-void nrm2(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &result);
-
-void nrm2(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &result);
-
-void nrm2(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &result);
-
-void rot(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &y, int64_t incy, float c, float s);
-
-void rot(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &y, int64_t incy, double c, double s);
-
-void rot(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-         sycl::buffer<float, 1> &y, int64_t incy, float c, float s);
-
-void rot(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-         sycl::buffer<double, 1> &y, int64_t incy, double c, double s);
-
-void rotg(sycl::queue &queue, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &b,
-          sycl::buffer<float, 1> &c, sycl::buffer<float, 1> &s);
-
-void rotg(sycl::queue &queue, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &b,
-          sycl::buffer<double, 1> &c, sycl::buffer<double, 1> &s);
-
-void rotg(sycl::queue &queue, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<std::complex<float>, 1> &s);
-
-void rotg(sycl::queue &queue, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<std::complex<double>, 1> &s);
-
-void rotm(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &param);
-
-void rotm(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &y, int64_t incy, sycl::buffer<double, 1> &param);
-
-void rotmg(sycl::queue &queue, sycl::buffer<float, 1> &d1, sycl::buffer<float, 1> &d2,
-           sycl::buffer<float, 1> &x1, float y1, sycl::buffer<float, 1> &param);
-
-void rotmg(sycl::queue &queue, sycl::buffer<double, 1> &d1, sycl::buffer<double, 1> &d2,
-           sycl::buffer<double, 1> &x1, double y1, sycl::buffer<double, 1> &param);
-
-void scal(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x, int64_t incx);
-
-void scal(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x, int64_t incx);
-
-void scal(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx);
-
-void scal(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx);
-
-void scal(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx);
-
-void scal(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<std::complex<double>, 1> &x,
-          int64_t incx);
-
-void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer<float, 1> &x, int64_t incx,
-            sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &result);
-
-void swap(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &y, int64_t incy);
-
-void swap(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &y, int64_t incy);
-
-void swap(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy);
-
-void swap(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy);
-
-void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-          float alpha, sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x,
-          int64_t incx, float beta, sycl::buffer<float, 1> &y, int64_t incy);
-
-void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-          double alpha, sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x,
-          int64_t incx, double beta, sycl::buffer<double, 1> &y, int64_t incy);
-
-void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy);
-
-void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy);
-
-void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x, int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, int64_t incy);
-
-void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x, int64_t incx,
-          double beta, sycl::buffer<double, 1> &y, int64_t incy);
-
-void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy);
-
-void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy);
-
-void gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, int64_t lda, int64_t stridea, sycl::buffer<float, 1> &x,
-                int64_t incx, int64_t stridex, float beta, sycl::buffer<float, 1> &y, int64_t incy,
-                int64_t stridey, int64_t batch_size);
-
-void gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, int64_t lda, int64_t stridea,
-                sycl::buffer<double, 1> &x, int64_t incx, int64_t stridex, double beta,
-                sycl::buffer<double, 1> &y, int64_t incy, int64_t stridey, int64_t batch_size);
-
-void gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                int64_t stridea, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-                int64_t stridex, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                int64_t incy, int64_t stridey, int64_t batch_size);
-
-void gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-                int64_t stridea, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-                int64_t stridex, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size);
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<float, 1> &a, int64_t lda, int64_t stridea, sycl::buffer<float, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<float, 1> &c, int64_t ldc,
-                int64_t stridec, int64_t batch_size);
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<double, 1> &a, int64_t lda, int64_t stridea,
-                sycl::buffer<double, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<double, 1> &c, int64_t ldc, int64_t stridec, int64_t batch_size);
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stridea,
-                sycl::buffer<std::complex<float>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &c, int64_t ldc, int64_t stridec,
-                int64_t batch_size);
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stridea,
-                sycl::buffer<std::complex<double>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stridec,
-                int64_t batch_size);
-
-void ger(sycl::queue &queue, int64_t m, int64_t n, float alpha, sycl::buffer<float, 1> &x,
-         int64_t incx, sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &a,
-         int64_t lda);
-
-void ger(sycl::queue &queue, int64_t m, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-         int64_t incx, sycl::buffer<double, 1> &y, int64_t incy, sycl::buffer<double, 1> &a,
-         int64_t lda);
-
-void gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda);
-
-void gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda);
-
-void geru(sycl::queue &queue, int64_t m, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda);
-
-void geru(sycl::queue &queue, int64_t m, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda);
-
-void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy);
-
-void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy);
-
-void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy);
-
-void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy);
-
-void her(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a, int64_t lda);
-
-void her(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a, int64_t lda);
-
-void her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda);
-
-void her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda);
-
-void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-          int64_t incy);
-
-void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, sycl::buffer<std::complex<double>, 1> &x,
-          int64_t incx, std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          int64_t incy);
-
-void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a);
-
-void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a);
-
-void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a);
-
-void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a);
-
-void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, float alpha,
-          sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x, int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, int64_t incy);
-
-void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, double alpha,
-          sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x, int64_t incx,
-          double beta, sycl::buffer<double, 1> &y, int64_t incy);
-
-void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &x, int64_t incx, float beta, sycl::buffer<float, 1> &y,
-          int64_t incy);
-
-void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &x, int64_t incx, double beta, sycl::buffer<double, 1> &y,
-          int64_t incy);
-
-void spr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer<float, 1> &x,
-         int64_t incx, sycl::buffer<float, 1> &a);
-
-void spr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-         int64_t incx, sycl::buffer<double, 1> &a);
-
-void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer<float, 1> &x,
-          int64_t incx, sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &a);
-
-void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-          int64_t incx, sycl::buffer<double, 1> &y, int64_t incy, sycl::buffer<double, 1> &a);
-
-void symv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer<float, 1> &a,
-          int64_t lda, sycl::buffer<float, 1> &x, int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, int64_t incy);
-
-void symv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          int64_t lda, sycl::buffer<double, 1> &x, int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, int64_t incy);
-
-void syr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer<float, 1> &x,
-         int64_t incx, sycl::buffer<float, 1> &a, int64_t lda);
-
-void syr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-         int64_t incx, sycl::buffer<double, 1> &a, int64_t lda);
-
-void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer<float, 1> &x,
-          int64_t incx, sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &a,
-          int64_t lda);
-
-void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-          int64_t incx, sycl::buffer<double, 1> &y, int64_t incy, sycl::buffer<double, 1> &a,
-          int64_t lda);
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x,
-          int64_t incx);
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x,
-          int64_t incx);
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx);
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx);
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x,
-          int64_t incx);
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x,
-          int64_t incx);
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx);
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx);
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, int64_t incx);
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x, int64_t incx);
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx);
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, sycl::buffer<std::complex<double>, 1> &x,
-          int64_t incx);
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, int64_t incx);
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x, int64_t incx);
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx);
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, sycl::buffer<std::complex<double>, 1> &x,
-          int64_t incx);
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x, int64_t incx);
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x, int64_t incx);
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx);
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx);
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x, int64_t incx);
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x, int64_t incx);
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx);
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          float alpha, sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b,
-          int64_t ldb, float beta, sycl::buffer<float, 1> &c, int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          double alpha, sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b,
-          int64_t ldb, double beta, sycl::buffer<double, 1> &c, int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          sycl::half alpha, sycl::buffer<sycl::half, 1> &a, int64_t lda,
-          sycl::buffer<sycl::half, 1> &b, int64_t ldb, sycl::half beta,
-          sycl::buffer<sycl::half, 1> &c, int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          float alpha, sycl::buffer<sycl::half, 1> &a, int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          int64_t ldb, float beta, sycl::buffer<float, 1> &c, int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          float alpha, sycl::buffer<bfloat16, 1> &a, int64_t lda, sycl::buffer<bfloat16, 1> &b,
-          int64_t ldb, float beta, sycl::buffer<float, 1> &c, int64_t ldc);
-
-void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, int64_t ldc);
-
-void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
-
-void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda, float beta,
-          sycl::buffer<std::complex<float>, 1> &c, int64_t ldc);
-
-void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, double alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda, double beta,
-          sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
-
-void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-           std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, float beta,
-           sycl::buffer<std::complex<float>, 1> &c, int64_t ldc);
-
-void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-           std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, double beta,
-           sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b, int64_t ldb,
-          float beta, sycl::buffer<float, 1> &c, int64_t ldc);
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b, int64_t ldb,
-          double beta, sycl::buffer<double, 1> &c, int64_t ldc);
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, int64_t ldc);
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha,
-          sycl::buffer<float, 1> &a, int64_t lda, float beta, sycl::buffer<float, 1> &c,
-          int64_t ldc);
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, double alpha,
-          sycl::buffer<double, 1> &a, int64_t lda, double beta, sycl::buffer<double, 1> &c,
-          int64_t ldc);
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, int64_t ldc);
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a, float beta,
-                sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size);
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                double beta, sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size);
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                int64_t stride_a, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                int64_t ldc, int64_t stride_c, int64_t batch_size);
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-                int64_t stride_a, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size);
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha,
-           sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b, int64_t ldb,
-           float beta, sycl::buffer<float, 1> &c, int64_t ldc);
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-           double alpha, sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b,
-           int64_t ldb, double beta, sycl::buffer<double, 1> &c, int64_t ldc);
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-           std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, int64_t ldc);
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-           std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          int64_t m, int64_t n, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-          sycl::buffer<float, 1> &b, int64_t ldb);
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          int64_t m, int64_t n, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-          sycl::buffer<double, 1> &b, int64_t ldb);
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          int64_t m, int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb);
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          int64_t m, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, int64_t ldb);
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          int64_t m, int64_t n, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-          sycl::buffer<float, 1> &b, int64_t ldb);
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          int64_t m, int64_t n, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-          sycl::buffer<double, 1> &b, int64_t ldb);
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          int64_t m, int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb);
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          int64_t m, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, int64_t ldb);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b, double beta,
-                sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-                int64_t stride_b, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                int64_t ldc, int64_t stride_c, int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                int64_t ldb, int64_t stride_b, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, sycl::half alpha, sycl::buffer<sycl::half, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<sycl::half, 1> &b, int64_t ldb, int64_t stride_b,
-                sycl::half beta, sycl::buffer<sycl::half, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, float alpha, sycl::buffer<sycl::half, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<sycl::half, 1> &b, int64_t ldb, int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<std::int8_t, 1> &b, int64_t ldb, int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<std::int8_t, 1> &b, int64_t ldb, int64_t stride_b,
-                float beta, sycl::buffer<std::int32_t, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size);
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<float, 1> &b, int64_t ldb,
-                int64_t stride_b, int64_t batch_size);
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<double, 1> &b, int64_t ldb,
-                int64_t stride_b, int64_t batch_size);
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, int64_t stride_b,
-                int64_t batch_size);
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, int64_t stride_b,
-                int64_t batch_size);
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-           sycl::buffer<float, 1> &b, int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-           int64_t ldc);
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-           sycl::buffer<double, 1> &b, int64_t ldb, double beta, sycl::buffer<double, 1> &c,
-           int64_t ldc);
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-           std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, int64_t ldc);
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<int8_t, 1> &a, int64_t lda,
-               int8_t ao, sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co);
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<int8_t, 1> &a, int64_t lda,
-               int8_t ao, sycl::buffer<int8_t, 1> &b, int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co);
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a, int64_t lda,
-               uint8_t ao, sycl::buffer<int8_t, 1> &b, int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co);
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a, int64_t lda,
-               uint8_t ao, sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co);
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                    sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                    sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size);
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                    sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                    sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size);
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                    int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-                    int64_t stride_b, int64_t batch_size);
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                    int64_t lda, int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    int64_t ldb, int64_t stride_b, int64_t batch_size);
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                    sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size);
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                    sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size);
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size);
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size);
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                   float beta, sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size);
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                   double beta, sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size);
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                   int64_t stride_a, std::complex<float> beta,
-                   sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<std::complex<float>, 1> &c, int64_t ldc, int64_t stride_c,
-                   int64_t batch_size);
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                   int64_t lda, int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                   int64_t batch_size);
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-              sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b, int64_t ldb);
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-              sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b, int64_t ldb);
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-              sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-              sycl::buffer<std::complex<float>, 1> &b, int64_t ldb);
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-              sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-              sycl::buffer<std::complex<double>, 1> &b, int64_t ldb);
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-               sycl::buffer<float, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<float, 1> &b, int64_t ldb, std::int64_t strideb);
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-               sycl::buffer<double, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<double, 1> &b, int64_t ldb, std::int64_t strideb);
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-               sycl::buffer<std::complex<float>, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::int64_t strideb);
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-               std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-               std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-               std::int64_t strideb);
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-              sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb);
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-              sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb);
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-              sycl::buffer<std::complex<float>, 1> &ab, int64_t lda, int64_t ldb);
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-              sycl::buffer<std::complex<double>, 1> &ab, int64_t lda, int64_t ldb);
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             float alpha, sycl::buffer<float, 1> &a, int64_t lda, float beta,
-             sycl::buffer<float, 1> &b, int64_t ldb, sycl::buffer<float, 1> &c, int64_t ldc);
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             double alpha, sycl::buffer<double, 1> &a, int64_t lda, double beta,
-             sycl::buffer<double, 1> &b, int64_t ldb, sycl::buffer<double, 1> &c, int64_t ldc);
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-             std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-             sycl::buffer<std::complex<float>, 1> &c, int64_t ldc);
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-             std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-             sycl::buffer<std::complex<double>, 1> &c, int64_t ldc);
-
-// USM APIs
-
-sycl::event asum(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                 float *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event asum(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                 double *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event asum(sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *result,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event asum(sycl::queue &queue, int64_t n, const double *x, int64_t incx, double *result,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, float *y,
-                 int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx,
-                 double *y, int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, int64_t incx, std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, int64_t incx, std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, const float **x, int64_t *incx,
-                       float **y, int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, const double **x,
-                       int64_t *incx, double **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex<float> *alpha,
-                       const std::complex<float> **x, int64_t *incx, std::complex<float> **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex<double> *alpha,
-                       const std::complex<double> **x, int64_t *incx, std::complex<double> **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx,
-                       int64_t stridex, float *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx,
-                       int64_t stridex, double *y, int64_t incy, int64_t stridey,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                       const std::complex<float> *x, int64_t incx, int64_t stridex,
-                       std::complex<float> *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                       const std::complex<double> *x, int64_t incx, int64_t stridex,
-                       std::complex<double> *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpby(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx,
-                  const float beta, float *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpby(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx,
-                  const double beta, double *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpby(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                  const std::complex<float> *x, int64_t incx, const std::complex<float> beta,
-                  std::complex<float> *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpby(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                  const std::complex<double> *x, int64_t incx, const std::complex<double> beta,
-                  std::complex<double> *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy(sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *y,
-                 int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy(sycl::queue &queue, int64_t n, const double *x, int64_t incx, double *y,
-                 int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                 std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, int64_t *incx, float **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, int64_t *incx, double **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex<float> **x, int64_t *incx,
-                       std::complex<float> **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex<double> **x,
-                       int64_t *incx, std::complex<double> **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const float *x, int64_t incx, int64_t stridex,
-                       float *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, int64_t incx,
-                       int64_t stridex, double *y, int64_t incy, int64_t stridey,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                       int64_t stridex, std::complex<float> *y, int64_t incy, int64_t stridey,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                       int64_t stridex, std::complex<double> *y, int64_t incy, int64_t stridey,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y,
-                int64_t incy, float *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dot(sycl::queue &queue, int64_t n, const double *x, int64_t incx, const double *y,
-                int64_t incy, double *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y,
-                int64_t incy, double *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dotc(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                 const std::complex<float> *y, int64_t incy, std::complex<float> *result,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dotc(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                 const std::complex<double> *y, int64_t incy, std::complex<double> *result,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dotu(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                 const std::complex<float> *y, int64_t incy, std::complex<float> *result,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dotu(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                 const std::complex<double> *y, int64_t incy, std::complex<double> *result,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamin(sycl::queue &queue, int64_t n, const float *x, int64_t incx, int64_t *result,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamin(sycl::queue &queue, int64_t n, const double *x, int64_t incx, int64_t *result,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamin(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                  int64_t *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamin(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                  int64_t *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamax(sycl::queue &queue, int64_t n, const float *x, int64_t incx, int64_t *result,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamax(sycl::queue &queue, int64_t n, const double *x, int64_t incx, int64_t *result,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamax(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                  int64_t *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamax(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                  int64_t *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event nrm2(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                 float *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event nrm2(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                 double *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event nrm2(sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *result,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event nrm2(sycl::queue &queue, int64_t n, const double *x, int64_t incx, double *result,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rot(sycl::queue &queue, int64_t n, std::complex<float> *x, int64_t incx,
-                std::complex<float> *y, int64_t incy, float c, float s,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rot(sycl::queue &queue, int64_t n, std::complex<double> *x, int64_t incx,
-                std::complex<double> *y, int64_t incy, double c, double s,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rot(sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, int64_t incy,
-                float c, float s, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rot(sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, int64_t incy,
-                double c, double s, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c, double *s,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotg(sycl::queue &queue, std::complex<float> *a, std::complex<float> *b, float *c,
-                 std::complex<float> *s, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotg(sycl::queue &queue, std::complex<double> *a, std::complex<double> *b, double *c,
-                 std::complex<double> *s, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotm(sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, int64_t incy,
-                 float *param, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotm(sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, int64_t incy,
-                 double *param, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event scal(sycl::queue &queue, int64_t n, float alpha, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event scal(sycl::queue &queue, int64_t n, double alpha, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event scal(sycl::queue &queue, int64_t n, std::complex<float> alpha, std::complex<float> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event scal(sycl::queue &queue, int64_t n, std::complex<double> alpha, std::complex<double> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event scal(sycl::queue &queue, int64_t n, float alpha, std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event scal(sycl::queue &queue, int64_t n, double alpha, std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx,
-                   const float *y, int64_t incy, float *result,
-                   const std::vector<sycl::event> &dependencies = {});
-
-sycl::event swap(sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event swap(sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event swap(sycl::queue &queue, int64_t n, std::complex<float> *x, int64_t incx,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event swap(sycl::queue &queue, int64_t n, std::complex<double> *x, int64_t incx,
-                 std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-                 float alpha, const float *a, int64_t lda, const float *x, int64_t incx, float beta,
-                 float *y, int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-                 double alpha, const double *a, int64_t lda, const double *x, int64_t incx,
-                 double beta, double *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                 const float *a, int64_t lda, const float *x, int64_t incx, float beta, float *y,
-                 int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                 const double *a, int64_t lda, const double *x, int64_t incx, double beta,
-                 double *y, int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                       const float *a, int64_t lda, int64_t stridea, const float *x, int64_t incx,
-                       int64_t stridex, float beta, float *y, int64_t incy, int64_t stridey,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                       const double *a, int64_t lda, int64_t stridea, const double *x, int64_t incx,
-                       int64_t stridex, double beta, double *y, int64_t incy, int64_t stridey,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                       std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                       int64_t stridea, const std::complex<float> *x, int64_t incx, int64_t stridex,
-                       std::complex<float> beta, std::complex<float> *y, int64_t incy,
-                       int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                       std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                       int64_t stridea, const std::complex<double> *x, int64_t incx,
-                       int64_t stridex, std::complex<double> beta, std::complex<double> *y,
-                       int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, float *alpha,
-                       const float **a, int64_t *lda, const float **x, int64_t *incx, float *beta,
-                       float **y, int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, double *alpha,
-                       const double **a, int64_t *lda, const double **x, int64_t *incx,
-                       double *beta, double **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                       std::complex<float> *alpha, const std::complex<float> **a, int64_t *lda,
-                       const std::complex<float> **x, int64_t *incx, std::complex<float> *beta,
-                       std::complex<float> **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                       std::complex<double> *alpha, const std::complex<double> **a, int64_t *lda,
-                       const std::complex<double> **x, int64_t *incx, std::complex<double> *beta,
-                       std::complex<double> **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const float *a,
-                       int64_t lda, int64_t stridea, const float *x, int64_t incx, int64_t stridex,
-                       float *c, int64_t ldc, int64_t stridec, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const double *a,
-                       int64_t lda, int64_t stridea, const double *x, int64_t incx, int64_t stridex,
-                       double *c, int64_t ldc, int64_t stridec, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                       const std::complex<float> *a, int64_t lda, int64_t stridea,
-                       const std::complex<float> *x, int64_t incx, int64_t stridex,
-                       std::complex<float> *c, int64_t ldc, int64_t stridec, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                       const std::complex<double> *a, int64_t lda, int64_t stridea,
-                       const std::complex<double> *x, int64_t incx, int64_t stridex,
-                       std::complex<double> *c, int64_t ldc, int64_t stridec, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const float **a, int64_t *lda, const float **x, int64_t *incx, float **c,
-                       int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const double **a, int64_t *lda, const double **x, int64_t *incx, double **c,
-                       int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const std::complex<float> **a, int64_t *lda, const std::complex<float> **x,
-                       int64_t *incx, std::complex<float> **c, int64_t *ldc, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const std::complex<double> **a, int64_t *lda, const std::complex<double> **x,
-                       int64_t *incx, std::complex<double> **c, int64_t *ldc, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event ger(sycl::queue &queue, int64_t m, int64_t n, float alpha, const float *x, int64_t incx,
-                const float *y, int64_t incy, float *a, int64_t lda,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event ger(sycl::queue &queue, int64_t m, int64_t n, double alpha, const double *x,
-                int64_t incx, const double *y, int64_t incy, double *a, int64_t lda,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
-                 int64_t incy, std::complex<float> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, int64_t incx, const std::complex<double> *y,
-                 int64_t incy, std::complex<double> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event geru(sycl::queue &queue, int64_t m, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
-                 int64_t incy, std::complex<float> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event geru(sycl::queue &queue, int64_t m, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, int64_t incx, const std::complex<double> *y,
-                 int64_t incy, std::complex<double> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, int64_t lda, const std::complex<float> *x,
-                 int64_t incx, std::complex<float> beta, std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, int64_t lda, const std::complex<double> *x,
-                 int64_t incx, std::complex<double> beta, std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
-                const std::complex<float> *x, int64_t incx, std::complex<float> *a, int64_t lda,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
-                const std::complex<double> *x, int64_t incx, std::complex<double> *a, int64_t lda,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
-                 int64_t incy, std::complex<float> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, int64_t incx, const std::complex<double> *y,
-                 int64_t incy, std::complex<double> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, const std::complex<float> *x, int64_t incx,
-                 std::complex<float> beta, std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, const std::complex<double> *x, int64_t incx,
-                 std::complex<double> beta, std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
-                const std::complex<float> *x, int64_t incx, std::complex<float> *a,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
-                const std::complex<double> *x, int64_t incx, std::complex<double> *a,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
-                 int64_t incy, std::complex<float> *a,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, int64_t incx, const std::complex<double> *y,
-                 int64_t incy, std::complex<double> *a,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, float alpha,
-                 const float *a, int64_t lda, const float *x, int64_t incx, float beta, float *y,
-                 int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, double alpha,
-                 const double *a, int64_t lda, const double *x, int64_t incx, double beta,
-                 double *y, int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *a,
-                 const float *x, int64_t incx, float beta, float *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *a,
-                 const double *x, int64_t incx, double beta, double *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x,
-                int64_t incx, float *a, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x,
-                int64_t incx, double *a, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x,
-                 int64_t incx, const float *y, int64_t incy, float *a,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x,
-                 int64_t incx, const double *y, int64_t incy, double *a,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *a,
-                 int64_t lda, const float *x, int64_t incx, float beta, float *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *a,
-                 int64_t lda, const double *x, int64_t incx, double beta, double *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x,
-                int64_t incx, float *a, int64_t lda,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x,
-                int64_t incx, double *a, int64_t lda,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x,
-                 int64_t incx, const float *y, int64_t incy, float *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x,
-                 int64_t incx, const double *y, int64_t incy, double *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const float *a, int64_t lda, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const double *a, int64_t lda, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const std::complex<float> *a, int64_t lda, std::complex<float> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const std::complex<double> *a, int64_t lda, std::complex<double> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const float *a, int64_t lda, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const double *a, int64_t lda, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const std::complex<float> *a, int64_t lda, std::complex<float> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const std::complex<double> *a, int64_t lda, std::complex<double> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const float *a, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const double *a, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const std::complex<float> *a, std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const std::complex<double> *a, std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const float *a, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const double *a, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const std::complex<float> *a, std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const std::complex<double> *a, std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const float *a, int64_t lda, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const double *a, int64_t lda, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const std::complex<float> *a, int64_t lda, std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const std::complex<double> *a, int64_t lda, std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const float *a, int64_t lda, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const double *a, int64_t lda, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const std::complex<float> *a, int64_t lda, std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const std::complex<double> *a, int64_t lda, std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, float alpha, const float *a, int64_t lda, const float *b, int64_t ldb,
-                 float beta, float *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, double alpha, const double *a, int64_t lda, const double *b,
-                 int64_t ldb, double beta, double *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                 std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
-                 std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, sycl::half alpha, const sycl::half *a, int64_t lda, const sycl::half *b,
-                 int64_t ldb, sycl::half beta, sycl::half *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, float alpha, const sycl::half *a, int64_t lda, const sycl::half *b,
-                 int64_t ldb, float beta, float *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, float alpha, const bfloat16 *a, int64_t lda, const bfloat16 *b,
-                 int64_t ldb, float beta, float *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                 std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
-                 std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                 float alpha, const std::complex<float> *a, int64_t lda, float beta,
-                 std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                 double alpha, const std::complex<double> *a, int64_t lda, double beta,
-                 std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                  std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                  const std::complex<float> *b, int64_t ldb, float beta, std::complex<float> *c,
-                  int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                  std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                  const std::complex<double> *b, int64_t ldb, double beta, std::complex<double> *c,
-                  int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-                 float alpha, const float *a, int64_t lda, const float *b, int64_t ldb, float beta,
-                 float *c, int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-                 double alpha, const double *a, int64_t lda, const double *b, int64_t ldb,
-                 double beta, double *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                 std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
-                 std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                 float alpha, const float *a, int64_t lda, float beta, float *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                 double alpha, const double *a, int64_t lda, double beta, double *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 std::complex<float> beta, std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, float *alpha, const float **a, int64_t *lda, float *beta,
-                       float **c, int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, double *alpha, const double **a, int64_t *lda, double *beta,
-                       double **c, int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, std::complex<float> *alpha, const std::complex<float> **a,
-                       int64_t *lda, std::complex<float> *beta, std::complex<float> **c,
-                       int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, std::complex<double> *alpha, const std::complex<double> **a,
-                       int64_t *lda, std::complex<double> *beta, std::complex<double> **c,
-                       int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       float alpha, const float *a, int64_t lda, int64_t stride_a, float beta,
-                       float *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       double alpha, const double *a, int64_t lda, int64_t stride_a, double beta,
-                       double *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                       int64_t stride_a, std::complex<float> beta, std::complex<float> *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                       int64_t stride_a, std::complex<double> beta, std::complex<double> *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                  float alpha, const float *a, int64_t lda, const float *b, int64_t ldb, float beta,
-                  float *c, int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                  double alpha, const double *a, int64_t lda, const double *b, int64_t ldb,
-                  double beta, double *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                  std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                  const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                  std::complex<float> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                  std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                  const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
-                  std::complex<double> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, int64_t lda,
-                 float *b, int64_t ldb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, int64_t lda,
-                 double *b, int64_t ldb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, int64_t lda, std::complex<float> *b, int64_t ldb,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, int64_t lda, std::complex<double> *b, int64_t ldb,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, int64_t lda,
-                 float *b, int64_t ldb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, int64_t lda,
-                 double *b, int64_t ldb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, int64_t lda, std::complex<float> *b, int64_t ldb,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, int64_t lda, std::complex<double> *b, int64_t ldb,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, float alpha, const float *a,
-                       int64_t lda, int64_t stride_a, float *b, int64_t ldb, int64_t stride_b,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, double alpha, const double *a,
-                       int64_t lda, int64_t stride_a, double *b, int64_t ldb, int64_t stride_b,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                       const std::complex<float> *a, int64_t lda, int64_t stride_a,
-                       std::complex<float> *b, int64_t ldb, int64_t stride_b, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                       const std::complex<double> *a, int64_t lda, int64_t stride_a,
-                       std::complex<double> *b, int64_t ldb, int64_t stride_b, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans,
-                       diag *unit_diag, int64_t *m, int64_t *n, float *alpha, const float **a,
-                       int64_t *lda, float **b, int64_t *ldb, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans,
-                       diag *unit_diag, int64_t *m, int64_t *n, double *alpha, const double **a,
-                       int64_t *lda, double **b, int64_t *ldb, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans,
-                       diag *unit_diag, int64_t *m, int64_t *n, std::complex<float> *alpha,
-                       const std::complex<float> **a, int64_t *lda, std::complex<float> **b,
-                       int64_t *ldb, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans,
-                       diag *unit_diag, int64_t *m, int64_t *n, std::complex<double> *alpha,
-                       const std::complex<double> **a, int64_t *lda, std::complex<double> **b,
-                       int64_t *ldb, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, float *alpha, const float **a, int64_t *lda,
-                       const float **b, int64_t *ldb, float *beta, float **c, int64_t *ldc,
-                       int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, double *alpha, const double **a, int64_t *lda,
-                       const double **b, int64_t *ldb, double *beta, double **c, int64_t *ldc,
-                       int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, std::complex<float> *alpha,
-                       const std::complex<float> **a, int64_t *lda, const std::complex<float> **b,
-                       int64_t *ldb, std::complex<float> *beta, std::complex<float> **c,
-                       int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, std::complex<double> *alpha,
-                       const std::complex<double> **a, int64_t *lda, const std::complex<double> **b,
-                       int64_t *ldb, std::complex<double> *beta, std::complex<double> **c,
-                       int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, sycl::half *alpha, const sycl::half **a,
-                       int64_t *lda, const sycl::half **b, int64_t *ldb, sycl::half *beta,
-                       sycl::half **c, int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, float *alpha, const sycl::half **a, int64_t *lda,
-                       const sycl::half **b, int64_t *ldb, float *beta, float **c, int64_t *ldc,
-                       int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, float *alpha, const std::int8_t **a, int64_t *lda,
-                       const std::int8_t **b, int64_t *ldb, float *beta, float **c, int64_t *ldc,
-                       int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, float *alpha, const std::int8_t **a, int64_t *lda,
-                       const std::int8_t **b, int64_t *ldb, float *beta, std::int32_t **c,
-                       int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, float alpha, const float *a, int64_t lda, int64_t stride_a,
-                       const float *b, int64_t ldb, int64_t stride_b, float beta, float *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, double alpha, const double *a, int64_t lda, int64_t stride_a,
-                       const double *b, int64_t ldb, int64_t stride_b, double beta, double *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                       int64_t lda, int64_t stride_a, const std::complex<float> *b, int64_t ldb,
-                       int64_t stride_b, std::complex<float> beta, std::complex<float> *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                       int64_t lda, int64_t stride_a, const std::complex<double> *b, int64_t ldb,
-                       int64_t stride_b, std::complex<double> beta, std::complex<double> *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, sycl::half alpha, const sycl::half *a, int64_t lda,
-                       int64_t stride_a, const sycl::half *b, int64_t ldb, int64_t stride_b,
-                       sycl::half beta, sycl::half *c, int64_t ldc, int64_t stride_c,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, float alpha, const sycl::half *a, int64_t lda, int64_t stride_a,
-                       const sycl::half *b, int64_t ldb, int64_t stride_b, float beta, float *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, float alpha, const std::int8_t *a, int64_t lda, int64_t stride_a,
-                       const std::int8_t *b, int64_t ldb, int64_t stride_b, float beta, float *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, float alpha, const std::int8_t *a, int64_t lda, int64_t stride_a,
-                       const std::int8_t *b, int64_t ldb, int64_t stride_b, float beta,
-                       std::int32_t *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b,
-                  int64_t ldb, float beta, float *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, double alpha, const double *a, int64_t lda, const double *b,
-                  int64_t ldb, double beta, double *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                  int64_t lda, const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                  std::complex<float> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                  int64_t lda, const std::complex<double> *b, int64_t ldb,
-                  std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const std::int8_t *a,
-                      int64_t lda, std::int8_t ao, const std::uint8_t *b, int64_t ldb,
-                      std::uint8_t bo, float beta, std::int32_t *c, int64_t ldc,
-                      const std::int32_t *co, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const std::int8_t *a,
-                      int64_t lda, std::int8_t ao, const std::int8_t *b, int64_t ldb,
-                      std::int8_t bo, float beta, std::int32_t *c, int64_t ldc,
-                      const std::int32_t *co, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const std::uint8_t *a,
-                      int64_t lda, std::uint8_t ao, const std::int8_t *b, int64_t ldb,
-                      std::int8_t bo, float beta, std::int32_t *c, int64_t ldc,
-                      const std::int32_t *co, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const std::uint8_t *a,
-                      int64_t lda, std::uint8_t ao, const std::uint8_t *b, int64_t ldb,
-                      std::uint8_t bo, float beta, std::int32_t *c, int64_t ldc,
-                      const std::int32_t *co, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                           const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb,
-                           int64_t stride_b, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                           const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb,
-                           int64_t stride_b, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                           int64_t stride_a, std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                           int64_t stride_a, std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                           float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                           double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<float> alpha, std::complex<float> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<double> alpha, std::complex<double> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, float alpha, const float *a, int64_t lda, int64_t stride_a,
-                          float beta, const float *b, int64_t ldb, int64_t stride_b, float *c,
-                          int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, double alpha, const double *a, int64_t lda, int64_t stride_a,
-                          double beta, const double *b, int64_t ldb, int64_t stride_b, double *c,
-                          int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                          int64_t lda, int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                          std::complex<float> *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                          int64_t lda, int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                          std::complex<double> *c, int64_t ldc, int64_t stride_c,
-                          int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                     const float *a, int64_t lda, float *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                     const double *a, int64_t lda, double *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                     std::complex<float> *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                     std::complex<double> *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                      const float *a, int64_t lda, std::int64_t stridea, float *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                      const double *a, int64_t lda, std::int64_t stridea, double *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                      std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                      std::int64_t stridea, std::complex<float> *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                      std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                      std::int64_t stridea, std::complex<double> *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                     float *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                     double *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<float> alpha, std::complex<float> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<double> alpha, std::complex<double> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    float alpha, const float *a, int64_t lda, float beta, const float *b,
-                    int64_t ldb, float *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    double alpha, const double *a, int64_t lda, double beta, const double *b,
-                    int64_t ldb, double *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                    std::complex<float> beta, const std::complex<float> *b, int64_t ldb,
-                    std::complex<float> *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                    std::complex<double> beta, const std::complex<double> *b, int64_t ldb,
-                    std::complex<double> *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           float* alpha, const float** a, int64_t* lda, float** b, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           double* alpha, const double** a, int64_t* lda, double** b, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<float>* alpha, const std::complex<float>** a, int64_t* lda,
-                           std::complex<float>** b, int64_t* ldb, int64_t group_count,
-                           int64_t* groupsize, const std::vector<sycl::event>& dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<double>* alpha, const std::complex<double>** a,
-                           int64_t* lda, std::complex<double>** b, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           float* alpha, float** ab, int64_t* lda, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           double* alpha, double** ab, int64_t* lda, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<float>* alpha, std::complex<float>** ab, int64_t* lda,
-                           int64_t* ldb, int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<double>* alpha, std::complex<double>** ab, int64_t* lda,
-                           int64_t* ldb, int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies = {});
diff --git a/include/oneapi/mkl/detail/backend_selector.hpp b/include/oneapi/mkl/detail/backend_selector.hpp
deleted file mode 100644
index b0c763ae0..000000000
--- a/include/oneapi/mkl/detail/backend_selector.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_BACKEND_SELECTOR_HPP_
-#define _ONEMKL_BACKEND_SELECTOR_HPP_
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/detail/backends.hpp"
-#include "oneapi/mkl/detail/backend_selector_predicates.hpp"
-
-namespace oneapi {
-namespace mkl {
-
-template <backend Backend>
-class backend_selector {
-public:
-    explicit backend_selector(sycl::queue queue) : queue_(queue) {
-        backend_selector_precondition<Backend>(queue_);
-    }
-    sycl::queue& get_queue() {
-        return queue_;
-    }
-
-private:
-    sycl::queue queue_;
-};
-
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_BACKEND_SELECTOR_HPP_
diff --git a/include/oneapi/mkl/detail/backend_selector_predicates.hpp b/include/oneapi/mkl/detail/backend_selector_predicates.hpp
deleted file mode 100644
index 4ee3f3bb1..000000000
--- a/include/oneapi/mkl/detail/backend_selector_predicates.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_BACKEND_SELECTOR_PREDICATES_HPP_
-#define _ONEMKL_BACKEND_SELECTOR_PREDICATES_HPP_
-
-#include <cstdint>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/detail/backends.hpp"
-#include "oneapi/mkl/detail/get_device_id.hpp"
-
-namespace oneapi {
-namespace mkl {
-
-template <backend Backend>
-inline void backend_selector_precondition(sycl::queue&) {}
-
-template <>
-inline void backend_selector_precondition<backend::netlib>(sycl::queue& queue) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-#ifdef __HIPSYCL__
-    if (!(queue.is_host() || queue.get_device().is_cpu())) {
-#else
-    if (!queue.get_device().is_cpu()) {
-#endif
-        throw unsupported_device("",
-                                 "backend_selector<backend::" + backend_map[backend::netlib] + ">",
-                                 queue.get_device());
-    }
-#endif
-}
-
-template <>
-inline void backend_selector_precondition<backend::mklcpu>(sycl::queue& queue) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-#ifdef __HIPSYCL__
-    if (!(queue.is_host() || queue.get_device().is_cpu())) {
-#else
-    if (!queue.get_device().is_cpu()) {
-#endif
-        throw unsupported_device("",
-                                 "backend_selector<backend::" + backend_map[backend::mklcpu] + ">",
-                                 queue.get_device());
-    }
-#endif
-}
-
-template <>
-inline void backend_selector_precondition<backend::mklgpu>(sycl::queue& queue) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-    unsigned int vendor_id =
-        static_cast<unsigned int>(queue.get_device().get_info<sycl::info::device::vendor_id>());
-    if (!(queue.get_device().is_gpu() && vendor_id == INTEL_ID)) {
-        throw unsupported_device("",
-                                 "backend_selector<backend::" + backend_map[backend::mklgpu] + ">",
-                                 queue.get_device());
-    }
-#endif
-}
-
-template <>
-inline void backend_selector_precondition<backend::cublas>(sycl::queue& queue) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-    unsigned int vendor_id =
-        static_cast<unsigned int>(queue.get_device().get_info<sycl::info::device::vendor_id>());
-    if (!(queue.get_device().is_gpu() && vendor_id == NVIDIA_ID)) {
-        throw unsupported_device("",
-                                 "backend_selector<backend::" + backend_map[backend::cublas] + ">",
-                                 queue.get_device());
-    }
-#endif
-}
-
-template <>
-inline void backend_selector_precondition<backend::cusolver>(sycl::queue& queue) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-    unsigned int vendor_id =
-        static_cast<unsigned int>(queue.get_device().get_info<sycl::info::device::vendor_id>());
-    if (!(queue.get_device().is_gpu() && vendor_id == NVIDIA_ID)) {
-        throw unsupported_device(
-            "", "backend_selector<backend::" + backend_map[backend::cusolver] + ">",
-            queue.get_device());
-    }
-#endif
-}
-
-template <>
-inline void backend_selector_precondition<backend::rocblas>(sycl::queue& queue) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-    unsigned int vendor_id =
-        static_cast<unsigned int>(queue.get_device().get_info<sycl::info::device::vendor_id>());
-    if (!(queue.get_device().is_gpu() && vendor_id == AMD_ID)) {
-        throw unsupported_device("",
-                                 "backend_selector<backend::" + backend_map[backend::rocblas] + ">",
-                                 queue.get_device());
-    }
-#endif
-}
-
-template <>
-inline void backend_selector_precondition<backend::rocrand>(sycl::queue& queue) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-    unsigned int vendor_id =
-        static_cast<unsigned int>(queue.get_device().get_info<sycl::info::device::vendor_id>());
-    if (!(queue.get_device().is_gpu() && vendor_id == AMD_ID)) {
-        throw unsupported_device("",
-                                 "backend_selector<backend::" + backend_map[backend::rocrand] + ">",
-                                 queue.get_device());
-    }
-#endif
-}
-
-template <>
-inline void backend_selector_precondition<backend::rocsolver>(sycl::queue& queue) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-    unsigned int vendor_id =
-        static_cast<unsigned int>(queue.get_device().get_info<sycl::info::device::vendor_id>());
-    if (!(queue.get_device().is_gpu() && vendor_id == AMD_ID)) {
-        throw unsupported_device(
-            "", "backend_selector<backend::" + backend_map[backend::rocsolver] + ">",
-            queue.get_device());
-    }
-#endif
-}
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_BACKEND_SELECTOR_PREDICATES_HPP_
diff --git a/include/oneapi/mkl/detail/backends.hpp b/include/oneapi/mkl/detail/backends.hpp
deleted file mode 100644
index 32b7c2614..000000000
--- a/include/oneapi/mkl/detail/backends.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_BACKENDS_HPP_
-#define _ONEMKL_BACKENDS_HPP_
-
-#include <map>
-#include <string>
-
-namespace oneapi {
-namespace mkl {
-
-enum class backend {
-    mklcpu,
-    mklgpu,
-    cublas,
-    rocsolver,
-    cusolver,
-    curand,
-    netlib,
-    rocblas,
-    rocrand,
-    portblas,
-    cufft,
-    rocfft,
-    portfft,
-    unsupported
-};
-
-typedef std::map<backend, std::string> backendmap;
-
-static backendmap backend_map = {
-    { backend::mklcpu, "mklcpu" },       { backend::mklgpu, "mklgpu" },
-    { backend::cublas, "cublas" },       { backend::cusolver, "cusolver" },
-    { backend::curand, "curand" },       { backend::netlib, "netlib" },
-    { backend::rocblas, "rocblas" },     { backend::rocrand, "rocrand" },
-    { backend::rocsolver, "rocsolver" }, { backend::portblas, "portblas" },
-    { backend::cufft, "cufft" },         { backend::rocfft, "rocfft" },
-    { backend::portfft, "portfft" },     { backend::unsupported, "unsupported" }
-};
-
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_ONEMKL_BACKENDS_HPP_
diff --git a/include/oneapi/mkl/detail/backends_table.hpp b/include/oneapi/mkl/detail/backends_table.hpp
deleted file mode 100644
index 8e68674cc..000000000
--- a/include/oneapi/mkl/detail/backends_table.hpp
+++ /dev/null
@@ -1,203 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_BACKENDS_TABLE_HPP_
-#define _ONEMKL_BACKENDS_TABLE_HPP_
-
-#include <string>
-#include <vector>
-#include <map>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/config.hpp"
-
-#ifdef __linux__
-#define LIB_NAME(a) "libonemkl_" a ".so"
-#elif defined(_WIN64)
-#define LIB_NAME(a) "onemkl_" a ".dll"
-#endif
-
-namespace oneapi {
-namespace mkl {
-
-enum class device : uint16_t { x86cpu, intelgpu, nvidiagpu, amdgpu };
-enum class domain : uint16_t { blas, dft, lapack, rng, sparse_blas };
-
-static std::map<domain, std::map<device, std::vector<const char*>>> libraries = {
-    { domain::blas,
-      { { device::x86cpu,
-          {
-#ifdef ENABLE_MKLCPU_BACKEND
-              LIB_NAME("blas_mklcpu"),
-#endif
-#ifdef ENABLE_NETLIB_BACKEND
-              LIB_NAME("blas_netlib"),
-#endif
-#ifdef ENABLE_PORTBLAS_BACKEND_INTEL_CPU
-              LIB_NAME("blas_portblas"),
-#endif
-          } },
-        { device::intelgpu,
-          {
-#ifdef ENABLE_MKLGPU_BACKEND
-              LIB_NAME("blas_mklgpu"),
-#endif
-#ifdef ENABLE_PORTBLAS_BACKEND_INTEL_GPU
-              LIB_NAME("blas_portblas"),
-#endif
-          } },
-        { device::amdgpu,
-          {
-#ifdef ENABLE_ROCBLAS_BACKEND
-              LIB_NAME("blas_rocblas"),
-#endif
-#ifdef ENABLE_PORTBLAS_BACKEND_AMD_GPU
-              LIB_NAME("blas_portblas"),
-#endif
-          } },
-        { device::nvidiagpu,
-          {
-#ifdef ENABLE_CUBLAS_BACKEND
-              LIB_NAME("blas_cublas"),
-#endif
-#ifdef ENABLE_PORTBLAS_BACKEND_NVIDIA_GPU
-              LIB_NAME("blas_portblas"),
-#endif
-          } } } },
-
-    { domain::dft,
-      { { device::x86cpu,
-          {
-#ifdef ENABLE_MKLCPU_BACKEND
-              LIB_NAME("dft_mklcpu")
-#endif
-#ifdef ENABLE_PORTFFT_BACKEND
-                  LIB_NAME("dft_portfft")
-#endif
-          } },
-        { device::intelgpu,
-          {
-#ifdef ENABLE_MKLGPU_BACKEND
-              LIB_NAME("dft_mklgpu")
-#endif
-#ifdef ENABLE_PORTFFT_BACKEND
-                  LIB_NAME("dft_portfft")
-#endif
-          } },
-        { device::amdgpu,
-          {
-#ifdef ENABLE_ROCFFT_BACKEND
-              LIB_NAME("dft_rocfft")
-#endif
-#ifdef ENABLE_PORTFFT_BACKEND
-                  LIB_NAME("dft_portfft")
-#endif
-          } },
-        { device::nvidiagpu,
-          {
-#ifdef ENABLE_CUFFT_BACKEND
-              LIB_NAME("dft_cufft")
-#endif
-#ifdef ENABLE_PORTFFT_BACKEND
-                  LIB_NAME("dft_portfft")
-#endif
-          } } } },
-
-    { domain::lapack,
-      { { device::x86cpu,
-          {
-#ifdef ENABLE_MKLCPU_BACKEND
-              LIB_NAME("lapack_mklcpu")
-#endif
-          } },
-        { device::intelgpu,
-          {
-#ifdef ENABLE_MKLGPU_BACKEND
-              LIB_NAME("lapack_mklgpu")
-#endif
-          } },
-        { device::amdgpu,
-          {
-#ifdef ENABLE_ROCSOLVER_BACKEND
-              LIB_NAME("lapack_rocsolver")
-#endif
-          } },
-        { device::nvidiagpu,
-          {
-#ifdef ENABLE_CUSOLVER_BACKEND
-              LIB_NAME("lapack_cusolver")
-#endif
-          } } } },
-
-    { domain::rng,
-      { { device::x86cpu,
-          {
-#ifdef ENABLE_MKLCPU_BACKEND
-              LIB_NAME("rng_mklcpu")
-#endif
-          } },
-        { device::intelgpu,
-          {
-#ifdef ENABLE_MKLGPU_BACKEND
-              LIB_NAME("rng_mklgpu")
-#endif
-          } },
-        { device::amdgpu,
-          {
-#ifdef ENABLE_ROCRAND_BACKEND
-              LIB_NAME("rng_rocrand")
-#endif
-          } },
-        { device::nvidiagpu,
-          {
-#ifdef ENABLE_CURAND_BACKEND
-              LIB_NAME("rng_curand")
-#endif
-          } } } },
-
-    { domain::sparse_blas,
-      { { device::x86cpu,
-          {
-#ifdef ENABLE_MKLCPU_BACKEND
-              LIB_NAME("sparse_blas_mklcpu")
-#endif
-          } },
-        { device::intelgpu,
-          {
-#ifdef ENABLE_MKLGPU_BACKEND
-              LIB_NAME("sparse_blas_mklgpu")
-#endif
-          } } } },
-};
-
-static std::map<domain, const char*> table_names = { { domain::blas, "mkl_blas_table" },
-                                                     { domain::lapack, "mkl_lapack_table" },
-                                                     { domain::dft, "mkl_dft_table" },
-                                                     { domain::rng, "mkl_rng_table" },
-                                                     { domain::sparse_blas,
-                                                       "mkl_sparse_blas_table" } };
-
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_ONEMKL_BACKENDS_TABLE_HPP_
diff --git a/include/oneapi/mkl/detail/exceptions.hpp b/include/oneapi/mkl/detail/exceptions.hpp
deleted file mode 100644
index 7767c2ac3..000000000
--- a/include/oneapi/mkl/detail/exceptions.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// These are oneAPI Math Kernel Library (oneMKL) Interfaces specific exceptions
-
-#ifndef _ONEMKL_DETAIL_EXCEPTIONS_HPP_
-#define _ONEMKL_DETAIL_EXCEPTIONS_HPP_
-
-#include <exception>
-#include <string>
-#include "oneapi/mkl/exceptions.hpp"
-
-namespace oneapi {
-namespace mkl {
-
-class backend_not_found : public oneapi::mkl::exception {
-public:
-    backend_not_found(const std::string &info = "")
-            : oneapi::mkl::exception(
-                  "", "", ((info.length() != 0) ? info : "Couldn't load selected backend")) {}
-};
-
-class function_not_found : public oneapi::mkl::exception {
-public:
-    function_not_found(const std::string &info = "")
-            : oneapi::mkl::exception(
-                  "", "",
-                  ((info.length() != 0) ? info : "Couldn't load functions from selected backend")) {
-    }
-};
-
-class specification_mismatch : public oneapi::mkl::exception {
-public:
-    specification_mismatch(const std::string &info = "")
-            : oneapi::mkl::exception(
-                  "", "",
-                  ((info.length() != 0) ? info : "Loaded oneMKL specification version mismatch")) {}
-};
-
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _ONEMKL_DETAIL_EXCEPTIONS_HPP_
diff --git a/include/oneapi/mkl/detail/export.hpp b/include/oneapi/mkl/detail/export.hpp
deleted file mode 100644
index cbdd0d08c..000000000
--- a/include/oneapi/mkl/detail/export.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef ONEMKL_EXPORT_H
-#define ONEMKL_EXPORT_H
-
-#include "oneapi/mkl/detail/config.hpp"
-
-#if !defined(BUILD_SHARED_LIBS) || !defined(_WIN64)
-#define ONEMKL_EXPORT
-#define ONEMKL_NO_EXPORT
-#else
-#ifndef ONEMKL_EXPORT
-#ifdef onemkl_EXPORTS
-/* We are building this library */
-#define ONEMKL_EXPORT __declspec(dllexport)
-#else
-/* We are using this library */
-#define ONEMKL_EXPORT __declspec(dllimport)
-#endif
-#endif
-#endif
-
-#endif /* ONEMKL_EXPORT_H */
diff --git a/include/oneapi/mkl/detail/get_device_id.hpp b/include/oneapi/mkl/detail/get_device_id.hpp
deleted file mode 100644
index 88b235754..000000000
--- a/include/oneapi/mkl/detail/get_device_id.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_GET_DEVICE_ID_HPP_
-#define _ONEMKL_GET_DEVICE_ID_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/backends_table.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-
-#define INTEL_ID  32902
-#define NVIDIA_ID 4318
-#ifndef __HIPSYCL__
-#define AMD_ID 4098
-#else
-#define AMD_ID 1022
-#endif
-
-namespace oneapi {
-namespace mkl {
-
-inline oneapi::mkl::device get_device_id(sycl::queue &queue) {
-    oneapi::mkl::device device_id;
-    if (queue.get_device().is_cpu())
-        device_id = device::x86cpu;
-#ifdef __HIPSYCL__
-    else if (queue.is_host())
-        device_id = device::x86cpu;
-#endif
-    else if (queue.get_device().is_gpu()) {
-        unsigned int vendor_id =
-            static_cast<unsigned int>(queue.get_device().get_info<sycl::info::device::vendor_id>());
-
-        if (vendor_id == INTEL_ID)
-            device_id = device::intelgpu;
-        else if (vendor_id == NVIDIA_ID)
-            device_id = device::nvidiagpu;
-        else if (vendor_id == AMD_ID)
-            device_id = device::amdgpu;
-        else {
-            throw unsupported_device("", "", queue.get_device());
-        }
-    }
-    else {
-        throw unsupported_device("", "", queue.get_device());
-    }
-    return device_id;
-}
-
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_ONEMKL_GET_DEVICE_ID_HPP_
diff --git a/include/oneapi/mkl/dft.hpp b/include/oneapi/mkl/dft.hpp
deleted file mode 100644
index 17ee4e042..000000000
--- a/include/oneapi/mkl/dft.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_HPP_
-#define _ONEMKL_DFT_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl/detail/get_device_id.hpp"
-#include "oneapi/mkl/dft/detail/dft_loader.hpp"
-
-#include "oneapi/mkl/dft/descriptor.hpp"
-#include "oneapi/mkl/dft/forward.hpp"
-#include "oneapi/mkl/dft/backward.hpp"
-
-#endif // _ONEMKL_DFT_HPP_
diff --git a/include/oneapi/mkl/dft/backward.hpp b/include/oneapi/mkl/dft/backward.hpp
deleted file mode 100644
index 3cd03e13b..000000000
--- a/include/oneapi/mkl/dft/backward.hpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_BACKWARD_HPP_
-#define _ONEMKL_DFT_BACKWARD_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "detail/types_impl.hpp"
-
-namespace oneapi::mkl::dft {
-//Buffer version
-
-//In-place transform
-template <typename descriptor_type, typename data_type>
-void compute_backward(descriptor_type &desc, sycl::buffer<data_type, 1> &inout) {
-    static_assert(detail::valid_compute_arg<descriptor_type, data_type>::value,
-                  "unexpected type for data_type");
-
-    using fwd_type = typename detail::descriptor_info<descriptor_type>::forward_type;
-    auto type_corrected_inout = inout.template reinterpret<fwd_type, 1>(
-        detail::reinterpret_range<data_type, fwd_type>(inout.size()));
-    get_commit(desc)->backward_ip_cc(desc, type_corrected_inout);
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type, typename data_type,
-          std::enable_if_t<detail::valid_ip_realreal_impl<descriptor_type, data_type>, bool> = true>
-void compute_backward(descriptor_type &desc, sycl::buffer<data_type, 1> &inout_re,
-                      sycl::buffer<data_type, 1> &inout_im) {
-    static_assert(detail::valid_compute_arg<descriptor_type, data_type>::value,
-                  "unexpected type for data_type");
-
-    using scalar_type = typename detail::descriptor_info<descriptor_type>::scalar_type;
-    auto type_corrected_inout_re = inout_re.template reinterpret<scalar_type, 1>(
-        detail::reinterpret_range<data_type, scalar_type>(inout_re.size()));
-    auto type_corrected_inout_im = inout_im.template reinterpret<scalar_type, 1>(
-        detail::reinterpret_range<data_type, scalar_type>(inout_im.size()));
-    get_commit(desc)->backward_ip_rr(desc, type_corrected_inout_re, type_corrected_inout_im);
-}
-
-//Out-of-place transform
-template <typename descriptor_type, typename input_type, typename output_type>
-void compute_backward(descriptor_type &desc, sycl::buffer<input_type, 1> &in,
-                      sycl::buffer<output_type, 1> &out) {
-    static_assert(detail::valid_compute_arg<descriptor_type, input_type>::value,
-                  "unexpected type for input_type");
-    static_assert(detail::valid_compute_arg<descriptor_type, output_type>::value,
-                  "unexpected type for output_type");
-
-    using fwd_type = typename detail::descriptor_info<descriptor_type>::forward_type;
-    using bwd_type = typename detail::descriptor_info<descriptor_type>::backward_type;
-    auto type_corrected_in = in.template reinterpret<bwd_type, 1>(
-        detail::reinterpret_range<input_type, bwd_type>(in.size()));
-    auto type_corrected_out = out.template reinterpret<fwd_type, 1>(
-        detail::reinterpret_range<output_type, fwd_type>(out.size()));
-    get_commit(desc)->backward_op_cc(desc, type_corrected_in, type_corrected_out);
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type, typename input_type, typename output_type>
-void compute_backward(descriptor_type &desc, sycl::buffer<input_type, 1> &in_re,
-                      sycl::buffer<input_type, 1> &in_im, sycl::buffer<output_type, 1> &out_re,
-                      sycl::buffer<output_type, 1> &out_im) {
-    static_assert(detail::valid_compute_arg<descriptor_type, input_type>::value,
-                  "unexpected type for input_type");
-    static_assert(detail::valid_compute_arg<descriptor_type, output_type>::value,
-                  "unexpected type for output_type");
-
-    using scalar_type = typename detail::descriptor_info<descriptor_type>::scalar_type;
-    auto type_corrected_in_re = in_re.template reinterpret<scalar_type, 1>(
-        detail::reinterpret_range<input_type, scalar_type>(in_re.size()));
-    auto type_corrected_in_im = in_im.template reinterpret<scalar_type, 1>(
-        detail::reinterpret_range<input_type, scalar_type>(in_im.size()));
-    auto type_corrected_out_re = out_re.template reinterpret<scalar_type, 1>(
-        detail::reinterpret_range<output_type, scalar_type>(out_re.size()));
-    auto type_corrected_out_im = out_im.template reinterpret<scalar_type, 1>(
-        detail::reinterpret_range<output_type, scalar_type>(out_im.size()));
-    get_commit(desc)->backward_op_rr(desc, type_corrected_in_re, type_corrected_in_im,
-                                     type_corrected_out_re, type_corrected_out_im);
-}
-
-//USM version
-
-//In-place transform
-template <typename descriptor_type, typename data_type>
-sycl::event compute_backward(descriptor_type &desc, data_type *inout,
-                             const std::vector<sycl::event> &dependencies = {}) {
-    static_assert(detail::valid_compute_arg<descriptor_type, data_type>::value,
-                  "unexpected type for data_type");
-
-    using fwd_type = typename detail::descriptor_info<descriptor_type>::forward_type;
-    return get_commit(desc)->backward_ip_cc(desc, reinterpret_cast<fwd_type *>(inout),
-                                            dependencies);
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type, typename data_type,
-          std::enable_if_t<detail::valid_ip_realreal_impl<descriptor_type, data_type>, bool> = true>
-sycl::event compute_backward(descriptor_type &desc, data_type *inout_re, data_type *inout_im,
-                             const std::vector<sycl::event> &dependencies = {}) {
-    static_assert(detail::valid_compute_arg<descriptor_type, data_type>::value,
-                  "unexpected type for data_type");
-
-    using scalar_type = typename detail::descriptor_info<descriptor_type>::scalar_type;
-    return get_commit(desc)->backward_ip_rr(desc, reinterpret_cast<scalar_type *>(inout_re),
-                                            reinterpret_cast<scalar_type *>(inout_im),
-                                            dependencies);
-}
-
-//Out-of-place transform
-template <typename descriptor_type, typename input_type, typename output_type>
-sycl::event compute_backward(descriptor_type &desc, input_type *in, output_type *out,
-                             const std::vector<sycl::event> &dependencies = {}) {
-    static_assert(detail::valid_compute_arg<descriptor_type, input_type>::value,
-                  "unexpected type for input_type");
-    static_assert(detail::valid_compute_arg<descriptor_type, output_type>::value,
-                  "unexpected type for output_type");
-
-    using fwd_type = typename detail::descriptor_info<descriptor_type>::forward_type;
-    using bwd_type = typename detail::descriptor_info<descriptor_type>::backward_type;
-    return get_commit(desc)->backward_op_cc(desc, reinterpret_cast<bwd_type *>(in),
-                                            reinterpret_cast<fwd_type *>(out), dependencies);
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type, typename input_type, typename output_type>
-sycl::event compute_backward(descriptor_type &desc, input_type *in_re, input_type *in_im,
-                             output_type *out_re, output_type *out_im,
-                             const std::vector<sycl::event> &dependencies = {}) {
-    static_assert(detail::valid_compute_arg<descriptor_type, input_type>::value,
-                  "unexpected type for input_type");
-    static_assert(detail::valid_compute_arg<descriptor_type, output_type>::value,
-                  "unexpected type for output_type");
-
-    using scalar_type = typename detail::descriptor_info<descriptor_type>::scalar_type;
-    return get_commit(desc)->backward_op_rr(desc, reinterpret_cast<scalar_type *>(in_re),
-                                            reinterpret_cast<scalar_type *>(in_im),
-                                            reinterpret_cast<scalar_type *>(out_re),
-                                            reinterpret_cast<scalar_type *>(out_im), dependencies);
-}
-} // namespace oneapi::mkl::dft
-
-#endif // _ONEMKL_DFT_BACKWARD_HPP_
diff --git a/include/oneapi/mkl/dft/descriptor.hpp b/include/oneapi/mkl/dft/descriptor.hpp
deleted file mode 100644
index fb618fd42..000000000
--- a/include/oneapi/mkl/dft/descriptor.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_DESCRIPTOR_HPP_
-#define _ONEMKL_DFT_DESCRIPTOR_HPP_
-
-#include "detail/descriptor_impl.hpp"
-#include "types.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-/** The detail namespace is required since the MKLGPU backend uses identical 
-names and function signatures in many places. **/
-
-template <precision prec, domain dom>
-using descriptor = detail::descriptor<prec, dom>;
-} // namespace dft
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _ONEMKL_DFT_DESCRIPTOR_HPP_
diff --git a/include/oneapi/mkl/dft/detail/commit_impl.hpp b/include/oneapi/mkl/dft/detail/commit_impl.hpp
deleted file mode 100644
index 9e827f357..000000000
--- a/include/oneapi/mkl/dft/detail/commit_impl.hpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_COMMIT_IMPL_HPP_
-#define _ONEMKL_DFT_COMMIT_IMPL_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "descriptor_impl.hpp"
-#include "external_workspace_helper.hpp"
-
-namespace oneapi::mkl {
-enum class backend;
-}
-
-namespace oneapi::mkl::dft::detail {
-
-template <precision prec, domain dom>
-class dft_values;
-
-template <precision prec, domain dom>
-class commit_impl {
-    sycl::queue queue_;
-    mkl::backend backend_;
-
-public:
-    using descriptor_type = typename oneapi::mkl::dft::detail::descriptor<prec, dom>;
-    using fwd_type = typename descriptor_info<descriptor_type>::forward_type;
-    using bwd_type = typename descriptor_info<descriptor_type>::backward_type;
-    using scalar_type = typename descriptor_info<descriptor_type>::scalar_type;
-
-protected:
-    external_workspace_helper<prec, dom> external_workspace_helper_;
-
-public:
-    commit_impl(sycl::queue queue, mkl::backend backend,
-                const dft::detail::dft_values<prec, dom> &config_values)
-            : queue_(queue),
-              backend_(backend),
-              external_workspace_helper_(config_values.workspace_placement ==
-                                         dft::detail::config_value::WORKSPACE_EXTERNAL) {}
-
-    // rule of three
-    commit_impl(const commit_impl &other) = delete;
-    commit_impl &operator=(const commit_impl &other) = delete;
-    virtual ~commit_impl() = default;
-
-    sycl::queue &get_queue() noexcept {
-        return queue_;
-    }
-
-    mkl::backend get_backend() const noexcept {
-        return backend_;
-    }
-
-    virtual void *get_handle() noexcept = 0;
-
-    virtual void commit(const dft_values<prec, dom> &) = 0;
-
-    inline std::int64_t get_workspace_external_bytes() {
-        return external_workspace_helper_.get_rqd_workspace_bytes(*this);
-    }
-
-    // set_workspace should be overridden for any backend that enables external workspaces.
-    // If these are overridden, get_workspace_external_bytes_impl must also be overridden.
-    // For backends that do not support external workspaces, these functions do not need to be overridden.
-    // When not overridden, external workspace support is faked: an external workspace can be set,
-    // and errors will be generated according to the specificiation,
-    // but the required workspace size will always be zero, and any given workspace will not actually be used.
-    virtual void set_workspace(scalar_type *usm_workspace) {
-        external_workspace_helper_.set_workspace_throw(*this, usm_workspace);
-    }
-    virtual void set_workspace(sycl::buffer<scalar_type> &buffer_workspace) {
-        external_workspace_helper_.set_workspace_throw(*this, buffer_workspace);
-    }
-
-    virtual void forward_ip_cc(descriptor_type &desc, sycl::buffer<fwd_type, 1> &inout) = 0;
-    virtual void forward_ip_rr(descriptor_type &desc, sycl::buffer<scalar_type, 1> &inout_re,
-                               sycl::buffer<scalar_type, 1> &inout_im) = 0;
-    virtual void forward_op_cc(descriptor_type &desc, sycl::buffer<fwd_type, 1> &in,
-                               sycl::buffer<bwd_type, 1> &out) = 0;
-    virtual void forward_op_rr(descriptor_type &desc, sycl::buffer<scalar_type, 1> &in_re,
-                               sycl::buffer<scalar_type, 1> &in_im,
-                               sycl::buffer<scalar_type, 1> &out_re,
-                               sycl::buffer<scalar_type, 1> &out_im) = 0;
-
-    virtual sycl::event forward_ip_cc(descriptor_type &desc, fwd_type *inout,
-                                      const std::vector<sycl::event> &dependencies) = 0;
-    virtual sycl::event forward_ip_rr(descriptor_type &desc, scalar_type *inout_re,
-                                      scalar_type *inout_im,
-                                      const std::vector<sycl::event> &dependencies) = 0;
-    virtual sycl::event forward_op_cc(descriptor_type &desc, fwd_type *in, bwd_type *out,
-                                      const std::vector<sycl::event> &dependencies) = 0;
-    virtual sycl::event forward_op_rr(descriptor_type &desc, scalar_type *in_re, scalar_type *in_im,
-                                      scalar_type *out_re, scalar_type *out_im,
-                                      const std::vector<sycl::event> &dependencies) = 0;
-
-    virtual void backward_ip_cc(descriptor_type &desc, sycl::buffer<fwd_type, 1> &inout) = 0;
-    virtual void backward_ip_rr(descriptor_type &desc, sycl::buffer<scalar_type, 1> &inout_re,
-                                sycl::buffer<scalar_type, 1> &inout_im) = 0;
-    virtual void backward_op_cc(descriptor_type &desc, sycl::buffer<bwd_type, 1> &in,
-                                sycl::buffer<fwd_type, 1> &out) = 0;
-    virtual void backward_op_rr(descriptor_type &desc, sycl::buffer<scalar_type, 1> &in_re,
-                                sycl::buffer<scalar_type, 1> &in_im,
-                                sycl::buffer<scalar_type, 1> &out_re,
-                                sycl::buffer<scalar_type, 1> &out_im) = 0;
-
-    virtual sycl::event backward_ip_cc(descriptor_type &desc, fwd_type *inout,
-                                       const std::vector<sycl::event> &dependencies) = 0;
-    virtual sycl::event backward_ip_rr(descriptor_type &desc, scalar_type *inout_re,
-                                       scalar_type *inout_im,
-                                       const std::vector<sycl::event> &dependencies) = 0;
-    virtual sycl::event backward_op_cc(descriptor_type &desc, bwd_type *in, fwd_type *out,
-                                       const std::vector<sycl::event> &dependencies) = 0;
-    virtual sycl::event backward_op_rr(descriptor_type &desc, scalar_type *in_re,
-                                       scalar_type *in_im, scalar_type *out_re, scalar_type *out_im,
-                                       const std::vector<sycl::event> &dependencies) = 0;
-
-    /** For compute calls, throw errors for the external workspace as required.
-     * @tparam ArgTs The non-descriptor arg(s) for the compute call. First one is used to check
-     * buffer or USM call.
-     * @param function_name The function name to user in generated exceptions.
-    */
-    template <typename... ArgTs>
-    void compute_call_throw(const char *function_name) {
-        external_workspace_helper_.template compute_call_throw<ArgTs...>(function_name);
-    }
-
-    /** Create an accessor out of the workspace buffer when required, to ensure correct dependency
-     *  management for the buffer. To be used by backends that don't natively support sycl::buffers.
-     * @param function_name The function name to user in generated exceptions.
-     * @param cgh The command group handler to associate the accessor with.
-    */
-    void add_buffer_workspace_dependency_if_rqd(const char *function_name, sycl::handler &cgh) {
-        external_workspace_helper_.add_buffer_dependency_if_rqd(function_name, cgh);
-    }
-
-    /** If WORKSPACE_EXTERNAL is set, depend on the last USM workspace event added via set_last_usm_workspace_event.
-     * @param cgh The command group handler to associate the accessor with.
-    */
-    void depend_on_last_usm_workspace_event_if_rqd(sycl::handler &cgh) {
-        external_workspace_helper_.depend_on_last_usm_workspace_event_if_rqd(cgh);
-    }
-
-    /** If WORKSPACE_EXTERNAL is set, store the given event internally to allow it to be depended upon by
-     * subsequent calls to depend_on_last_usm_workspace_event.
-     * @param sycl_event The last usage of the USM workspace.
-    */
-    void set_last_usm_workspace_event_if_rqd(sycl::event &sycl_event) {
-        external_workspace_helper_.set_last_usm_workspace_event_if_rqd(sycl_event);
-    }
-
-protected:
-    friend class external_workspace_helper<prec, dom>;
-
-    // This must be reimplemented for backends that support external workspaces.
-    virtual std::int64_t get_workspace_external_bytes_impl() {
-        return 0;
-    }
-};
-
-} // namespace oneapi::mkl::dft::detail
-
-#endif //_ONEMKL_DFT_COMMIT_IMPL_HPP_
diff --git a/include/oneapi/mkl/dft/detail/cufft/onemkl_dft_cufft.hpp b/include/oneapi/mkl/dft/detail/cufft/onemkl_dft_cufft.hpp
deleted file mode 100644
index 4e4ad2030..000000000
--- a/include/oneapi/mkl/dft/detail/cufft/onemkl_dft_cufft.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_CUFFT_HPP_
-#define _ONEMKL_DFT_CUFFT_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/dft/detail/types_impl.hpp"
-
-namespace oneapi::mkl::dft::cufft {
-
-#include "oneapi/mkl/dft/detail/dft_ct.hxx"
-
-} // namespace oneapi::mkl::dft::cufft
-
-#endif // _ONEMKL_DFT_CUFFT_HPP_
diff --git a/include/oneapi/mkl/dft/detail/descriptor_impl.hpp b/include/oneapi/mkl/dft/detail/descriptor_impl.hpp
deleted file mode 100644
index a9c3f946c..000000000
--- a/include/oneapi/mkl/dft/detail/descriptor_impl.hpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_DETAIL_DESCRIPTOR_IMPL_HPP_
-#define _ONEMKL_DFT_DETAIL_DESCRIPTOR_IMPL_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/backend_selector.hpp"
-#include "oneapi/mkl/detail/export.hpp"
-
-#include "oneapi/mkl/dft/detail/types_impl.hpp"
-#include "oneapi/mkl/dft/detail/commit_impl.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-namespace detail {
-// Forward declaration:
-template <precision prec, domain dom>
-class descriptor;
-
-template <precision prec, domain dom>
-inline commit_impl<prec, dom>* get_commit(descriptor<prec, dom>& desc);
-
-template <precision prec, domain dom>
-class descriptor {
-private:
-    using scalar_type = typename descriptor_info<descriptor>::scalar_type;
-
-public:
-    // Syntax for 1-dimensional DFT
-    descriptor(std::int64_t length);
-
-    // Syntax for d-dimensional DFT
-    descriptor(std::vector<std::int64_t> dimensions);
-
-    // Copy operations are included in the oneAPI oneMKL specification, but not yet
-    // implemented here. If you need copies, please open an issue at
-    // https://github.com/oneapi-src/oneMKL/issues
-
-    descriptor(descriptor&&);
-
-    descriptor& operator=(descriptor&&);
-
-    ~descriptor();
-
-    void set_value(config_param param, ...);
-
-    void get_value(config_param param, ...) const;
-
-    void commit(sycl::queue& queue);
-
-#ifdef ENABLE_MKLCPU_BACKEND
-    void commit(backend_selector<backend::mklcpu> selector);
-#endif
-
-#ifdef ENABLE_MKLGPU_BACKEND
-    void commit(backend_selector<backend::mklgpu> selector);
-#endif
-
-#ifdef ENABLE_CUFFT_BACKEND
-    void commit(backend_selector<backend::cufft> selector);
-#endif
-
-#ifdef ENABLE_ROCFFT_BACKEND
-    void commit(backend_selector<backend::rocfft> selector);
-#endif
-
-#ifdef ENABLE_PORTFFT_BACKEND
-    void commit(backend_selector<backend::portfft> selector);
-#endif
-
-    const dft_values<prec, dom>& get_values() const noexcept {
-        return values_;
-    }
-
-    void set_workspace(scalar_type* usm_workspace);
-
-    void set_workspace(sycl::buffer<scalar_type>& buffer_workspace);
-
-private:
-    // Has a value when the descriptor is committed.
-    std::unique_ptr<commit_impl<prec, dom>> pimpl_;
-
-    // descriptor configuration values_ and structs
-    dft_values<prec, dom> values_;
-
-    friend commit_impl<prec, dom>* get_commit<prec, dom>(descriptor<prec, dom>&);
-
-    using real_t = typename precision_t<prec>::real_t;
-};
-
-template <precision prec, domain dom>
-inline commit_impl<prec, dom>* get_commit(descriptor<prec, dom>& desc) {
-    return desc.pimpl_.get();
-}
-
-} // namespace detail
-} // namespace dft
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _ONEMKL_DFT_DETAIL_DESCRIPTOR_IMPL_HPP_
diff --git a/include/oneapi/mkl/dft/detail/dft_ct.hxx b/include/oneapi/mkl/dft/detail/dft_ct.hxx
deleted file mode 100644
index 20cd537d8..000000000
--- a/include/oneapi/mkl/dft/detail/dft_ct.hxx
+++ /dev/null
@@ -1,138 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Commit
-
-template <dft::detail::precision prec, dft::detail::domain dom>
-ONEMKL_EXPORT dft::detail::commit_impl<prec, dom> *create_commit(
-    const dft::detail::descriptor<prec, dom> &desc, sycl::queue &sycl_queue);
-
-// BUFFER version
-
-template <typename descriptor_type>
-using scalar = typename detail::descriptor_info<descriptor_type>::scalar_type;
-template <typename descriptor_type>
-using fwd = typename detail::descriptor_info<descriptor_type>::forward_type;
-template <typename descriptor_type>
-using bwd = typename detail::descriptor_info<descriptor_type>::backward_type;
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc,
-                                   sycl::buffer<fwd<descriptor_type>, 1> &inout);
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &inout_re,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &inout_im);
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer<fwd<descriptor_type>, 1> &in,
-                                   sycl::buffer<bwd<descriptor_type>, 1> &out);
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &in_re,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &in_im,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &out_re,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &out_im);
-
-//USM version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd<descriptor_type> *inout,
-                                          const std::vector<sycl::event> &dependencies);
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar<descriptor_type> *inout_re,
-                                          scalar<descriptor_type> *inout_im,
-                                          const std::vector<sycl::event> &dependencies);
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd<descriptor_type> *in,
-                                          bwd<descriptor_type> *out,
-                                          const std::vector<sycl::event> &dependencies);
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar<descriptor_type> *in_re,
-                                          scalar<descriptor_type> *in_im,
-                                          scalar<descriptor_type> *out_re,
-                                          scalar<descriptor_type> *out_im,
-                                          const std::vector<sycl::event> &dependencies);
-
-// BUFFER version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<fwd<descriptor_type>, 1> &inout);
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &inout_re,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &inout_im);
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<bwd<descriptor_type>, 1> &in,
-                                    sycl::buffer<fwd<descriptor_type>, 1> &out);
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &in_re,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &in_im,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &out_re,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &out_im);
-
-//USM version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwd<descriptor_type> *inout,
-                                           const std::vector<sycl::event> &dependencies);
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar<descriptor_type> *inout_re,
-                                           scalar<descriptor_type> *inout_im,
-                                           const std::vector<sycl::event> &dependencies);
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwd<descriptor_type> *in,
-                                           fwd<descriptor_type> *out,
-                                           const std::vector<sycl::event> &dependencies);
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar<descriptor_type> *in_re,
-                                           scalar<descriptor_type> *in_im,
-                                           scalar<descriptor_type> *out_re,
-                                           scalar<descriptor_type> *out_im,
-                                           const std::vector<sycl::event> &dependencies);
diff --git a/include/oneapi/mkl/dft/detail/dft_loader.hpp b/include/oneapi/mkl/dft/detail/dft_loader.hpp
deleted file mode 100644
index f84a4e01c..000000000
--- a/include/oneapi/mkl/dft/detail/dft_loader.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_LOADER_HPP_
-#define _ONEMKL_DFT_LOADER_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/dft/detail/types_impl.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-namespace detail {
-
-template <precision prec, domain dom>
-class commit_impl;
-
-template <precision prec, domain dom>
-class descriptor;
-
-template <precision prec, domain dom>
-ONEMKL_EXPORT commit_impl<prec, dom>* create_commit(const descriptor<prec, dom>& desc,
-                                                    sycl::queue& queue);
-
-} // namespace detail
-} // namespace dft
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_DFT_LOADER_HPP_
diff --git a/include/oneapi/mkl/dft/detail/external_workspace_helper.hpp b/include/oneapi/mkl/dft/detail/external_workspace_helper.hpp
deleted file mode 100644
index b41dffc4c..000000000
--- a/include/oneapi/mkl/dft/detail/external_workspace_helper.hpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_EXTERNAL_WORKSPACE_HELPER_HPP_
-#define _ONEMKL_DFT_EXTERNAL_WORKSPACE_HELPER_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/dft/detail/types_impl.hpp"
-#include "oneapi/mkl/dft/detail/commit_impl.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-namespace detail {
-
-template <precision prec, domain dom>
-class external_workspace_helper {
-public:
-    using commit_impl_t = commit_impl<prec, dom>;
-    using scalar_t = typename commit_impl_t::scalar_type;
-
-private:
-    // Enum to represent whatever the workspace was set as.
-    enum class ext_workspace_type {
-        not_set,
-        usm,
-        buffer,
-    };
-
-    // Is an external workspace required?
-    bool m_ext_workspace_rqd;
-
-    // Set workspace type, with optional workspaces.
-    ext_workspace_type m_workspace_type;
-
-    // Minimum size of workspace in bytes. -1 indicates not set.
-    std::int64_t m_workspace_bytes_rqd;
-
-    // Needed for adding dependencies to the SYCL runtime for backends that don't take
-    // the buffer as an argument.
-    std::optional<sycl::buffer<scalar_t>> m_workspace_buffer;
-
-    // Needed for creating dependencies between forward and backward calls in some backends.
-    sycl::event m_usm_workspace_last_dependency;
-
-public:
-    /** Constructor.
-     *  @param ext_workspace_rqd True if WORKSPACE_PLACEMENT is set to WORKSPACE_EXTERNAL.
-    */
-    constexpr external_workspace_helper(bool ext_workspace_rqd)
-            : m_ext_workspace_rqd(ext_workspace_rqd),
-              m_workspace_type(ext_workspace_type::not_set),
-              m_workspace_bytes_rqd(-1) {}
-
-    /** Get the required workspace bytes for the backend's external workspace.
-     *  @param committed_desc The backend's native descriptor.
-    */
-    std::int64_t get_rqd_workspace_bytes(commit_impl_t& committed_desc) {
-        if (m_workspace_bytes_rqd == -1) {
-            m_workspace_bytes_rqd = committed_desc.get_workspace_external_bytes_impl();
-        }
-        return m_workspace_bytes_rqd;
-    }
-
-    /** Throw according to spec for setting the workspace. USM version.
-     *  @param committed_desc The backend's native descriptor.
-     *  @param usm_workspace A USM allocation for the workspace. Assumed to be sufficeintly large.
-    */
-    void set_workspace_throw(commit_impl_t& committed_desc, scalar_t* usm_workspace) {
-        if (get_rqd_workspace_bytes(committed_desc) > 0 && usm_workspace == nullptr) {
-            throw mkl::invalid_argument("DFT", "set_workspace",
-                                        "Backend expected a non-null workspace pointer.");
-        }
-        m_ext_workspace_rqd = true;
-        m_workspace_type = ext_workspace_type::usm;
-    }
-
-    /** Throw according to spec for setting the workspace. Buffer version.
-     *  @param committed_desc The backend's native descriptor.
-     *  @param buffer_workspace A buffer for the workspace
-    */
-    void set_workspace_throw(commit_impl_t& committed_desc,
-                             sycl::buffer<scalar_t>& buffer_workspace) {
-        if (static_cast<std::size_t>(get_rqd_workspace_bytes(committed_desc)) / sizeof(scalar_t) >
-            buffer_workspace.size()) {
-            throw mkl::invalid_argument("DFT", "set_workspace", "Provided workspace is too small");
-            return;
-        }
-        if (buffer_workspace.is_sub_buffer()) {
-            throw mkl::invalid_argument("DFT", "set_workspace",
-                                        "Cannot use sub-buffers for workspace");
-            return;
-        }
-        m_ext_workspace_rqd = true;
-        m_workspace_type = ext_workspace_type::buffer;
-        m_workspace_buffer = buffer_workspace;
-    }
-
-    template <typename FirstArgT, typename... ArgTs>
-    void compute_call_throw(const char* function_name) const {
-        constexpr bool is_pointer = std::is_pointer_v<std::remove_reference_t<FirstArgT>>;
-        if constexpr (is_pointer) {
-            usm_compute_call_throw(function_name);
-        }
-        else {
-            buffer_compute_call_throw(function_name);
-        }
-    }
-
-    void add_buffer_dependency_if_rqd(const char* function_name, sycl::handler& cgh) {
-        if (m_ext_workspace_rqd) {
-            if (m_workspace_buffer) {
-                if (m_workspace_buffer->size()) {
-                    m_workspace_buffer->template get_access<sycl::access::mode::read_write>(cgh);
-                }
-            }
-            else {
-                throw mkl::invalid_argument(
-                    "DFT", function_name,
-                    "Buffer external workspace must be used with buffer compute calls");
-            }
-        }
-    }
-
-    /** If WORKSPACE_EXTERNAL is set, depend on the last USM workspace event added via set_last_usm_workspace_event.
-     * @param cgh The command group handler to associate the accessor with.
-    */
-    void depend_on_last_usm_workspace_event_if_rqd(sycl::handler& cgh) {
-        if (m_ext_workspace_rqd) {
-            cgh.depends_on(m_usm_workspace_last_dependency);
-        }
-    }
-
-    /** If WORKSPACE_EXTERNAL is set, store the given event internally to allow it to be depended upon by
-     * subsequent calls to depend_on_last_usm_workspace_event.
-     * @param sycl_event The last usage of the USM workspace.
-    */
-    void set_last_usm_workspace_event_if_rqd(sycl::event& sycl_event) {
-        if (m_ext_workspace_rqd) {
-            m_usm_workspace_last_dependency = sycl_event;
-        }
-    }
-
-private:
-    /** When a compute function using USM arguments is called, throw an exception if an incorrect workspace has been set.
-     *  @param function_name The name of the function to use in the error.
-    */
-    void usm_compute_call_throw(const char* function_name) const {
-        if (m_ext_workspace_rqd && m_workspace_type != ext_workspace_type::usm) {
-            throw mkl::invalid_argument(
-                "DFT", function_name, "USM external workspace must be used with usm compute calls");
-        }
-    }
-
-    /** When a compute function using buffer arguments is called, throw an exception if an incorrect workspace has been set.
-     *  @param function_name The name of the function to use in the error.
-    */
-    void buffer_compute_call_throw(const char* function_name) const {
-        if (m_ext_workspace_rqd && m_workspace_type != ext_workspace_type::buffer) {
-            throw mkl::invalid_argument(
-                "DFT", function_name,
-                "Buffer external workspace must be used with buffer compute calls");
-        }
-    }
-};
-
-} // namespace detail
-} // namespace dft
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_DFT_EXTERNAL_WORKSPACE_HELPER_HPP_
diff --git a/include/oneapi/mkl/dft/detail/mklcpu/onemkl_dft_mklcpu.hpp b/include/oneapi/mkl/dft/detail/mklcpu/onemkl_dft_mklcpu.hpp
deleted file mode 100644
index 00d4dd47b..000000000
--- a/include/oneapi/mkl/dft/detail/mklcpu/onemkl_dft_mklcpu.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_MKLCPU_HPP_
-#define _ONEMKL_DFT_MKLCPU_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/dft/detail/types_impl.hpp"
-
-namespace oneapi::mkl::dft::mklcpu {
-
-#include "oneapi/mkl/dft/detail/dft_ct.hxx"
-
-} // namespace oneapi::mkl::dft::mklcpu
-
-#endif // _ONEMKL_DFT_MKLCPU_HPP_
diff --git a/include/oneapi/mkl/dft/detail/mklgpu/onemkl_dft_mklgpu.hpp b/include/oneapi/mkl/dft/detail/mklgpu/onemkl_dft_mklgpu.hpp
deleted file mode 100644
index 56a55a9f7..000000000
--- a/include/oneapi/mkl/dft/detail/mklgpu/onemkl_dft_mklgpu.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_MKLGPU_HPP_
-#define _ONEMKL_DFT_MKLGPU_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/dft/detail/types_impl.hpp"
-
-namespace oneapi::mkl::dft::mklgpu {
-
-#include "oneapi/mkl/dft/detail/dft_ct.hxx"
-
-} // namespace oneapi::mkl::dft::mklgpu
-
-#endif // _ONEMKL_DFT_MKLGPU_HPP_
diff --git a/include/oneapi/mkl/dft/detail/portfft/onemkl_dft_portfft.hpp b/include/oneapi/mkl/dft/detail/portfft/onemkl_dft_portfft.hpp
deleted file mode 100644
index 4617e8a5c..000000000
--- a/include/oneapi/mkl/dft/detail/portfft/onemkl_dft_portfft.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_PORTFFT_HPP_
-#define _ONEMKL_DFT_PORTFFT_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/dft/detail/types_impl.hpp"
-
-namespace oneapi::mkl::dft::portfft {
-
-// We don't need the forward declarations of compute_xxxward templates (just need the create_commit template), but it doesn't hurt and keeps things simple.
-#include "oneapi/mkl/dft/detail/dft_ct.hxx"
-
-} // namespace oneapi::mkl::dft::portfft
-
-#endif // _ONEMKL_DFT_PORTFFT_HPP_
diff --git a/include/oneapi/mkl/dft/detail/rocfft/onemkl_dft_rocfft.hpp b/include/oneapi/mkl/dft/detail/rocfft/onemkl_dft_rocfft.hpp
deleted file mode 100644
index fe3305680..000000000
--- a/include/oneapi/mkl/dft/detail/rocfft/onemkl_dft_rocfft.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_ROCFFT_HPP_
-#define _ONEMKL_DFT_ROCFFT_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/dft/detail/types_impl.hpp"
-
-namespace oneapi::mkl::dft::rocfft {
-
-#include "oneapi/mkl/dft/detail/dft_ct.hxx"
-
-} // namespace oneapi::mkl::dft::rocfft
-
-#endif // _ONEMKL_DFT_ROCFFT_HPP_
diff --git a/include/oneapi/mkl/dft/detail/types_impl.hpp b/include/oneapi/mkl/dft/detail/types_impl.hpp
deleted file mode 100644
index 60eb922ab..000000000
--- a/include/oneapi/mkl/dft/detail/types_impl.hpp
+++ /dev/null
@@ -1,234 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DETAIL_TYPES_IMPL_HPP_
-#define _ONEMKL_DETAIL_TYPES_IMPL_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include <cstdint>
-#include <vector>
-#include <type_traits>
-#include <complex>
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-namespace detail {
-
-typedef long DFT_ERROR;
-
-enum class precision { SINGLE, DOUBLE };
-
-template <precision prec>
-struct precision_t {
-    using real_t = std::conditional_t<prec == precision::SINGLE, float, double>;
-};
-
-enum class domain { REAL, COMPLEX };
-
-// Forward declarations
-template <precision prec, domain dom>
-class commit_impl;
-
-template <precision prec, domain dom>
-class descriptor;
-
-template <class... T>
-constexpr bool always_false = false;
-
-template <typename descriptor_type>
-struct descriptor_info {
-    static_assert(always_false<descriptor_type>, "Not a valid descriptor type");
-};
-
-template <>
-struct descriptor_info<descriptor<precision::SINGLE, domain::REAL>> {
-    using scalar_type = float;
-    using forward_type = float;
-    using backward_type = std::complex<float>;
-};
-template <>
-struct descriptor_info<descriptor<precision::SINGLE, domain::COMPLEX>> {
-    using scalar_type = float;
-    using forward_type = std::complex<float>;
-    using backward_type = std::complex<float>;
-};
-template <>
-struct descriptor_info<descriptor<precision::DOUBLE, domain::REAL>> {
-    using scalar_type = double;
-    using forward_type = double;
-    using backward_type = std::complex<double>;
-};
-template <>
-struct descriptor_info<descriptor<precision::DOUBLE, domain::COMPLEX>> {
-    using scalar_type = double;
-    using forward_type = std::complex<double>;
-    using backward_type = std::complex<double>;
-};
-
-// Get the scalar type associated with a descriptor.
-template <class descriptor_t>
-using descriptor_scalar_t = typename descriptor_info<descriptor_t>::scalar_type;
-
-template <typename T>
-constexpr bool is_complex_dft = false;
-template <precision Prec>
-constexpr bool is_complex_dft<descriptor<Prec, domain::COMPLEX>> = true;
-
-template <typename T>
-constexpr bool is_complex = false;
-template <typename T>
-constexpr bool is_complex<std::complex<T>> = true;
-
-template <typename T, typename... Ts>
-using is_one_of = typename std::bool_constant<(std::is_same_v<T, Ts> || ...)>;
-
-template <typename descriptor_type, typename T>
-using valid_compute_arg = typename std::bool_constant<
-    (std::is_same_v<descriptor_scalar_t<descriptor_type>, float> &&
-     is_one_of<T, float, sycl::float2, sycl::float4, std::complex<float>>::value) ||
-    (std::is_same_v<descriptor_scalar_t<descriptor_type>, double> &&
-     is_one_of<T, double, sycl::double2, sycl::double4, std::complex<double>>::value)>;
-
-template <class descriptor_t, typename data_t>
-constexpr bool valid_ip_realreal_impl =
-    is_complex_dft<descriptor_t>&& std::is_same_v<descriptor_scalar_t<descriptor_t>, data_t>;
-
-// compute the range of a reinterpreted buffer
-template <typename In, typename Out>
-std::size_t reinterpret_range(std::size_t size) {
-    if constexpr (sizeof(In) >= sizeof(Out)) {
-        static_assert(sizeof(In) % sizeof(Out) == 0);
-        return size * (sizeof(In) / sizeof(Out));
-    }
-    else {
-        static_assert(sizeof(Out) % sizeof(In) == 0);
-        if (size % (sizeof(Out) / sizeof(In))) {
-            throw std::runtime_error("buffer cannot be evenly divived into the expected type");
-        }
-        return size / (sizeof(Out) / sizeof(In));
-    }
-}
-
-enum class config_param {
-    FORWARD_DOMAIN,
-    DIMENSION,
-    LENGTHS,
-    PRECISION,
-
-    FORWARD_SCALE,
-    BACKWARD_SCALE,
-
-    NUMBER_OF_TRANSFORMS,
-
-    COMPLEX_STORAGE,
-    REAL_STORAGE,
-    CONJUGATE_EVEN_STORAGE,
-
-    PLACEMENT,
-
-    INPUT_STRIDES [[deprecated("Use FWD/BWD_STRIDES")]],
-    OUTPUT_STRIDES [[deprecated("Use FWD/BWD_STRIDES")]],
-
-    FWD_DISTANCE,
-    BWD_DISTANCE,
-
-    WORKSPACE,
-    WORKSPACE_PLACEMENT,
-    WORKSPACE_EXTERNAL_BYTES,
-    ORDERING,
-    TRANSPOSE,
-    PACKED_FORMAT,
-    COMMIT_STATUS,
-
-    FWD_STRIDES,
-    BWD_STRIDES
-};
-
-enum class config_value {
-    // for config_param::COMMIT_STATUS
-    COMMITTED,
-    UNCOMMITTED,
-
-    // for config_param::COMPLEX_STORAGE,
-    //     config_param::REAL_STORAGE and
-    //     config_param::CONJUGATE_EVEN_STORAGE
-    COMPLEX_COMPLEX,
-    REAL_COMPLEX,
-    REAL_REAL,
-
-    // for config_param::PLACEMENT
-    INPLACE,
-    NOT_INPLACE,
-
-    // for config_param::ORDERING
-    ORDERED,
-    BACKWARD_SCRAMBLED,
-
-    // Allow/avoid certain usages
-    ALLOW,
-    AVOID,
-    NONE,
-
-    // for config_param::PACKED_FORMAT for storing conjugate-even finite sequence in real containers
-    CCE_FORMAT,
-
-    // For config_param::WORKSPACE_PLACEMENT
-    WORKSPACE_AUTOMATIC,
-    WORKSPACE_EXTERNAL
-};
-
-template <precision prec, domain dom>
-class dft_values {
-private:
-    using real_t = typename precision_t<prec>::real_t;
-
-public:
-    std::vector<std::int64_t> input_strides;
-    std::vector<std::int64_t> output_strides;
-    std::vector<std::int64_t> fwd_strides;
-    std::vector<std::int64_t> bwd_strides;
-    real_t bwd_scale;
-    real_t fwd_scale;
-    std::int64_t number_of_transforms;
-    std::int64_t fwd_dist;
-    std::int64_t bwd_dist;
-    config_value placement;
-    config_value complex_storage;
-    config_value real_storage;
-    config_value conj_even_storage;
-    config_value workspace;
-    config_value workspace_placement;
-    config_value ordering;
-    bool transpose;
-    config_value packed_format;
-    std::vector<std::int64_t> dimensions;
-};
-
-} // namespace detail
-} // namespace dft
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_DETAIL_TYPES_IMPL_HPP_
diff --git a/include/oneapi/mkl/dft/forward.hpp b/include/oneapi/mkl/dft/forward.hpp
deleted file mode 100644
index e43c39ce0..000000000
--- a/include/oneapi/mkl/dft/forward.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_FORWARD_HPP_
-#define _ONEMKL_DFT_FORWARD_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "detail/types_impl.hpp"
-
-namespace oneapi::mkl::dft {
-
-//Buffer version
-
-//In-place transform
-template <typename descriptor_type, typename data_type>
-void compute_forward(descriptor_type &desc, sycl::buffer<data_type, 1> &inout) {
-    static_assert(detail::valid_compute_arg<descriptor_type, data_type>::value,
-                  "unexpected type for data_type");
-
-    using fwd_type = typename detail::descriptor_info<descriptor_type>::forward_type;
-    auto type_corrected_inout = inout.template reinterpret<fwd_type, 1>(
-        detail::reinterpret_range<data_type, fwd_type>(inout.size()));
-    get_commit(desc)->forward_ip_cc(desc, type_corrected_inout);
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type, typename data_type,
-          std::enable_if_t<detail::valid_ip_realreal_impl<descriptor_type, data_type>, bool> = true>
-void compute_forward(descriptor_type &desc, sycl::buffer<data_type, 1> &inout_re,
-                     sycl::buffer<data_type, 1> &inout_im) {
-    static_assert(detail::valid_compute_arg<descriptor_type, data_type>::value,
-                  "unexpected type for data_type");
-
-    using scalar_type = typename detail::descriptor_info<descriptor_type>::scalar_type;
-    auto type_corrected_inout_re = inout_re.template reinterpret<scalar_type, 1>(
-        detail::reinterpret_range<data_type, scalar_type>(inout_re.size()));
-    auto type_corrected_inout_im = inout_im.template reinterpret<scalar_type, 1>(
-        detail::reinterpret_range<data_type, scalar_type>(inout_im.size()));
-    get_commit(desc)->forward_ip_rr(desc, type_corrected_inout_re, type_corrected_inout_im);
-}
-
-//Out-of-place transform
-template <typename descriptor_type, typename input_type, typename output_type>
-void compute_forward(descriptor_type &desc, sycl::buffer<input_type, 1> &in,
-                     sycl::buffer<output_type, 1> &out) {
-    static_assert(detail::valid_compute_arg<descriptor_type, input_type>::value,
-                  "unexpected type for input_type");
-    static_assert(detail::valid_compute_arg<descriptor_type, output_type>::value,
-                  "unexpected type for output_type");
-
-    using fwd_type = typename detail::descriptor_info<descriptor_type>::forward_type;
-    using bwd_type = typename detail::descriptor_info<descriptor_type>::backward_type;
-    auto type_corrected_in = in.template reinterpret<fwd_type, 1>(
-        detail::reinterpret_range<input_type, fwd_type>(in.size()));
-    auto type_corrected_out = out.template reinterpret<bwd_type, 1>(
-        detail::reinterpret_range<output_type, bwd_type>(out.size()));
-    get_commit(desc)->forward_op_cc(desc, type_corrected_in, type_corrected_out);
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type, typename input_type, typename output_type>
-void compute_forward(descriptor_type &desc, sycl::buffer<input_type, 1> &in_re,
-                     sycl::buffer<input_type, 1> &in_im, sycl::buffer<output_type, 1> &out_re,
-                     sycl::buffer<output_type, 1> &out_im) {
-    static_assert(detail::valid_compute_arg<descriptor_type, input_type>::value,
-                  "unexpected type for input_type");
-    static_assert(detail::valid_compute_arg<descriptor_type, output_type>::value,
-                  "unexpected type for output_type");
-
-    using scalar_type = typename detail::descriptor_info<descriptor_type>::scalar_type;
-    auto type_corrected_in_re = in_re.template reinterpret<scalar_type, 1>(
-        detail::reinterpret_range<input_type, scalar_type>(in_re.size()));
-    auto type_corrected_in_im = in_im.template reinterpret<scalar_type, 1>(
-        detail::reinterpret_range<input_type, scalar_type>(in_im.size()));
-    auto type_corrected_out_re = out_re.template reinterpret<scalar_type, 1>(
-        detail::reinterpret_range<output_type, scalar_type>(out_re.size()));
-    auto type_corrected_out_im = out_im.template reinterpret<scalar_type, 1>(
-        detail::reinterpret_range<output_type, scalar_type>(out_im.size()));
-    get_commit(desc)->forward_op_rr(desc, type_corrected_in_re, type_corrected_in_im,
-                                    type_corrected_out_re, type_corrected_out_im);
-}
-
-//USM version
-
-//In-place transform
-template <typename descriptor_type, typename data_type>
-sycl::event compute_forward(descriptor_type &desc, data_type *inout,
-                            const std::vector<sycl::event> &dependencies = {}) {
-    static_assert(detail::valid_compute_arg<descriptor_type, data_type>::value,
-                  "unexpected type for data_type");
-
-    using fwd_type = typename detail::descriptor_info<descriptor_type>::forward_type;
-    return get_commit(desc)->forward_ip_cc(desc, reinterpret_cast<fwd_type *>(inout), dependencies);
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type, typename data_type,
-          std::enable_if_t<detail::valid_ip_realreal_impl<descriptor_type, data_type>, bool> = true>
-sycl::event compute_forward(descriptor_type &desc, data_type *inout_re, data_type *inout_im,
-                            const std::vector<sycl::event> &dependencies = {}) {
-    static_assert(detail::valid_compute_arg<descriptor_type, data_type>::value,
-                  "unexpected type for data_type");
-    using scalar_type = typename detail::descriptor_info<descriptor_type>::scalar_type;
-    return get_commit(desc)->forward_ip_rr(desc, reinterpret_cast<scalar_type *>(inout_re),
-                                           reinterpret_cast<scalar_type *>(inout_im), dependencies);
-}
-
-//Out-of-place transform
-template <typename descriptor_type, typename input_type, typename output_type>
-sycl::event compute_forward(descriptor_type &desc, input_type *in, output_type *out,
-                            const std::vector<sycl::event> &dependencies = {}) {
-    static_assert(detail::valid_compute_arg<descriptor_type, input_type>::value,
-                  "unexpected type for input_type");
-    static_assert(detail::valid_compute_arg<descriptor_type, output_type>::value,
-                  "unexpected type for output_type");
-    using fwd_type = typename detail::descriptor_info<descriptor_type>::forward_type;
-    using bwd_type = typename detail::descriptor_info<descriptor_type>::backward_type;
-    return get_commit(desc)->forward_op_cc(desc, reinterpret_cast<fwd_type *>(in),
-                                           reinterpret_cast<bwd_type *>(out), dependencies);
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type, typename input_type, typename output_type>
-sycl::event compute_forward(descriptor_type &desc, input_type *in_re, input_type *in_im,
-                            output_type *out_re, output_type *out_im,
-                            const std::vector<sycl::event> &dependencies = {}) {
-    static_assert(detail::valid_compute_arg<descriptor_type, input_type>::value,
-                  "unexpected type for input_type");
-    static_assert(detail::valid_compute_arg<descriptor_type, output_type>::value,
-                  "unexpected type for output_type");
-
-    using scalar_type = typename detail::descriptor_info<descriptor_type>::scalar_type;
-    return get_commit(desc)->forward_op_rr(desc, reinterpret_cast<scalar_type *>(in_re),
-                                           reinterpret_cast<scalar_type *>(in_im),
-                                           reinterpret_cast<scalar_type *>(out_re),
-                                           reinterpret_cast<scalar_type *>(out_im), dependencies);
-}
-} // namespace oneapi::mkl::dft
-
-#endif // _ONEMKL_DFT_FORWARD_HPP_
diff --git a/include/oneapi/mkl/dft/types.hpp b/include/oneapi/mkl/dft/types.hpp
deleted file mode 100644
index dfbcd3c28..000000000
--- a/include/oneapi/mkl/dft/types.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_TYPES_HPP_
-#define _ONEMKL_DFT_TYPES_HPP_
-
-#include "detail/types_impl.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-
-/** The detail namespace is required since the MKLGPU backend uses identical 
-names and function signatures in many places. **/
-
-using precision = detail::precision;
-using domain = detail::domain;
-using config_param = detail::config_param;
-using config_value = detail::config_value;
-using DFT_ERROR = detail::DFT_ERROR;
-
-} // namespace dft
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_TYPES_HPP_
diff --git a/include/oneapi/mkl/exceptions.hpp b/include/oneapi/mkl/exceptions.hpp
deleted file mode 100644
index 244c8c61d..000000000
--- a/include/oneapi/mkl/exceptions.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_EXCEPTIONS_HPP_
-#define _ONEMKL_EXCEPTIONS_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <exception>
-#include <string>
-
-#include "oneapi/mkl/types.hpp"
-
-// These are oneAPI oneMKL Specification exceptions
-
-namespace oneapi {
-namespace mkl {
-class exception : public std::exception {
-    std::string msg_;
-
-public:
-    exception(const std::string &domain, const std::string &function, const std::string &info = "")
-            : std::exception() {
-        msg_ = std::string("oneMKL: ") + domain +
-               ((domain.length() != 0 && function.length() != 0) ? "/" : "") + function +
-               ((info.length() != 0)
-                    ? (((domain.length() + function.length() != 0) ? ": " : "") + info)
-                    : "");
-    }
-
-    const char *what() const noexcept override {
-        return msg_.c_str();
-    }
-};
-
-class unsupported_device : public oneapi::mkl::exception {
-public:
-    unsupported_device(const std::string &domain, const std::string &function,
-                       const sycl::device &device)
-            : oneapi::mkl::exception(
-                  domain, function,
-                  device.get_info<sycl::info::device::name>() + " is not supported") {}
-};
-
-class host_bad_alloc : public oneapi::mkl::exception {
-public:
-    host_bad_alloc(const std::string &domain, const std::string &function)
-            : oneapi::mkl::exception(domain, function, "cannot allocate memory on host") {}
-};
-
-class device_bad_alloc : public oneapi::mkl::exception {
-public:
-    device_bad_alloc(const std::string &domain, const std::string &function,
-                     const sycl::device &device)
-            : oneapi::mkl::exception(
-                  domain, function,
-                  "cannot allocate memory on " + device.get_info<sycl::info::device::name>()) {}
-};
-
-class unimplemented : public oneapi::mkl::exception {
-public:
-    unimplemented(const std::string &domain, const std::string &function,
-                  const std::string &info = "")
-            : oneapi::mkl::exception(domain, function, "function is not implemented " + info) {}
-};
-
-class invalid_argument : public oneapi::mkl::exception {
-public:
-    invalid_argument(const std::string &domain, const std::string &function,
-                     const std::string &info = "")
-            : oneapi::mkl::exception(domain, function, "invalid argument " + info) {}
-};
-
-class uninitialized : public oneapi::mkl::exception {
-public:
-    uninitialized(const std::string &domain, const std::string &function,
-                  const std::string &info = "")
-            : oneapi::mkl::exception(domain, function,
-                                     "handle/descriptor is not initialized " + info) {}
-};
-
-class computation_error : public oneapi::mkl::exception {
-public:
-    computation_error(const std::string &domain, const std::string &function,
-                      const std::string &info = "")
-            : oneapi::mkl::exception(
-                  domain, function,
-                  "computation error" + ((info.length() != 0) ? (": " + info) : "")) {}
-};
-
-class batch_error : public oneapi::mkl::exception {
-public:
-    batch_error(const std::string &domain, const std::string &function,
-                const std::string &info = "")
-            : oneapi::mkl::exception(domain, function,
-                                     "batch error" + ((info.length() != 0) ? (": " + info) : "")) {}
-};
-
-class library_not_found : public oneapi::mkl::exception {
-public:
-    library_not_found(const std::string &domain, const std::string &function,
-                      const std::string &info = "")
-            : oneapi::mkl::exception(
-                  domain, function,
-                  "library not found" + ((info.length() != 0) ? (": " + info) : "")) {}
-};
-
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _ONEMKL_EXCEPTIONS_HPP_
diff --git a/include/oneapi/mkl/lapack.hpp b/include/oneapi/mkl/lapack.hpp
deleted file mode 100644
index 2340c16ba..000000000
--- a/include/oneapi/mkl/lapack.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#include "oneapi/mkl/detail/config.hpp"
-
-#ifdef ENABLE_MKLCPU_BACKEND
-#include "oneapi/mkl/lapack/detail/mklcpu/lapack_ct.hpp"
-#endif
-#ifdef ENABLE_MKLGPU_BACKEND
-#include "oneapi/mkl/lapack/detail/mklgpu/lapack_ct.hpp"
-#endif
-#ifdef ENABLE_CUSOLVER_BACKEND
-#include "oneapi/mkl/lapack/detail/cusolver/lapack_ct.hpp"
-#endif
-#ifdef ENABLE_ROCSOLVER_BACKEND
-#include "oneapi/mkl/lapack/detail/rocsolver/lapack_ct.hpp"
-#endif
-
-#include "oneapi/mkl/lapack/detail/lapack_rt.hpp"
diff --git a/include/oneapi/mkl/lapack/detail/cusolver/lapack_ct.hpp b/include/oneapi/mkl/lapack/detail/cusolver/lapack_ct.hpp
deleted file mode 100644
index 1be0e5895..000000000
--- a/include/oneapi/mkl/lapack/detail/cusolver/lapack_ct.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _DETAIL_CUSOLVER_LAPACK_CT_HPP_
-#define _DETAIL_CUSOLVER_LAPACK_CT_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/lapack/types.hpp"
-#include "oneapi/mkl/detail/backend_selector.hpp"
-#include "oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-
-#define LAPACK_BACKEND cusolver
-#include "lapack_ct.hxx"
-#undef LAPACK_BACKEND
-
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_DETAIL_CUSOLVER_LAPACK_CT_HPP_
diff --git a/include/oneapi/mkl/lapack/detail/cusolver/lapack_ct.hxx b/include/oneapi/mkl/lapack/detail/cusolver/lapack_ct.hxx
deleted file mode 100644
index cd1d76765..000000000
--- a/include/oneapi/mkl/lapack/detail/cusolver/lapack_ct.hxx
+++ /dev/null
@@ -1,2627 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-// Buffer APIs
-
-static inline void gebrd(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tauq,
-                         sycl::buffer<std::complex<float>> &taup,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup,
-                                         scratchpad, scratchpad_size);
-}
-static inline void gebrd(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<double> &tauq, sycl::buffer<double> &taup,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup,
-                                         scratchpad, scratchpad_size);
-}
-static inline void gebrd(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e, sycl::buffer<float> &tauq,
-                         sycl::buffer<float> &taup, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup,
-                                         scratchpad, scratchpad_size);
-}
-static inline void gebrd(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tauq,
-                         sycl::buffer<std::complex<double>> &taup,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup,
-                                         scratchpad, scratchpad_size);
-}
-static inline void gerqf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void gerqf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void gerqf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void gerqf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void geqrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void geqrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void geqrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void geqrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getri(backend_selector<backend::cusolver> selector, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getri(backend_selector<backend::cusolver> selector, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getri(backend_selector<backend::cusolver> selector, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getri(backend_selector<backend::cusolver> selector, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getrs(backend_selector<backend::cusolver> selector, oneapi::mkl::transpose trans,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void getrs(backend_selector<backend::cusolver> selector, oneapi::mkl::transpose trans,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void getrs(backend_selector<backend::cusolver> selector, oneapi::mkl::transpose trans,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &b,
-                         std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void getrs(backend_selector<backend::cusolver> selector, oneapi::mkl::transpose trans,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void gesvd(backend_selector<backend::cusolver> selector, oneapi::mkl::jobsvd jobu,
-                         oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &s,
-                         sycl::buffer<double> &u, std::int64_t ldu, sycl::buffer<double> &vt,
-                         std::int64_t ldvt, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu,
-                                         vt, ldvt, scratchpad, scratchpad_size);
-}
-static inline void gesvd(backend_selector<backend::cusolver> selector, oneapi::mkl::jobsvd jobu,
-                         oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &s,
-                         sycl::buffer<float> &u, std::int64_t ldu, sycl::buffer<float> &vt,
-                         std::int64_t ldvt, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu,
-                                         vt, ldvt, scratchpad, scratchpad_size);
-}
-static inline void gesvd(backend_selector<backend::cusolver> selector, oneapi::mkl::jobsvd jobu,
-                         oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &s, sycl::buffer<std::complex<float>> &u,
-                         std::int64_t ldu, sycl::buffer<std::complex<float>> &vt, std::int64_t ldvt,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu,
-                                         vt, ldvt, scratchpad, scratchpad_size);
-}
-static inline void gesvd(backend_selector<backend::cusolver> selector, oneapi::mkl::jobsvd jobu,
-                         oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &s, sycl::buffer<std::complex<double>> &u,
-                         std::int64_t ldu, sycl::buffer<std::complex<double>> &vt,
-                         std::int64_t ldvt, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu,
-                                         vt, ldvt, scratchpad, scratchpad_size);
-}
-static inline void heevd(backend_selector<backend::cusolver> selector, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad,
-                                         scratchpad_size);
-}
-static inline void heevd(backend_selector<backend::cusolver> selector, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad,
-                                         scratchpad_size);
-}
-static inline void hegvd(backend_selector<backend::cusolver> selector, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb,
-                                         w, scratchpad, scratchpad_size);
-}
-static inline void hegvd(backend_selector<backend::cusolver> selector, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb,
-                                         w, scratchpad, scratchpad_size);
-}
-static inline void hetrd(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void hetrd(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void hetrf(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void hetrf(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void orgbr(backend_selector<backend::cusolver> selector, oneapi::mkl::generate vec,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void orgbr(backend_selector<backend::cusolver> selector, oneapi::mkl::generate vec,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void orgqr(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void orgqr(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void orgtr(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void orgtr(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void ormtr(backend_selector<backend::cusolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau,
-                                         c, ldc, scratchpad, scratchpad_size);
-}
-static inline void ormtr(backend_selector<backend::cusolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau,
-                                         c, ldc, scratchpad, scratchpad_size);
-}
-static inline void ormrq(backend_selector<backend::cusolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void ormrq(backend_selector<backend::cusolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void ormqr(backend_selector<backend::cusolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void ormqr(backend_selector<backend::cusolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void potrf(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potrf(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potrf(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potrf(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potri(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potri(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potri(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potri(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potrs(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void potrs(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void potrs(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void potrs(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void syevd(backend_selector<backend::cusolver> selector, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &w,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad,
-                                         scratchpad_size);
-}
-static inline void syevd(backend_selector<backend::cusolver> selector, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad,
-                                         scratchpad_size);
-}
-static inline void sygvd(backend_selector<backend::cusolver> selector, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b,
-                         std::int64_t ldb, sycl::buffer<double> &w,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb,
-                                         w, scratchpad, scratchpad_size);
-}
-static inline void sygvd(backend_selector<backend::cusolver> selector, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b,
-                         std::int64_t ldb, sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb,
-                                         w, scratchpad, scratchpad_size);
-}
-static inline void sytrd(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void sytrd(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void sytrf(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void sytrf(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void sytrf(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void sytrf(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void trtrs(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda,
-                                         b, ldb, scratchpad, scratchpad_size);
-}
-static inline void trtrs(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda,
-                                         b, ldb, scratchpad, scratchpad_size);
-}
-static inline void trtrs(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &b, std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda,
-                                         b, ldb, scratchpad, scratchpad_size);
-}
-static inline void trtrs(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda,
-                                         b, ldb, scratchpad, scratchpad_size);
-}
-static inline void ungbr(backend_selector<backend::cusolver> selector, oneapi::mkl::generate vec,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void ungbr(backend_selector<backend::cusolver> selector, oneapi::mkl::generate vec,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void ungqr(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void ungqr(backend_selector<backend::cusolver> selector, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void ungtr(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void ungtr(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void unmrq(backend_selector<backend::cusolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void unmrq(backend_selector<backend::cusolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void unmqr(backend_selector<backend::cusolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void unmqr(backend_selector<backend::cusolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void unmtr(backend_selector<backend::cusolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau,
-                                         c, ldc, scratchpad, scratchpad_size);
-}
-static inline void unmtr(backend_selector<backend::cusolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau,
-                                         c, ldc, scratchpad, scratchpad_size);
-}
-static inline void geqrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<float> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-static inline void geqrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<double> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-static inline void geqrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-static inline void geqrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<double>> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-static inline void getri_batch(backend_selector<backend::cusolver> selector, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getri_batch(backend_selector<backend::cusolver> selector, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getri_batch(backend_selector<backend::cusolver> selector, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getri_batch(backend_selector<backend::cusolver> selector, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getrs_batch(backend_selector<backend::cusolver> selector,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<float> &b, std::int64_t ldb, std::int64_t stride_b,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                               stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrs_batch(backend_selector<backend::cusolver> selector,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<double> &b, std::int64_t ldb, std::int64_t stride_b,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                               stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrs_batch(backend_selector<backend::cusolver> selector,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<std::complex<float>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                               stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrs_batch(backend_selector<backend::cusolver> selector,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<std::complex<double>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                               stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void orgqr_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                               std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                               std::int64_t lda, std::int64_t stride_a, sycl::buffer<float> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-static inline void orgqr_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                               std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                               std::int64_t lda, std::int64_t stride_a, sycl::buffer<double> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrs_batch(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a,
-                               std::int64_t lda, std::int64_t stride_a, sycl::buffer<float> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                               stride_a, b, ldb, stride_b, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void potrs_batch(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a,
-                               std::int64_t lda, std::int64_t stride_a, sycl::buffer<double> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                               stride_a, b, ldb, stride_b, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void potrs_batch(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<float>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                               stride_a, b, ldb, stride_b, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void potrs_batch(backend_selector<backend::cusolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                               stride_a, b, ldb, stride_b, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void ungqr_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                               std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-static inline void ungqr_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                               std::int64_t n, std::int64_t k,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::cusolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-
-// USM APIs
-
-static inline sycl::event gebrd(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda, float *d,
-                                float *e, std::complex<float> *tauq, std::complex<float> *taup,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq,
-                                                taup, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gebrd(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *d, double *e,
-                                double *tauq, double *taup, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq,
-                                                taup, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gebrd(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *d, float *e,
-                                float *tauq, float *taup, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq,
-                                                taup, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gebrd(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                double *d, double *e, std::complex<double> *tauq,
-                                std::complex<double> *taup, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq,
-                                                taup, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, std::int64_t *ipiv,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, std::int64_t *ipiv,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getri(backend_selector<backend::cusolver> selector, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event getri(backend_selector<backend::cusolver> selector, std::int64_t n,
-                                double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event getri(backend_selector<backend::cusolver> selector, std::int64_t n,
-                                float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event getri(backend_selector<backend::cusolver> selector, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event getrs(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv,
-                                                b, ldb, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                double *a, std::int64_t lda, std::int64_t *ipiv, double *b,
-                                std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv,
-                                                b, ldb, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                float *a, std::int64_t lda, std::int64_t *ipiv, float *b,
-                                std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv,
-                                                b, ldb, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv,
-                                                b, ldb, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gesvd(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *s, double *u,
-                                std::int64_t ldu, double *vt, std::int64_t ldvt, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s,
-                                                u, ldu, vt, ldvt, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event gesvd(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *s, float *u,
-                                std::int64_t ldu, float *vt, std::int64_t ldvt, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s,
-                                                u, ldu, vt, ldvt, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event gesvd(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda, float *s,
-                                std::complex<float> *u, std::int64_t ldu, std::complex<float> *vt,
-                                std::int64_t ldvt, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s,
-                                                u, ldu, vt, ldvt, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event gesvd(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                double *s, std::complex<double> *u, std::int64_t ldu,
-                                std::complex<double> *vt, std::int64_t ldvt,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s,
-                                                u, ldu, vt, ldvt, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event heevd(backend_selector<backend::cusolver> selector, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, float *w, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event heevd(backend_selector<backend::cusolver> selector, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, double *w, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event hegvd(backend_selector<backend::cusolver> selector, std::int64_t itype,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                                std::int64_t ldb, float *w, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda,
-                                                b, ldb, w, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event hegvd(backend_selector<backend::cusolver> selector, std::int64_t itype,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                                std::int64_t ldb, double *w, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda,
-                                                b, ldb, w, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event hetrd(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, float *d, float *e, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event hetrd(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, double *d, double *e, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event hetrf(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event hetrf(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgbr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, float *a, std::int64_t lda, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgbr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, double *a, std::int64_t lda, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                                double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                                float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgtr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgtr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ormtr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a,
-                                                lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event ormtr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a,
-                                                lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event ormrq(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, float *a,
-                                std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event ormrq(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, double *a,
-                                std::int64_t lda, double *tau, double *c, std::int64_t ldc,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event ormqr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, double *a,
-                                std::int64_t lda, double *tau, double *c, std::int64_t ldc,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event ormqr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, float *a,
-                                std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event potrf(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potrf(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potrf(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potrf(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potri(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potri(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potri(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potri(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potrs(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, float *a,
-                                std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                                std::int64_t ldb, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                                std::int64_t ldb, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event syevd(backend_selector<backend::cusolver> selector, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *w, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event syevd(backend_selector<backend::cusolver> selector, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *w, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sygvd(backend_selector<backend::cusolver> selector, std::int64_t itype,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *b, std::int64_t ldb, double *w,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda,
-                                                b, ldb, w, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event sygvd(backend_selector<backend::cusolver> selector, std::int64_t itype,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *b, std::int64_t ldb, float *w,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda,
-                                                b, ldb, w, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event sytrd(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *d, double *e, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sytrd(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *d, float *e, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event trtrs(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                                std::int64_t ldb, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a,
-                                                lda, b, ldb, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event trtrs(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a,
-                                                lda, b, ldb, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event trtrs(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a,
-                                std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a,
-                                                lda, b, ldb, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event trtrs(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                                std::int64_t ldb, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a,
-                                                lda, b, ldb, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event ungbr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungbr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungtr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungtr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event unmrq(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event unmrq(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event unmqr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event unmqr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event unmtr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a,
-                                                lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event unmtr(backend_selector<backend::cusolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a,
-                                                lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                      std::int64_t n, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, float *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      tau, stride_tau, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                      std::int64_t n, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, double *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      tau, stride_tau, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                      std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      tau, stride_tau, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                      std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      tau, stride_tau, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::cusolver> selector, std::int64_t *m,
-                                      std::int64_t *n, float **a, std::int64_t *lda, float **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::cusolver> selector, std::int64_t *m,
-                                      std::int64_t *n, double **a, std::int64_t *lda, double **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::cusolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::cusolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                      std::int64_t n, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                      std::int64_t n, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                      std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                      std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::cusolver> selector, std::int64_t *m,
-                                      std::int64_t *n, float **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::cusolver> selector, std::int64_t *m,
-                                      std::int64_t *n, double **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::cusolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::cusolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::cusolver> selector, std::int64_t n,
-                                      float *a, std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t *ipiv, std::int64_t stride_ipiv,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::cusolver> selector, std::int64_t n,
-                                      double *a, std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t *ipiv, std::int64_t stride_ipiv,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::cusolver> selector, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::cusolver> selector, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::cusolver> selector, std::int64_t *n,
-                                      float **a, std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::cusolver> selector, std::int64_t *n,
-                                      double **a, std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::cusolver> selector, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::cusolver> selector, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::transpose trans, std::int64_t n,
-                                      std::int64_t nrhs, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, float *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-        batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::transpose trans, std::int64_t n,
-                                      std::int64_t nrhs, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, double *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-        batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(
-    backend_selector<backend::cusolver> selector, oneapi::mkl::transpose trans, std::int64_t n,
-    std::int64_t nrhs, std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-    std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex<float> *b, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size, std::complex<float> *scratchpad,
-    std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-        batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(
-    backend_selector<backend::cusolver> selector, oneapi::mkl::transpose trans, std::int64_t n,
-    std::int64_t nrhs, std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-    std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex<double> *b, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size, std::complex<double> *scratchpad,
-    std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-        batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::transpose *trans, std::int64_t *n,
-                                      std::int64_t *nrhs, float **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, float **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                      ipiv, b, ldb, group_count, group_sizes,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::transpose *trans, std::int64_t *n,
-                                      std::int64_t *nrhs, double **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, double **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                      ipiv, b, ldb, group_count, group_sizes,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::transpose *trans, std::int64_t *n,
-                                      std::int64_t *nrhs, std::complex<float> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                      ipiv, b, ldb, group_count, group_sizes,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(
-    backend_selector<backend::cusolver> selector, oneapi::mkl::transpose *trans, std::int64_t *n,
-    std::int64_t *nrhs, std::complex<double> **a, std::int64_t *lda, std::int64_t **ipiv,
-    std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-    const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                      ipiv, b, ldb, group_count, group_sizes,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                      std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, float *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda,
-                                                      stride_a, tau, stride_tau, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                      std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, double *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda,
-                                                      stride_a, tau, stride_tau, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(backend_selector<backend::cusolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::int64_t *k, float **a,
-                                      std::int64_t *lda, float **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(backend_selector<backend::cusolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::int64_t *k, double **a,
-                                      std::int64_t *lda, double **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      stride_a, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      stride_a, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      stride_a, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      stride_a, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, float **a,
-                                      std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, double **a,
-                                      std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      float *a, std::int64_t lda, std::int64_t stride_a, float *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      stride_a, b, ldb, stride_b, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      double *a, std::int64_t lda, std::int64_t stride_a, double *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      stride_a, b, ldb, stride_b, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      stride_a, b, ldb, stride_b, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      stride_a, b, ldb, stride_b, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      float **a, std::int64_t *lda, float **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      b, ldb, group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      double **a, std::int64_t *lda, double **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      b, ldb, group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      b, ldb, group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::cusolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      b, ldb, group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                      std::int64_t n, std::int64_t k, std::complex<float> *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::complex<float> *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda,
-                                                      stride_a, tau, stride_tau, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                      std::int64_t n, std::int64_t k, std::complex<double> *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::complex<double> *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda,
-                                                      stride_a, tau, stride_tau, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(backend_selector<backend::cusolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::int64_t *k, std::complex<float> **a,
-                                      std::int64_t *lda, std::complex<float> **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(backend_selector<backend::cusolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::int64_t *k, std::complex<double> **a,
-                                      std::int64_t *lda, std::complex<double> **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::cusolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-
-// SCRATCHPAD APIs
-template <typename fp_type>
-std::int64_t gebrd_scratchpad_size(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                   std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::gebrd_scratchpad_size<fp_type>(selector.get_queue(), m, n,
-                                                                         lda);
-}
-template <typename fp_type>
-std::int64_t gerqf_scratchpad_size(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                   std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::gerqf_scratchpad_size<fp_type>(selector.get_queue(), m, n,
-                                                                         lda);
-}
-template <typename fp_type>
-std::int64_t geqrf_scratchpad_size(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                   std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::geqrf_scratchpad_size<fp_type>(selector.get_queue(), m, n,
-                                                                         lda);
-}
-template <typename fp_type>
-std::int64_t gesvd_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldu, std::int64_t ldvt) {
-    return oneapi::mkl::lapack::cusolver::gesvd_scratchpad_size<fp_type>(
-        selector.get_queue(), jobu, jobvt, m, n, lda, ldu, ldvt);
-}
-template <typename fp_type>
-std::int64_t getrf_scratchpad_size(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                   std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::getrf_scratchpad_size<fp_type>(selector.get_queue(), m, n,
-                                                                         lda);
-}
-template <typename fp_type>
-std::int64_t getri_scratchpad_size(backend_selector<backend::cusolver> selector, std::int64_t n,
-                                   std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::getri_scratchpad_size<fp_type>(selector.get_queue(), n,
-                                                                         lda);
-}
-template <typename fp_type>
-std::int64_t getrs_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                   std::int64_t lda, std::int64_t ldb) {
-    return oneapi::mkl::lapack::cusolver::getrs_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                         trans, n, nrhs, lda, ldb);
-}
-template <typename fp_type>
-std::int64_t heevd_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::heevd_scratchpad_size<fp_type>(selector.get_queue(), jobz,
-                                                                         uplo, n, lda);
-}
-template <typename fp_type>
-std::int64_t hegvd_scratchpad_size(backend_selector<backend::cusolver> selector, std::int64_t itype,
-                                   oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldb) {
-    return oneapi::mkl::lapack::cusolver::hegvd_scratchpad_size<fp_type>(
-        selector.get_queue(), itype, jobz, uplo, n, lda, ldb);
-}
-template <typename fp_type>
-std::int64_t hetrd_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::hetrd_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t hetrf_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::hetrf_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t orgbr_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::generate vect, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::orgbr_scratchpad_size<fp_type>(selector.get_queue(), vect,
-                                                                         m, n, k, lda);
-}
-template <typename fp_type>
-std::int64_t orgtr_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::orgtr_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t orgqr_scratchpad_size(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::orgqr_scratchpad_size<fp_type>(selector.get_queue(), m, n,
-                                                                         k, lda);
-}
-template <typename fp_type>
-std::int64_t ormrq_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc) {
-    return oneapi::mkl::lapack::cusolver::ormrq_scratchpad_size<fp_type>(selector.get_queue(), side,
-                                                                         trans, m, n, k, lda, ldc);
-}
-template <typename fp_type>
-std::int64_t ormqr_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc) {
-    return oneapi::mkl::lapack::cusolver::ormqr_scratchpad_size<fp_type>(selector.get_queue(), side,
-                                                                         trans, m, n, k, lda, ldc);
-}
-template <typename fp_type>
-std::int64_t ormtr_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldc) {
-    return oneapi::mkl::lapack::cusolver::ormtr_scratchpad_size<fp_type>(
-        selector.get_queue(), side, uplo, trans, m, n, lda, ldc);
-}
-template <typename fp_type>
-std::int64_t potrf_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::potrf_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t potrs_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                   std::int64_t lda, std::int64_t ldb) {
-    return oneapi::mkl::lapack::cusolver::potrs_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, nrhs, lda, ldb);
-}
-template <typename fp_type>
-std::int64_t potri_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::potri_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t sytrf_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::sytrf_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t syevd_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::syevd_scratchpad_size<fp_type>(selector.get_queue(), jobz,
-                                                                         uplo, n, lda);
-}
-template <typename fp_type>
-std::int64_t sygvd_scratchpad_size(backend_selector<backend::cusolver> selector, std::int64_t itype,
-                                   oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldb) {
-    return oneapi::mkl::lapack::cusolver::sygvd_scratchpad_size<fp_type>(
-        selector.get_queue(), itype, jobz, uplo, n, lda, ldb);
-}
-template <typename fp_type>
-std::int64_t sytrd_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::sytrd_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t trtrs_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                   oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                   std::int64_t lda, std::int64_t ldb) {
-    return oneapi::mkl::lapack::cusolver::trtrs_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, trans, diag, n, nrhs, lda, ldb);
-}
-template <typename fp_type>
-std::int64_t ungbr_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::generate vect, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::ungbr_scratchpad_size<fp_type>(selector.get_queue(), vect,
-                                                                         m, n, k, lda);
-}
-template <typename fp_type>
-std::int64_t ungqr_scratchpad_size(backend_selector<backend::cusolver> selector, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::ungqr_scratchpad_size<fp_type>(selector.get_queue(), m, n,
-                                                                         k, lda);
-}
-template <typename fp_type>
-std::int64_t ungtr_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::cusolver::ungtr_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t unmrq_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc) {
-    return oneapi::mkl::lapack::cusolver::unmrq_scratchpad_size<fp_type>(selector.get_queue(), side,
-                                                                         trans, m, n, k, lda, ldc);
-}
-template <typename fp_type>
-std::int64_t unmqr_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc) {
-    return oneapi::mkl::lapack::cusolver::unmqr_scratchpad_size<fp_type>(selector.get_queue(), side,
-                                                                         trans, m, n, k, lda, ldc);
-}
-template <typename fp_type>
-std::int64_t unmtr_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldc) {
-    return oneapi::mkl::lapack::cusolver::unmtr_scratchpad_size<fp_type>(
-        selector.get_queue(), side, uplo, trans, m, n, lda, ldc);
-}
-template <typename fp_type>
-std::int64_t getrf_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         std::int64_t m, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t stride_ipiv,
-                                         std::int64_t batch_size) {
-    return oneapi::mkl::lapack::cusolver::getrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, lda, stride_a, stride_ipiv, batch_size);
-}
-template <typename fp_type>
-std::int64_t getri_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         std::int64_t n, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_ipiv, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::cusolver::getri_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), n, lda, stride_a, stride_ipiv, batch_size);
-}
-template <typename fp_type>
-std::int64_t getrs_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         oneapi::mkl::transpose trans, std::int64_t n,
-                                         std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_ipiv, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::cusolver::getrs_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b,
-        batch_size);
-}
-template <typename fp_type>
-std::int64_t geqrf_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         std::int64_t m, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t stride_tau,
-                                         std::int64_t batch_size) {
-    return oneapi::mkl::lapack::cusolver::geqrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, lda, stride_a, stride_tau, batch_size);
-}
-template <typename fp_type>
-std::int64_t potrf_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::cusolver::potrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, n, lda, stride_a, batch_size);
-}
-template <typename fp_type>
-std::int64_t potrs_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                         std::int64_t lda, std::int64_t stride_a, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::cusolver::potrs_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size);
-}
-template <typename fp_type>
-std::int64_t orgqr_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         std::int64_t m, std::int64_t n, std::int64_t k,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::cusolver::orgqr_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, k, lda, stride_a, stride_tau, batch_size);
-}
-template <typename fp_type>
-std::int64_t ungqr_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         std::int64_t m, std::int64_t n, std::int64_t k,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::cusolver::ungqr_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, k, lda, stride_a, stride_tau, batch_size);
-}
-template <typename fp_type>
-std::int64_t getrf_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::cusolver::getrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, lda, group_count, group_sizes);
-}
-template <typename fp_type>
-std::int64_t getri_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::cusolver::getri_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), n, lda, group_count, group_sizes);
-}
-template <typename fp_type>
-std::int64_t getrs_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         oneapi::mkl::transpose *trans, std::int64_t *n,
-                                         std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::cusolver::getrs_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), trans, n, nrhs, lda, ldb, group_count, group_sizes);
-}
-template <typename fp_type>
-std::int64_t geqrf_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::cusolver::geqrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, lda, group_count, group_sizes);
-}
-template <typename fp_type>
-std::int64_t orgqr_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::cusolver::orgqr_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, k, lda, group_count, group_sizes);
-}
-template <typename fp_type>
-std::int64_t potrf_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::cusolver::potrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, n, lda, group_count, group_sizes);
-}
-template <typename fp_type>
-std::int64_t potrs_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                         std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::cusolver::potrs_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, n, nrhs, lda, ldb, group_count, group_sizes);
-}
-template <typename fp_type>
-std::int64_t ungqr_batch_scratchpad_size(backend_selector<backend::cusolver> selector,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::cusolver::ungqr_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, k, lda, group_count, group_sizes);
-}
diff --git a/include/oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hpp b/include/oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hpp
deleted file mode 100644
index 6d31a05af..000000000
--- a/include/oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#ifndef _ONEMKL_LAPACK_CUSOLVER_HPP_
-#define _ONEMKL_LAPACK_CUSOLVER_HPP_
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-#include <string>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/detail/export.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace cusolver {
-
-#include "onemkl_lapack_cusolver.hxx"
-
-} // namespace cusolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_LAPACK_CUSOLVER_HPP_
diff --git a/include/oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hxx b/include/oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hxx
deleted file mode 100644
index ffa9c3007..000000000
--- a/include/oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hxx
+++ /dev/null
@@ -1,1830 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-// Buffer APIs
-
-ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tauq,
-                         sycl::buffer<std::complex<float>> &taup,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &d,
-                         sycl::buffer<double> &e, sycl::buffer<double> &tauq,
-                         sycl::buffer<double> &taup, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<float> &tauq, sycl::buffer<float> &taup,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tauq,
-                         sycl::buffer<std::complex<double>> &taup,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<float>> &b,
-                         std::int64_t ldb, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &b,
-                         std::int64_t ldb, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &b, std::int64_t ldb,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &b,
-                         std::int64_t ldb, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &s, sycl::buffer<double> &u, std::int64_t ldu,
-                         sycl::buffer<double> &vt, std::int64_t ldvt,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &s, sycl::buffer<float> &u, std::int64_t ldu,
-                         sycl::buffer<float> &vt, std::int64_t ldvt,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<float> &s,
-                         sycl::buffer<std::complex<float>> &u, std::int64_t ldu,
-                         sycl::buffer<std::complex<float>> &vt, std::int64_t ldvt,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<double> &s,
-                         sycl::buffer<std::complex<double>> &u, std::int64_t ldu,
-                         sycl::buffer<std::complex<double>> &vt, std::int64_t ldvt,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &c, std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &c,
-                         std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &c,
-                         std::int64_t ldc, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &c,
-                         std::int64_t ldc, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &c,
-                         std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &b, std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &w, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &w, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-                         sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &d,
-                         sycl::buffer<double> &e, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &d,
-                         sycl::buffer<float> &e, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b,
-                         std::int64_t ldb, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b,
-                         std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<float> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<double> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<float>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<float> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<double> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<float> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<double> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<float> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<double> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<float>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-// USM APIs
-
-ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, float *d, float *e,
-                                std::complex<float> *tauq, std::complex<float> *taup,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, double *d, double *e, double *tauq, double *taup,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *d, float *e, float *tauq, float *taup,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, double *d, double *e,
-                                std::complex<double> *tauq, std::complex<double> *taup,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda,
-                                std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda,
-                                std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t *ipiv,
-                                double *b, std::int64_t ldb, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t *ipiv,
-                                float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                double *a, std::int64_t lda, double *s, double *u, std::int64_t ldu,
-                                double *vt, std::int64_t ldvt, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *s, float *u, std::int64_t ldu, float *vt,
-                                std::int64_t ldvt, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, float *s,
-                                std::complex<float> *u, std::int64_t ldu, std::complex<float> *vt,
-                                std::int64_t ldvt, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, double *s,
-                                std::complex<double> *u, std::int64_t ldu, std::complex<double> *vt,
-                                std::int64_t ldvt, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda, float *w,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                double *w, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                float *w, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                double *w, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, float *d, float *e,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, double *d, double *e,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                                float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                                double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                double *a, std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                float *a, std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, float *a, std::int64_t lda, float *tau, float *c,
-                                std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, float *a, std::int64_t lda, float *tau, float *c,
-                                std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, float *a, std::int64_t lda, float *b,
-                                std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, double *a, std::int64_t lda, double *b,
-                                std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, double *a, std::int64_t lda, double *w,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, float *a, std::int64_t lda, float *w,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *b, std::int64_t ldb, double *w, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *b, std::int64_t ldb, float *w, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *d, double *e, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *d, float *e, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda,
-                                double *b, std::int64_t ldb, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda,
-                                float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a, float *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, double *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      float **a, std::int64_t *lda, float **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      double **a, std::int64_t *lda, double **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      float **a, std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      double **a, std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex<float> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex<double> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<float> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<double> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, float *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, double *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, std::complex<float> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::complex<float> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, std::complex<double> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::complex<double> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, float **a,
-                                      std::int64_t *lda, std::int64_t **ipiv, float **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, double **a,
-                                      std::int64_t *lda, std::int64_t **ipiv, double **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, std::complex<float> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, std::complex<double> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::complex<double> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, float *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, double *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, float **a, std::int64_t *lda, float **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, double **a, std::int64_t *lda, double **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      float *a, std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      double *a, std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      float **a, std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      double **a, std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, float *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, double *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, float **a, std::int64_t *lda, float **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, double **a, std::int64_t *lda, double **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, std::complex<float> **a,
-                                      std::int64_t *lda, std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, std::complex<double> **a,
-                                      std::int64_t *lda, std::complex<double> **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-// SCRATCHPAD APIs
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                                 oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                                 std::int64_t n, std::int64_t lda, std::int64_t ldu,
-                                                 std::int64_t ldvt);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                 std::int64_t n, std::int64_t nrhs,
-                                                 std::int64_t lda, std::int64_t ldb);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t heevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz,
-                                                 oneapi::mkl::uplo uplo, std::int64_t n,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t hegvd_scratchpad_size(sycl::queue &queue, std::int64_t itype,
-                                                 oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda,
-                                                 std::int64_t ldb);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect,
-                                                 std::int64_t m, std::int64_t n, std::int64_t k,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t k, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                                 oneapi::mkl::transpose trans, std::int64_t m,
-                                                 std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                 std::int64_t ldc);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                                 oneapi::mkl::transpose trans, std::int64_t m,
-                                                 std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                 std::int64_t ldc);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                                 oneapi::mkl::uplo uplo,
-                                                 oneapi::mkl::transpose trans, std::int64_t m,
-                                                 std::int64_t n, std::int64_t lda,
-                                                 std::int64_t ldc);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t nrhs,
-                                                 std::int64_t lda, std::int64_t ldb);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz,
-                                                 oneapi::mkl::uplo uplo, std::int64_t n,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype,
-                                                 oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda,
-                                                 std::int64_t ldb);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 oneapi::mkl::transpose trans,
-                                                 oneapi::mkl::diag diag, std::int64_t n,
-                                                 std::int64_t nrhs, std::int64_t lda,
-                                                 std::int64_t ldb);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ungbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect,
-                                                 std::int64_t m, std::int64_t n, std::int64_t k,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ungqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t k, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t unmrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                                 oneapi::mkl::transpose trans, std::int64_t m,
-                                                 std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                 std::int64_t ldc);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t unmqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                                 oneapi::mkl::transpose trans, std::int64_t m,
-                                                 std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                 std::int64_t ldc);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t unmtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                                 oneapi::mkl::uplo uplo,
-                                                 oneapi::mkl::transpose trans, std::int64_t m,
-                                                 std::int64_t n, std::int64_t lda,
-                                                 std::int64_t ldc);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m,
-                                                       std::int64_t n, std::int64_t lda,
-                                                       std::int64_t stride_a,
-                                                       std::int64_t stride_ipiv,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n,
-                                                       std::int64_t lda, std::int64_t stride_a,
-                                                       std::int64_t stride_ipiv,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size(
-    sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m,
-                                                       std::int64_t n, std::int64_t lda,
-                                                       std::int64_t stride_a,
-                                                       std::int64_t stride_tau,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                       std::int64_t n, std::int64_t lda,
-                                                       std::int64_t stride_a,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                       std::int64_t n, std::int64_t nrhs,
-                                                       std::int64_t lda, std::int64_t stride_a,
-                                                       std::int64_t ldb, std::int64_t stride_b,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m,
-                                                       std::int64_t n, std::int64_t k,
-                                                       std::int64_t lda, std::int64_t stride_a,
-                                                       std::int64_t stride_tau,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m,
-                                                       std::int64_t n, std::int64_t k,
-                                                       std::int64_t lda, std::int64_t stride_a,
-                                                       std::int64_t stride_tau,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m,
-                                                       std::int64_t *n, std::int64_t *lda,
-                                                       std::int64_t group_count,
-                                                       std::int64_t *group_sizes);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n,
-                                                       std::int64_t *lda, std::int64_t group_count,
-                                                       std::int64_t *group_sizes);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size(
-    sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m,
-                                                       std::int64_t *n, std::int64_t *lda,
-                                                       std::int64_t group_count,
-                                                       std::int64_t *group_sizes);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m,
-                                                       std::int64_t *n, std::int64_t *k,
-                                                       std::int64_t *lda, std::int64_t group_count,
-                                                       std::int64_t *group_sizes);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                       std::int64_t *n, std::int64_t *lda,
-                                                       std::int64_t group_count,
-                                                       std::int64_t *group_sizes);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                       std::int64_t *n, std::int64_t *nrhs,
-                                                       std::int64_t *lda, std::int64_t *ldb,
-                                                       std::int64_t group_count,
-                                                       std::int64_t *group_sizes);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m,
-                                                       std::int64_t *n, std::int64_t *k,
-                                                       std::int64_t *lda, std::int64_t group_count,
-                                                       std::int64_t *group_sizes);
diff --git a/include/oneapi/mkl/lapack/detail/lapack_loader.hpp b/include/oneapi/mkl/lapack/detail/lapack_loader.hpp
deleted file mode 100644
index 2bb49364e..000000000
--- a/include/oneapi/mkl/lapack/detail/lapack_loader.hpp
+++ /dev/null
@@ -1,2382 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#include <complex>
-#include <cstdint>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/lapack/types.hpp"
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/detail/get_device_id.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace detail {
-
-ONEMKL_EXPORT void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tauq,
-                         sycl::buffer<std::complex<float>> &taup,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<double> &tauq, sycl::buffer<double> &taup,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e, sycl::buffer<float> &tauq,
-                         sycl::buffer<float> &taup, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tauq,
-                         sycl::buffer<std::complex<double>> &taup,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<float>> &b,
-                         std::int64_t ldb, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &b,
-                         std::int64_t ldb, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &b, std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &b,
-                         std::int64_t ldb, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                         oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &s,
-                         sycl::buffer<double> &u, std::int64_t ldu, sycl::buffer<double> &vt,
-                         std::int64_t ldvt, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                         oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &s,
-                         sycl::buffer<float> &u, std::int64_t ldu, sycl::buffer<float> &vt,
-                         std::int64_t ldvt, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                         oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &s, sycl::buffer<std::complex<float>> &u,
-                         std::int64_t ldu, sycl::buffer<std::complex<float>> &vt, std::int64_t ldvt,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                         oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &s, sycl::buffer<std::complex<double>> &u,
-                         std::int64_t ldu, sycl::buffer<std::complex<double>> &vt,
-                         std::int64_t ldvt, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void heevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void heevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void hetrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void hetrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void hetrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void hetrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ormtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ormtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ormrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ormrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ormqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ormqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void syevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &w,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void syevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b,
-                         std::int64_t ldb, sycl::buffer<double> &w,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b,
-                         std::int64_t ldb, sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sytrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sytrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &b, std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void unmrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void unmrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void unmqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void unmqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void unmtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void unmtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<float> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<double> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<double>> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<float> &b, std::int64_t ldb, std::int64_t stride_b,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<double> &b, std::int64_t ldb, std::int64_t stride_b,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<std::complex<float>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<std::complex<double>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                               std::int64_t lda, std::int64_t stride_a, sycl::buffer<float> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                               std::int64_t lda, std::int64_t stride_a, sycl::buffer<double> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                               oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-                               std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                               oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-                               std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                               oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                               oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                               oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<float> &b, std::int64_t ldb, std::int64_t stride_b,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                               oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<double> &b, std::int64_t ldb, std::int64_t stride_b,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                               oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<float>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                               oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, std::int64_t k,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda, float *d,
-                                float *e, std::complex<float> *tauq, std::complex<float> *taup,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *d, double *e,
-                                double *tauq, double *taup, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *d, float *e,
-                                float *tauq, float *taup, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                double *d, double *e, std::complex<double> *tauq,
-                                std::complex<double> *taup, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, std::int64_t *ipiv,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, std::int64_t *ipiv,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                                double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                                float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                double *a, std::int64_t lda, std::int64_t *ipiv, double *b,
-                                std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                float *a, std::int64_t lda, std::int64_t *ipiv, float *b,
-                                std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *s, double *u,
-                                std::int64_t ldu, double *vt, std::int64_t ldvt, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *s, float *u,
-                                std::int64_t ldu, float *vt, std::int64_t ldvt, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda, float *s,
-                                std::complex<float> *u, std::int64_t ldu, std::complex<float> *vt,
-                                std::int64_t ldvt, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                double *s, std::complex<double> *u, std::int64_t ldu,
-                                std::complex<double> *vt, std::int64_t ldvt,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event heevd(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, float *w,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event heevd(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, double *w,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                                std::int64_t ldb, float *w, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                                std::int64_t ldb, double *w, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hetrd(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, float *d, float *e, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hetrd(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, double *d, double *e, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hetrf(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hetrf(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgbr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, float *a, std::int64_t lda, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgbr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, double *a, std::int64_t lda, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                                double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                                float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgtr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgtr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ormtr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ormtr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ormrq(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, float *a,
-                                std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ormrq(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, double *a,
-                                std::int64_t lda, double *tau, double *c, std::int64_t ldc,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ormqr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, double *a,
-                                std::int64_t lda, double *tau, double *c, std::int64_t ldc,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ormqr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, float *a,
-                                std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, float *a,
-                                std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                                std::int64_t ldb, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                                std::int64_t ldb, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syevd(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *w, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syevd(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *w, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *b, std::int64_t ldb, double *w,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *b, std::int64_t ldb, float *w,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sytrd(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *d, double *e, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sytrd(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *d, float *e, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                                std::int64_t ldb, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a,
-                                std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                                std::int64_t ldb, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungbr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungbr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungtr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungtr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event unmrq(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event unmrq(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event unmqr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event unmqr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event unmtr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event unmtr(oneapi::mkl::device libkey, sycl::queue &queue,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, float *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, double *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t m, std::int64_t n, std::complex<float> *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::complex<float> *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t m, std::int64_t n, std::complex<double> *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::complex<double> *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *m, std::int64_t *n, float **a,
-                                      std::int64_t *lda, float **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *m, std::int64_t *n, double **a,
-                                      std::int64_t *lda, double **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *m, std::int64_t *n, std::complex<float> **a,
-                                      std::int64_t *lda, std::complex<float> **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *m, std::int64_t *n, std::complex<double> **a,
-                                      std::int64_t *lda, std::complex<double> **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t m, std::int64_t n, std::complex<float> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t m, std::int64_t n, std::complex<double> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *m, std::int64_t *n, float **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *m, std::int64_t *n, double **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *m, std::int64_t *n, std::complex<float> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *m, std::int64_t *n, std::complex<double> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t n, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t n, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *n, float **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *n, double **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *n, std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *n, std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::transpose trans, std::int64_t n,
-                                      std::int64_t nrhs, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, float *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::transpose trans, std::int64_t n,
-                                      std::int64_t nrhs, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, double *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-    std::int64_t nrhs, std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-    std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex<float> *b, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size, std::complex<float> *scratchpad,
-    std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-    std::int64_t nrhs, std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-    std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex<double> *b, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size, std::complex<double> *scratchpad,
-    std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::transpose *trans, std::int64_t *n,
-                                      std::int64_t *nrhs, float **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, float **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::transpose *trans, std::int64_t *n,
-                                      std::int64_t *nrhs, double **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, double **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::transpose *trans, std::int64_t *n,
-                                      std::int64_t *nrhs, std::complex<float> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-    std::int64_t *nrhs, std::complex<double> **a, std::int64_t *lda, std::int64_t **ipiv,
-    std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-    const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t m, std::int64_t n, std::int64_t k, float *a,
-                                      std::int64_t lda, std::int64_t stride_a, float *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t m, std::int64_t n, std::int64_t k, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, double *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *m, std::int64_t *n, std::int64_t *k, float **a,
-                                      std::int64_t *lda, float **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *m, std::int64_t *n, std::int64_t *k, double **a,
-                                      std::int64_t *lda, double **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, float **a,
-                                      std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, double **a,
-                                      std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      float *a, std::int64_t lda, std::int64_t stride_a, float *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      double *a, std::int64_t lda, std::int64_t stride_a, double *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      float **a, std::int64_t *lda, float **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      double **a, std::int64_t *lda, double **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t m, std::int64_t n, std::int64_t k,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungqr_batch(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-    std::complex<double> *a, std::int64_t lda, std::int64_t stride_a, std::complex<double> *tau,
-    std::int64_t stride_tau, std::int64_t batch_size, std::complex<double> *scratchpad,
-    std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                                      std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t gebrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                   std::int64_t n, std::int64_t lda);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t gerqf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                   std::int64_t n, std::int64_t lda);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                   std::int64_t n, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t gesvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldu, std::int64_t ldvt);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t gesvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldu, std::int64_t ldvt);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                   std::int64_t n, std::int64_t lda);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                   std::int64_t lda, std::int64_t ldb);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t heevd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hegvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t itype, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldb);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hetrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hetrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgbr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::generate vect, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormrq_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldc);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                   std::int64_t lda, std::int64_t ldb);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potri_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t sytrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t syevd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t sygvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   std::int64_t itype, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldb);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t sytrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t trtrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                   oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                   std::int64_t lda, std::int64_t ldb);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungbr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::generate vect, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmrq_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                   oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldc);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t m, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t stride_ipiv,
-                                         std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t n, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_ipiv, std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         oneapi::mkl::transpose trans, std::int64_t n,
-                                         std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_ipiv, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t m, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t stride_tau,
-                                         std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                         std::int64_t lda, std::int64_t stride_a, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t m, std::int64_t n, std::int64_t k,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t m, std::int64_t n, std::int64_t k,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         oneapi::mkl::transpose *trans, std::int64_t *n,
-                                         std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb,
-                                         std::int64_t group_count, std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                         std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb,
-                                         std::int64_t group_count, std::int64_t *group_sizes);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes);
-
-template <>
-ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                                      sycl::queue &queue,
-                                                                      std::int64_t m,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                                       sycl::queue &queue,
-                                                                       std::int64_t m,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                                      sycl::queue &queue,
-                                                                      std::int64_t m,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                                       sycl::queue &queue,
-                                                                       std::int64_t m,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                                      sycl::queue &queue,
-                                                                      std::int64_t m,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                                       sycl::queue &queue,
-                                                                       std::int64_t m,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue,
-                                                        oneapi::mkl::jobsvd jobu,
-                                                        oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda,
-                                                        std::int64_t ldu, std::int64_t ldvt);
-template <>
-ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue,
-                                                         oneapi::mkl::jobsvd jobu,
-                                                         oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda,
-                                                         std::int64_t ldu, std::int64_t ldvt);
-template <>
-ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-    oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu,
-    std::int64_t ldvt);
-template <>
-ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-    oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu,
-    std::int64_t ldvt);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                                      sycl::queue &queue,
-                                                                      std::int64_t m,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                                       sycl::queue &queue,
-                                                                       std::int64_t m,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getri_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, std::int64_t n,
-                                                        std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getri_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, std::int64_t n,
-                                                         std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getri_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                                      sycl::queue &queue,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getri_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                                       sycl::queue &queue,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t n, std::int64_t nrhs,
-                                                        std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t n, std::int64_t nrhs,
-                                                         std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t heevd_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-    std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t heevd_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-    std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t hegvd_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-    oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t hegvd_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-    oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                                      sycl::queue &queue,
-                                                                      oneapi::mkl::uplo uplo,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                                       sycl::queue &queue,
-                                                                       oneapi::mkl::uplo uplo,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                                      sycl::queue &queue,
-                                                                      oneapi::mkl::uplo uplo,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                                       sycl::queue &queue,
-                                                                       oneapi::mkl::uplo uplo,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue,
-                                                        oneapi::mkl::generate vect, std::int64_t m,
-                                                        std::int64_t n, std::int64_t k,
-                                                        std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue,
-                                                         oneapi::mkl::generate vect, std::int64_t m,
-                                                         std::int64_t n, std::int64_t k,
-                                                         std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t k,
-                                                        std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t k,
-                                                         std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::side side,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t m, std::int64_t n,
-                                                        std::int64_t k, std::int64_t lda,
-                                                        std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::side side,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t m, std::int64_t n,
-                                                         std::int64_t k, std::int64_t lda,
-                                                         std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::side side,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t m, std::int64_t n,
-                                                        std::int64_t k, std::int64_t lda,
-                                                        std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::side side,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t m, std::int64_t n,
-                                                         std::int64_t k, std::int64_t lda,
-                                                         std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::side side,
-                                                        oneapi::mkl::uplo uplo,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t m, std::int64_t n,
-                                                        std::int64_t lda, std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::side side,
-                                                         oneapi::mkl::uplo uplo,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t m, std::int64_t n,
-                                                         std::int64_t lda, std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                                      sycl::queue &queue,
-                                                                      oneapi::mkl::uplo uplo,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                                       sycl::queue &queue,
-                                                                       oneapi::mkl::uplo uplo,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t nrhs,
-                                                        std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t nrhs,
-                                                         std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t potri_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t potri_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t potri_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                                      sycl::queue &queue,
-                                                                      oneapi::mkl::uplo uplo,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t potri_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                                       sycl::queue &queue,
-                                                                       oneapi::mkl::uplo uplo,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                                      sycl::queue &queue,
-                                                                      oneapi::mkl::uplo uplo,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                                       sycl::queue &queue,
-                                                                       oneapi::mkl::uplo uplo,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t syevd_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::job jobz,
-                                                        oneapi::mkl::uplo uplo, std::int64_t n,
-                                                        std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t syevd_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::job jobz,
-                                                         oneapi::mkl::uplo uplo, std::int64_t n,
-                                                         std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, std::int64_t itype,
-                                                        oneapi::mkl::job jobz,
-                                                        oneapi::mkl::uplo uplo, std::int64_t n,
-                                                        std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, std::int64_t itype,
-                                                         oneapi::mkl::job jobz,
-                                                         oneapi::mkl::uplo uplo, std::int64_t n,
-                                                         std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        oneapi::mkl::transpose trans,
-                                                        oneapi::mkl::diag diag, std::int64_t n,
-                                                        std::int64_t nrhs, std::int64_t lda,
-                                                        std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         oneapi::mkl::transpose trans,
-                                                         oneapi::mkl::diag diag, std::int64_t n,
-                                                         std::int64_t nrhs, std::int64_t lda,
-                                                         std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-    oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-    std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-    oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-    std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t ungbr_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m,
-    std::int64_t n, std::int64_t k, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t ungbr_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m,
-    std::int64_t n, std::int64_t k, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t ungqr_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-    std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t ungqr_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-    std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                                      sycl::queue &queue,
-                                                                      oneapi::mkl::uplo uplo,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                                       sycl::queue &queue,
-                                                                       oneapi::mkl::uplo uplo,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t unmrq_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-    oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-    std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t unmrq_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-    oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-    std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t unmqr_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-    oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-    std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t unmqr_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-    oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-    std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t unmtr_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-    oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda,
-    std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t unmtr_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-    oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda,
-    std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<float>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<double>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<float>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<double>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<float>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv,
-    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<double>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv,
-    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv,
-    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv,
-    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<float>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<double>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<float>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<double>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<float>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<double>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size<float>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size<double>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                              sycl::queue &queue, std::int64_t *m,
-                                                              std::int64_t *n, std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                               sycl::queue &queue, std::int64_t *m,
-                                                               std::int64_t *n, std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-    std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-    std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                              sycl::queue &queue, std::int64_t *n,
-                                                              std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                               sycl::queue &queue, std::int64_t *n,
-                                                               std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, std::int64_t *lda,
-    std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, std::int64_t *lda,
-    std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<float>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-    std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<double>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-    std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-    std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-    std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<float>(oneapi::mkl::device libkey,
-                                                              sycl::queue &queue, std::int64_t *m,
-                                                              std::int64_t *n, std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<double>(oneapi::mkl::device libkey,
-                                                               sycl::queue &queue, std::int64_t *m,
-                                                               std::int64_t *n, std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-    std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-    std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size<float>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-    std::int64_t *k, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size<double>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-    std::int64_t *k, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<float>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-    std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<double>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-    std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-    std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-    std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<float>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-    std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<double>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-    std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-    std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-    std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-    std::int64_t *k, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-    std::int64_t *k, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes);
-} //namespace detail
-} //namespace lapack
-} //namespace mkl
-} //namespace oneapi
diff --git a/include/oneapi/mkl/lapack/detail/lapack_rt.hpp b/include/oneapi/mkl/lapack/detail/lapack_rt.hpp
deleted file mode 100644
index a96efe8d1..000000000
--- a/include/oneapi/mkl/lapack/detail/lapack_rt.hpp
+++ /dev/null
@@ -1,2392 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#include <complex>
-#include <cstdint>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/lapack/types.hpp"
-#include "oneapi/mkl/lapack/exceptions.hpp"
-#include "oneapi/mkl/detail/get_device_id.hpp"
-#include "oneapi/mkl/lapack/detail/lapack_loader.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-
-static inline void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tauq,
-                         sycl::buffer<std::complex<float>> &taup,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                  scratchpad_size);
-}
-static inline void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &d,
-                         sycl::buffer<double> &e, sycl::buffer<double> &tauq,
-                         sycl::buffer<double> &taup, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                  scratchpad_size);
-}
-static inline void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<float> &tauq, sycl::buffer<float> &taup,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                  scratchpad_size);
-}
-static inline void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tauq,
-                         sycl::buffer<std::complex<double>> &taup,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                  scratchpad_size);
-}
-static inline void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-static inline void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-static inline void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-static inline void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-static inline void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-static inline void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-static inline void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-static inline void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-static inline void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<float>> &b,
-                         std::int64_t ldb, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                  scratchpad_size);
-}
-static inline void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &b,
-                         std::int64_t ldb, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                  scratchpad_size);
-}
-static inline void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &b, std::int64_t ldb,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                  scratchpad_size);
-}
-static inline void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &b,
-                         std::int64_t ldb, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                  scratchpad_size);
-}
-static inline void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &s, sycl::buffer<double> &u, std::int64_t ldu,
-                         sycl::buffer<double> &vt, std::int64_t ldvt,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,
-                  scratchpad, scratchpad_size);
-}
-static inline void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &s, sycl::buffer<float> &u, std::int64_t ldu,
-                         sycl::buffer<float> &vt, std::int64_t ldvt,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,
-                  scratchpad, scratchpad_size);
-}
-static inline void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<float> &s,
-                         sycl::buffer<std::complex<float>> &u, std::int64_t ldu,
-                         sycl::buffer<std::complex<float>> &vt, std::int64_t ldvt,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,
-                  scratchpad, scratchpad_size);
-}
-static inline void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<double> &s,
-                         sycl::buffer<std::complex<double>> &u, std::int64_t ldu,
-                         sycl::buffer<std::complex<double>> &vt, std::int64_t ldvt,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,
-                  scratchpad, scratchpad_size);
-}
-static inline void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::heevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad,
-                  scratchpad_size);
-}
-static inline void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::heevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad,
-                  scratchpad_size);
-}
-static inline void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::hegvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                  scratchpad_size);
-}
-static inline void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::hegvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                  scratchpad_size);
-}
-static inline void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::hetrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                  scratchpad_size);
-}
-static inline void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::hetrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                  scratchpad_size);
-}
-static inline void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::hetrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-static inline void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::hetrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-static inline void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::orgbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad,
-                  scratchpad_size);
-}
-static inline void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::orgbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad,
-                  scratchpad_size);
-}
-static inline void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    detail::orgqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    detail::orgqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    detail::orgtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    detail::orgtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &c, std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::ormtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                  scratchpad, scratchpad_size);
-}
-static inline void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    detail::ormtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                  scratchpad, scratchpad_size);
-}
-static inline void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &c,
-                         std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::ormrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                  scratchpad, scratchpad_size);
-}
-static inline void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &c,
-                         std::int64_t ldc, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::ormrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                  scratchpad, scratchpad_size);
-}
-static inline void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &c,
-                         std::int64_t ldc, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::ormqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                  scratchpad, scratchpad_size);
-}
-static inline void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &c,
-                         std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::ormqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                  scratchpad, scratchpad_size);
-}
-static inline void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-static inline void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-static inline void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-static inline void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-static inline void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-static inline void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-static inline void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-static inline void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-static inline void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &b, std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                  scratchpad_size);
-}
-static inline void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                  scratchpad_size);
-}
-static inline void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                  scratchpad_size);
-}
-static inline void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                  scratchpad_size);
-}
-static inline void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &w, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::syevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad,
-                  scratchpad_size);
-}
-static inline void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::syevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad,
-                  scratchpad_size);
-}
-static inline void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &w, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::sygvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                  scratchpad_size);
-}
-static inline void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-                         sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::sygvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                  scratchpad_size);
-}
-static inline void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &d,
-                         sycl::buffer<double> &e, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    detail::sytrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                  scratchpad_size);
-}
-static inline void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &d,
-                         sycl::buffer<float> &e, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    detail::sytrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                  scratchpad_size);
-}
-static inline void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-static inline void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-static inline void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-static inline void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-static inline void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                  scratchpad, scratchpad_size);
-}
-static inline void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b,
-                         std::int64_t ldb, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                  scratchpad, scratchpad_size);
-}
-static inline void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b,
-                         std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                  scratchpad, scratchpad_size);
-}
-static inline void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                  scratchpad, scratchpad_size);
-}
-static inline void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::ungbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad,
-                  scratchpad_size);
-}
-static inline void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::ungbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad,
-                  scratchpad_size);
-}
-static inline void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::ungqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::ungqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::ungtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::ungtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-static inline void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::unmrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                  scratchpad, scratchpad_size);
-}
-static inline void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::unmrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                  scratchpad, scratchpad_size);
-}
-static inline void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::unmqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                  scratchpad, scratchpad_size);
-}
-static inline void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::unmqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                  scratchpad, scratchpad_size);
-}
-static inline void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::unmtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                  scratchpad, scratchpad_size);
-}
-static inline void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    detail::unmtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                  scratchpad, scratchpad_size);
-}
-static inline void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<float> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<double> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<float>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void getri_batch(sycl::queue &queue, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void getri_batch(sycl::queue &queue, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<float> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                        stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<double> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                        stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                        stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                        stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv, stride_ipiv,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv, stride_ipiv,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv, stride_ipiv,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv, stride_ipiv,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<float> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::orgqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<double> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::orgqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size,
-                        scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size,
-                        scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size,
-                        scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size,
-                        scratchpad, scratchpad_size);
-}
-static inline void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<float> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                        stride_b, batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<double> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                        stride_b, batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                        stride_b, batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                        stride_b, batch_size, scratchpad, scratchpad_size);
-}
-static inline void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<float>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::ungqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    detail::ungqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                        batch_size, scratchpad, scratchpad_size);
-}
-static inline sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, float *d, float *e,
-                                std::complex<float> *tauq, std::complex<float> *taup,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, double *d, double *e, double *tauq, double *taup,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *d, float *e, float *tauq, float *taup,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, double *d, double *e,
-                                std::complex<double> *tauq, std::complex<double> *taup,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                         dependencies);
-}
-static inline sycl::event getri(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda,
-                                std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                         dependencies);
-}
-static inline sycl::event getri(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda,
-                                std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                         dependencies);
-}
-static inline sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                         dependencies);
-}
-static inline sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t *ipiv,
-                                double *b, std::int64_t ldb, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t *ipiv,
-                                float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                double *a, std::int64_t lda, double *s, double *u, std::int64_t ldu,
-                                double *vt, std::int64_t ldvt, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt,
-                         ldvt, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *s, float *u, std::int64_t ldu, float *vt,
-                                std::int64_t ldvt, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt,
-                         ldvt, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, float *s,
-                                std::complex<float> *u, std::int64_t ldu, std::complex<float> *vt,
-                                std::int64_t ldvt, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt,
-                         ldvt, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, double *s,
-                                std::complex<double> *u, std::int64_t ldu, std::complex<double> *vt,
-                                std::int64_t ldvt, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt,
-                         ldvt, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda, float *w,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::heevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                double *w, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::heevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                float *w, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::hegvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                double *w, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::hegvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, float *d, float *e,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::hetrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, double *d, double *e,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::hetrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::hetrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::hetrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                                float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::orgbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                                double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::orgbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                double *a, std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::orgqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                float *a, std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::orgqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::orgtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::orgtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ormtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ormtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, float *a, std::int64_t lda, float *tau, float *c,
-                                std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ormrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ormrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ormqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, float *a, std::int64_t lda, float *tau, float *c,
-                                std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ormqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                         dependencies);
-}
-static inline sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                         dependencies);
-}
-static inline sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                         dependencies);
-}
-static inline sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                         dependencies);
-}
-static inline sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                         dependencies);
-}
-static inline sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                         dependencies);
-}
-static inline sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                         dependencies);
-}
-static inline sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                         dependencies);
-}
-static inline sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, float *a, std::int64_t lda, float *b,
-                                std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, double *a, std::int64_t lda, double *b,
-                                std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, double *a, std::int64_t lda, double *w,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::syevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, float *a, std::int64_t lda, float *w,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::syevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *b, std::int64_t ldb, double *w, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::sygvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *b, std::int64_t ldb, float *w, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::sygvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *d, double *e, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::sytrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *d, float *e, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::sytrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda,
-                                double *b, std::int64_t ldb, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda,
-                                float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ungbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ungbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ungqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ungqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ungtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ungtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad,
-                         scratchpad_size, dependencies);
-}
-static inline sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::unmrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::unmrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::unmqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::unmqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::unmtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return detail::unmtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                         scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a, float *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau,
-                               batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, double *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau,
-                               batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau,
-                               batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau,
-                               batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      float **a, std::int64_t *lda, float **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, tau, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      double **a, std::int64_t *lda, double **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, tau, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, tau, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, tau, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv,
-                               stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv,
-                               stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv,
-                               stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv,
-                               stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      float **a, std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, ipiv, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      double **a, std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, ipiv, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, ipiv, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, ipiv, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                               batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                               batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex<float> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                               batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex<double> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                               batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getri_batch(get_device_id(queue), queue, n, a, lda, ipiv, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getri_batch(get_device_id(queue), queue, n, a, lda, ipiv, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<float> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getri_batch(get_device_id(queue), queue, n, a, lda, ipiv, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<double> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getri_batch(get_device_id(queue), queue, n, a, lda, ipiv, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, float *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                               stride_ipiv, b, ldb, stride_b, batch_size, scratchpad,
-                               scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, double *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                               stride_ipiv, b, ldb, stride_b, batch_size, scratchpad,
-                               scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, std::complex<float> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::complex<float> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                               stride_ipiv, b, ldb, stride_b, batch_size, scratchpad,
-                               scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, std::complex<double> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::complex<double> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                               stride_ipiv, b, ldb, stride_b, batch_size, scratchpad,
-                               scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, float **a,
-                                      std::int64_t *lda, std::int64_t **ipiv, float **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                               group_count, group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, double **a,
-                                      std::int64_t *lda, std::int64_t **ipiv, double **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                               group_count, group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, std::complex<float> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                               group_count, group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, std::complex<double> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::complex<double> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                               group_count, group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, float *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::orgqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau,
-                               stride_tau, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, double *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::orgqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau,
-                               stride_tau, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, float **a, std::int64_t *lda, float **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::orgqr_batch(get_device_id(queue), queue, m, n, k, a, lda, tau, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, double **a, std::int64_t *lda, double **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::orgqr_batch(get_device_id(queue), queue, m, n, k, a, lda, tau, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      float *a, std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size,
-                               scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      double *a, std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size,
-                               scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size,
-                               scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size,
-                               scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      float **a, std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      double **a, std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, float *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                               stride_b, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, double *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                               stride_b, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                               stride_b, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                               stride_b, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, float **a, std::int64_t *lda, float **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb,
-                               group_count, group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, double **a, std::int64_t *lda, double **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb,
-                               group_count, group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, std::complex<float> **a,
-                                      std::int64_t *lda, std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb,
-                               group_count, group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, std::complex<double> **a,
-                                      std::int64_t *lda, std::complex<double> **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb,
-                               group_count, group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ungqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau,
-                               stride_tau, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ungqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau,
-                               stride_tau, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ungqr_batch(get_device_id(queue), queue, m, n, k, a, lda, tau, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return detail::ungqr_batch(get_device_id(queue), queue, m, n, k, a, lda, tau, group_count,
-                               group_sizes, scratchpad, scratchpad_size, dependencies);
-}
-
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda) {
-    return detail::gebrd_scratchpad_size<fp_type>(get_device_id(queue), queue, m, n, lda);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda) {
-    return detail::gerqf_scratchpad_size<fp_type>(get_device_id(queue), queue, m, n, lda);
-}
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda) {
-    return detail::geqrf_scratchpad_size<fp_type>(get_device_id(queue), queue, m, n, lda);
-}
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                   oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldu, std::int64_t ldvt) {
-    return detail::gesvd_scratchpad_size<fp_type>(get_device_id(queue), queue, jobu, jobvt, m, n,
-                                                  lda, ldu, ldvt);
-}
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                   oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldu, std::int64_t ldvt) {
-    return detail::gesvd_scratchpad_size<fp_type>(get_device_id(queue), queue, jobu, jobvt, m, n,
-                                                  lda, ldu, ldvt);
-}
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda) {
-    return detail::getrf_scratchpad_size<fp_type>(get_device_id(queue), queue, m, n, lda);
-}
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda) {
-    return detail::getri_scratchpad_size<fp_type>(get_device_id(queue), queue, n, lda);
-}
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                   std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) {
-    return detail::getrs_scratchpad_size<fp_type>(get_device_id(queue), queue, trans, n, nrhs, lda,
-                                                  ldb);
-}
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t heevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return detail::heevd_scratchpad_size<fp_type>(get_device_id(queue), queue, jobz, uplo, n, lda);
-}
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hegvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldb) {
-    return detail::hegvd_scratchpad_size<fp_type>(get_device_id(queue), queue, itype, jobz, uplo, n,
-                                                  lda, ldb);
-}
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hetrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda) {
-    return detail::hetrd_scratchpad_size<fp_type>(get_device_id(queue), queue, uplo, n, lda);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hetrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda) {
-    return detail::hetrf_scratchpad_size<fp_type>(get_device_id(queue), queue, uplo, n, lda);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::int64_t lda) {
-    return detail::orgbr_scratchpad_size<fp_type>(get_device_id(queue), queue, vect, m, n, k, lda);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda) {
-    return detail::orgtr_scratchpad_size<fp_type>(get_device_id(queue), queue, uplo, n, lda);
-}
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda) {
-    return detail::orgqr_scratchpad_size<fp_type>(get_device_id(queue), queue, m, n, k, lda);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda, std::int64_t ldc) {
-    return detail::ormrq_scratchpad_size<fp_type>(get_device_id(queue), queue, side, trans, m, n, k,
-                                                  lda, ldc);
-}
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda, std::int64_t ldc) {
-    return detail::ormqr_scratchpad_size<fp_type>(get_device_id(queue), queue, side, trans, m, n, k,
-                                                  lda, ldc);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldc) {
-    return detail::ormtr_scratchpad_size<fp_type>(get_device_id(queue), queue, side, uplo, trans, m,
-                                                  n, lda, ldc);
-}
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda) {
-    return detail::potrf_scratchpad_size<fp_type>(get_device_id(queue), queue, uplo, n, lda);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) {
-    return detail::potrs_scratchpad_size<fp_type>(get_device_id(queue), queue, uplo, n, nrhs, lda,
-                                                  ldb);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda) {
-    return detail::potri_scratchpad_size<fp_type>(get_device_id(queue), queue, uplo, n, lda);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda) {
-    return detail::sytrf_scratchpad_size<fp_type>(get_device_id(queue), queue, uplo, n, lda);
-}
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return detail::syevd_scratchpad_size<fp_type>(get_device_id(queue), queue, jobz, uplo, n, lda);
-}
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldb) {
-    return detail::sygvd_scratchpad_size<fp_type>(get_device_id(queue), queue, itype, jobz, uplo, n,
-                                                  lda, ldb);
-}
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda) {
-    return detail::sytrd_scratchpad_size<fp_type>(get_device_id(queue), queue, uplo, n, lda);
-}
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                   std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                   std::int64_t ldb) {
-    return detail::trtrs_scratchpad_size<fp_type>(get_device_id(queue), queue, uplo, trans, diag, n,
-                                                  nrhs, lda, ldb);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::int64_t lda) {
-    return detail::ungbr_scratchpad_size<fp_type>(get_device_id(queue), queue, vect, m, n, k, lda);
-}
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda) {
-    return detail::ungqr_scratchpad_size<fp_type>(get_device_id(queue), queue, m, n, k, lda);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda) {
-    return detail::ungtr_scratchpad_size<fp_type>(get_device_id(queue), queue, uplo, n, lda);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda, std::int64_t ldc) {
-    return detail::unmrq_scratchpad_size<fp_type>(get_device_id(queue), queue, side, trans, m, n, k,
-                                                  lda, ldc);
-}
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda, std::int64_t ldc) {
-    return detail::unmqr_scratchpad_size<fp_type>(get_device_id(queue), queue, side, trans, m, n, k,
-                                                  lda, ldc);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldc) {
-    return detail::unmtr_scratchpad_size<fp_type>(get_device_id(queue), queue, side, uplo, trans, m,
-                                                  n, lda, ldc);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_ipiv, std::int64_t batch_size) {
-    return detail::getrf_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, m, n, lda,
-                                                        stride_a, stride_ipiv, batch_size);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t stride_ipiv,
-                                         std::int64_t batch_size) {
-    return detail::getri_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, n, lda,
-                                                        stride_a, stride_ipiv, batch_size);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t stride_ipiv,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size) {
-    return detail::getrs_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, trans, n, nrhs,
-                                                        lda, stride_a, stride_ipiv, ldb, stride_b,
-                                                        batch_size);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size) {
-    return detail::geqrf_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, m, n, lda,
-                                                        stride_a, stride_tau, batch_size);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t batch_size) {
-    return detail::potrf_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, uplo, n, lda,
-                                                        stride_a, batch_size);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                         std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size) {
-    return detail::potrs_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, uplo, n, nrhs,
-                                                        lda, stride_a, ldb, stride_b, batch_size);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t k, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size) {
-    return detail::orgqr_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, m, n, k, lda,
-                                                        stride_a, stride_tau, batch_size);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t k, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size) {
-    return detail::ungqr_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, m, n, k, lda,
-                                                        stride_a, stride_tau, batch_size);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes) {
-    return detail::getrf_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, m, n, lda,
-                                                        group_count, group_sizes);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return detail::getri_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, n, lda,
-                                                        group_count, group_sizes);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                         std::int64_t *n, std::int64_t *nrhs, std::int64_t *lda,
-                                         std::int64_t *ldb, std::int64_t group_count,
-                                         std::int64_t *group_sizes) {
-    return detail::getrs_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, trans, n, nrhs,
-                                                        lda, ldb, group_count, group_sizes);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes) {
-    return detail::geqrf_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, m, n, lda,
-                                                        group_count, group_sizes);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *k, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return detail::orgqr_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, m, n, k, lda,
-                                                        group_count, group_sizes);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                         std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return detail::potrf_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, uplo, n, lda,
-                                                        group_count, group_sizes);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                         std::int64_t *n, std::int64_t *nrhs, std::int64_t *lda,
-                                         std::int64_t *ldb, std::int64_t group_count,
-                                         std::int64_t *group_sizes) {
-    return detail::potrs_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, uplo, n, nrhs,
-                                                        lda, ldb, group_count, group_sizes);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *k, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return detail::ungqr_batch_scratchpad_size<fp_type>(get_device_id(queue), queue, m, n, k, lda,
-                                                        group_count, group_sizes);
-}
-
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
diff --git a/include/oneapi/mkl/lapack/detail/mkl_common/lapack_ct.hxx b/include/oneapi/mkl/lapack/detail/mkl_common/lapack_ct.hxx
deleted file mode 100644
index 1ebe97527..000000000
--- a/include/oneapi/mkl/lapack/detail/mkl_common/lapack_ct.hxx
+++ /dev/null
@@ -1,2694 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-static inline void gebrd(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tauq,
-                         sycl::buffer<std::complex<float>> &taup,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup,
-                                               scratchpad, scratchpad_size);
-}
-static inline void gebrd(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<double> &tauq, sycl::buffer<double> &taup,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup,
-                                               scratchpad, scratchpad_size);
-}
-static inline void gebrd(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e, sycl::buffer<float> &tauq,
-                         sycl::buffer<float> &taup, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup,
-                                               scratchpad, scratchpad_size);
-}
-static inline void gebrd(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tauq,
-                         sycl::buffer<std::complex<double>> &taup,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup,
-                                               scratchpad, scratchpad_size);
-}
-static inline void gerqf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                               scratchpad_size);
-}
-static inline void gerqf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                               scratchpad_size);
-}
-static inline void gerqf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                               scratchpad_size);
-}
-static inline void gerqf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                               scratchpad_size);
-}
-static inline void geqrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                               scratchpad_size);
-}
-static inline void geqrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                               scratchpad_size);
-}
-static inline void geqrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                               scratchpad_size);
-}
-static inline void geqrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getri(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getri(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getri(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getri(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<float>> &b,
-                         std::int64_t ldb, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv,
-                                               b, ldb, scratchpad, scratchpad_size);
-}
-static inline void getrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &b,
-                         std::int64_t ldb, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv,
-                                               b, ldb, scratchpad, scratchpad_size);
-}
-static inline void getrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &b, std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv,
-                                               b, ldb, scratchpad, scratchpad_size);
-}
-static inline void getrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &b,
-                         std::int64_t ldb, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv,
-                                               b, ldb, scratchpad, scratchpad_size);
-}
-static inline void gesvd(backend_selector<backend::LAPACK_BACKEND> selector,
-                         oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &s, sycl::buffer<double> &u, std::int64_t ldu,
-                         sycl::buffer<double> &vt, std::int64_t ldvt,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s,
-                                               u, ldu, vt, ldvt, scratchpad, scratchpad_size);
-}
-static inline void gesvd(backend_selector<backend::LAPACK_BACKEND> selector,
-                         oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &s, sycl::buffer<float> &u, std::int64_t ldu,
-                         sycl::buffer<float> &vt, std::int64_t ldvt,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s,
-                                               u, ldu, vt, ldvt, scratchpad, scratchpad_size);
-}
-static inline void gesvd(backend_selector<backend::LAPACK_BACKEND> selector,
-                         oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &s, sycl::buffer<std::complex<float>> &u,
-                         std::int64_t ldu, sycl::buffer<std::complex<float>> &vt, std::int64_t ldvt,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s,
-                                               u, ldu, vt, ldvt, scratchpad, scratchpad_size);
-}
-static inline void gesvd(backend_selector<backend::LAPACK_BACKEND> selector,
-                         oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &s, sycl::buffer<std::complex<double>> &u,
-                         std::int64_t ldu, sycl::buffer<std::complex<double>> &vt,
-                         std::int64_t ldvt, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s,
-                                               u, ldu, vt, ldvt, scratchpad, scratchpad_size);
-}
-static inline void heevd(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w,
-                                               scratchpad, scratchpad_size);
-}
-static inline void heevd(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w,
-                                               scratchpad, scratchpad_size);
-}
-static inline void hegvd(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda,
-                                               b, ldb, w, scratchpad, scratchpad_size);
-}
-static inline void hegvd(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda,
-                                               b, ldb, w, scratchpad, scratchpad_size);
-}
-static inline void hetrd(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void hetrd(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void hetrf(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                               scratchpad, scratchpad_size);
-}
-static inline void hetrf(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                               scratchpad, scratchpad_size);
-}
-static inline void orgbr(backend_selector<backend::LAPACK_BACKEND> selector,
-                         oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void orgbr(backend_selector<backend::LAPACK_BACKEND> selector,
-                         oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void orgqr(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void orgqr(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void orgtr(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void orgtr(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void ormtr(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormtr(selector.get_queue(), side, uplo, trans, m, n, a,
-                                               lda, tau, c, ldc, scratchpad, scratchpad_size);
-}
-static inline void ormtr(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormtr(selector.get_queue(), side, uplo, trans, m, n, a,
-                                               lda, tau, c, ldc, scratchpad, scratchpad_size);
-}
-static inline void ormrq(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                               tau, c, ldc, scratchpad, scratchpad_size);
-}
-static inline void ormrq(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                               tau, c, ldc, scratchpad, scratchpad_size);
-}
-static inline void ormqr(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                               tau, c, ldc, scratchpad, scratchpad_size);
-}
-static inline void ormqr(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                               tau, c, ldc, scratchpad, scratchpad_size);
-}
-static inline void potrf(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                               scratchpad_size);
-}
-static inline void potrf(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                               scratchpad_size);
-}
-static inline void potrf(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                               scratchpad_size);
-}
-static inline void potrf(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                               scratchpad_size);
-}
-static inline void potri(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                               scratchpad_size);
-}
-static inline void potri(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                               scratchpad_size);
-}
-static inline void potri(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                               scratchpad_size);
-}
-static inline void potri(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                               scratchpad_size);
-}
-static inline void potrs(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                               scratchpad, scratchpad_size);
-}
-static inline void potrs(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                               scratchpad, scratchpad_size);
-}
-static inline void potrs(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                               scratchpad, scratchpad_size);
-}
-static inline void potrs(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                               scratchpad, scratchpad_size);
-}
-static inline void syevd(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &w,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w,
-                                               scratchpad, scratchpad_size);
-}
-static inline void syevd(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w,
-                                               scratchpad, scratchpad_size);
-}
-static inline void sygvd(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b,
-                         std::int64_t ldb, sycl::buffer<double> &w,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda,
-                                               b, ldb, w, scratchpad, scratchpad_size);
-}
-static inline void sygvd(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b,
-                         std::int64_t ldb, sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda,
-                                               b, ldb, w, scratchpad, scratchpad_size);
-}
-static inline void sytrd(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void sytrd(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void sytrf(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                               scratchpad, scratchpad_size);
-}
-static inline void sytrf(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                               scratchpad, scratchpad_size);
-}
-static inline void sytrf(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                               scratchpad, scratchpad_size);
-}
-static inline void sytrf(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                               scratchpad, scratchpad_size);
-}
-static inline void trtrs(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a,
-                                               lda, b, ldb, scratchpad, scratchpad_size);
-}
-static inline void trtrs(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a,
-                                               lda, b, ldb, scratchpad, scratchpad_size);
-}
-static inline void trtrs(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &b, std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a,
-                                               lda, b, ldb, scratchpad, scratchpad_size);
-}
-static inline void trtrs(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a,
-                                               lda, b, ldb, scratchpad, scratchpad_size);
-}
-static inline void ungbr(backend_selector<backend::LAPACK_BACKEND> selector,
-                         oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void ungbr(backend_selector<backend::LAPACK_BACKEND> selector,
-                         oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void ungqr(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void ungqr(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void ungtr(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void ungtr(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                               scratchpad, scratchpad_size);
-}
-static inline void unmrq(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                               tau, c, ldc, scratchpad, scratchpad_size);
-}
-static inline void unmrq(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                               tau, c, ldc, scratchpad, scratchpad_size);
-}
-static inline void unmqr(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                               tau, c, ldc, scratchpad, scratchpad_size);
-}
-static inline void unmqr(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                               tau, c, ldc, scratchpad, scratchpad_size);
-}
-static inline void unmtr(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmtr(selector.get_queue(), side, uplo, trans, m, n, a,
-                                               lda, tau, c, ldc, scratchpad, scratchpad_size);
-}
-static inline void unmtr(backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmtr(selector.get_queue(), side, uplo, trans, m, n, a,
-                                               lda, tau, c, ldc, scratchpad, scratchpad_size);
-}
-static inline void geqrf_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<float> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                     tau, stride_tau, batch_size, scratchpad,
-                                                     scratchpad_size);
-}
-static inline void geqrf_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<double> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                     tau, stride_tau, batch_size, scratchpad,
-                                                     scratchpad_size);
-}
-static inline void geqrf_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                     tau, stride_tau, batch_size, scratchpad,
-                                                     scratchpad_size);
-}
-static inline void geqrf_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<double>> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                     tau, stride_tau, batch_size, scratchpad,
-                                                     scratchpad_size);
-}
-static inline void getri_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, stride_a,
-                                                     ipiv, stride_ipiv, batch_size, scratchpad,
-                                                     scratchpad_size);
-}
-static inline void getri_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, stride_a,
-                                                     ipiv, stride_ipiv, batch_size, scratchpad,
-                                                     scratchpad_size);
-}
-static inline void getri_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, stride_a,
-                                                     ipiv, stride_ipiv, batch_size, scratchpad,
-                                                     scratchpad_size);
-}
-static inline void getri_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, stride_a,
-                                                     ipiv, stride_ipiv, batch_size, scratchpad,
-                                                     scratchpad_size);
-}
-static inline void getrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<float> &b, std::int64_t ldb, std::int64_t stride_b,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                     stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-                                                     batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<double> &b, std::int64_t ldb, std::int64_t stride_b,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                     stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-                                                     batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<std::complex<float>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                     stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-                                                     batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<std::complex<double>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                     stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-                                                     batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrf_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                     ipiv, stride_ipiv, batch_size, scratchpad,
-                                                     scratchpad_size);
-}
-static inline void getrf_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                     ipiv, stride_ipiv, batch_size, scratchpad,
-                                                     scratchpad_size);
-}
-static inline void getrf_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                     ipiv, stride_ipiv, batch_size, scratchpad,
-                                                     scratchpad_size);
-}
-static inline void getrf_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                     ipiv, stride_ipiv, batch_size, scratchpad,
-                                                     scratchpad_size);
-}
-static inline void orgqr_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                               std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                               std::int64_t lda, std::int64_t stride_a, sycl::buffer<float> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch(selector.get_queue(), m, n, k, a, lda,
-                                                     stride_a, tau, stride_tau, batch_size,
-                                                     scratchpad, scratchpad_size);
-}
-static inline void orgqr_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                               std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                               std::int64_t lda, std::int64_t stride_a, sycl::buffer<double> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch(selector.get_queue(), m, n, k, a, lda,
-                                                     stride_a, tau, stride_tau, batch_size,
-                                                     scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                               oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-                               std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(
-        selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                               oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-                               std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(
-        selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                               oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(
-        selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                               oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(
-        selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                               oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<float> &b, std::int64_t ldb, std::int64_t stride_b,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                     stride_a, b, ldb, stride_b, batch_size,
-                                                     scratchpad, scratchpad_size);
-}
-static inline void potrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                               oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<double> &b, std::int64_t ldb, std::int64_t stride_b,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                     stride_a, b, ldb, stride_b, batch_size,
-                                                     scratchpad, scratchpad_size);
-}
-static inline void potrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                               oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<float>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                     stride_a, b, ldb, stride_b, batch_size,
-                                                     scratchpad, scratchpad_size);
-}
-static inline void potrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                               oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                     stride_a, b, ldb, stride_b, batch_size,
-                                                     scratchpad, scratchpad_size);
-}
-static inline void ungqr_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                               std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch(selector.get_queue(), m, n, k, a, lda,
-                                                     stride_a, tau, stride_tau, batch_size,
-                                                     scratchpad, scratchpad_size);
-}
-static inline void ungqr_batch(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                               std::int64_t n, std::int64_t k,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch(selector.get_queue(), m, n, k, a, lda,
-                                                     stride_a, tau, stride_tau, batch_size,
-                                                     scratchpad, scratchpad_size);
-}
-static inline sycl::event gebrd(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda, float *d,
-                                float *e, std::complex<float> *tauq, std::complex<float> *taup,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e,
-                                                      tauq, taup, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event gebrd(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *d, double *e,
-                                double *tauq, double *taup, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e,
-                                                      tauq, taup, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event gebrd(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *d, float *e,
-                                float *tauq, float *taup, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e,
-                                                      tauq, taup, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event gebrd(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                double *d, double *e, std::complex<double> *tauq,
-                                std::complex<double> *taup, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e,
-                                                      tauq, taup, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event gerqf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, std::int64_t *ipiv,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, std::int64_t *ipiv,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getri(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getri(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t n,
-                                double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getri(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t n,
-                                float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getri(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                      ipiv, b, ldb, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event getrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                double *a, std::int64_t lda, std::int64_t *ipiv, double *b,
-                                std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                      ipiv, b, ldb, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event getrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                float *a, std::int64_t lda, std::int64_t *ipiv, float *b,
-                                std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                      ipiv, b, ldb, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event getrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                      ipiv, b, ldb, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event gesvd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *s, double *u,
-                                std::int64_t ldu, double *vt, std::int64_t ldvt, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a,
-                                                      lda, s, u, ldu, vt, ldvt, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event gesvd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *s, float *u,
-                                std::int64_t ldu, float *vt, std::int64_t ldvt, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a,
-                                                      lda, s, u, ldu, vt, ldvt, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event gesvd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda, float *s,
-                                std::complex<float> *u, std::int64_t ldu, std::complex<float> *vt,
-                                std::int64_t ldvt, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a,
-                                                      lda, s, u, ldu, vt, ldvt, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event gesvd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                double *s, std::complex<double> *u, std::int64_t ldu,
-                                std::complex<double> *vt, std::int64_t ldvt,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a,
-                                                      lda, s, u, ldu, vt, ldvt, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event heevd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, float *w,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::heevd(selector.get_queue(), jobz, uplo, n, a, lda,
-                                                      w, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event heevd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, double *w,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::heevd(selector.get_queue(), jobz, uplo, n, a, lda,
-                                                      w, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event hegvd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *b, std::int64_t ldb, float *w,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::hegvd(selector.get_queue(), itype, jobz, uplo, n, a,
-                                                      lda, b, ldb, w, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event hegvd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *b, std::int64_t ldb, double *w,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::hegvd(selector.get_queue(), itype, jobz, uplo, n, a,
-                                                      lda, b, ldb, w, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event hetrd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, float *d, float *e, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::hetrd(selector.get_queue(), uplo, n, a, lda, d, e,
-                                                      tau, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event hetrd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, double *d, double *e, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::hetrd(selector.get_queue(), uplo, n, a, lda, d, e,
-                                                      tau, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event hetrf(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event hetrf(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgbr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, float *a, std::int64_t lda, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgbr(
-        selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgbr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, double *a, std::int64_t lda, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgbr(
-        selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                                double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                                float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgtr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgtr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ormtr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ormtr(selector.get_queue(), side, uplo, trans, m, n,
-                                                      a, lda, tau, c, ldc, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event ormtr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ormtr(selector.get_queue(), side, uplo, trans, m, n,
-                                                      a, lda, tau, c, ldc, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event ormrq(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, float *a,
-                                std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ormrq(selector.get_queue(), side, trans, m, n, k, a,
-                                                      lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event ormrq(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, double *a,
-                                std::int64_t lda, double *tau, double *c, std::int64_t ldc,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ormrq(selector.get_queue(), side, trans, m, n, k, a,
-                                                      lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event ormqr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, double *a,
-                                std::int64_t lda, double *tau, double *c, std::int64_t ldc,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ormqr(selector.get_queue(), side, trans, m, n, k, a,
-                                                      lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event ormqr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, float *a,
-                                std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ormqr(selector.get_queue(), side, trans, m, n, k, a,
-                                                      lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event potrf(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrf(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrf(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrf(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potri(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potri(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potri(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potri(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, float *a,
-                                std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      b, ldb, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event potrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      b, ldb, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event potrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                                std::int64_t ldb, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      b, ldb, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event potrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                                std::int64_t ldb, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      b, ldb, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event syevd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *w, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::syevd(selector.get_queue(), jobz, uplo, n, a, lda,
-                                                      w, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event syevd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *w, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::syevd(selector.get_queue(), jobz, uplo, n, a, lda,
-                                                      w, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sygvd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, double *a, std::int64_t lda, double *b,
-                                std::int64_t ldb, double *w, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::sygvd(selector.get_queue(), itype, jobz, uplo, n, a,
-                                                      lda, b, ldb, w, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event sygvd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, float *a, std::int64_t lda, float *b,
-                                std::int64_t ldb, float *w, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::sygvd(selector.get_queue(), itype, jobz, uplo, n, a,
-                                                      lda, b, ldb, w, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event sytrd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *d, double *e, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::sytrd(selector.get_queue(), uplo, n, a, lda, d, e,
-                                                      tau, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event sytrd(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *d, float *e, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::sytrd(selector.get_queue(), uplo, n, a, lda, d, e,
-                                                      tau, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event sytrf(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event trtrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                                std::int64_t ldb, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n,
-                                                      nrhs, a, lda, b, ldb, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event trtrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n,
-                                                      nrhs, a, lda, b, ldb, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event trtrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a,
-                                std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n,
-                                                      nrhs, a, lda, b, ldb, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event trtrs(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                                std::int64_t ldb, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n,
-                                                      nrhs, a, lda, b, ldb, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event ungbr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungbr(
-        selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungbr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungbr(
-        selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr(backend_selector<backend::LAPACK_BACKEND> selector, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungtr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungtr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event unmrq(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::unmrq(selector.get_queue(), side, trans, m, n, k, a,
-                                                      lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event unmrq(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::unmrq(selector.get_queue(), side, trans, m, n, k, a,
-                                                      lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event unmqr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::unmqr(selector.get_queue(), side, trans, m, n, k, a,
-                                                      lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event unmqr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::unmqr(selector.get_queue(), side, trans, m, n, k, a,
-                                                      lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                      dependencies);
-}
-static inline sycl::event unmtr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::unmtr(selector.get_queue(), side, uplo, trans, m, n,
-                                                      a, lda, tau, c, ldc, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event unmtr(backend_selector<backend::LAPACK_BACKEND> selector,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::unmtr(selector.get_queue(), side, uplo, trans, m, n,
-                                                      a, lda, tau, c, ldc, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, float *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(
-        selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, double *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(
-        selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t m, std::int64_t n, std::complex<float> *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::complex<float> *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(
-        selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t m, std::int64_t n, std::complex<double> *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::complex<double> *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(
-        selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *m, std::int64_t *n, float **a,
-                                      std::int64_t *lda, float **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, tau,
-                                                            group_count, group_sizes, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *m, std::int64_t *n, double **a,
-                                      std::int64_t *lda, double **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, tau,
-                                                            group_count, group_sizes, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *m, std::int64_t *n, std::complex<float> **a,
-                                      std::int64_t *lda, std::complex<float> **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, tau,
-                                                            group_count, group_sizes, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *m, std::int64_t *n, std::complex<double> **a,
-                                      std::int64_t *lda, std::complex<double> **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, tau,
-                                                            group_count, group_sizes, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(
-        selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(
-        selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t m, std::int64_t n, std::complex<float> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(
-        selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t m, std::int64_t n, std::complex<double> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(
-        selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *m, std::int64_t *n, float **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(
-        selector.get_queue(), m, n, a, lda, ipiv, group_count, group_sizes, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *m, std::int64_t *n, double **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(
-        selector.get_queue(), m, n, a, lda, ipiv, group_count, group_sizes, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *m, std::int64_t *n, std::complex<float> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(
-        selector.get_queue(), m, n, a, lda, ipiv, group_count, group_sizes, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *m, std::int64_t *n, std::complex<double> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(
-        selector.get_queue(), m, n, a, lda, ipiv, group_count, group_sizes, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t n, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(
-        selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t n, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(
-        selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(
-        selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(
-        selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *n, float **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, ipiv,
-                                                            group_count, group_sizes, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *n, double **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, ipiv,
-                                                            group_count, group_sizes, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *n, std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, ipiv,
-                                                            group_count, group_sizes, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *n, std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, ipiv,
-                                                            group_count, group_sizes, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::transpose trans, std::int64_t n,
-                                      std::int64_t nrhs, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, float *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-        batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::transpose trans, std::int64_t n,
-                                      std::int64_t nrhs, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, double *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-        batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::transpose trans, std::int64_t n,
-                                      std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::complex<float> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-        batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::transpose trans, std::int64_t n,
-                                      std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::complex<double> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-        batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::transpose *trans, std::int64_t *n,
-                                      std::int64_t *nrhs, float **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, float **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes,
-        scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::transpose *trans, std::int64_t *n,
-                                      std::int64_t *nrhs, double **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, double **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes,
-        scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::transpose *trans, std::int64_t *n,
-                                      std::int64_t *nrhs, std::complex<float> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes,
-        scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(
-    backend_selector<backend::LAPACK_BACKEND> selector, oneapi::mkl::transpose *trans,
-    std::int64_t *n, std::int64_t *nrhs, std::complex<double> **a, std::int64_t *lda,
-    std::int64_t **ipiv, std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-    const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes,
-        scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t m, std::int64_t n, std::int64_t k, float *a,
-                                      std::int64_t lda, std::int64_t stride_a, float *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch(
-        selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t m, std::int64_t n, std::int64_t k, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, double *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch(
-        selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *m, std::int64_t *n, std::int64_t *k, float **a,
-                                      std::int64_t *lda, float **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch(
-        selector.get_queue(), m, n, k, a, lda, tau, group_count, group_sizes, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *m, std::int64_t *n, std::int64_t *k, double **a,
-                                      std::int64_t *lda, double **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch(
-        selector.get_queue(), m, n, k, a, lda, tau, group_count, group_sizes, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                            stride_a, batch_size, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                            stride_a, batch_size, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                            stride_a, batch_size, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                            stride_a, batch_size, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, float **a,
-                                      std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                            group_count, group_sizes, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, double **a,
-                                      std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                            group_count, group_sizes, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                            group_count, group_sizes, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                            group_count, group_sizes, scratchpad,
-                                                            scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      float *a, std::int64_t lda, std::int64_t stride_a, float *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(
-        selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      double *a, std::int64_t lda, std::int64_t stride_a, double *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(
-        selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(
-        selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(
-        selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size,
-        scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      float **a, std::int64_t *lda, float **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(
-        selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      double **a, std::int64_t *lda, double **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(
-        selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(
-        selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(
-        selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t m, std::int64_t n, std::int64_t k,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch(
-        selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t m, std::int64_t n, std::int64_t k,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch(
-        selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch(
-        selector.get_queue(), m, n, k, a, lda, tau, group_count, group_sizes, scratchpad,
-        scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(backend_selector<backend::LAPACK_BACKEND> selector,
-                                      std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch(
-        selector.get_queue(), m, n, k, a, lda, tau, group_count, group_sizes, scratchpad,
-        scratchpad_size, dependencies);
-}
-
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t gebrd_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gebrd_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               m, n, lda);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t gerqf_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gerqf_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               m, n, lda);
-}
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               m, n, lda);
-}
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t gesvd_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldu, std::int64_t ldvt) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gesvd_scratchpad_size<fp_type>(
-        selector.get_queue(), jobu, jobvt, m, n, lda, ldu, ldvt);
-}
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t gesvd_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldu, std::int64_t ldvt) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::gesvd_scratchpad_size<fp_type>(
-        selector.get_queue(), jobu, jobvt, m, n, lda, ldu, ldvt);
-}
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               m, n, lda);
-}
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               n, lda);
-}
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                   std::int64_t lda, std::int64_t ldb) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_scratchpad_size<fp_type>(
-        selector.get_queue(), trans, n, nrhs, lda, ldb);
-}
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t heevd_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::heevd_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               jobz, uplo, n, lda);
-}
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hegvd_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   std::int64_t itype, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldb) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::hegvd_scratchpad_size<fp_type>(
-        selector.get_queue(), itype, jobz, uplo, n, lda, ldb);
-}
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hetrd_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::hetrd_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               uplo, n, lda);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hetrf_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::hetrf_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               uplo, n, lda);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgbr_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::generate vect, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgbr_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               vect, m, n, k, lda);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgtr_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgtr_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               uplo, n, lda);
-}
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   std::int64_t m, std::int64_t n, std::int64_t k,
-                                   std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               m, n, k, lda);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormrq_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ormrq_scratchpad_size<fp_type>(
-        selector.get_queue(), side, trans, m, n, k, lda, ldc);
-}
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormqr_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ormqr_scratchpad_size<fp_type>(
-        selector.get_queue(), side, trans, m, n, k, lda, ldc);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormtr_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldc) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ormtr_scratchpad_size<fp_type>(
-        selector.get_queue(), side, uplo, trans, m, n, lda, ldc);
-}
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               uplo, n, lda);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                   std::int64_t lda, std::int64_t ldb) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, n, nrhs, lda, ldb);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potri_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potri_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               uplo, n, lda);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t sytrf_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::sytrf_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               uplo, n, lda);
-}
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t syevd_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::syevd_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               jobz, uplo, n, lda);
-}
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t sygvd_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   std::int64_t itype, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldb) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::sygvd_scratchpad_size<fp_type>(
-        selector.get_queue(), itype, jobz, uplo, n, lda, ldb);
-}
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t sytrd_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::sytrd_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               uplo, n, lda);
-}
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t trtrs_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                   oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                   std::int64_t lda, std::int64_t ldb) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::trtrs_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, trans, diag, n, nrhs, lda, ldb);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungbr_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::generate vect, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungbr_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               vect, m, n, k, lda);
-}
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   std::int64_t m, std::int64_t n, std::int64_t k,
-                                   std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               m, n, k, lda);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungtr_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungtr_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                               uplo, n, lda);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmrq_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::unmrq_scratchpad_size<fp_type>(
-        selector.get_queue(), side, trans, m, n, k, lda, ldc);
-}
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmqr_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::unmqr_scratchpad_size<fp_type>(
-        selector.get_queue(), side, trans, m, n, k, lda, ldc);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmtr_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldc) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::unmtr_scratchpad_size<fp_type>(
-        selector.get_queue(), side, uplo, trans, m, n, lda, ldc);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         std::int64_t m, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t stride_ipiv,
-                                         std::int64_t batch_size) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, lda, stride_a, stride_ipiv, batch_size);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         std::int64_t n, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_ipiv, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), n, lda, stride_a, stride_ipiv, batch_size);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         oneapi::mkl::transpose trans, std::int64_t n,
-                                         std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_ipiv, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b,
-        batch_size);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         std::int64_t m, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t stride_tau,
-                                         std::int64_t batch_size) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, lda, stride_a, stride_tau, batch_size);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, n, lda, stride_a, batch_size);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                         std::int64_t lda, std::int64_t stride_a, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         std::int64_t m, std::int64_t n, std::int64_t k,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, k, lda, stride_a, stride_tau, batch_size);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         std::int64_t m, std::int64_t n, std::int64_t k,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, k, lda, stride_a, stride_tau, batch_size);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, lda, group_count, group_sizes);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), n, lda, group_count, group_sizes);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         oneapi::mkl::transpose *trans, std::int64_t *n,
-                                         std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), trans, n, nrhs, lda, ldb, group_count, group_sizes);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, lda, group_count, group_sizes);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, k, lda, group_count, group_sizes);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, n, lda, group_count, group_sizes);
-}
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                         std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, n, nrhs, lda, ldb, group_count, group_sizes);
-}
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_batch_scratchpad_size(backend_selector<backend::LAPACK_BACKEND> selector,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, k, lda, group_count, group_sizes);
-}
diff --git a/include/oneapi/mkl/lapack/detail/mkl_common/onemkl_lapack_backends.hxx b/include/oneapi/mkl/lapack/detail/mkl_common/onemkl_lapack_backends.hxx
deleted file mode 100644
index 372e2646b..000000000
--- a/include/oneapi/mkl/lapack/detail/mkl_common/onemkl_lapack_backends.hxx
+++ /dev/null
@@ -1,2139 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tauq,
-                         sycl::buffer<std::complex<float>> &taup,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &d,
-                         sycl::buffer<double> &e, sycl::buffer<double> &tauq,
-                         sycl::buffer<double> &taup, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<float> &tauq, sycl::buffer<float> &taup,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tauq,
-                         sycl::buffer<std::complex<double>> &taup,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<float>> &b,
-                         std::int64_t ldb, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &b,
-                         std::int64_t ldb, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &b, std::int64_t ldb,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &b,
-                         std::int64_t ldb, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &s, sycl::buffer<double> &u, std::int64_t ldu,
-                         sycl::buffer<double> &vt, std::int64_t ldvt,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &s, sycl::buffer<float> &u, std::int64_t ldu,
-                         sycl::buffer<float> &vt, std::int64_t ldvt,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<float> &s,
-                         sycl::buffer<std::complex<float>> &u, std::int64_t ldu,
-                         sycl::buffer<std::complex<float>> &vt, std::int64_t ldvt,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<double> &s,
-                         sycl::buffer<std::complex<double>> &u, std::int64_t ldu,
-                         sycl::buffer<std::complex<double>> &vt, std::int64_t ldvt,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &c, std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &c,
-                         std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &c,
-                         std::int64_t ldc, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &c,
-                         std::int64_t ldc, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &c,
-                         std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &b, std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &w, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &w, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-                         sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &d,
-                         sycl::buffer<double> &e, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &d,
-                         sycl::buffer<float> &e, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b,
-                         std::int64_t ldb, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b,
-                         std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<float> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<double> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<float>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<float> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<double> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<float> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<double> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<float> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<double> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<float>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, float *d, float *e,
-                                std::complex<float> *tauq, std::complex<float> *taup,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, double *d, double *e, double *tauq, double *taup,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *d, float *e, float *tauq, float *taup,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, double *d, double *e,
-                                std::complex<double> *tauq, std::complex<double> *taup,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda,
-                                std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda,
-                                std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t *ipiv,
-                                double *b, std::int64_t ldb, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t *ipiv,
-                                float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                double *a, std::int64_t lda, double *s, double *u, std::int64_t ldu,
-                                double *vt, std::int64_t ldvt, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *s, float *u, std::int64_t ldu, float *vt,
-                                std::int64_t ldvt, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, float *s,
-                                std::complex<float> *u, std::int64_t ldu, std::complex<float> *vt,
-                                std::int64_t ldvt, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, double *s,
-                                std::complex<double> *u, std::int64_t ldu, std::complex<double> *vt,
-                                std::int64_t ldvt, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda, float *w,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                double *w, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                float *w, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                double *w, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, float *d, float *e,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, double *d, double *e,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                                float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                                double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                double *a, std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                float *a, std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, float *a, std::int64_t lda, float *tau, float *c,
-                                std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, float *a, std::int64_t lda, float *tau, float *c,
-                                std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, float *a, std::int64_t lda, float *b,
-                                std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, double *a, std::int64_t lda, double *b,
-                                std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, double *a, std::int64_t lda, double *w,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, float *a, std::int64_t lda, float *w,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *b, std::int64_t ldb, double *w, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *b, std::int64_t ldb, float *w, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *d, double *e, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *d, float *e, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda,
-                                double *b, std::int64_t ldb, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda,
-                                float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a, float *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, double *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      float **a, std::int64_t *lda, float **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      double **a, std::int64_t *lda, double **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      float **a, std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      double **a, std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex<float> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex<double> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<float> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<double> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, float *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, double *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, std::complex<float> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::complex<float> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, std::complex<double> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::complex<double> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, float **a,
-                                      std::int64_t *lda, std::int64_t **ipiv, float **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, double **a,
-                                      std::int64_t *lda, std::int64_t **ipiv, double **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, std::complex<float> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, std::complex<double> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::complex<double> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, float *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, double *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, float **a, std::int64_t *lda, float **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, double **a, std::int64_t *lda, double **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      float *a, std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      double *a, std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      float **a, std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      double **a, std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, float *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, double *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, float **a, std::int64_t *lda, float **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, double **a, std::int64_t *lda, double **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, std::complex<float> **a,
-                                      std::int64_t *lda, std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, std::complex<double> **a,
-                                      std::int64_t *lda, std::complex<double> **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                   oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldu, std::int64_t ldvt);
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                   oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldu, std::int64_t ldvt);
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda);
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                   std::int64_t nrhs, std::int64_t lda, std::int64_t ldb);
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t heevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda);
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hegvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldb);
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hetrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hetrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda, std::int64_t ldc);
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda, std::int64_t ldc);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldc);
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t nrhs, std::int64_t lda, std::int64_t ldb);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda);
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldb);
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                   std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                   std::int64_t ldb);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::int64_t lda);
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda, std::int64_t ldc);
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda, std::int64_t ldc);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldc);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_ipiv, std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t stride_ipiv,
-                                         std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t stride_ipiv,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                         std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t k, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t k, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                         std::int64_t *n, std::int64_t *nrhs, std::int64_t *lda,
-                                         std::int64_t *ldb, std::int64_t group_count,
-                                         std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *k, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                         std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                         std::int64_t *n, std::int64_t *nrhs, std::int64_t *lda,
-                                         std::int64_t *ldb, std::int64_t group_count,
-                                         std::int64_t *group_sizes);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *k, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size<float>(sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size<double>(sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                                      std::int64_t m,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                                       std::int64_t m,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size<float>(sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size<double>(sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                                      std::int64_t m,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                                       std::int64_t m,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size<float>(sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size<double>(sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                                      std::int64_t m,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                                       std::int64_t m,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size<float>(sycl::queue &queue,
-                                                        oneapi::mkl::jobsvd jobu,
-                                                        oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda,
-                                                        std::int64_t ldu, std::int64_t ldvt);
-template <>
-ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size<double>(sycl::queue &queue,
-                                                         oneapi::mkl::jobsvd jobu,
-                                                         oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda,
-                                                         std::int64_t ldu, std::int64_t ldvt);
-template <>
-ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-    std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt);
-template <>
-ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-    std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_scratchpad_size<float>(sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_scratchpad_size<double>(sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                                      std::int64_t m,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                                       std::int64_t m,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getri_scratchpad_size<float>(sycl::queue &queue, std::int64_t n,
-                                                        std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getri_scratchpad_size<double>(sycl::queue &queue, std::int64_t n,
-                                                         std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getri_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getri_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_scratchpad_size<float>(sycl::queue &queue,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t n, std::int64_t nrhs,
-                                                        std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_scratchpad_size<double>(sycl::queue &queue,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t n, std::int64_t nrhs,
-                                                         std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-    std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-    std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t heevd_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                                      oneapi::mkl::job jobz,
-                                                                      oneapi::mkl::uplo uplo,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t heevd_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                                       oneapi::mkl::job jobz,
-                                                                       oneapi::mkl::uplo uplo,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t hegvd_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-    std::int64_t n, std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t hegvd_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-    std::int64_t n, std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                                      oneapi::mkl::uplo uplo,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                                       oneapi::mkl::uplo uplo,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                                      oneapi::mkl::uplo uplo,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                                       oneapi::mkl::uplo uplo,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size<float>(sycl::queue &queue,
-                                                        oneapi::mkl::generate vect, std::int64_t m,
-                                                        std::int64_t n, std::int64_t k,
-                                                        std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size<double>(sycl::queue &queue,
-                                                         oneapi::mkl::generate vect, std::int64_t m,
-                                                         std::int64_t n, std::int64_t k,
-                                                         std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size<float>(sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t k,
-                                                        std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size<double>(sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t k,
-                                                         std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::side side,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t m, std::int64_t n,
-                                                        std::int64_t k, std::int64_t lda,
-                                                        std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::side side,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t m, std::int64_t n,
-                                                         std::int64_t k, std::int64_t lda,
-                                                         std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::side side,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t m, std::int64_t n,
-                                                        std::int64_t k, std::int64_t lda,
-                                                        std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::side side,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t m, std::int64_t n,
-                                                         std::int64_t k, std::int64_t lda,
-                                                         std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::side side,
-                                                        oneapi::mkl::uplo uplo,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t m, std::int64_t n,
-                                                        std::int64_t lda, std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::side side,
-                                                         oneapi::mkl::uplo uplo,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t m, std::int64_t n,
-                                                         std::int64_t lda, std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                                      oneapi::mkl::uplo uplo,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                                       oneapi::mkl::uplo uplo,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t nrhs,
-                                                        std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t nrhs,
-                                                         std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-    std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-    std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t potri_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t potri_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t potri_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                                      oneapi::mkl::uplo uplo,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t potri_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                                       oneapi::mkl::uplo uplo,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                                      oneapi::mkl::uplo uplo,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                                       oneapi::mkl::uplo uplo,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t syevd_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::job jobz,
-                                                        oneapi::mkl::uplo uplo, std::int64_t n,
-                                                        std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t syevd_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::job jobz,
-                                                         oneapi::mkl::uplo uplo, std::int64_t n,
-                                                         std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size<float>(sycl::queue &queue, std::int64_t itype,
-                                                        oneapi::mkl::job jobz,
-                                                        oneapi::mkl::uplo uplo, std::int64_t n,
-                                                        std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size<double>(sycl::queue &queue, std::int64_t itype,
-                                                         oneapi::mkl::job jobz,
-                                                         oneapi::mkl::uplo uplo, std::int64_t n,
-                                                         std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        oneapi::mkl::transpose trans,
-                                                        oneapi::mkl::diag diag, std::int64_t n,
-                                                        std::int64_t nrhs, std::int64_t lda,
-                                                        std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         oneapi::mkl::transpose trans,
-                                                         oneapi::mkl::diag diag, std::int64_t n,
-                                                         std::int64_t nrhs, std::int64_t lda,
-                                                         std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-    oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-    oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb);
-template <>
-ONEMKL_EXPORT std::int64_t ungbr_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k,
-    std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t ungbr_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k,
-    std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t ungqr_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t ungqr_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                                      oneapi::mkl::uplo uplo,
-                                                                      std::int64_t n,
-                                                                      std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                                       oneapi::mkl::uplo uplo,
-                                                                       std::int64_t n,
-                                                                       std::int64_t lda);
-template <>
-ONEMKL_EXPORT std::int64_t unmrq_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-    std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t unmrq_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-    std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t unmqr_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-    std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t unmqr_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-    std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t unmtr_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-    oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda,
-    std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t unmtr_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-    oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda,
-    std::int64_t ldc);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t m,
-                                                              std::int64_t n, std::int64_t lda,
-                                                              std::int64_t stride_a,
-                                                              std::int64_t stride_ipiv,
-                                                              std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t m,
-                                                               std::int64_t n, std::int64_t lda,
-                                                               std::int64_t stride_a,
-                                                               std::int64_t stride_ipiv,
-                                                               std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a,
-    std::int64_t stride_ipiv, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a,
-    std::int64_t stride_ipiv, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t n,
-                                                              std::int64_t lda,
-                                                              std::int64_t stride_a,
-                                                              std::int64_t stride_ipiv,
-                                                              std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t n,
-                                                               std::int64_t lda,
-                                                               std::int64_t stride_a,
-                                                               std::int64_t stride_ipiv,
-                                                               std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a,
-    std::int64_t stride_ipiv, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a,
-    std::int64_t stride_ipiv, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<float>(
-    sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<double>(
-    sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t m,
-                                                              std::int64_t n, std::int64_t lda,
-                                                              std::int64_t stride_a,
-                                                              std::int64_t stride_tau,
-                                                              std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t m,
-                                                               std::int64_t n, std::int64_t lda,
-                                                               std::int64_t stride_a,
-                                                               std::int64_t stride_tau,
-                                                               std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a,
-    std::int64_t stride_tau, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a,
-    std::int64_t stride_tau, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<float>(sycl::queue &queue,
-                                                              oneapi::mkl::uplo uplo,
-                                                              std::int64_t n, std::int64_t lda,
-                                                              std::int64_t stride_a,
-                                                              std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<double>(sycl::queue &queue,
-                                                               oneapi::mkl::uplo uplo,
-                                                               std::int64_t n, std::int64_t lda,
-                                                               std::int64_t stride_a,
-                                                               std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<float>(
-    sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<double>(
-    sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size<float>(
-    sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size<double>(
-    sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t *m,
-                                                              std::int64_t *n, std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t *m,
-                                                               std::int64_t *n, std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-    std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-    std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t *n,
-                                                              std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t *n,
-                                                               std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, std::int64_t *n, std::int64_t *lda, std::int64_t group_count,
-    std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, std::int64_t *n, std::int64_t *lda, std::int64_t group_count,
-    std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<float>(
-    sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<double>(
-    sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t *m,
-                                                              std::int64_t *n, std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t *m,
-                                                               std::int64_t *n, std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-    std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-    std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t *m,
-                                                              std::int64_t *n, std::int64_t *k,
-                                                              std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t *m,
-                                                               std::int64_t *n, std::int64_t *k,
-                                                               std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<float>(sycl::queue &queue,
-                                                              oneapi::mkl::uplo *uplo,
-                                                              std::int64_t *n, std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<double>(sycl::queue &queue,
-                                                               oneapi::mkl::uplo *uplo,
-                                                               std::int64_t *n, std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *lda,
-    std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *lda,
-    std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<float>(
-    sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<double>(
-    sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, std::int64_t *lda,
-    std::int64_t group_count, std::int64_t *group_sizes);
-template <>
-ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, std::int64_t *lda,
-    std::int64_t group_count, std::int64_t *group_sizes);
diff --git a/include/oneapi/mkl/lapack/detail/mklcpu/lapack_ct.hpp b/include/oneapi/mkl/lapack/detail/mklcpu/lapack_ct.hpp
deleted file mode 100644
index 1a6c088d6..000000000
--- a/include/oneapi/mkl/lapack/detail/mklcpu/lapack_ct.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#include <complex>
-#include <cstdint>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/lapack/types.hpp"
-#include "oneapi/mkl/detail/backend_selector.hpp"
-#include "oneapi/mkl/lapack/detail/mklcpu/onemkl_lapack_mklcpu.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-
-#define LAPACK_BACKEND mklcpu
-#include "oneapi/mkl/lapack/detail/mkl_common/lapack_ct.hxx"
-#undef LAPACK_BACKEND
-
-} //namespace lapack
-} //namespace mkl
-} //namespace oneapi
diff --git a/include/oneapi/mkl/lapack/detail/mklcpu/onemkl_lapack_mklcpu.hpp b/include/oneapi/mkl/lapack/detail/mklcpu/onemkl_lapack_mklcpu.hpp
deleted file mode 100644
index fc52ce1db..000000000
--- a/include/oneapi/mkl/lapack/detail/mklcpu/onemkl_lapack_mklcpu.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#include <complex>
-#include <cstdint>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/lapack/types.hpp"
-#include "oneapi/mkl/detail/export.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace mklcpu {
-
-#include "oneapi/mkl/lapack/detail/mkl_common/onemkl_lapack_backends.hxx"
-
-} //namespace mklcpu
-} //namespace lapack
-} //namespace mkl
-} //namespace oneapi
diff --git a/include/oneapi/mkl/lapack/detail/mklgpu/lapack_ct.hpp b/include/oneapi/mkl/lapack/detail/mklgpu/lapack_ct.hpp
deleted file mode 100644
index e344966a0..000000000
--- a/include/oneapi/mkl/lapack/detail/mklgpu/lapack_ct.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/lapack/types.hpp"
-#include "oneapi/mkl/detail/backend_selector.hpp"
-#include "oneapi/mkl/lapack/detail/mklgpu/onemkl_lapack_mklgpu.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-
-#define LAPACK_BACKEND mklgpu
-#include "oneapi/mkl/lapack/detail/mkl_common/lapack_ct.hxx"
-#undef LAPACK_BACKEND
-
-} //namespace lapack
-} //namespace mkl
-} //namespace oneapi
diff --git a/include/oneapi/mkl/lapack/detail/mklgpu/onemkl_lapack_mklgpu.hpp b/include/oneapi/mkl/lapack/detail/mklgpu/onemkl_lapack_mklgpu.hpp
deleted file mode 100644
index 132431b7c..000000000
--- a/include/oneapi/mkl/lapack/detail/mklgpu/onemkl_lapack_mklgpu.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#include <complex>
-#include <cstdint>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/lapack/types.hpp"
-#include "oneapi/mkl/detail/export.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace mklgpu {
-
-#include "oneapi/mkl/lapack/detail/mkl_common/onemkl_lapack_backends.hxx"
-
-} //namespace mklgpu
-} //namespace lapack
-} //namespace mkl
-} //namespace oneapi
diff --git a/include/oneapi/mkl/lapack/detail/rocsolver/lapack_ct.hpp b/include/oneapi/mkl/lapack/detail/rocsolver/lapack_ct.hpp
deleted file mode 100644
index 5e98b7c47..000000000
--- a/include/oneapi/mkl/lapack/detail/rocsolver/lapack_ct.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright 2022 Intel Corporation
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _DETAIL_ROCSOLVER_LAPACK_CT_HPP_
-#define _DETAIL_ROCSOLVER_LAPACK_CT_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/lapack/types.hpp"
-#include "oneapi/mkl/detail/backend_selector.hpp"
-#include "oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-
-#define LAPACK_BACKEND rocsolver
-#include "lapack_ct.hxx"
-#undef LAPACK_BACKEND
-
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_DETAIL_ROCSOLVER_LAPACK_CT_HPP_
diff --git a/include/oneapi/mkl/lapack/detail/rocsolver/lapack_ct.hxx b/include/oneapi/mkl/lapack/detail/rocsolver/lapack_ct.hxx
deleted file mode 100644
index 774441409..000000000
--- a/include/oneapi/mkl/lapack/detail/rocsolver/lapack_ct.hxx
+++ /dev/null
@@ -1,2629 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright 2022 Intel Corporation
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-// Buffer APIs
-
-static inline void gebrd(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tauq,
-                         sycl::buffer<std::complex<float>> &taup,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup,
-                                         scratchpad, scratchpad_size);
-}
-static inline void gebrd(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<double> &tauq, sycl::buffer<double> &taup,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup,
-                                         scratchpad, scratchpad_size);
-}
-static inline void gebrd(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e, sycl::buffer<float> &tauq,
-                         sycl::buffer<float> &taup, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup,
-                                         scratchpad, scratchpad_size);
-}
-static inline void gebrd(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tauq,
-                         sycl::buffer<std::complex<double>> &taup,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup,
-                                         scratchpad, scratchpad_size);
-}
-static inline void gerqf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void gerqf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void gerqf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void gerqf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void geqrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void geqrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void geqrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void geqrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getri(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getri(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getri(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getri(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void getrs(backend_selector<backend::rocsolver> selector, oneapi::mkl::transpose trans,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void getrs(backend_selector<backend::rocsolver> selector, oneapi::mkl::transpose trans,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void getrs(backend_selector<backend::rocsolver> selector, oneapi::mkl::transpose trans,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &b,
-                         std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void getrs(backend_selector<backend::rocsolver> selector, oneapi::mkl::transpose trans,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void gesvd(backend_selector<backend::rocsolver> selector, oneapi::mkl::jobsvd jobu,
-                         oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &s,
-                         sycl::buffer<double> &u, std::int64_t ldu, sycl::buffer<double> &vt,
-                         std::int64_t ldvt, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu,
-                                         vt, ldvt, scratchpad, scratchpad_size);
-}
-static inline void gesvd(backend_selector<backend::rocsolver> selector, oneapi::mkl::jobsvd jobu,
-                         oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &s,
-                         sycl::buffer<float> &u, std::int64_t ldu, sycl::buffer<float> &vt,
-                         std::int64_t ldvt, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu,
-                                         vt, ldvt, scratchpad, scratchpad_size);
-}
-static inline void gesvd(backend_selector<backend::rocsolver> selector, oneapi::mkl::jobsvd jobu,
-                         oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &s, sycl::buffer<std::complex<float>> &u,
-                         std::int64_t ldu, sycl::buffer<std::complex<float>> &vt, std::int64_t ldvt,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu,
-                                         vt, ldvt, scratchpad, scratchpad_size);
-}
-static inline void gesvd(backend_selector<backend::rocsolver> selector, oneapi::mkl::jobsvd jobu,
-                         oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &s, sycl::buffer<std::complex<double>> &u,
-                         std::int64_t ldu, sycl::buffer<std::complex<double>> &vt,
-                         std::int64_t ldvt, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu,
-                                         vt, ldvt, scratchpad, scratchpad_size);
-}
-static inline void heevd(backend_selector<backend::rocsolver> selector, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad,
-                                         scratchpad_size);
-}
-static inline void heevd(backend_selector<backend::rocsolver> selector, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad,
-                                         scratchpad_size);
-}
-static inline void hegvd(backend_selector<backend::rocsolver> selector, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb,
-                                         w, scratchpad, scratchpad_size);
-}
-static inline void hegvd(backend_selector<backend::rocsolver> selector, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb,
-                                         w, scratchpad, scratchpad_size);
-}
-static inline void hetrd(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void hetrd(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void hetrf(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void hetrf(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void orgbr(backend_selector<backend::rocsolver> selector, oneapi::mkl::generate vec,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void orgbr(backend_selector<backend::rocsolver> selector, oneapi::mkl::generate vec,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void orgqr(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void orgqr(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void orgtr(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void orgtr(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void ormtr(backend_selector<backend::rocsolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau,
-                                         c, ldc, scratchpad, scratchpad_size);
-}
-static inline void ormtr(backend_selector<backend::rocsolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau,
-                                         c, ldc, scratchpad, scratchpad_size);
-}
-static inline void ormrq(backend_selector<backend::rocsolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void ormrq(backend_selector<backend::rocsolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void ormqr(backend_selector<backend::rocsolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void ormqr(backend_selector<backend::rocsolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void potrf(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potrf(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potrf(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potrf(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potri(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potri(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potri(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potri(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                         scratchpad_size);
-}
-static inline void potrs(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void potrs(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void potrs(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void potrs(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                         scratchpad, scratchpad_size);
-}
-static inline void syevd(backend_selector<backend::rocsolver> selector, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &w,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad,
-                                         scratchpad_size);
-}
-static inline void syevd(backend_selector<backend::rocsolver> selector, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad,
-                                         scratchpad_size);
-}
-static inline void sygvd(backend_selector<backend::rocsolver> selector, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b,
-                         std::int64_t ldb, sycl::buffer<double> &w,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb,
-                                         w, scratchpad, scratchpad_size);
-}
-static inline void sygvd(backend_selector<backend::rocsolver> selector, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b,
-                         std::int64_t ldb, sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb,
-                                         w, scratchpad, scratchpad_size);
-}
-static inline void sytrd(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void sytrd(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void sytrf(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void sytrf(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void sytrf(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void sytrf(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad,
-                                         scratchpad_size);
-}
-static inline void trtrs(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda,
-                                         b, ldb, scratchpad, scratchpad_size);
-}
-static inline void trtrs(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda,
-                                         b, ldb, scratchpad, scratchpad_size);
-}
-static inline void trtrs(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &b, std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda,
-                                         b, ldb, scratchpad, scratchpad_size);
-}
-static inline void trtrs(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda,
-                                         b, ldb, scratchpad, scratchpad_size);
-}
-static inline void ungbr(backend_selector<backend::rocsolver> selector, oneapi::mkl::generate vec,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void ungbr(backend_selector<backend::rocsolver> selector, oneapi::mkl::generate vec,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                         scratchpad, scratchpad_size);
-}
-static inline void ungqr(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void ungqr(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void ungtr(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void ungtr(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad,
-                                         scratchpad_size);
-}
-static inline void unmrq(backend_selector<backend::rocsolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void unmrq(backend_selector<backend::rocsolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void unmqr(backend_selector<backend::rocsolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void unmqr(backend_selector<backend::rocsolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c,
-                                         ldc, scratchpad, scratchpad_size);
-}
-static inline void unmtr(backend_selector<backend::rocsolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau,
-                                         c, ldc, scratchpad, scratchpad_size);
-}
-static inline void unmtr(backend_selector<backend::rocsolver> selector, oneapi::mkl::side side,
-                         oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau,
-                                         c, ldc, scratchpad, scratchpad_size);
-}
-static inline void geqrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<float> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-static inline void geqrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<double> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-static inline void geqrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-static inline void geqrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<double>> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-static inline void getri_batch(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getri_batch(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getri_batch(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getri_batch(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getrs_batch(backend_selector<backend::rocsolver> selector,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<float> &b, std::int64_t ldb, std::int64_t stride_b,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                               stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrs_batch(backend_selector<backend::rocsolver> selector,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<double> &b, std::int64_t ldb, std::int64_t stride_b,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                               stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrs_batch(backend_selector<backend::rocsolver> selector,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<std::complex<float>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                               stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrs_batch(backend_selector<backend::rocsolver> selector,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<std::complex<double>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                               stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void getrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void getrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                               std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv,
-                                               stride_ipiv, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void orgqr_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                               std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                               std::int64_t lda, std::int64_t stride_a, sycl::buffer<float> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-static inline void orgqr_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                               std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                               std::int64_t lda, std::int64_t stride_a, sycl::buffer<double> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrf_batch(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a,
-                                               batch_size, scratchpad, scratchpad_size);
-}
-static inline void potrs_batch(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a,
-                               std::int64_t lda, std::int64_t stride_a, sycl::buffer<float> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                               stride_a, b, ldb, stride_b, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void potrs_batch(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a,
-                               std::int64_t lda, std::int64_t stride_a, sycl::buffer<double> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                               stride_a, b, ldb, stride_b, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void potrs_batch(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<float>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                               stride_a, b, ldb, stride_b, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void potrs_batch(backend_selector<backend::rocsolver> selector, oneapi::mkl::uplo uplo,
-                               std::int64_t n, std::int64_t nrhs,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &b,
-                               std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                               stride_a, b, ldb, stride_b, batch_size, scratchpad,
-                                               scratchpad_size);
-}
-static inline void ungqr_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                               std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-static inline void ungqr_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                               std::int64_t n, std::int64_t k,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size) {
-    oneapi::mkl::lapack::rocsolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau,
-                                               stride_tau, batch_size, scratchpad, scratchpad_size);
-}
-
-// USM APIs
-
-static inline sycl::event gebrd(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda, float *d,
-                                float *e, std::complex<float> *tauq, std::complex<float> *taup,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq,
-                                                taup, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gebrd(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *d, double *e,
-                                double *tauq, double *taup, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq,
-                                                taup, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gebrd(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *d, float *e,
-                                float *tauq, float *taup, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq,
-                                                taup, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gebrd(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                double *d, double *e, std::complex<double> *tauq,
-                                std::complex<double> *taup, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq,
-                                                taup, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event gerqf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, std::int64_t *ipiv,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, std::int64_t *ipiv,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrf(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getri(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event getri(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                                double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event getri(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                                float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event getri(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event getrs(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv,
-                                                b, ldb, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                double *a, std::int64_t lda, std::int64_t *ipiv, double *b,
-                                std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv,
-                                                b, ldb, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                float *a, std::int64_t lda, std::int64_t *ipiv, float *b,
-                                std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv,
-                                                b, ldb, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv,
-                                                b, ldb, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event gesvd(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, double *a, std::int64_t lda, double *s, double *u,
-                                std::int64_t ldu, double *vt, std::int64_t ldvt, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s,
-                                                u, ldu, vt, ldvt, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event gesvd(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, float *a, std::int64_t lda, float *s, float *u,
-                                std::int64_t ldu, float *vt, std::int64_t ldvt, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s,
-                                                u, ldu, vt, ldvt, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event gesvd(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda, float *s,
-                                std::complex<float> *u, std::int64_t ldu, std::complex<float> *vt,
-                                std::int64_t ldvt, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s,
-                                                u, ldu, vt, ldvt, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event gesvd(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                double *s, std::complex<double> *u, std::int64_t ldu,
-                                std::complex<double> *vt, std::int64_t ldvt,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s,
-                                                u, ldu, vt, ldvt, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event heevd(backend_selector<backend::rocsolver> selector, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, float *w, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event heevd(backend_selector<backend::rocsolver> selector, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, double *w, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event hegvd(backend_selector<backend::rocsolver> selector, std::int64_t itype,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                                std::int64_t ldb, float *w, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda,
-                                                b, ldb, w, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event hegvd(backend_selector<backend::rocsolver> selector, std::int64_t itype,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                                std::int64_t ldb, double *w, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda,
-                                                b, ldb, w, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event hetrd(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, float *d, float *e, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event hetrd(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, double *d, double *e, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event hetrf(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event hetrf(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgbr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, float *a, std::int64_t lda, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgbr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, double *a, std::int64_t lda, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                                double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                                float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgtr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgtr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ormtr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a,
-                                                lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event ormtr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a,
-                                                lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event ormrq(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, float *a,
-                                std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event ormrq(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, double *a,
-                                std::int64_t lda, double *tau, double *c, std::int64_t ldc,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event ormqr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, double *a,
-                                std::int64_t lda, double *tau, double *c, std::int64_t ldc,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event ormqr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k, float *a,
-                                std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event potrf(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potrf(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potrf(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potrf(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potri(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potri(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potri(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potri(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad,
-                                                scratchpad_size, dependencies);
-}
-static inline sycl::event potrs(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, float *a,
-                                std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                                std::int64_t ldb, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                                std::int64_t ldb, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event syevd(backend_selector<backend::rocsolver> selector, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *w, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event syevd(backend_selector<backend::rocsolver> selector, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *w, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sygvd(backend_selector<backend::rocsolver> selector, std::int64_t itype,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *b, std::int64_t ldb, double *w,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda,
-                                                b, ldb, w, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event sygvd(backend_selector<backend::rocsolver> selector, std::int64_t itype,
-                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *b, std::int64_t ldb, float *w,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda,
-                                                b, ldb, w, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event sytrd(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *d, double *e, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sytrd(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *d, float *e, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event sytrf(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event trtrs(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                                std::int64_t ldb, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a,
-                                                lda, b, ldb, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event trtrs(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a,
-                                                lda, b, ldb, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event trtrs(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a,
-                                std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a,
-                                                lda, b, ldb, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event trtrs(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                                std::int64_t ldb, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a,
-                                                lda, b, ldb, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event ungbr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungbr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungtr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungtr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau,
-                                                scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event unmrq(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event unmrq(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event unmqr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event unmqr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda,
-                                                tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event unmtr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a,
-                                                lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event unmtr(backend_selector<backend::rocsolver> selector,
-                                oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a,
-                                                lda, tau, c, ldc, scratchpad, scratchpad_size,
-                                                dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                      std::int64_t n, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, float *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      tau, stride_tau, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                      std::int64_t n, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, double *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      tau, stride_tau, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                      std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      tau, stride_tau, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                      std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      tau, stride_tau, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t *m,
-                                      std::int64_t *n, float **a, std::int64_t *lda, float **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t *m,
-                                      std::int64_t *n, double **a, std::int64_t *lda, double **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event geqrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                      std::int64_t n, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                      std::int64_t n, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                      std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                      std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t *m,
-                                      std::int64_t *n, float **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t *m,
-                                      std::int64_t *n, double **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrf_batch(backend_selector<backend::rocsolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                                      float *a, std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t *ipiv, std::int64_t stride_ipiv,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                                      double *a, std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t *ipiv, std::int64_t stride_ipiv,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a,
-                                                      ipiv, stride_ipiv, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::rocsolver> selector, std::int64_t *n,
-                                      float **a, std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::rocsolver> selector, std::int64_t *n,
-                                      double **a, std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::rocsolver> selector, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getri_batch(backend_selector<backend::rocsolver> selector, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, ipiv,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::transpose trans, std::int64_t n,
-                                      std::int64_t nrhs, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, float *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-        batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::transpose trans, std::int64_t n,
-                                      std::int64_t nrhs, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, double *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-        batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(
-    backend_selector<backend::rocsolver> selector, oneapi::mkl::transpose trans, std::int64_t n,
-    std::int64_t nrhs, std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-    std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex<float> *b, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size, std::complex<float> *scratchpad,
-    std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-        batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(
-    backend_selector<backend::rocsolver> selector, oneapi::mkl::transpose trans, std::int64_t n,
-    std::int64_t nrhs, std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-    std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex<double> *b, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size, std::complex<double> *scratchpad,
-    std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrs_batch(
-        selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b,
-        batch_size, scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::transpose *trans, std::int64_t *n,
-                                      std::int64_t *nrhs, float **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, float **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                      ipiv, b, ldb, group_count, group_sizes,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::transpose *trans, std::int64_t *n,
-                                      std::int64_t *nrhs, double **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, double **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                      ipiv, b, ldb, group_count, group_sizes,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::transpose *trans, std::int64_t *n,
-                                      std::int64_t *nrhs, std::complex<float> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                      ipiv, b, ldb, group_count, group_sizes,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event getrs_batch(
-    backend_selector<backend::rocsolver> selector, oneapi::mkl::transpose *trans, std::int64_t *n,
-    std::int64_t *nrhs, std::complex<double> **a, std::int64_t *lda, std::int64_t **ipiv,
-    std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-    const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda,
-                                                      ipiv, b, ldb, group_count, group_sizes,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                      std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, float *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda,
-                                                      stride_a, tau, stride_tau, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                      std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, double *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda,
-                                                      stride_a, tau, stride_tau, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(backend_selector<backend::rocsolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::int64_t *k, float **a,
-                                      std::int64_t *lda, float **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event orgqr_batch(backend_selector<backend::rocsolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::int64_t *k, double **a,
-                                      std::int64_t *lda, double **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      stride_a, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      stride_a, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      stride_a, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      stride_a, batch_size, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, float **a,
-                                      std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, double **a,
-                                      std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrf_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      float *a, std::int64_t lda, std::int64_t stride_a, float *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      stride_a, b, ldb, stride_b, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      double *a, std::int64_t lda, std::int64_t stride_a, double *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      stride_a, b, ldb, stride_b, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      stride_a, b, ldb, stride_b, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      stride_a, b, ldb, stride_b, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      float **a, std::int64_t *lda, float **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      b, ldb, group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      double **a, std::int64_t *lda, double **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      b, ldb, group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      b, ldb, group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event potrs_batch(backend_selector<backend::rocsolver> selector,
-                                      oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda,
-                                                      b, ldb, group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                      std::int64_t n, std::int64_t k, std::complex<float> *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::complex<float> *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda,
-                                                      stride_a, tau, stride_tau, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                      std::int64_t n, std::int64_t k, std::complex<double> *a,
-                                      std::int64_t lda, std::int64_t stride_a,
-                                      std::complex<double> *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda,
-                                                      stride_a, tau, stride_tau, batch_size,
-                                                      scratchpad, scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(backend_selector<backend::rocsolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::int64_t *k, std::complex<float> **a,
-                                      std::int64_t *lda, std::complex<float> **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-static inline sycl::event ungqr_batch(backend_selector<backend::rocsolver> selector, std::int64_t *m,
-                                      std::int64_t *n, std::int64_t *k, std::complex<double> **a,
-                                      std::int64_t *lda, std::complex<double> **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {}) {
-    return oneapi::mkl::lapack::rocsolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, tau,
-                                                      group_count, group_sizes, scratchpad,
-                                                      scratchpad_size, dependencies);
-}
-
-// SCRATCHPAD APIs
-template <typename fp_type>
-std::int64_t gebrd_scratchpad_size(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                   std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::gebrd_scratchpad_size<fp_type>(selector.get_queue(), m, n,
-                                                                         lda);
-}
-template <typename fp_type>
-std::int64_t gerqf_scratchpad_size(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                   std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::gerqf_scratchpad_size<fp_type>(selector.get_queue(), m, n,
-                                                                         lda);
-}
-template <typename fp_type>
-std::int64_t geqrf_scratchpad_size(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                   std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::geqrf_scratchpad_size<fp_type>(selector.get_queue(), m, n,
-                                                                         lda);
-}
-template <typename fp_type>
-std::int64_t gesvd_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldu, std::int64_t ldvt) {
-    return oneapi::mkl::lapack::rocsolver::gesvd_scratchpad_size<fp_type>(
-        selector.get_queue(), jobu, jobvt, m, n, lda, ldu, ldvt);
-}
-template <typename fp_type>
-std::int64_t getrf_scratchpad_size(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                   std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::getrf_scratchpad_size<fp_type>(selector.get_queue(), m, n,
-                                                                         lda);
-}
-template <typename fp_type>
-std::int64_t getri_scratchpad_size(backend_selector<backend::rocsolver> selector, std::int64_t n,
-                                   std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::getri_scratchpad_size<fp_type>(selector.get_queue(), n,
-                                                                         lda);
-}
-template <typename fp_type>
-std::int64_t getrs_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                                   std::int64_t lda, std::int64_t ldb) {
-    return oneapi::mkl::lapack::rocsolver::getrs_scratchpad_size<fp_type>(selector.get_queue(),
-                                                                         trans, n, nrhs, lda, ldb);
-}
-template <typename fp_type>
-std::int64_t heevd_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::heevd_scratchpad_size<fp_type>(selector.get_queue(), jobz,
-                                                                         uplo, n, lda);
-}
-template <typename fp_type>
-std::int64_t hegvd_scratchpad_size(backend_selector<backend::rocsolver> selector, std::int64_t itype,
-                                   oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldb) {
-    return oneapi::mkl::lapack::rocsolver::hegvd_scratchpad_size<fp_type>(
-        selector.get_queue(), itype, jobz, uplo, n, lda, ldb);
-}
-template <typename fp_type>
-std::int64_t hetrd_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::hetrd_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t hetrf_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::hetrf_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t orgbr_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::generate vect, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::orgbr_scratchpad_size<fp_type>(selector.get_queue(), vect,
-                                                                         m, n, k, lda);
-}
-template <typename fp_type>
-std::int64_t orgtr_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::orgtr_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t orgqr_scratchpad_size(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::orgqr_scratchpad_size<fp_type>(selector.get_queue(), m, n,
-                                                                         k, lda);
-}
-template <typename fp_type>
-std::int64_t ormrq_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc) {
-    return oneapi::mkl::lapack::rocsolver::ormrq_scratchpad_size<fp_type>(selector.get_queue(), side,
-                                                                         trans, m, n, k, lda, ldc);
-}
-template <typename fp_type>
-std::int64_t ormqr_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc) {
-    return oneapi::mkl::lapack::rocsolver::ormqr_scratchpad_size<fp_type>(selector.get_queue(), side,
-                                                                         trans, m, n, k, lda, ldc);
-}
-template <typename fp_type>
-std::int64_t ormtr_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldc) {
-    return oneapi::mkl::lapack::rocsolver::ormtr_scratchpad_size<fp_type>(
-        selector.get_queue(), side, uplo, trans, m, n, lda, ldc);
-}
-template <typename fp_type>
-std::int64_t potrf_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::potrf_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t potrs_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                   std::int64_t lda, std::int64_t ldb) {
-    return oneapi::mkl::lapack::rocsolver::potrs_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, nrhs, lda, ldb);
-}
-template <typename fp_type>
-std::int64_t potri_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::potri_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t sytrf_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::sytrf_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t syevd_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::syevd_scratchpad_size<fp_type>(selector.get_queue(), jobz,
-                                                                         uplo, n, lda);
-}
-template <typename fp_type>
-std::int64_t sygvd_scratchpad_size(backend_selector<backend::rocsolver> selector, std::int64_t itype,
-                                   oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldb) {
-    return oneapi::mkl::lapack::rocsolver::sygvd_scratchpad_size<fp_type>(
-        selector.get_queue(), itype, jobz, uplo, n, lda, ldb);
-}
-template <typename fp_type>
-std::int64_t sytrd_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::sytrd_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t trtrs_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                   oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                   std::int64_t lda, std::int64_t ldb) {
-    return oneapi::mkl::lapack::rocsolver::trtrs_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, trans, diag, n, nrhs, lda, ldb);
-}
-template <typename fp_type>
-std::int64_t ungbr_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::generate vect, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::ungbr_scratchpad_size<fp_type>(selector.get_queue(), vect,
-                                                                         m, n, k, lda);
-}
-template <typename fp_type>
-std::int64_t ungqr_scratchpad_size(backend_selector<backend::rocsolver> selector, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::ungqr_scratchpad_size<fp_type>(selector.get_queue(), m, n,
-                                                                         k, lda);
-}
-template <typename fp_type>
-std::int64_t ungtr_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) {
-    return oneapi::mkl::lapack::rocsolver::ungtr_scratchpad_size<fp_type>(selector.get_queue(), uplo,
-                                                                         n, lda);
-}
-template <typename fp_type>
-std::int64_t unmrq_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc) {
-    return oneapi::mkl::lapack::rocsolver::unmrq_scratchpad_size<fp_type>(selector.get_queue(), side,
-                                                                         trans, m, n, k, lda, ldc);
-}
-template <typename fp_type>
-std::int64_t unmqr_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                   std::int64_t ldc) {
-    return oneapi::mkl::lapack::rocsolver::unmqr_scratchpad_size<fp_type>(selector.get_queue(), side,
-                                                                         trans, m, n, k, lda, ldc);
-}
-template <typename fp_type>
-std::int64_t unmtr_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                   oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldc) {
-    return oneapi::mkl::lapack::rocsolver::unmtr_scratchpad_size<fp_type>(
-        selector.get_queue(), side, uplo, trans, m, n, lda, ldc);
-}
-template <typename fp_type>
-std::int64_t getrf_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         std::int64_t m, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t stride_ipiv,
-                                         std::int64_t batch_size) {
-    return oneapi::mkl::lapack::rocsolver::getrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, lda, stride_a, stride_ipiv, batch_size);
-}
-template <typename fp_type>
-std::int64_t getri_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         std::int64_t n, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_ipiv, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::rocsolver::getri_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), n, lda, stride_a, stride_ipiv, batch_size);
-}
-template <typename fp_type>
-std::int64_t getrs_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         oneapi::mkl::transpose trans, std::int64_t n,
-                                         std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_ipiv, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::rocsolver::getrs_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b,
-        batch_size);
-}
-template <typename fp_type>
-std::int64_t geqrf_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         std::int64_t m, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t stride_tau,
-                                         std::int64_t batch_size) {
-    return oneapi::mkl::lapack::rocsolver::geqrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, lda, stride_a, stride_tau, batch_size);
-}
-template <typename fp_type>
-std::int64_t potrf_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::rocsolver::potrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, n, lda, stride_a, batch_size);
-}
-template <typename fp_type>
-std::int64_t potrs_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                         std::int64_t lda, std::int64_t stride_a, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::rocsolver::potrs_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size);
-}
-template <typename fp_type>
-std::int64_t orgqr_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         std::int64_t m, std::int64_t n, std::int64_t k,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::rocsolver::orgqr_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, k, lda, stride_a, stride_tau, batch_size);
-}
-template <typename fp_type>
-std::int64_t ungqr_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         std::int64_t m, std::int64_t n, std::int64_t k,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size) {
-    return oneapi::mkl::lapack::rocsolver::ungqr_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, k, lda, stride_a, stride_tau, batch_size);
-}
-template <typename fp_type>
-std::int64_t getrf_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::rocsolver::getrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, lda, group_count, group_sizes);
-}
-template <typename fp_type>
-std::int64_t getri_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::rocsolver::getri_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), n, lda, group_count, group_sizes);
-}
-template <typename fp_type>
-std::int64_t getrs_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         oneapi::mkl::transpose *trans, std::int64_t *n,
-                                         std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::rocsolver::getrs_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), trans, n, nrhs, lda, ldb, group_count, group_sizes);
-}
-template <typename fp_type>
-std::int64_t geqrf_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::rocsolver::geqrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, lda, group_count, group_sizes);
-}
-template <typename fp_type>
-std::int64_t orgqr_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::rocsolver::orgqr_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, k, lda, group_count, group_sizes);
-}
-template <typename fp_type>
-std::int64_t potrf_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::rocsolver::potrf_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, n, lda, group_count, group_sizes);
-}
-template <typename fp_type>
-std::int64_t potrs_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                         std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb,
-                                         std::int64_t group_count, std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::rocsolver::potrs_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), uplo, n, nrhs, lda, ldb, group_count, group_sizes);
-}
-template <typename fp_type>
-std::int64_t ungqr_batch_scratchpad_size(backend_selector<backend::rocsolver> selector,
-                                         std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes) {
-    return oneapi::mkl::lapack::rocsolver::ungqr_batch_scratchpad_size<fp_type>(
-        selector.get_queue(), m, n, k, lda, group_count, group_sizes);
-}
diff --git a/include/oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hpp b/include/oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hpp
deleted file mode 100644
index f7e83f9a9..000000000
--- a/include/oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright 2022 Intel Corporation
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _ONEMKL_LAPACK_ROCSOLVER_HPP_
-#define _ONEMKL_LAPACK_ROCSOLVER_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-#include <string>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/detail/export.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace rocsolver {
-
-#include "onemkl_lapack_rocsolver.hxx"
-
-} // namespace rocsolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_LAPACK_ROCSOLVER_HPP_
diff --git a/include/oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hxx b/include/oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hxx
deleted file mode 100644
index c68009e54..000000000
--- a/include/oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hxx
+++ /dev/null
@@ -1,1835 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright 2022 Intel Corporation
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-// Buffer APIs
-
-ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tauq,
-                         sycl::buffer<std::complex<float>> &taup,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &d,
-                         sycl::buffer<double> &e, sycl::buffer<double> &tauq,
-                         sycl::buffer<double> &taup, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<float> &tauq, sycl::buffer<float> &taup,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tauq,
-                         sycl::buffer<std::complex<double>> &taup,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<float>> &b,
-                         std::int64_t ldb, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &b,
-                         std::int64_t ldb, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &b, std::int64_t ldb,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &b,
-                         std::int64_t ldb, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &s, sycl::buffer<double> &u, std::int64_t ldu,
-                         sycl::buffer<double> &vt, std::int64_t ldvt,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &s, sycl::buffer<float> &u, std::int64_t ldu,
-                         sycl::buffer<float> &vt, std::int64_t ldvt,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<float> &s,
-                         sycl::buffer<std::complex<float>> &u, std::int64_t ldu,
-                         sycl::buffer<std::complex<float>> &vt, std::int64_t ldvt,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                         std::int64_t m, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<double> &s,
-                         sycl::buffer<std::complex<double>> &u, std::int64_t ldu,
-                         sycl::buffer<std::complex<double>> &vt, std::int64_t ldvt,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<float> &d, sycl::buffer<float> &e,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<double> &d, sycl::buffer<double> &e,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &c, std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &c, std::int64_t ldc,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &c,
-                         std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &c,
-                         std::int64_t ldc, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &c,
-                         std::int64_t ldc, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &c,
-                         std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &b, std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<double> &w, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                         std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                         sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-                         std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-                         sycl::buffer<double> &w, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                         oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-                         std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-                         sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &d,
-                         sycl::buffer<double> &e, sycl::buffer<double> &tau,
-                         sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &d,
-                         sycl::buffer<float> &e, sycl::buffer<float> &tau,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<double> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::int64_t> &ipiv,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b,
-                         std::int64_t ldb, sycl::buffer<double> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b,
-                         std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                         oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                         std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a,
-                         std::int64_t lda, sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                         std::int64_t m, std::int64_t n, std::int64_t k,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<float>> &tau,
-                         sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<float>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                         sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                         sycl::buffer<std::complex<double>> &tau,
-                         sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                         sycl::buffer<std::complex<double>> &scratchpad,
-                         std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<float> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<double> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<float>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<float> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, sycl::buffer<double> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                               std::int64_t stride_ipiv, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<float> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<double> &tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                               std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                               std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<float> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<double> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                               std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                               std::int64_t lda, std::int64_t stride_a,
-                               sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<float>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<float>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-ONEMKL_EXPORT void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                               sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                               std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                               std::int64_t stride_tau, std::int64_t batch_size,
-                               sycl::buffer<std::complex<double>> &scratchpad,
-                               std::int64_t scratchpad_size);
-
-// USM APIs
-
-ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, float *d, float *e,
-                                std::complex<float> *tauq, std::complex<float> *taup,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, double *d, double *e, double *tauq, double *taup,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *d, float *e, float *tauq, float *taup,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, double *d, double *e,
-                                std::complex<double> *tauq, std::complex<double> *taup,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda,
-                                std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda,
-                                std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t *ipiv,
-                                double *b, std::int64_t ldb, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t *ipiv,
-                                float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                std::int64_t *ipiv, std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                double *a, std::int64_t lda, double *s, double *u, std::int64_t ldu,
-                                double *vt, std::int64_t ldvt, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float *a,
-                                std::int64_t lda, float *s, float *u, std::int64_t ldu, float *vt,
-                                std::int64_t ldvt, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, float *s,
-                                std::complex<float> *u, std::int64_t ldu, std::complex<float> *vt,
-                                std::int64_t ldvt, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, double *s,
-                                std::complex<double> *u, std::int64_t ldu, std::complex<double> *vt,
-                                std::int64_t ldvt, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, std::complex<float> *a, std::int64_t lda, float *w,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                                double *w, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                float *w, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                double *w, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, float *d, float *e,
-                                std::complex<float> *tau, std::complex<float> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, double *d, double *e,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                                float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                                double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                double *a, std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                float *a, std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *tau, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *tau, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, float *a, std::int64_t lda, float *tau, float *c,
-                                std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, double *a, std::int64_t lda, double *tau, double *c,
-                                std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, float *a, std::int64_t lda, float *tau, float *c,
-                                std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, float *a, std::int64_t lda, float *b,
-                                std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, double *a, std::int64_t lda, double *b,
-                                std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, double *a, std::int64_t lda, double *w,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                std::int64_t n, float *a, std::int64_t lda, float *w,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda,
-                                double *b, std::int64_t ldb, double *w, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda,
-                                float *b, std::int64_t ldb, float *w, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, double *d, double *e, double *tau,
-                                double *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, float *d, float *e, float *tau,
-                                float *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda,
-                                double *b, std::int64_t ldb, double *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda,
-                                float *b, std::int64_t ldb, float *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                std::int64_t n, std::int64_t nrhs, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<float> *a,
-                                std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                std::int64_t n, std::int64_t k, std::complex<double> *a,
-                                std::int64_t lda, std::complex<double> *tau,
-                                std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                std::complex<float> *tau, std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                                std::complex<float> *c, std::int64_t ldc,
-                                std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                std::complex<double> *a, std::int64_t lda,
-                                std::complex<double> *tau, std::complex<double> *c,
-                                std::int64_t ldc, std::complex<double> *scratchpad,
-                                std::int64_t scratchpad_size,
-                                const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a, float *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, double *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      float **a, std::int64_t *lda, float **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      double **a, std::int64_t *lda, double **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      float **a, std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      double **a, std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t **ipiv, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, float *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex<float> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex<double> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<float> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<double> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, float *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, double *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, double *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, std::complex<float> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::complex<float> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t n, std::int64_t nrhs, std::complex<double> *a,
-                                      std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                                      std::int64_t stride_ipiv, std::complex<double> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, float **a,
-                                      std::int64_t *lda, std::int64_t **ipiv, float **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, double **a,
-                                      std::int64_t *lda, std::int64_t **ipiv, double **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, std::complex<float> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                      std::int64_t *n, std::int64_t *nrhs, std::complex<double> **a,
-                                      std::int64_t *lda, std::int64_t **ipiv,
-                                      std::complex<double> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, float *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, double *tau, std::int64_t stride_tau,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, float **a, std::int64_t *lda, float **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, double **a, std::int64_t *lda, double **tau,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      float *a, std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      double *a, std::int64_t lda, std::int64_t stride_a,
-                                      std::int64_t batch_size, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      float **a, std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      double **a, std::int64_t *lda, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<float> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::complex<double> **a, std::int64_t *lda,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, float *a, std::int64_t lda,
-                                      std::int64_t stride_a, float *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      float *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, double *a, std::int64_t lda,
-                                      std::int64_t stride_a, double *b, std::int64_t ldb,
-                                      std::int64_t stride_b, std::int64_t batch_size,
-                                      double *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                      std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *b,
-                                      std::int64_t ldb, std::int64_t stride_b,
-                                      std::int64_t batch_size, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, float **a, std::int64_t *lda, float **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, float *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, double **a, std::int64_t *lda, double **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, double *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, std::complex<float> **a,
-                                      std::int64_t *lda, std::complex<float> **b, std::int64_t *ldb,
-                                      std::int64_t group_count, std::int64_t *group_sizes,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                      std::int64_t *nrhs, std::complex<double> **a,
-                                      std::int64_t *lda, std::complex<double> **b,
-                                      std::int64_t *ldb, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<float> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                      std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                      std::int64_t stride_a, std::complex<double> *tau,
-                                      std::int64_t stride_tau, std::int64_t batch_size,
-                                      std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, std::complex<float> **a, std::int64_t *lda,
-                                      std::complex<float> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                      std::int64_t *k, std::complex<double> **a, std::int64_t *lda,
-                                      std::complex<double> **tau, std::int64_t group_count,
-                                      std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                                      std::int64_t scratchpad_size,
-                                      const std::vector<sycl::event> &dependencies = {});
-
-// SCRATCHPAD APIs
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                                 oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                                 std::int64_t n, std::int64_t lda, std::int64_t ldu,
-                                                 std::int64_t ldvt);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                 std::int64_t n, std::int64_t nrhs,
-                                                 std::int64_t lda, std::int64_t ldb);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t heevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz,
-                                                 oneapi::mkl::uplo uplo, std::int64_t n,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t hegvd_scratchpad_size(sycl::queue &queue, std::int64_t itype,
-                                                 oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda,
-                                                 std::int64_t ldb);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect,
-                                                 std::int64_t m, std::int64_t n, std::int64_t k,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t k, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                                 oneapi::mkl::transpose trans, std::int64_t m,
-                                                 std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                 std::int64_t ldc);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                                 oneapi::mkl::transpose trans, std::int64_t m,
-                                                 std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                 std::int64_t ldc);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                                 oneapi::mkl::uplo uplo,
-                                                 oneapi::mkl::transpose trans, std::int64_t m,
-                                                 std::int64_t n, std::int64_t lda,
-                                                 std::int64_t ldc);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t nrhs,
-                                                 std::int64_t lda, std::int64_t ldb);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz,
-                                                 oneapi::mkl::uplo uplo, std::int64_t n,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype,
-                                                 oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda,
-                                                 std::int64_t ldb);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 oneapi::mkl::transpose trans,
-                                                 oneapi::mkl::diag diag, std::int64_t n,
-                                                 std::int64_t nrhs, std::int64_t lda,
-                                                 std::int64_t ldb);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ungbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect,
-                                                 std::int64_t m, std::int64_t n, std::int64_t k,
-                                                 std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ungqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t k, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t unmrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                                 oneapi::mkl::transpose trans, std::int64_t m,
-                                                 std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                 std::int64_t ldc);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t unmqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                                 oneapi::mkl::transpose trans, std::int64_t m,
-                                                 std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                 std::int64_t ldc);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t unmtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                                 oneapi::mkl::uplo uplo,
-                                                 oneapi::mkl::transpose trans, std::int64_t m,
-                                                 std::int64_t n, std::int64_t lda,
-                                                 std::int64_t ldc);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m,
-                                                       std::int64_t n, std::int64_t lda,
-                                                       std::int64_t stride_a,
-                                                       std::int64_t stride_ipiv,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n,
-                                                       std::int64_t lda, std::int64_t stride_a,
-                                                       std::int64_t stride_ipiv,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue,
-	      	                                       oneapi::mkl::transpose trans,
-                                                       std::int64_t n, std::int64_t nrhs,
-                                                       std::int64_t lda, std::int64_t stride_a,
-                                                       std::int64_t stride_ipiv, std::int64_t ldb,
-                                                       std::int64_t stride_b,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m,
-                                                       std::int64_t n, std::int64_t lda,
-                                                       std::int64_t stride_a,
-                                                       std::int64_t stride_tau,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                       std::int64_t n, std::int64_t lda,
-                                                       std::int64_t stride_a,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                       std::int64_t n, std::int64_t nrhs,
-                                                       std::int64_t lda, std::int64_t stride_a,
-                                                       std::int64_t ldb, std::int64_t stride_b,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m,
-                                                       std::int64_t n, std::int64_t k,
-                                                       std::int64_t lda, std::int64_t stride_a,
-                                                       std::int64_t stride_tau,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m,
-                                                       std::int64_t n, std::int64_t k,
-                                                       std::int64_t lda, std::int64_t stride_a,
-                                                       std::int64_t stride_tau,
-                                                       std::int64_t batch_size);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m,
-                                                       std::int64_t *n, std::int64_t *lda,
-                                                       std::int64_t group_count,
-                                                       std::int64_t *group_sizes);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n,
-                                                       std::int64_t *lda, std::int64_t group_count,
-                                                       std::int64_t *group_sizes);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size(
-    sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m,
-                                                       std::int64_t *n, std::int64_t *lda,
-                                                       std::int64_t group_count,
-                                                       std::int64_t *group_sizes);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m,
-                                                       std::int64_t *n, std::int64_t *k,
-                                                       std::int64_t *lda, std::int64_t group_count,
-                                                       std::int64_t *group_sizes);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                       std::int64_t *n, std::int64_t *lda,
-                                                       std::int64_t group_count,
-                                                       std::int64_t *group_sizes);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                       std::int64_t *n, std::int64_t *nrhs,
-                                                       std::int64_t *lda, std::int64_t *ldb,
-                                                       std::int64_t group_count,
-                                                       std::int64_t *group_sizes);
-
-template <typename T>
-ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m,
-                                                       std::int64_t *n, std::int64_t *k,
-                                                       std::int64_t *lda, std::int64_t group_count,
-                                                       std::int64_t *group_sizes);
diff --git a/include/oneapi/mkl/lapack/exceptions.hpp b/include/oneapi/mkl/lapack/exceptions.hpp
deleted file mode 100644
index da205cc1a..000000000
--- a/include/oneapi/mkl/lapack/exceptions.hpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-
-class exception {
-public:
-    exception(oneapi::mkl::exception *_ex, std::int64_t info, std::int64_t detail = 0)
-            : _info(info),
-              _detail(detail),
-              _ex(_ex) {}
-    std::int64_t info() const {
-        return _info;
-    }
-    std::int64_t detail() const {
-        return _detail;
-    }
-    const char *what() const {
-        return _ex->what();
-    }
-
-private:
-    std::int64_t _info;
-    std::int64_t _detail;
-    mkl::exception *_ex;
-};
-
-class computation_error : public oneapi::mkl::computation_error,
-                          public oneapi::mkl::lapack::exception {
-public:
-    computation_error(const std::string &function, const std::string &info, std::int64_t code)
-            : oneapi::mkl::computation_error("LAPACK", function, info),
-              oneapi::mkl::lapack::exception(this, code) {}
-    using oneapi::mkl::computation_error::what;
-};
-
-class batch_error : public oneapi::mkl::batch_error, public oneapi::mkl::lapack::exception {
-public:
-    batch_error(const std::string &function, const std::string &info, std::int64_t num_errors,
-                std::vector<std::int64_t> ids = {}, std::vector<std::exception_ptr> exceptions = {})
-            : oneapi::mkl::batch_error("LAPACK", function, info),
-              oneapi::mkl::lapack::exception(this, num_errors),
-              _ids(ids),
-              _exceptions(exceptions) {}
-    using oneapi::mkl::batch_error::what;
-    const std::vector<std::int64_t> &ids() const {
-        return _ids;
-    }
-    const std::vector<std::exception_ptr> &exceptions() const {
-        return _exceptions;
-    }
-
-private:
-    std::vector<std::int64_t> _ids;
-    std::vector<std::exception_ptr> _exceptions;
-};
-
-class invalid_argument : public oneapi::mkl::invalid_argument,
-                         public oneapi::mkl::lapack::exception {
-public:
-    invalid_argument(const std::string &function, const std::string &info,
-                     std::int64_t arg_position = 0, std::int64_t detail = 0)
-            : oneapi::mkl::invalid_argument("LAPACK", function, info),
-              oneapi::mkl::lapack::exception(this, arg_position, detail) {}
-    using oneapi::mkl::invalid_argument::what;
-};
-
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
diff --git a/include/oneapi/mkl/lapack/types.hpp b/include/oneapi/mkl/lapack/types.hpp
deleted file mode 100644
index 8dbe19e2e..000000000
--- a/include/oneapi/mkl/lapack/types.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#include <complex>
-#include <cstdint>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace internal {
-
-// auxilary type aliases and forward declarations
-template <bool, typename T = void>
-struct enable_if;
-template <typename T>
-struct is_fp;
-template <typename T>
-struct is_rfp;
-template <typename T>
-struct is_cfp;
-
-// auxilary typechecking templates
-template <typename T>
-struct enable_if<true, T> {
-    using type = T;
-};
-
-template <>
-struct is_fp<float> {
-    static constexpr bool value{ true };
-};
-template <>
-struct is_fp<double> {
-    static constexpr bool value{ true };
-};
-template <>
-struct is_fp<std::complex<float>> {
-    static constexpr bool value{ true };
-};
-template <>
-struct is_fp<std::complex<double>> {
-    static constexpr bool value{ true };
-};
-
-template <>
-struct is_rfp<float> {
-    static constexpr bool value{ true };
-};
-template <>
-struct is_rfp<double> {
-    static constexpr bool value{ true };
-};
-
-template <>
-struct is_cfp<std::complex<float>> {
-    static constexpr bool value{ true };
-};
-template <>
-struct is_cfp<std::complex<double>> {
-    static constexpr bool value{ true };
-};
-
-template <typename fp>
-using is_floating_point = typename enable_if<is_fp<fp>::value>::type*;
-template <typename fp>
-using is_real_floating_point = typename enable_if<is_rfp<fp>::value>::type*;
-template <typename fp>
-using is_complex_floating_point = typename enable_if<is_cfp<fp>::value>::type*;
-
-} // namespace internal
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
diff --git a/include/oneapi/mkl/rng.hpp b/include/oneapi/mkl/rng.hpp
deleted file mode 100644
index 5dfbdac81..000000000
--- a/include/oneapi/mkl/rng.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_RNG_HPP_
-#define _ONEMKL_RNG_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl/detail/get_device_id.hpp"
-
-#include "oneapi/mkl/rng/predicates.hpp"
-#include "oneapi/mkl/rng/detail/rng_loader.hpp"
-
-#include "oneapi/mkl/rng/functions.hpp"
-#include "oneapi/mkl/rng/distributions.hpp"
-#include "oneapi/mkl/rng/engines.hpp"
-
-#endif // _ONEMKL_RNG_HPP_
diff --git a/include/oneapi/mkl/rng/detail/curand/onemkl_rng_curand.hpp b/include/oneapi/mkl/rng/detail/curand/onemkl_rng_curand.hpp
deleted file mode 100644
index 062d21b61..000000000
--- a/include/oneapi/mkl/rng/detail/curand/onemkl_rng_curand.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*******************************************************************************
- * cuRAND back-end Copyright (c) 2021, The Regents of the University of
- * California, through Lawrence Berkeley National Laboratory (subject to receipt
- * of any required approvals from the U.S. Dept. of Energy). All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * (1) Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * (2) Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * (3) Neither the name of the University of California, Lawrence Berkeley
- * National Laboratory, U.S. Dept. of Energy nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * You are under no obligation whatsoever to provide any bug fixes, patches,
- * or upgrades to the features, functionality or performance of the source
- * code ("Enhancements") to anyone; however, if you choose to make your
- * Enhancements available either publicly, or directly to Lawrence Berkeley
- * National Laboratory, without imposing a separate written license agreement
- * for such Enhancements, then you hereby grant the following license: a
- * non-exclusive, royalty-free perpetual license to install, use, modify,
- * prepare derivative works, incorporate into other computer software,
- * distribute, and sublicense such enhancements or derivative works thereof,
- * in binary and source code form.
- *
- * If you have questions about your rights to use or distribute this software,
- * please contact Berkeley Lab's Intellectual Property Office at
- * IPO@lbl.gov.
- *
- * NOTICE.  This Software was developed under funding from the U.S. Department
- * of Energy and the U.S. Government consequently retains certain rights.  As
- * such, the U.S. Government has been granted for itself and others acting on
- * its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
- * Software to reproduce, distribute copies to the public, prepare derivative
- * works, and perform publicly and display publicly, and to permit others to do
- * so.
- ******************************************************************************/
-
-#ifndef _ONEMKL_RNG_CURAND_HPP_
-#define _ONEMKL_RNG_CURAND_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <cstdint>
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace curand {
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(sycl::queue queue,
-                                                                          std::uint64_t seed);
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(
-    sycl::queue queue, std::initializer_list<std::uint64_t> seed);
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(sycl::queue queue,
-                                                                     std::uint32_t seed);
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(
-    sycl::queue queue, std::initializer_list<std::uint32_t> seed);
-
-} // namespace curand
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_RNG_CURAND_HPP_
diff --git a/include/oneapi/mkl/rng/detail/engine_impl.hpp b/include/oneapi/mkl/rng/detail/engine_impl.hpp
deleted file mode 100644
index e76181e4c..000000000
--- a/include/oneapi/mkl/rng/detail/engine_impl.hpp
+++ /dev/null
@@ -1,197 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_RNG_ENGINE_IMPL_HPP_
-#define _ONEMKL_RNG_ENGINE_IMPL_HPP_
-
-#include <cstdint>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/detail/get_device_id.hpp"
-
-#include "oneapi/mkl/rng/distributions.hpp"
-#include "oneapi/mkl/types.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace detail {
-
-class engine_impl {
-public:
-    engine_impl(sycl::queue queue) : queue_(queue) {}
-
-    engine_impl(const engine_impl& other) : queue_(other.queue_) {}
-
-    // Buffers API
-    virtual void generate(const uniform<float, uniform_method::standard>& distr, std::int64_t n,
-                          sycl::buffer<float, 1>& r) = 0;
-
-    virtual void generate(const uniform<double, uniform_method::standard>& distr, std::int64_t n,
-                          sycl::buffer<double, 1>& r) = 0;
-
-    virtual void generate(const uniform<std::int32_t, uniform_method::standard>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) = 0;
-
-    virtual void generate(const uniform<float, uniform_method::accurate>& distr, std::int64_t n,
-                          sycl::buffer<float, 1>& r) = 0;
-
-    virtual void generate(const uniform<double, uniform_method::accurate>& distr, std::int64_t n,
-                          sycl::buffer<double, 1>& r) = 0;
-
-    virtual void generate(const gaussian<float, gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) = 0;
-
-    virtual void generate(const gaussian<double, gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) = 0;
-
-    virtual void generate(const gaussian<float, gaussian_method::icdf>& distr, std::int64_t n,
-                          sycl::buffer<float, 1>& r) = 0;
-
-    virtual void generate(const gaussian<double, gaussian_method::icdf>& distr, std::int64_t n,
-                          sycl::buffer<double, 1>& r) = 0;
-
-    virtual void generate(const lognormal<float, lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) = 0;
-
-    virtual void generate(const lognormal<double, lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) = 0;
-
-    virtual void generate(const lognormal<float, lognormal_method::icdf>& distr, std::int64_t n,
-                          sycl::buffer<float, 1>& r) = 0;
-
-    virtual void generate(const lognormal<double, lognormal_method::icdf>& distr, std::int64_t n,
-                          sycl::buffer<double, 1>& r) = 0;
-
-    virtual void generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) = 0;
-
-    virtual void generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) = 0;
-
-    virtual void generate(const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) = 0;
-
-    virtual void generate(const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) = 0;
-
-    virtual void generate(const bits<std::uint32_t>& distr, std::int64_t n,
-                          sycl::buffer<std::uint32_t, 1>& r) = 0;
-
-    // USM APIs
-    virtual sycl::event generate(const uniform<float, uniform_method::standard>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const uniform<double, uniform_method::standard>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const uniform<std::int32_t, uniform_method::standard>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const uniform<float, uniform_method::accurate>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const uniform<double, uniform_method::accurate>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const gaussian<float, gaussian_method::box_muller2>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const gaussian<double, gaussian_method::box_muller2>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const gaussian<float, gaussian_method::icdf>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const gaussian<double, gaussian_method::icdf>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const lognormal<float, lognormal_method::box_muller2>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const lognormal<double, lognormal_method::box_muller2>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const lognormal<float, lognormal_method::icdf>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const lognormal<double, lognormal_method::icdf>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(
-        const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::int32_t* r, const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(
-        const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::uint32_t* r, const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual sycl::event generate(const bits<std::uint32_t>& distr, std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) = 0;
-
-    virtual engine_impl* copy_state() = 0;
-
-    virtual void skip_ahead(std::uint64_t num_to_skip) = 0;
-
-    virtual void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) = 0;
-
-    virtual void leapfrog(std::uint64_t idx, std::uint64_t stride) = 0;
-
-    virtual ~engine_impl() {}
-
-    sycl::queue& get_queue() {
-        return queue_;
-    }
-
-protected:
-    sycl::queue queue_;
-};
-
-} // namespace detail
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_RNG_ENGINE_IMPL_HPP_
diff --git a/include/oneapi/mkl/rng/detail/mklcpu/onemkl_rng_mklcpu.hpp b/include/oneapi/mkl/rng/detail/mklcpu/onemkl_rng_mklcpu.hpp
deleted file mode 100644
index e13b70148..000000000
--- a/include/oneapi/mkl/rng/detail/mklcpu/onemkl_rng_mklcpu.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_RNG_MKLCPU_HPP_
-#define _ONEMKL_RNG_MKLCPU_HPP_
-
-#include <cstdint>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace mklcpu {
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(sycl::queue queue,
-                                                                          std::uint64_t seed);
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(
-    sycl::queue queue, std::initializer_list<std::uint64_t> seed);
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(sycl::queue queue,
-                                                                     std::uint32_t seed);
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(
-    sycl::queue queue, std::initializer_list<std::uint32_t> seed);
-
-} // namespace mklcpu
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_RNG_MKLCPU_HPP_
diff --git a/include/oneapi/mkl/rng/detail/mklgpu/onemkl_rng_mklgpu.hpp b/include/oneapi/mkl/rng/detail/mklgpu/onemkl_rng_mklgpu.hpp
deleted file mode 100644
index 4dd55f19b..000000000
--- a/include/oneapi/mkl/rng/detail/mklgpu/onemkl_rng_mklgpu.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_RNG_MKLGPU_HPP_
-#define _ONEMKL_RNG_MKLGPU_HPP_
-
-#include <cstdint>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace mklgpu {
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(sycl::queue queue,
-                                                                          std::uint64_t seed);
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(
-    sycl::queue queue, std::initializer_list<std::uint64_t> seed);
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(sycl::queue queue,
-                                                                     std::uint32_t seed);
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(
-    sycl::queue queue, std::initializer_list<std::uint32_t> seed);
-
-} // namespace mklgpu
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_RNG_MKLGPU_HPP_
diff --git a/include/oneapi/mkl/rng/detail/rng_loader.hpp b/include/oneapi/mkl/rng/detail/rng_loader.hpp
deleted file mode 100644
index dc85df5d6..000000000
--- a/include/oneapi/mkl/rng/detail/rng_loader.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_RNG_LOADER_HPP_
-#define _ONEMKL_RNG_LOADER_HPP_
-
-#include <cstdint>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/detail/get_device_id.hpp"
-
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace detail {
-
-ONEMKL_EXPORT engine_impl* create_philox4x32x10(oneapi::mkl::device libkey, sycl::queue queue,
-                                                std::uint64_t seed);
-
-ONEMKL_EXPORT engine_impl* create_philox4x32x10(oneapi::mkl::device libkey, sycl::queue queue,
-                                                std::initializer_list<std::uint64_t> seed);
-
-ONEMKL_EXPORT engine_impl* create_mrg32k3a(oneapi::mkl::device libkey, sycl::queue queue,
-                                           std::uint32_t seed);
-
-ONEMKL_EXPORT engine_impl* create_mrg32k3a(oneapi::mkl::device libkey, sycl::queue queue,
-                                           std::initializer_list<std::uint32_t> seed);
-
-} // namespace detail
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_RNG_LOADER_HPP_
diff --git a/include/oneapi/mkl/rng/detail/rocrand/onemkl_rng_rocrand.hpp b/include/oneapi/mkl/rng/detail/rocrand/onemkl_rng_rocrand.hpp
deleted file mode 100644
index 791bcc13b..000000000
--- a/include/oneapi/mkl/rng/detail/rocrand/onemkl_rng_rocrand.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) 
- * and Computing Centre (URZ)
- * cuRAND back-end Copyright (c) 2021, The Regents of the University of
- * California, through Lawrence Berkeley National Laboratory (subject to receipt
- * of any required approvals from the U.S. Dept. of Energy). All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * (1) Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * (2) Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * (3) Neither the name of the University of California, Lawrence Berkeley
- * National Laboratory, U.S. Dept. of Energy nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * You are under no obligation whatsoever to provide any bug fixes, patches,
- * or upgrades to the features, functionality or performance of the source
- * code ("Enhancements") to anyone; however, if you choose to make your
- * Enhancements available either publicly, or directly to Lawrence Berkeley
- * National Laboratory, without imposing a separate written license agreement
- * for such Enhancements, then you hereby grant the following license: a
- * non-exclusive, royalty-free perpetual license to install, use, modify,
- * prepare derivative works, incorporate into other computer software,
- * distribute, and sublicense such enhancements or derivative works thereof,
- * in binary and source code form.
- *
- * If you have questions about your rights to use or distribute this software,
- * please contact Berkeley Lab's Intellectual Property Office at
- * IPO@lbl.gov.
- *
- * NOTICE.  This Software was developed under funding from the U.S. Department
- * of Energy and the U.S. Government consequently retains certain rights.  As
- * such, the U.S. Government has been granted for itself and others acting on
- * its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
- * Software to reproduce, distribute copies to the public, prepare derivative
- * works, and perform publicly and display publicly, and to permit others to do
- * so.
- ******************************************************************************/
-
-#ifndef _ONEMKL_RNG_ROCRAND_HPP_
-#define _ONEMKL_RNG_ROCRAND_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <cstdint>
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace rocrand {
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(sycl::queue queue,
-                                                                          std::uint64_t seed);
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(
-    sycl::queue queue, std::initializer_list<std::uint64_t> seed);
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(sycl::queue queue,
-                                                                     std::uint32_t seed);
-
-ONEMKL_EXPORT oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(
-    sycl::queue queue, std::initializer_list<std::uint32_t> seed);
-
-} // namespace rocrand
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_RNG_ROCRAND_HPP_
diff --git a/include/oneapi/mkl/rng/device.hpp b/include/oneapi/mkl/rng/device.hpp
deleted file mode 100644
index a628395d2..000000000
--- a/include/oneapi/mkl/rng/device.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_SYCL_DEVICE_HPP__
-#define _MKL_RNG_SYCL_DEVICE_HPP__
-
-#include "oneapi/mkl/rng/device/types.hpp"
-#include "oneapi/mkl/rng/device/functions.hpp"
-#include "oneapi/mkl/rng/device/distributions.hpp"
-#include "oneapi/mkl/rng/device/engines.hpp"
-
-#endif // _MKL_RNG_SYCL_DEVICE_HPP__
diff --git a/include/oneapi/mkl/rng/device/detail/bernoulli_impl.hpp b/include/oneapi/mkl/rng/device/detail/bernoulli_impl.hpp
deleted file mode 100644
index 83bb92f2d..000000000
--- a/include/oneapi/mkl/rng/device/detail/bernoulli_impl.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_BERNOULLI_IMPL_HPP_
-#define _MKL_RNG_DEVICE_BERNOULLI_IMPL_HPP_
-
-namespace oneapi::mkl::rng::device::detail {
-
-template <typename IntType, typename Method>
-class distribution_base<oneapi::mkl::rng::device::bernoulli<IntType, Method>> {
-public:
-    struct param_type {
-        param_type(float p) : p_(p) {}
-        float p_;
-    };
-
-    distribution_base(float p) : p_(p) {
-#ifndef __SYCL_DEVICE_ONLY__
-        if ((p > 1.0f) || (p < 0.0f)) {
-            throw oneapi::mkl::invalid_argument("rng", "bernoulli", "p < 0 || p > 1");
-        }
-#endif
-    }
-
-    float p() const {
-        return p_;
-    }
-
-    param_type param() const {
-        return param_type(p_);
-    }
-
-    void param(const param_type& pt) {
-#ifndef __SYCL_DEVICE_ONLY__
-        if ((pt.p_ > 1.0f) || (pt.p_ < 0.0f)) {
-            throw oneapi::mkl::invalid_argument("rng", "bernoulli", "p < 0 || p > 1");
-        }
-#endif
-        p_ = pt.p_;
-    }
-
-protected:
-    template <typename EngineType>
-    auto generate(EngineType& engine) ->
-        typename std::conditional<EngineType::vec_size == 1, IntType,
-                                  sycl::vec<IntType, EngineType::vec_size>>::type {
-        auto uni_res = engine.generate(0.0f, 1.0f);
-        if constexpr (EngineType::vec_size == 1) {
-            return IntType{ uni_res < p_ };
-        }
-        else {
-            sycl::vec<IntType, EngineType::vec_size> vec_out(IntType{ 0 });
-            for (int i = 0; i < EngineType::vec_size; ++i) {
-                if (uni_res[i] < p_) {
-                    vec_out[i] = IntType{ 1 };
-                }
-            }
-            return vec_out;
-        }
-    }
-
-    template <typename EngineType>
-    IntType generate_single(EngineType& engine) {
-        auto uni_res = engine.generate_single(0.0f, 1.0f);
-        return IntType{ uni_res < p_ };
-    }
-
-    float p_;
-};
-
-} // namespace oneapi::mkl::rng::device::detail
-
-#endif // _MKL_RNG_DEVICE_BERNOULLI_IMPL_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/bits_impl.hpp b/include/oneapi/mkl/rng/device/detail/bits_impl.hpp
deleted file mode 100644
index aa68956d6..000000000
--- a/include/oneapi/mkl/rng/device/detail/bits_impl.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_BITS_IMPL_HPP_
-#define _MKL_RNG_DEVICE_BITS_IMPL_HPP_
-
-#include "engine_base.hpp"
-
-namespace oneapi::mkl::rng::device::detail {
-
-template <typename UIntType>
-class distribution_base<oneapi::mkl::rng::device::bits<UIntType>> {
-protected:
-    template <typename EngineType>
-    auto generate(EngineType& engine) -> typename std::enable_if<
-        !std::is_same<EngineType, mcg59<EngineType::vec_size>>::value,
-        typename std::conditional<EngineType::vec_size == 1, UIntType,
-                                  sycl::vec<UIntType, EngineType::vec_size>>::type>::type {
-        static_assert(std::is_same<UIntType, uint32_t>::value,
-                      "oneMKL: bits works only with std::uint32_t");
-        return engine.generate();
-    }
-
-    template <typename EngineType>
-    auto generate(EngineType& engine) -> typename std::enable_if<
-        std::is_same<EngineType, mcg59<EngineType::vec_size>>::value,
-        typename std::conditional<EngineType::vec_size == 1, UIntType,
-                                  sycl::vec<UIntType, EngineType::vec_size>>::type>::type {
-        static_assert(std::is_same<UIntType, uint64_t>::value,
-                      "oneMKL: bits for mcg59 works only with std::uint64_t");
-        return engine.generate_bits();
-    }
-
-    template <typename EngineType>
-    typename std::enable_if<!std::is_same<EngineType, mcg59<EngineType::vec_size>>::value,
-                            UIntType>::type
-    generate_single(EngineType& engine) {
-        static_assert(std::is_same<UIntType, uint32_t>::value,
-                      "oneMKL: bits works only with std::uint32_t");
-        return engine.generate_single();
-    }
-
-    template <typename EngineType>
-    typename std::enable_if<std::is_same<EngineType, mcg59<EngineType::vec_size>>::value,
-                            UIntType>::type
-    generate_single(EngineType& engine) {
-        static_assert(std::is_same<UIntType, uint64_t>::value,
-                      "oneMKL: bits for mcg59 works only with std::uint64_t");
-        return engine.generate_single();
-    }
-};
-
-} // namespace oneapi::mkl::rng::device::detail
-
-#endif // _MKL_RNG_DEVICE_BITS_IMPL_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/distribution_base.hpp b/include/oneapi/mkl/rng/device/detail/distribution_base.hpp
deleted file mode 100644
index e728a564c..000000000
--- a/include/oneapi/mkl/rng/device/detail/distribution_base.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DISTRIBUTION_BASE_HPP_
-#define _MKL_RNG_DISTRIBUTION_BASE_HPP_
-
-#include <sycl/sycl.hpp>
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/rng/device/types.hpp"
-
-namespace oneapi::mkl::rng::device {
-
-namespace detail {
-
-template <typename DistrType>
-class distribution_base {};
-
-} // namespace detail
-
-// declarations of distribution classes
-template <typename Type = float, typename Method = uniform_method::by_default>
-class uniform;
-
-template <typename RealType = float, typename Method = gaussian_method::by_default>
-class gaussian;
-
-template <typename RealType = float, typename Method = lognormal_method::by_default>
-class lognormal;
-
-template <typename UIntType = std::uint32_t>
-class uniform_bits;
-
-template <typename UIntType = std::uint32_t>
-class bits;
-
-template <typename RealType = float, typename Method = exponential_method::by_default>
-class exponential;
-
-template <typename IntType = std::int32_t, typename Method = poisson_method::by_default>
-class poisson;
-
-template <typename IntType = std::uint32_t, typename Method = bernoulli_method::by_default>
-class bernoulli;
-
-} // namespace oneapi::mkl::rng::device
-
-#include "oneapi/mkl/rng/device/detail/uniform_impl.hpp"
-#include "oneapi/mkl/rng/device/detail/gaussian_impl.hpp"
-#include "oneapi/mkl/rng/device/detail/lognormal_impl.hpp"
-#include "oneapi/mkl/rng/device/detail/bits_impl.hpp"
-#include "oneapi/mkl/rng/device/detail/uniform_bits_impl.hpp"
-#include "oneapi/mkl/rng/device/detail/exponential_impl.hpp"
-#include "oneapi/mkl/rng/device/detail/poisson_impl.hpp"
-#include "oneapi/mkl/rng/device/detail/bernoulli_impl.hpp"
-
-#endif // _MKL_RNG_DISTRIBUTION_BASE_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/engine_base.hpp b/include/oneapi/mkl/rng/device/detail/engine_base.hpp
deleted file mode 100644
index fc1aee16a..000000000
--- a/include/oneapi/mkl/rng/device/detail/engine_base.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_ENGINE_BASE_HPP_
-#define _MKL_RNG_DEVICE_ENGINE_BASE_HPP_
-
-#include <cstdint>
-
-#include <sycl/sycl.hpp>
-
-namespace oneapi::mkl::rng::device::detail {
-
-// internal structure to specify state of engine
-template <typename EngineType>
-struct engine_state {};
-
-template <typename EngineType>
-class engine_base {};
-
-} // namespace oneapi::mkl::rng::device::detail
-
-#include "oneapi/mkl/rng/device/detail/philox4x32x10_impl.hpp"
-#include "oneapi/mkl/rng/device/detail/mrg32k3a_impl.hpp"
-#include "oneapi/mkl/rng/device/detail/mcg31m1_impl.hpp"
-#include "oneapi/mkl/rng/device/detail/mcg59_impl.hpp"
-
-#endif // _MKL_RNG_DEVICE_ENGINE_BASE_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/exponential_impl.hpp b/include/oneapi/mkl/rng/device/detail/exponential_impl.hpp
deleted file mode 100644
index cf712f0e5..000000000
--- a/include/oneapi/mkl/rng/device/detail/exponential_impl.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_EXPONENTIAL_IMPL_HPP_
-#define _MKL_RNG_DEVICE_EXPONENTIAL_IMPL_HPP_
-
-#include "vm_wrappers.hpp"
-
-namespace oneapi::mkl::rng::device::detail {
-
-template <typename RealType, typename Method>
-class distribution_base<oneapi::mkl::rng::device::exponential<RealType, Method>> {
-public:
-    struct param_type {
-        param_type(RealType a, RealType beta) : a_(a), beta_(beta) {}
-        RealType a_;
-        RealType beta_;
-    };
-
-    distribution_base(RealType a, RealType beta) : a_(a), beta_(beta) {
-#ifndef __SYCL_DEVICE_ONLY__
-        if (beta <= static_cast<RealType>(0.0)) {
-            throw oneapi::mkl::invalid_argument("rng", "exponential", "beta <= 0");
-        }
-#endif
-    }
-
-    RealType a() const {
-        return a_;
-    }
-
-    RealType beta() const {
-        return beta_;
-    }
-
-    param_type param() const {
-        return param_type(a_, beta_);
-    }
-
-    void param(const param_type& pt) {
-#ifndef __SYCL_DEVICE_ONLY__
-        if (pt.beta_ <= static_cast<RealType>(0.0)) {
-            throw oneapi::mkl::invalid_argument("rng", "exponential", "beta <= 0");
-        }
-#endif
-        a_ = pt.a_;
-        beta_ = pt.beta_;
-    }
-
-protected:
-    template <typename EngineType>
-    auto generate(EngineType& engine) ->
-        typename std::conditional<EngineType::vec_size == 1, RealType,
-                                  sycl::vec<RealType, EngineType::vec_size>>::type {
-        using OutType = typename std::conditional<EngineType::vec_size == 1, RealType,
-                                                  sycl::vec<RealType, EngineType::vec_size>>::type;
-
-        OutType res = engine.generate(RealType(0), RealType(1));
-        if constexpr (EngineType::vec_size == 1) {
-            res = ln_wrapper(res);
-        }
-        else {
-            for (int i = 0; i < EngineType::vec_size; ++i) {
-                res[i] = ln_wrapper(res[i]);
-            }
-        }
-        res = a_ - res * beta_;
-        if constexpr (std::is_same<Method, exponential_method::icdf_accurate>::value) {
-            res = sycl::fmax(res, OutType{ a_ });
-        }
-        return res;
-    }
-
-    template <typename EngineType>
-    RealType generate_single(EngineType& engine) {
-        RealType res = engine.generate_single(RealType(0), RealType(1));
-        res = ln_wrapper(res);
-        res = a_ - res * beta_;
-        if constexpr (std::is_same<Method, exponential_method::icdf_accurate>::value) {
-            res = sycl::fmax(res, a_);
-        }
-        return res;
-    }
-
-    RealType a_;
-    RealType beta_;
-
-    friend class distribution_base<
-        oneapi::mkl::rng::device::poisson<std::int32_t, poisson_method::devroye>>;
-    friend class distribution_base<
-        oneapi::mkl::rng::device::poisson<std::uint32_t, poisson_method::devroye>>;
-};
-
-} // namespace oneapi::mkl::rng::device::detail
-
-#endif // _MKL_RNG_DEVICE_EXPONENTIAL_IMPL_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/gaussian_impl.hpp b/include/oneapi/mkl/rng/device/detail/gaussian_impl.hpp
deleted file mode 100644
index 4588aea97..000000000
--- a/include/oneapi/mkl/rng/device/detail/gaussian_impl.hpp
+++ /dev/null
@@ -1,270 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_GAUSSIAN_IMPL_HPP_
-#define _MKL_RNG_DEVICE_GAUSSIAN_IMPL_HPP_
-
-#include "vm_wrappers.hpp"
-
-namespace oneapi::mkl::rng::device::detail {
-
-// sqrt(2)
-template <typename RealType = float>
-constexpr inline RealType sqrt2() {
-    return 0x1.6A09E6P+0f; // 1.414213562
-}
-
-template <>
-constexpr inline double sqrt2<double>() {
-    return 0x1.6A09E667F3BCDP+0; // 1.414213562
-}
-
-template <typename RealType>
-class distribution_base<
-    oneapi::mkl::rng::device::gaussian<RealType, gaussian_method::box_muller2>> {
-public:
-    struct param_type {
-        param_type(RealType mean, RealType stddev) : mean_(mean), stddev_(stddev) {}
-        RealType mean_;
-        RealType stddev_;
-    };
-
-    distribution_base(RealType mean, RealType stddev) : mean_(mean), stddev_(stddev) {
-        flag_ = false;
-#ifndef __SYCL_DEVICE_ONLY__
-        if (stddev <= RealType(0)) {
-            throw oneapi::mkl::invalid_argument("rng", "gaussian", "stddev <= 0");
-        }
-#endif
-    }
-
-    RealType mean() const {
-        return mean_;
-    }
-
-    RealType stddev() const {
-        return stddev_;
-    }
-
-    param_type param() const {
-        return param_type(mean_, stddev_);
-    }
-
-    void param(const param_type& pt) {
-#ifndef __SYCL_DEVICE_ONLY__
-        if (pt.stddev_ <= RealType(0)) {
-            throw oneapi::mkl::invalid_argument("rng", "gaussian", "stddev <= 0");
-        }
-#endif
-        mean_ = pt.mean_;
-        stddev_ = pt.stddev_;
-    }
-
-protected:
-    template <typename EngineType>
-    __attribute__((always_inline)) inline auto generate(EngineType& engine) ->
-        typename std::conditional<EngineType::vec_size == 1, RealType,
-                                  sycl::vec<RealType, EngineType::vec_size>>::type {
-        RealType u1, u2, u1_transformed;
-
-        if constexpr (EngineType::vec_size == 1) {
-            RealType res;
-            if (!flag_) {
-                u1 = engine.generate(RealType(0), RealType(1));
-                u2 = engine.generate(RealType(0), RealType(1));
-                u1_transformed = ln_wrapper(u1);
-                u1_transformed = sqrt_wrapper(static_cast<RealType>(-2.0) * u1_transformed);
-                res = u1_transformed * sinpi_wrapper(RealType(2) * u2) * stddev_ + mean_;
-                u1_transformed_ = u1_transformed;
-                u2_ = u2;
-                flag_ = true;
-                return res;
-            }
-            res = u1_transformed_ * cospi_wrapper(RealType(2) * u2_) * stddev_ + mean_;
-            flag_ = false;
-            return res;
-        }
-        else {
-            RealType sin, cos;
-            sycl::vec<RealType, EngineType::vec_size> res;
-            if (!flag_) {
-                constexpr std::int32_t tail = EngineType::vec_size % 2;
-                auto uniform_res = engine.generate(RealType(0), RealType(1));
-#pragma unroll
-                for (std::int32_t i = 0; i < EngineType::vec_size - tail; i += 2) {
-                    u1 = uniform_res[i];
-                    u2 = uniform_res[i + 1];
-                    u1_transformed = ln_wrapper(u1);
-                    u1_transformed = sqrt_wrapper(static_cast<RealType>(-2.0) * u1_transformed);
-                    sin = sincospi_wrapper(RealType(2.0) * u2, cos);
-                    res[i] = (u1_transformed * sin) * stddev_ + mean_;
-                    res[i + 1] = (u1_transformed * cos) * stddev_ + mean_;
-                }
-                if constexpr (tail) {
-                    u1 = uniform_res[EngineType::vec_size - 1];
-                    u2 = engine.generate_single(RealType(0), RealType(1));
-                    u1_transformed = ln_wrapper(u1);
-                    u1_transformed = sqrt_wrapper(static_cast<RealType>(-2.0) * u1_transformed);
-                    res[EngineType::vec_size - 1] =
-                        u1_transformed * sinpi_wrapper(RealType(2) * u2) * stddev_ + mean_;
-                    u1_transformed_ = u1_transformed;
-                    u2_ = u2;
-                    flag_ = true;
-                }
-                return res;
-            }
-
-            res[0] = u1_transformed_ * cospi_wrapper(RealType(2) * u2_) * stddev_ + mean_;
-            flag_ = false;
-            constexpr std::int32_t tail = (EngineType::vec_size - 1) % 2;
-#pragma unroll
-            for (std::int32_t i = 1; i < EngineType::vec_size - tail; i += 2) {
-                u1 = engine.generate_single(RealType(0), RealType(1));
-                u2 = engine.generate_single(RealType(0), RealType(1));
-                u1_transformed = ln_wrapper(u1);
-                u1_transformed = sqrt_wrapper(static_cast<RealType>(-2.0) * u1_transformed);
-                sin = sincospi_wrapper(RealType(2.0) * u2, cos);
-                res[i] = (u1_transformed * sin) * stddev_ + mean_;
-                res[i + 1] = (u1_transformed * cos) * stddev_ + mean_;
-            }
-            if constexpr (tail) {
-                u1 = engine.generate_single(RealType(0), RealType(1));
-                u2 = engine.generate_single(RealType(0), RealType(1));
-                u1_transformed = ln_wrapper(u1);
-                u1_transformed = sqrt_wrapper(static_cast<RealType>(-2.0) * u1_transformed);
-                res[EngineType::vec_size - 1] =
-                    u1_transformed * sinpi_wrapper(RealType(2) * u2) * stddev_ + mean_;
-                u1_transformed_ = u1_transformed;
-                u2_ = u2;
-                flag_ = true;
-            }
-            return res;
-        }
-    }
-
-    template <typename EngineType>
-    __attribute__((always_inline)) inline RealType generate_single(EngineType& engine) {
-        RealType u1, u2, u1_transformed;
-        RealType res;
-        if (!flag_) {
-            u1 = engine.generate_single(RealType(0), RealType(1));
-            u2 = engine.generate_single(RealType(0), RealType(1));
-            u1_transformed = ln_wrapper(u1);
-            u1_transformed = sqrt_wrapper(static_cast<RealType>(-2.0) * u1_transformed);
-            res = u1_transformed * sinpi_wrapper(RealType(2) * u2) * stddev_ + mean_;
-            u1_transformed_ = u1_transformed;
-            u2_ = u2;
-            flag_ = true;
-            return res;
-        }
-        res = u1_transformed_ * cospi_wrapper(RealType(2) * u2_) * stddev_ + mean_;
-        flag_ = false;
-        return res;
-    }
-
-    RealType mean_;
-    RealType stddev_;
-    bool flag_ = false;
-    RealType u1_transformed_;
-    RealType u2_;
-
-    friend class distribution_base<
-        oneapi::mkl::rng::device::lognormal<RealType, lognormal_method::box_muller2>>;
-    friend class distribution_base<
-        oneapi::mkl::rng::device::poisson<std::int32_t, poisson_method::devroye>>;
-    friend class distribution_base<
-        oneapi::mkl::rng::device::poisson<std::uint32_t, poisson_method::devroye>>;
-};
-
-#if MKL_RNG_USE_BINARY_CODE
-
-template <typename RealType>
-class distribution_base<oneapi::mkl::rng::device::gaussian<RealType, gaussian_method::icdf>> {
-public:
-    struct param_type {
-        param_type(RealType mean, RealType stddev) : mean_(mean), stddev_(stddev) {}
-        RealType mean_;
-        RealType stddev_;
-    };
-
-    distribution_base(RealType mean, RealType stddev) : mean_(mean), stddev_(stddev) {
-#ifndef __SYCL_DEVICE_ONLY__
-        if (stddev <= RealType(0)) {
-            throw oneapi::mkl::invalid_argument("rng", "gaussian", "stddev <= 0");
-        }
-#endif
-    }
-
-    RealType mean() const {
-        return mean_;
-    }
-
-    RealType stddev() const {
-        return stddev_;
-    }
-
-    param_type param() const {
-        return param_type(mean_, stddev_);
-    }
-
-    void param(const param_type& pt) {
-#ifndef __SYCL_DEVICE_ONLY__
-        if (pt.stddev_ <= RealType(0)) {
-            throw oneapi::mkl::invalid_argument("rng", "gaussian", "stddev <= 0");
-        }
-#endif
-        mean_ = pt.mean_;
-        stddev_ = pt.stddev_;
-    }
-
-protected:
-    template <typename EngineType>
-    __attribute__((always_inline)) inline auto generate(EngineType& engine) ->
-        typename std::conditional<EngineType::vec_size == 1, RealType,
-                                  sycl::vec<RealType, EngineType::vec_size>>::type {
-        if constexpr (EngineType::vec_size == 1) {
-            return generate_single(engine);
-        }
-        else {
-            RealType stddev = stddev_ * sqrt2<RealType>();
-            sycl::vec<RealType, EngineType::vec_size> res;
-            sycl::vec<RealType, EngineType::vec_size> u =
-                engine.generate(RealType(-1), RealType(1));
-            for (std::int32_t i = 0; i < EngineType::vec_size; i++) {
-                res[i] = erf_inv_wrapper(u[i]);
-            }
-            return res * stddev + mean_;
-        }
-    }
-
-    template <typename EngineType>
-    __attribute__((always_inline)) inline RealType generate_single(EngineType& engine) {
-        RealType stddev = stddev_ * sqrt2<RealType>();
-        RealType u = engine.generate_single(RealType(-1), RealType(1));
-        return sycl::fma(erf_inv_wrapper(u), stddev, mean_);
-    }
-
-    RealType mean_;
-    RealType stddev_;
-};
-#endif
-
-} // namespace oneapi::mkl::rng::device::detail
-
-#endif // _MKL_RNG_DEVICE_GAUSSIAN_IMPL_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/lognormal_impl.hpp b/include/oneapi/mkl/rng/device/detail/lognormal_impl.hpp
deleted file mode 100644
index 85e8b6d57..000000000
--- a/include/oneapi/mkl/rng/device/detail/lognormal_impl.hpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_LOGNORMAL_IMPL_HPP_
-#define _MKL_RNG_DEVICE_LOGNORMAL_IMPL_HPP_
-
-namespace oneapi::mkl::rng::device::detail {
-
-template <typename RealType, typename Method>
-class distribution_base<oneapi::mkl::rng::device::lognormal<RealType, Method>> {
-public:
-    struct param_type {
-        param_type(RealType m, RealType s, RealType displ, RealType scale)
-                : m_(m),
-                  s_(s),
-                  displ_(displ),
-                  scale_(scale) {}
-        RealType m_;
-        RealType s_;
-        RealType displ_;
-        RealType scale_;
-    };
-
-    distribution_base(RealType m, RealType s, RealType displ, RealType scale)
-            : gaussian_(m, s),
-              displ_(displ),
-              scale_(scale) {
-#ifndef __SYCL_DEVICE_ONLY__
-        if (scale <= static_cast<RealType>(0.0)) {
-            throw oneapi::mkl::invalid_argument("rng", "lognormal", "scale <= 0");
-        }
-#endif
-    }
-
-    RealType m() const {
-        return gaussian_.mean();
-    }
-
-    RealType s() const {
-        return gaussian_.stddev();
-    }
-
-    RealType displ() const {
-        return displ_;
-    }
-
-    RealType scale() const {
-        return scale_;
-    }
-
-    param_type param() const {
-        return param_type(gaussian_.mean(), gaussian_.stddev(), displ_, scale_);
-    }
-
-    void param(const param_type& pt) {
-#ifndef __SYCL_DEVICE_ONLY__
-        if (pt.scale_ <= static_cast<RealType>(0.0)) {
-            throw oneapi::mkl::invalid_argument("rng", "lognormal", "scale <= 0");
-        }
-#endif
-        gaussian_.param({ pt.m_, pt.s_ });
-        displ_ = pt.displ_;
-        scale_ = pt.scale_;
-    }
-
-protected:
-    template <typename EngineType>
-    auto generate(EngineType& engine) ->
-        typename std::conditional<EngineType::vec_size == 1, RealType,
-                                  sycl::vec<RealType, EngineType::vec_size>>::type {
-        auto res = gaussian_.generate(engine);
-        return sycl::exp(res) * scale_ + displ_;
-    }
-
-    template <typename EngineType>
-    RealType generate_single(EngineType& engine) {
-        RealType res = gaussian_.generate_single(engine);
-        return sycl::exp(res) * scale_ + displ_;
-    }
-
-    distribution_base<oneapi::mkl::rng::device::gaussian<RealType, gaussian_method::box_muller2>>
-        gaussian_;
-    RealType displ_;
-    RealType scale_;
-};
-
-} // namespace oneapi::mkl::rng::device::detail
-
-#endif // _MKL_RNG_DEVICE_LOGNORMAL_IMPL_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/mcg31m1_impl.hpp b/include/oneapi/mkl/rng/device/detail/mcg31m1_impl.hpp
deleted file mode 100644
index 8f1294ac2..000000000
--- a/include/oneapi/mkl/rng/device/detail/mcg31m1_impl.hpp
+++ /dev/null
@@ -1,233 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_MCG31M1_IMPL_HPP_
-#define _MKL_RNG_DEVICE_MCG31M1_IMPL_HPP_
-
-namespace oneapi::mkl::rng::device {
-
-template <std::int32_t VecSize = 1>
-class mcg31m1;
-
-namespace detail {
-
-template <std::int32_t VecSize>
-constexpr sycl::vec<std::uint64_t, VecSize> select_vector_a_mcg31m1() {
-    if constexpr (VecSize == 1)
-        return sycl::vec<std::uint64_t, 1>(UINT64_C(1));
-    else if constexpr (VecSize == 2)
-        return sycl::vec<std::uint64_t, 2>({ UINT64_C(1), UINT64_C(1132489760) });
-    else if constexpr (VecSize == 3)
-        return sycl::vec<std::uint64_t, 3>(
-            { UINT64_C(1), UINT64_C(1132489760), UINT64_C(826537482) });
-    else if constexpr (VecSize == 4)
-        return sycl::vec<std::uint64_t, 4>(
-            { UINT64_C(1), UINT64_C(1132489760), UINT64_C(826537482), UINT64_C(289798557) });
-    else if constexpr (VecSize == 8)
-        return sycl::vec<std::uint64_t, 8>({ UINT64_C(1), UINT64_C(1132489760), UINT64_C(826537482),
-                                             UINT64_C(289798557), UINT64_C(480863449),
-                                             UINT64_C(1381340036), UINT64_C(1582925527),
-                                             UINT64_C(1918178478) });
-    else
-        return sycl::vec<std::uint64_t, 16>(
-            { UINT64_C(1), UINT64_C(1132489760), UINT64_C(826537482), UINT64_C(289798557),
-              UINT64_C(480863449), UINT64_C(1381340036), UINT64_C(1582925527), UINT64_C(1918178478),
-              UINT64_C(1286028348), UINT64_C(482167044), UINT64_C(262060616), UINT64_C(1856662125),
-              UINT64_C(839877947), UINT64_C(1997268203), UINT64_C(458714024),
-              UINT64_C(650347998) });
-}
-
-// hipSYCL (AdaptiveCpp) doesn't support constexpr sycl::vec constructor
-// that's why in case of hipSYCL backend sycl::vec is created as a local variable
-#ifndef __HIPSYCL__
-template <std::int32_t VecSize>
-struct mcg31m1_vector_a {
-    static constexpr sycl::vec<std::uint64_t, VecSize> vector_a =
-        select_vector_a_mcg31m1<VecSize>(); // powers of a
-};
-#endif
-
-struct mcg31m1_param {
-    static constexpr std::uint32_t a = 1132489760;
-    static constexpr std::uint64_t m_64 = 0x000000007FFFFFFF; // 2^31 - 1
-    static constexpr double m_fl = 2147483647.0; // 2^31 - 1
-    static constexpr std::uint64_t bits = 31;
-};
-
-template <std::int32_t VecSize>
-struct engine_state<oneapi::mkl::rng::device::mcg31m1<VecSize>> {
-    std::uint32_t s;
-};
-
-namespace mcg31m1_impl {
-
-// Improved modulus x % (2^31 - 1) operation (possible to do for divisor (2^N
-// -1), but MCG31M1 needs only 2^31 - 1) if we want to do x % (2^N -1) we can
-// find out that: x = A + B * 2^N, where A = x % 2^N = x & 00..01..11 (binary)
-// where quantity of 1 is N, B = x / 2^N = x >> N also x = A + B * (2^N - 1 + 1)
-// = (A + B) + B * (2^N - 1), but (A + B) may be greater than (2^N - 1), that's
-// why we put x1 = A + B = A' + B' * 2^N = ... until new (A + B) < (2^N - 1) for
-// MCG31m1 N = 31
-template <typename T>
-static inline T custom_mod(std::uint64_t x) {
-    std::uint64_t b = x >> mcg31m1_param::bits;
-    std::uint64_t a = x & mcg31m1_param::m_64;
-    x = a + b;
-    b = x >> mcg31m1_param::bits;
-    a = x & mcg31m1_param::m_64;
-    return static_cast<T>(a + b);
-}
-
-template <std::int32_t VecSize>
-static inline sycl::vec<std::uint32_t, VecSize> custom_mod(
-    const sycl::vec<std::uint64_t, VecSize>& x) {
-    sycl::vec<std::uint64_t, VecSize> b = x >> mcg31m1_param::bits;
-    sycl::vec<std::uint64_t, VecSize> a = x & mcg31m1_param::m_64;
-    sycl::vec<std::uint64_t, VecSize> res = a + b;
-    b = res >> mcg31m1_param::bits;
-    a = res & mcg31m1_param::m_64;
-    res = a + b;
-    return res.template convert<std::uint32_t>();
-}
-
-static inline std::uint64_t power(std::uint64_t a, std::uint64_t n) {
-    std::uint64_t a2;
-    // initialize result by 1 for recurrence
-    std::uint32_t result = 1;
-
-    if (n == 0) {
-        // return (a^0)%m = 1
-        return std::uint64_t{ 1 };
-    }
-
-    // Recurrence loop
-    do {
-        // For each odd n
-        if (n & 1) {
-            a2 = static_cast<std::uint64_t>(result) * a;
-            result = custom_mod<std::uint32_t>(a2);
-        }
-        // n /= 2
-        n >>= 1;
-
-        a2 = a * a;
-        a = custom_mod<std::uint64_t>(a2);
-    } while (n);
-
-    return static_cast<std::uint64_t>(result);
-}
-
-template <std::int32_t VecSize>
-static inline void skip_ahead(engine_state<oneapi::mkl::rng::device::mcg31m1<VecSize>>& state,
-                              std::uint64_t num_to_skip) {
-    std::uint64_t loc_A = power(static_cast<std::uint64_t>(mcg31m1_param::a), num_to_skip);
-    state.s = custom_mod<std::uint32_t>(loc_A * static_cast<std::uint64_t>(state.s));
-}
-
-template <std::int32_t VecSize>
-static inline void init(engine_state<oneapi::mkl::rng::device::mcg31m1<VecSize>>& state,
-                        std::uint32_t seed, std::uint64_t offset) {
-    state.s = custom_mod<std::uint32_t>(seed);
-    if (state.s == 0)
-        state.s = 1;
-    skip_ahead(state, offset);
-}
-
-template <std::int32_t VecSize>
-static inline sycl::vec<std::uint32_t, VecSize> generate(
-    engine_state<oneapi::mkl::rng::device::mcg31m1<VecSize>>& state) {
-    sycl::vec<std::uint64_t, VecSize> x(state.s);
-    sycl::vec<std::uint32_t, VecSize> res;
-#ifndef __HIPSYCL__
-    res = custom_mod(mcg31m1_vector_a<VecSize>::vector_a * x);
-#else
-    // a workaround for hipSYCL (AdaptiveCpp)
-    res = custom_mod(select_vector_a_mcg31m1<VecSize>() * x);
-#endif
-    state.s =
-        custom_mod<std::uint32_t>(mcg31m1_param::a * static_cast<std::uint64_t>(res[VecSize - 1]));
-    return res;
-}
-
-template <std::int32_t VecSize>
-static inline std::uint32_t generate_single(
-    engine_state<oneapi::mkl::rng::device::mcg31m1<VecSize>>& state) {
-    std::uint32_t x = state.s;
-    state.s = custom_mod<std::uint32_t>(mcg31m1_param::a * static_cast<std::uint64_t>(state.s));
-    return x;
-}
-
-} // namespace mcg31m1_impl
-
-template <std::int32_t VecSize>
-class engine_base<oneapi::mkl::rng::device::mcg31m1<VecSize>> {
-protected:
-    engine_base(std::uint32_t seed, std::uint64_t offset = 0) {
-        mcg31m1_impl::init(this->state_, seed, offset);
-    }
-
-    template <typename RealType>
-    auto generate(RealType a, RealType b) ->
-        typename std::conditional<VecSize == 1, RealType, sycl::vec<RealType, VecSize>>::type {
-        sycl::vec<RealType, VecSize> res;
-        sycl::vec<std::uint32_t, VecSize> res_uint;
-
-        RealType c = (b - a) / static_cast<RealType>(mcg31m1_param::m_fl);
-
-        res_uint = mcg31m1_impl::generate(this->state_);
-
-        res = res_uint.template convert<RealType>() * c + a;
-
-        return res;
-    }
-
-    auto generate() -> typename std::conditional<VecSize == 1, std::uint32_t,
-                                                 sycl::vec<std::uint32_t, VecSize>>::type {
-        return mcg31m1_impl::generate(this->state_);
-    }
-
-    template <typename RealType>
-    RealType generate_single(RealType a, RealType b) {
-        RealType res;
-        std::uint32_t res_uint;
-
-        RealType c = (b - a) / static_cast<RealType>(mcg31m1_param::m_fl);
-
-        res_uint = mcg31m1_impl::generate_single(this->state_);
-
-        res = static_cast<RealType>(res_uint) * c + a;
-        return res;
-    }
-
-    std::uint32_t generate_single() {
-        return mcg31m1_impl::generate_single(this->state_);
-    }
-
-    void skip_ahead(std::uint64_t num_to_skip) {
-        detail::mcg31m1_impl::skip_ahead(this->state_, num_to_skip);
-    }
-
-    engine_state<oneapi::mkl::rng::device::mcg31m1<VecSize>> state_;
-};
-
-} // namespace detail
-
-} // namespace oneapi::mkl::rng::device
-
-#endif // _MKL_RNG_DEVICE_MCG31M1_IMPL_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/mcg59_impl.hpp b/include/oneapi/mkl/rng/device/detail/mcg59_impl.hpp
deleted file mode 100644
index bc21eb607..000000000
--- a/include/oneapi/mkl/rng/device/detail/mcg59_impl.hpp
+++ /dev/null
@@ -1,275 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_MCG59_IMPL_HPP_
-#define _MKL_RNG_DEVICE_MCG59_IMPL_HPP_
-
-namespace oneapi::mkl::rng::device {
-
-template <std::int32_t VecSize = 1>
-class mcg59;
-
-namespace detail {
-
-template <std::int32_t VecSize>
-constexpr sycl::vec<uint64_t, VecSize> select_vector_a_mcg59() {
-    if constexpr (VecSize == 1)
-        return sycl::vec<uint64_t, 1>(UINT64_C(1));
-    else if constexpr (VecSize == 2)
-        return sycl::vec<uint64_t, 2>({ UINT64_C(1), UINT64_C(0x113769B23C5FD) });
-    else if constexpr (VecSize == 3)
-        return sycl::vec<uint64_t, 3>(
-            { UINT64_C(1), UINT64_C(0x113769B23C5FD), UINT64_C(0x65C69FC1A4D5C09) });
-    else if constexpr (VecSize == 4)
-        return sycl::vec<uint64_t, 4>({ UINT64_C(1), UINT64_C(0x113769B23C5FD),
-                                        UINT64_C(0x65C69FC1A4D5C09), UINT64_C(0x1CE44D68E81E1E5) });
-    else if constexpr (VecSize == 8)
-        return sycl::vec<uint64_t, 8>({ UINT64_C(1), UINT64_C(0x113769B23C5FD),
-                                        UINT64_C(0x65C69FC1A4D5C09), UINT64_C(0x1CE44D68E81E1E5),
-                                        UINT64_C(0x2F861CA52807851), UINT64_C(0x1CCDF2FE3A03D0D),
-                                        UINT64_C(0x707AB5B7C1E56D9), UINT64_C(0x6139AE457BD175) });
-    else
-        return sycl::vec<uint64_t, 16>(
-            { UINT64_C(1), UINT64_C(0x113769B23C5FD), UINT64_C(0x65C69FC1A4D5C09),
-              UINT64_C(0x1CE44D68E81E1E5), UINT64_C(0x2F861CA52807851), UINT64_C(0x1CCDF2FE3A03D0D),
-              UINT64_C(0x707AB5B7C1E56D9), UINT64_C(0x6139AE457BD175), UINT64_C(0x171CF606D8C09A1),
-              UINT64_C(0x3764DC8D2D1691D), UINT64_C(0x50A1576CCF32A9), UINT64_C(0x499F3083ADC1E05),
-              UINT64_C(0x7A30C00B05283F1), UINT64_C(0x4FE299EB607DA2D), UINT64_C(0x51CCFD803CE3F79),
-              UINT64_C(0x58145D06A37D795) });
-}
-
-// hipSYCL (AdaptiveCpp) doesn't support constexpr sycl::vec constructor
-// that's why in case of hipSYCL backend sycl::vec is created as a local variable
-#ifndef __HIPSYCL__
-template <std::int32_t VecSize>
-struct mcg59_vector_a {
-    static constexpr sycl::vec<std::uint64_t, VecSize> vector_a =
-        select_vector_a_mcg59<VecSize>(); // powers of a
-};
-#endif
-
-struct mcg59_param {
-    static constexpr uint64_t a = 0x113769B23C5FD; // 13^13
-    static constexpr uint64_t m_64 = 0x7FFFFFFFFFFFFFF; // 2^59 - 1
-    static constexpr float m_fl = 576460752303423488.0f; // 2^59
-};
-
-template <std::int32_t VecSize>
-struct engine_state<oneapi::mkl::rng::device::mcg59<VecSize>> {
-    std::uint64_t s;
-};
-
-namespace mcg59_impl {
-
-template <typename T>
-static inline T custom_mod(T x) {
-    return (x & mcg59_param::m_64);
-}
-
-static inline std::uint64_t power(std::uint64_t a, std::uint64_t n) {
-    // initialize result by 1 for recurrency
-    std::uint64_t result = 1;
-    if (n == 0) {
-        // return (a^0)%m = 1
-        return 1;
-    }
-    do {
-        // For each odd n
-        if (n & 1) {
-            result = custom_mod(result * a);
-        }
-        // n := n/2
-        n >>= 1;
-        a = custom_mod(a * a);
-    } while (n);
-
-    return result;
-}
-
-template <std::int32_t VecSize>
-static inline void skip_ahead(engine_state<oneapi::mkl::rng::device::mcg59<VecSize>>& state,
-                              std::uint64_t num_to_skip) {
-    std::uint64_t loc_A = power(mcg59_param::a, num_to_skip);
-    state.s = custom_mod(loc_A * state.s);
-}
-
-template <std::int32_t VecSize>
-static inline void init(engine_state<oneapi::mkl::rng::device::mcg59<VecSize>>& state,
-                        std::uint64_t seed, std::uint64_t offset) {
-    state.s = seed & mcg59_param::m_64;
-    if (state.s == 0)
-        state.s = 1;
-
-    skip_ahead(state, offset);
-}
-
-template <std::int32_t VecSize>
-static inline sycl::vec<std::uint64_t, VecSize> generate(
-    engine_state<oneapi::mkl::rng::device::mcg59<VecSize>>& state) {
-    sycl::vec<std::uint64_t, VecSize> res(state.s);
-#ifndef __HIPSYCL__
-    res = custom_mod(mcg59_vector_a<VecSize>::vector_a * res);
-#else
-    // a workaround for hipSYCL (AdaptiveCpp)
-    res = custom_mod(select_vector_a_mcg59<VecSize>() * res);
-#endif
-    state.s = custom_mod(mcg59_param::a * res[VecSize - 1]);
-    return res;
-}
-
-template <std::int32_t VecSize>
-static inline std::uint64_t generate_single(
-    engine_state<oneapi::mkl::rng::device::mcg59<VecSize>>& state) {
-    std::uint64_t x = state.s;
-    state.s = custom_mod(mcg59_param::a * x);
-    return x;
-}
-
-} // namespace mcg59_impl
-
-template <std::int32_t VecSize>
-class engine_base<oneapi::mkl::rng::device::mcg59<VecSize>> {
-protected:
-    engine_base(std::uint64_t seed, std::uint64_t offset = 0) {
-        mcg59_impl::init(this->state_, seed, offset);
-    }
-
-    template <typename RealType>
-    auto generate(RealType a, RealType b) ->
-        typename std::conditional<VecSize == 1, RealType, sycl::vec<RealType, VecSize>>::type {
-        sycl::vec<RealType, VecSize> res;
-
-        RealType c = (b - a) / static_cast<RealType>(mcg59_param::m_fl);
-        sycl::vec<std::uint64_t, VecSize> res_uint = mcg59_impl::generate(this->state_);
-
-        res = res_uint.template convert<RealType>() * c + a;
-
-        return res;
-    }
-
-    auto generate() -> typename std::conditional<VecSize == 1, std::uint32_t,
-                                                 sycl::vec<std::uint32_t, VecSize>>::type {
-        return mcg59_impl::generate(this->state_);
-    }
-
-    auto generate_bits() -> typename std::conditional<VecSize == 1, std::uint64_t,
-                                                      sycl::vec<std::uint64_t, VecSize>>::type {
-        return mcg59_impl::generate(this->state_);
-    }
-
-    template <typename UIntType>
-    auto generate_uniform_bits() ->
-        typename std::conditional<VecSize == 1, UIntType, sycl::vec<UIntType, VecSize>>::type {
-        if constexpr (std::is_same<UIntType, std::uint32_t>::value) {
-            auto uni_res = mcg59_impl::generate(this->state_);
-
-            if constexpr (VecSize == 1) {
-                return static_cast<std::uint32_t>(uni_res[0] >> 27);
-            }
-            else {
-                sycl::vec<std::uint32_t, VecSize> vec_out;
-
-                for (std::int32_t i = 0; i < VecSize; i++) {
-                    vec_out[i] = static_cast<std::uint32_t>(uni_res[i] >> 27);
-                }
-
-                return vec_out;
-            }
-        }
-        else {
-            auto uni_res1 = mcg59_impl::generate(this->state_);
-            auto uni_res2 = mcg59_impl::generate(this->state_);
-
-            if constexpr (VecSize == 1) {
-                uni_res1 >>= UIntType(27);
-                uni_res2 >>= UIntType(27);
-
-                return (uni_res2 << UIntType(32)) + uni_res1;
-            }
-            else {
-                sycl::vec<std::uint64_t, VecSize> vec_out;
-
-                for (int i = 0; i < VecSize; i++) {
-                    uni_res1[i] >>= 27;
-                    uni_res2[i] >>= 27;
-                }
-
-                if constexpr (VecSize != 3) {
-                    for (int i = 0; i < VecSize / 2; i++) {
-                        vec_out[i] = (uni_res1[2 * i + 1] << 32) + uni_res1[2 * i];
-                        vec_out[i + VecSize / 2] = (uni_res2[2 * i + 1] << 32) + uni_res2[2 * i];
-                    }
-                }
-                else {
-                    vec_out[0] = (uni_res1[1] << 32) + uni_res1[0];
-                    vec_out[1] = (uni_res2[0] << 32) + uni_res1[2];
-                    vec_out[2] = (uni_res2[2] << 32) + uni_res2[1];
-                }
-
-                return vec_out;
-            }
-        }
-    }
-
-    template <typename RealType>
-    RealType generate_single(RealType a, RealType b) {
-        RealType res;
-        std::uint64_t res_uint;
-
-        RealType c = (b - a) / static_cast<RealType>(mcg59_param::m_fl);
-
-        res_uint = mcg59_impl::generate_single(this->state_);
-        res = static_cast<RealType>(res_uint) * c + a;
-
-        return res;
-    }
-
-    auto generate_single() {
-        return mcg59_impl::generate_single(this->state_);
-    }
-
-    template <typename UIntType>
-    auto generate_single_uniform_bits() {
-        if constexpr (std::is_same<UIntType, std::uint32_t>::value) {
-            auto uni_res = mcg59_impl::generate_single(this->state_) >> 27;
-
-            return static_cast<std::uint32_t>(uni_res);
-        }
-        else {
-            auto uni_res1 = mcg59_impl::generate_single(this->state_);
-            auto uni_res2 = mcg59_impl::generate_single(this->state_);
-
-            uni_res1 >>= 27;
-            uni_res2 >>= 27;
-
-            return (uni_res2 << 32) + uni_res1;
-        }
-    }
-
-    void skip_ahead(std::uint64_t num_to_skip) {
-        detail::mcg59_impl::skip_ahead(this->state_, num_to_skip);
-    }
-
-    engine_state<oneapi::mkl::rng::device::mcg59<VecSize>> state_;
-};
-
-} // namespace detail
-} // namespace oneapi::mkl::rng::device
-
-#endif // _MKL_RNG_DEVICE_MCG59_IMPL_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/mrg32k3a_impl.hpp b/include/oneapi/mkl/rng/device/detail/mrg32k3a_impl.hpp
deleted file mode 100644
index 596e625ad..000000000
--- a/include/oneapi/mkl/rng/device/detail/mrg32k3a_impl.hpp
+++ /dev/null
@@ -1,384 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// References:
-// [1] Bradley, Thomas & du Toit, Jacques & Giles, Mike & Tong, Robert & Woodhams, Paul.
-// (2011). Parallelisation Techniques for Random Number Generators.
-// GPU Computing Gems Emerald Edition. 10.1016/B978-0-12-384988-5.00016-4
-
-#ifndef _MKL_RNG_DEVICE_MRG32K3A_IMPL_HPP_
-#define _MKL_RNG_DEVICE_MRG32K3A_IMPL_HPP_
-
-#include "oneapi/mkl/rng/device/detail/mrg32k3a_skip_ahead_matrix.hpp"
-
-namespace oneapi::mkl::rng::device {
-
-template <std::int32_t VecSize = 1>
-class mrg32k3a;
-
-namespace detail {
-
-template <std::int32_t VecSize>
-struct engine_state<oneapi::mkl::rng::device::mrg32k3a<VecSize>> {
-    std::uint32_t s[6];
-};
-
-namespace mrg32k3a_impl {
-
-struct mrg32k3a_params {
-    static constexpr std::uint32_t m1 = 4294967087;
-    static constexpr std::uint32_t m2 = 4294944443;
-    static constexpr std::uint32_t a12 = 1403580;
-    static constexpr std::uint32_t a13 = 4294156359;
-    static constexpr std::uint32_t a21 = 527612;
-    static constexpr std::uint32_t a23 = 4293573854;
-    static constexpr std::uint32_t a13n = 810728;
-    static constexpr std::uint32_t a23n = 1370589;
-};
-
-template <std::uint32_t M>
-struct two_pow_32_minus_m {};
-
-template <>
-struct two_pow_32_minus_m<mrg32k3a_params::m1> {
-    static constexpr std::int64_t val = 209;
-};
-
-template <>
-struct two_pow_32_minus_m<mrg32k3a_params::m2> {
-    static constexpr std::int64_t val = 22853;
-};
-
-template <std::int64_t M, typename T>
-static inline void bit_shift_and_mask(T& in) {
-    T mask;
-    if constexpr (std::is_same_v<T, std::uint64_t>) {
-        mask = 0x00000000ffffffffu;
-    }
-    else {
-        mask = 0x00000000ffffffff;
-    }
-    in = ((in >> 32) * two_pow_32_minus_m<M>::val + (in & mask));
-}
-
-template <std::uint32_t M>
-static inline void matr3x3_vec_mul_mod(std::uint32_t a[3][3], std::uint32_t x[3],
-                                       std::uint32_t y[3]) {
-    std::uint64_t temp[3] = { 0ull, 0ull, 0ull };
-    for (int i = 0; i < 3; ++i) {
-        for (int k = 0; k < 3; ++k) {
-            std::uint64_t tmp =
-                static_cast<std::uint64_t>(a[i][k]) * static_cast<std::uint64_t>(x[k]);
-            bit_shift_and_mask<M>(tmp);
-            bit_shift_and_mask<M>(tmp);
-            if (tmp >= M) {
-                tmp -= M;
-            }
-            temp[i] += tmp;
-        }
-        bit_shift_and_mask<M>(temp[i]);
-        if (temp[i] >= M) {
-            temp[i] -= M;
-        }
-    }
-
-    for (int k = 0; k < 3; k++) {
-        y[k] = static_cast<std::uint32_t>(temp[k]);
-    }
-
-    return;
-}
-
-template <std::uint32_t M>
-static inline void matr3x3_mul_mod(std::uint32_t B[3][3],
-                                   const std::uint32_t _skip_ahead_matrix[3][3]) {
-    std::uint64_t temp[3][3] = { { 0ull, 0ull, 0ull }, { 0ull, 0ull, 0ull }, { 0ull, 0ull, 0ull } };
-
-    for (int i = 0; i < 3; ++i) {
-        for (int j = 0; j < 3; ++j) {
-            for (int k = 0; k < 3; ++k) {
-                std::uint64_t tmp = static_cast<std::uint64_t>(B[i][k]) *
-                                    static_cast<std::uint64_t>(_skip_ahead_matrix[k][j]);
-                bit_shift_and_mask<M>(tmp);
-                if constexpr (mrg32k3a_params::m2 == M) {
-                    bit_shift_and_mask<M>(tmp);
-                }
-                if (tmp >= M) {
-                    tmp -= M;
-                }
-                temp[i][j] += tmp;
-            }
-            bit_shift_and_mask<M>(temp[i][j]);
-            if (temp[i][j] >= M) {
-                temp[i][j] -= M;
-            }
-        }
-    }
-
-    for (int i = 0; i < 3; ++i) {
-        for (int j = 0; j < 3; ++j) {
-            B[i][j] = static_cast<std::uint32_t>(temp[i][j]);
-        }
-    }
-}
-
-template <std::uint32_t M>
-static inline void vec3_pow_mod(
-    std::uint32_t x[3], std::uint64_t n, const std::uint64_t* skip_params,
-    const std::uint32_t _skip_ahead_matrix[quantity_of_3x3_matrices][3][3]) {
-    std::uint32_t B[3][3] = { { 1u, 0u, 0u }, { 0u, 1u, 0u }, { 0u, 0u, 1u } };
-
-    std::uint32_t off;
-    std::uint32_t mod;
-    std::uint64_t skip_param;
-    std::uint32_t bit_count = 0; // can be 0, 1, 2
-    std::uint32_t bit_count_tmp;
-
-    for (std::uint32_t j = 0; j < n; j++) {
-        skip_param = skip_params[j];
-        off = 0;
-        bit_count_tmp = bit_count;
-        while (skip_param) {
-            // we have to multiply skip_param[1] by 2 and skip_params[2] by 4 only for the 1st iteration
-            // of the loop to get the required power of a power-of-eight matrice from a power of two
-            mod = (skip_param << static_cast<std::uint64_t>(bit_count_tmp)) &
-                  7ull; // == (skip_param * _mult) % 8, _mult={1,2,4}
-            if (mod) {
-                // 7 - number of 3x3 matrices of some power of 8: 1*8^x, 2*8^x, ..., 7*8^x
-                // 7 * 21 - number of 3x3 matrices for each skip parameter
-                matr3x3_mul_mod<M>(B, _skip_ahead_matrix[7 * 21 * j + off * 7 + (mod - 1)]);
-            }
-            skip_param =
-                skip_param /
-                (8ull >> static_cast<std::uint64_t>(bit_count_tmp)); // == skip_param / (8 / _mult)
-            ++off;
-            bit_count_tmp = 0;
-        }
-        ++bit_count;
-    }
-    matr3x3_vec_mul_mod<M>(B, x, x);
-}
-
-template <std::int32_t VecSize>
-static inline void skip_ahead(engine_state<oneapi::mkl::rng::device::mrg32k3a<VecSize>>& state,
-                              std::uint64_t n, const std::uint64_t* num_to_skip_ptr) {
-    if (n > 3) {
-        n = 3;
-#ifndef __SYCL_DEVICE_ONLY__
-        throw oneapi::mkl::invalid_argument("rng", "mrg32k3a",
-                                            "period is 2 ^ 191, skip on more than 2^192");
-#endif
-    }
-    vec3_pow_mod<mrg32k3a_params::m1>(state.s, n, num_to_skip_ptr, skip_ahead_matrix[0]);
-    vec3_pow_mod<mrg32k3a_params::m2>(state.s + 3, n, num_to_skip_ptr, skip_ahead_matrix[1]);
-}
-
-template <std::int32_t VecSize>
-static inline void validate_seed(engine_state<oneapi::mkl::rng::device::mrg32k3a<VecSize>>& state) {
-    int i;
-    for (i = 0; i < 3; i++) {
-        if (state.s[i] >= mrg32k3a_params::m1) {
-            state.s[i] -= mrg32k3a_params::m1;
-        }
-    }
-    for (; i < 6; i++) {
-        if (state.s[i] >= mrg32k3a_params::m2) {
-            state.s[i] -= mrg32k3a_params::m2;
-        }
-    }
-
-    if ((state.s[0]) == 0 && (state.s[1]) == 0 && (state.s[2]) == 0) {
-        state.s[0] = 1;
-    }
-    if ((state.s[3]) == 0 && (state.s[4]) == 0 && (state.s[5]) == 0) {
-        state.s[3] = 1;
-    }
-}
-
-template <std::int32_t VecSize>
-static inline void init(engine_state<oneapi::mkl::rng::device::mrg32k3a<VecSize>>& state,
-                        std::uint64_t n, const std::uint32_t* seed_ptr, std::uint64_t n_offset,
-                        const std::uint64_t* offset_ptr) {
-    std::uint64_t i;
-    if (n > 6) {
-        n = 6;
-    }
-    for (i = 0; i < n; i++) {
-        state.s[i] = seed_ptr[i];
-    }
-    for (; i < 6; i++) {
-        state.s[i] = 1;
-    }
-    validate_seed(state);
-    mrg32k3a_impl::skip_ahead(state, n_offset, offset_ptr);
-}
-
-template <std::int32_t VecSize>
-static inline sycl::vec<std::uint32_t, VecSize> generate(
-    engine_state<oneapi::mkl::rng::device::mrg32k3a<VecSize>>& state) {
-    const std::int32_t num_elements = VecSize;
-    sycl::vec<std::uint32_t, VecSize> res;
-    std::int64_t x, y;
-    std::int32_t i = 0;
-    for (i = 0; i < num_elements; i++) {
-        x = mrg32k3a_params::a12 * static_cast<std::int64_t>(state.s[1]) -
-            mrg32k3a_params::a13n * static_cast<std::int64_t>(state.s[0]);
-        // perform modulus
-        bit_shift_and_mask<mrg32k3a_params::m1>(x);
-        if (x >= mrg32k3a_params::m1)
-            x -= mrg32k3a_params::m1;
-        x += ((x & 0x8000000000000000) >> 63) * mrg32k3a_params::m1;
-        y = mrg32k3a_params::a21 * static_cast<std::int64_t>(state.s[5]) -
-            mrg32k3a_params::a23n * static_cast<std::int64_t>(state.s[3]);
-        // perform modulus
-        bit_shift_and_mask<mrg32k3a_params::m2>(y);
-        bit_shift_and_mask<mrg32k3a_params::m2>(y);
-        if (y >= mrg32k3a_params::m2)
-            y -= mrg32k3a_params::m2;
-        y += ((y & 0x8000000000000000) >> 63) * mrg32k3a_params::m2;
-        state.s[0] = state.s[1];
-        state.s[1] = state.s[2];
-        state.s[2] = x;
-        state.s[3] = state.s[4];
-        state.s[4] = state.s[5];
-        state.s[5] = y;
-        if (x <= y) {
-            res[i] = x + (mrg32k3a_params::m1 - y);
-        }
-        else {
-            res[i] = x - y;
-        }
-    }
-    return res;
-}
-
-template <std::int32_t VecSize>
-static inline std::uint32_t generate_single(
-    engine_state<oneapi::mkl::rng::device::mrg32k3a<VecSize>>& state) {
-    std::uint32_t res;
-    std::int64_t x, y;
-    x = mrg32k3a_params::a12 * static_cast<std::int64_t>(state.s[1]) -
-        mrg32k3a_params::a13n * static_cast<std::int64_t>(state.s[0]);
-    // perform modulus
-    bit_shift_and_mask<mrg32k3a_params::m1>(x);
-    if (x >= mrg32k3a_params::m1)
-        x -= mrg32k3a_params::m1;
-    x += ((x & 0x8000000000000000) >> 63) * mrg32k3a_params::m1;
-    y = mrg32k3a_params::a21 * static_cast<std::int64_t>(state.s[5]) -
-        mrg32k3a_params::a23n * static_cast<std::int64_t>(state.s[3]);
-    // perform modulus
-    bit_shift_and_mask<mrg32k3a_params::m2>(y);
-    bit_shift_and_mask<mrg32k3a_params::m2>(y);
-    if (y >= mrg32k3a_params::m2)
-        y -= mrg32k3a_params::m2;
-    y += ((y & 0x8000000000000000) >> 63) * mrg32k3a_params::m2;
-    state.s[0] = state.s[1];
-    state.s[1] = state.s[2];
-    state.s[2] = x;
-    state.s[3] = state.s[4];
-    state.s[4] = state.s[5];
-    state.s[5] = y;
-    if (x <= y) {
-        res = x + (mrg32k3a_params::m1 - y);
-    }
-    else {
-        res = x - y;
-    }
-
-    return res;
-}
-
-} // namespace mrg32k3a_impl
-
-template <std::int32_t VecSize>
-class engine_base<oneapi::mkl::rng::device::mrg32k3a<VecSize>> {
-protected:
-    engine_base(std::uint32_t seed, std::uint64_t offset = 0) {
-        mrg32k3a_impl::init(this->state_, 1, &seed, 1, &offset);
-    }
-
-    engine_base(std::uint64_t n, const std::uint32_t* seed, std::uint64_t offset = 0) {
-        mrg32k3a_impl::init(this->state_, n, seed, 1, &offset);
-    }
-
-    engine_base(std::uint32_t seed, std::uint64_t n_offset, const std::uint64_t* offset_ptr) {
-        mrg32k3a_impl::init(this->state_, 1, &seed, n_offset, offset_ptr);
-    }
-
-    engine_base(std::uint64_t n, const std::uint32_t* seed, std::uint64_t n_offset,
-                const std::uint64_t* offset_ptr) {
-        mrg32k3a_impl::init(this->state_, n, seed, n_offset, offset_ptr);
-    }
-
-    template <typename RealType>
-    auto generate(RealType a, RealType b) ->
-        typename std::conditional<VecSize == 1, RealType, sycl::vec<RealType, VecSize>>::type {
-        sycl::vec<RealType, VecSize> res;
-        sycl::vec<std::uint32_t, VecSize> res_uint;
-        RealType c;
-
-        c = (b - a) / (static_cast<RealType>(mrg32k3a_impl::mrg32k3a_params::m1));
-
-        res_uint = mrg32k3a_impl::generate(this->state_);
-
-        for (int i = 0; i < VecSize; i++) {
-            res[i] = (RealType)(res_uint[i]) * c + a;
-        }
-        return res;
-    }
-
-    auto generate() -> typename std::conditional<VecSize == 1, std::uint32_t,
-                                                 sycl::vec<std::uint32_t, VecSize>>::type {
-        return mrg32k3a_impl::generate(this->state_);
-    }
-
-    template <typename RealType>
-    RealType generate_single(RealType a, RealType b) {
-        RealType res;
-        std::uint32_t res_uint;
-        RealType c;
-
-        c = (b - a) / (static_cast<RealType>(mrg32k3a_impl::mrg32k3a_params::m1));
-
-        res_uint = mrg32k3a_impl::generate_single(this->state_);
-
-        res = (RealType)(res_uint)*c + a;
-
-        return res;
-    }
-
-    std::uint32_t generate_single() {
-        return mrg32k3a_impl::generate_single(this->state_);
-    }
-
-    void skip_ahead(std::uint64_t num_to_skip) {
-        detail::mrg32k3a_impl::skip_ahead(this->state_, 1, &num_to_skip);
-    }
-
-    void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) {
-        detail::mrg32k3a_impl::skip_ahead(this->state_, num_to_skip.size(), num_to_skip.begin());
-    }
-
-    engine_state<oneapi::mkl::rng::device::mrg32k3a<VecSize>> state_;
-};
-
-} // namespace detail
-} // namespace oneapi::mkl::rng::device
-
-#endif // _MKL_RNG_DEVICE_MRG32K3A_IMPL_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/mrg32k3a_skip_ahead_matrix.hpp b/include/oneapi/mkl/rng/device/detail/mrg32k3a_skip_ahead_matrix.hpp
deleted file mode 100644
index d1ea8c263..000000000
--- a/include/oneapi/mkl/rng/device/detail/mrg32k3a_skip_ahead_matrix.hpp
+++ /dev/null
@@ -1,3668 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_MRG32K3A_SKIP_AHEAD_MATRIX_HPP_
-#define _MKL_RNG_DEVICE_MRG32K3A_SKIP_AHEAD_MATRIX_HPP_
-
-namespace oneapi::mkl::rng::device::detail {
-namespace mrg32k3a_impl {
-
-constexpr std::size_t quantity_of_3x3_matrices = 455; // number of 3x3 matrices for skipping
-
-// There are 2 subsequences of numbers, each containing 455 3x3 matrices
-static const std::uint32_t skip_ahead_matrix[2][quantity_of_3x3_matrices][3][3] = {
-    // Matrices for the first part of SkipAhead procedure
-    // Matrix for nskip = 1 * 8 ^ 0:
-    { { { 0, 1, 0 }, { 0, 0, 1 }, { 4294156359, 1403580, 0 } },
-      // Matrix for nskip = 2 * 8 ^ 0:
-      { { 0, 0, 1 }, { 4294156359, 1403580, 0 }, { 0, 4294156359, 1403580 } },
-      // Matrix for nskip = 3 * 8 ^ 0:
-      { { 4294156359, 1403580, 0 },
-        { 0, 4294156359, 1403580 },
-        { 244671815, 2941890554, 4294156359 } },
-      // Matrix for nskip = 4 * 8 ^ 0:
-      { { 0, 4294156359, 1403580 },
-        { 244671815, 2941890554, 4294156359 },
-        { 149925673, 489343630, 2941890554 } },
-      // Matrix for nskip = 5 * 8 ^ 0:
-      { { 244671815, 2941890554, 4294156359 },
-        { 149925673, 489343630, 2941890554 },
-        { 3782722441, 1831234280, 489343630 } },
-      // Matrix for nskip = 6 * 8 ^ 0:
-      { { 149925673, 489343630, 2941890554 },
-        { 3782722441, 1831234280, 489343630 },
-        { 1527363550, 2758233149, 1831234280 } },
-      // Matrix for nskip = 7 * 8 ^ 0:
-      { { 3782722441, 1831234280, 489343630 },
-        { 1527363550, 2758233149, 1831234280 },
-        { 4072640363, 939574583, 2758233149 } },
-      // Matrix for nskip = 1 * 8 ^ 1:
-      { { 1527363550, 2758233149, 1831234280 },
-        { 4072640363, 939574583, 2758233149 },
-        { 2064391165, 3228066636, 939574583 } },
-      // Matrix for nskip = 2 * 8 ^ 1:
-      { { 736416029, 2961816100, 342112271 },
-        { 387300998, 1062452522, 2961816100 },
-        { 2955879160, 340793741, 1062452522 } },
-      // Matrix for nskip = 3 * 8 ^ 1:
-      { { 3830731060, 3351104823, 355092062 },
-        { 4271633387, 3081436279, 3351104823 },
-        { 2754512837, 673113417, 3081436279 } },
-      // Matrix for nskip = 4 * 8 ^ 1:
-      { { 1243502014, 2218748291, 1709215645 },
-        { 2019641772, 3847560959, 2218748291 },
-        { 3866010231, 2305448679, 3847560959 } },
-      // Matrix for nskip = 5 * 8 ^ 1:
-      { { 753665800, 3956261650, 1880714717 },
-        { 3889504807, 299844503, 3956261650 },
-        { 3555787878, 734199116, 299844503 } },
-      // Matrix for nskip = 6 * 8 ^ 1:
-      { { 1402917279, 671479916, 279477115 },
-        { 1066184965, 1957999095, 671479916 },
-        { 3803905489, 2154014226, 1957999095 } },
-      // Matrix for nskip = 7 * 8 ^ 1:
-      { { 1519817277, 3513041072, 37163717 },
-        { 3823126416, 1394452522, 3513041072 },
-        { 762181894, 1046733826, 1394452522 } },
-      // Matrix for nskip = 1 * 8 ^ 2:
-      { { 3241775219, 3453352062, 3721871040 },
-        { 4062454730, 3015754, 3453352062 },
-        { 919711945, 613405362, 3015754 } },
-      // Matrix for nskip = 2 * 8 ^ 2:
-      { { 1955221006, 1414472808, 1746037714 },
-        { 3653507277, 1644962013, 1414472808 },
-        { 3501544776, 2336229602, 1644962013 } },
-      // Matrix for nskip = 3 * 8 ^ 2:
-      { { 2883496440, 2415235089, 3754924652 },
-        { 2873360987, 3093961248, 2415235089 },
-        { 2551531030, 3967481377, 3093961248 } },
-      // Matrix for nskip = 4 * 8 ^ 2:
-      { { 1170096663, 49135452, 3441537107 },
-        { 1857945175, 1649398389, 49135452 },
-        { 333002869, 3109147376, 1649398389 } },
-      // Matrix for nskip = 5 * 8 ^ 2:
-      { { 3782304170, 536558728, 1207462427 },
-        { 2479820532, 1357898065, 536558728 },
-        { 3967038637, 280429670, 1357898065 } },
-      // Matrix for nskip = 6 * 8 ^ 2:
-      { { 1850220783, 2237648487, 4288110946 },
-        { 778070070, 3729077970, 2237648487 },
-        { 1095506872, 3284249345, 3729077970 } },
-      // Matrix for nskip = 7 * 8 ^ 2:
-      { { 3963964167, 1824244353, 1280698295 },
-        { 1736039316, 2491872331, 1824244353 },
-        { 1645622379, 4226305484, 2491872331 } },
-      // Matrix for nskip = 1 * 8 ^ 3:
-      { { 2299034194, 2297111910, 862649200 },
-        { 1399961132, 996706937, 2297111910 },
-        { 3439056503, 1481993076, 996706937 } },
-      // Matrix for nskip = 2 * 8 ^ 3:
-      { { 4146310528, 458782589, 1007330283 },
-        { 4241015765, 3979619964, 458782589 },
-        { 553886495, 2186897562, 3979619964 } },
-      // Matrix for nskip = 3 * 8 ^ 3:
-      { { 1146235803, 3119708691, 3977084597 },
-        { 1030264372, 1706820424, 3119708691 },
-        { 2210423860, 4154877869, 1706820424 } },
-      // Matrix for nskip = 4 * 8 ^ 3:
-      { { 3630027893, 2130448350, 292773857 },
-        { 1392525159, 1299285967, 2130448350 },
-        { 2589171163, 1217405758, 1299285967 } },
-      // Matrix for nskip = 5 * 8 ^ 3:
-      { { 3841954865, 948545149, 4067146304 },
-        { 4218117763, 3741945962, 948545149 },
-        { 1745368878, 730788749, 3741945962 } },
-      // Matrix for nskip = 6 * 8 ^ 3:
-      { { 2341737887, 1393299668, 3386176735 },
-        { 1655556841, 359678770, 1393299668 },
-        { 2175543957, 3314680006, 359678770 } },
-      // Matrix for nskip = 7 * 8 ^ 3:
-      { { 3121396438, 3210334684, 1062918236 },
-        { 325732785, 2721675172, 3210334684 },
-        { 3182328265, 241385543, 2721675172 } },
-      // Matrix for nskip = 1 * 8 ^ 4:
-      { { 892409263, 1999175811, 2979225418 },
-        { 1996163538, 2148702503, 1999175811 },
-        { 3922720782, 103819730, 2148702503 } },
-      // Matrix for nskip = 2 * 8 ^ 4:
-      { { 1586003016, 2114210471, 3240775579 },
-        { 2777288607, 1400478398, 2114210471 },
-        { 3018215420, 535326008, 1400478398 } },
-      // Matrix for nskip = 3 * 8 ^ 4:
-      { { 377225862, 1098715579, 1378248654 },
-        { 2452527982, 3677219860, 1098715579 },
-        { 3805011027, 3962510930, 3677219860 } },
-      // Matrix for nskip = 4 * 8 ^ 4:
-      { { 2188531273, 1783231160, 3576659343 },
-        { 1908318389, 379210133, 1783231160 },
-        { 554369329, 250053591, 379210133 } },
-      // Matrix for nskip = 5 * 8 ^ 4:
-      { { 2249717607, 2266741858, 2040546316 },
-        { 3093925525, 3510732546, 2266741858 },
-        { 2244264588, 3926709784, 3510732546 } },
-      // Matrix for nskip = 6 * 8 ^ 4:
-      { { 2349663769, 2339070143, 3651849809 },
-        { 1360064932, 443349145, 2339070143 },
-        { 2864061919, 590074072, 443349145 } },
-      // Matrix for nskip = 7 * 8 ^ 4:
-      { { 299115015, 4017647307, 737449908 },
-        { 1014398637, 352887003, 4017647307 },
-        { 2268496651, 499779786, 352887003 } },
-      // Matrix for nskip = 1 * 8 ^ 5:
-      { { 4022841636, 3951951872, 2143424240 },
-        { 1046219306, 1591992468, 3951951872 },
-        { 1510277444, 381333958, 1591992468 } },
-      // Matrix for nskip = 2 * 8 ^ 5:
-      { { 2256493727, 3715182130, 642697923 },
-        { 3615342722, 3975008370, 3715182130 },
-        { 2405650329, 754337639, 3975008370 } },
-      // Matrix for nskip = 3 * 8 ^ 5:
-      { { 3246129870, 3068844475, 3738266208 },
-        { 668859604, 3798586786, 3068844475 },
-        { 3275530821, 2740099935, 3798586786 } },
-      // Matrix for nskip = 4 * 8 ^ 5:
-      { { 1286664224, 627406673, 963516608 },
-        { 1541344588, 460768826, 627406673 },
-        { 1089892553, 2717717970, 460768826 } },
-      // Matrix for nskip = 5 * 8 ^ 5:
-      { { 2092934033, 2692683366, 2826944083 },
-        { 1909409603, 3350132528, 2692683366 },
-        { 3481095738, 3485350450, 3350132528 } },
-      // Matrix for nskip = 6 * 8 ^ 5:
-      { { 1918719231, 2970279915, 803149880 },
-        { 2389311995, 4195833089, 2970279915 },
-        { 166509779, 2105299796, 4195833089 } },
-      // Matrix for nskip = 7 * 8 ^ 5:
-      { { 3252663202, 2481165293, 694007918 },
-        { 1921953957, 350878042, 2481165293 },
-        { 1954500233, 1970948165, 350878042 } },
-      // Matrix for nskip = 1 * 8 ^ 6:
-      { { 2956342842, 3471097641, 2353092905 },
-        { 2996150472, 420480221, 3471097641 },
-        { 2221681883, 372736411, 420480221 } },
-      // Matrix for nskip = 2 * 8 ^ 6:
-      { { 420492906, 153526651, 3499730988 },
-        { 2662640502, 3278195133, 153526651 },
-        { 4086436419, 2510762118, 3278195133 } },
-      // Matrix for nskip = 3 * 8 ^ 6:
-      { { 600928360, 715341436, 3127996992 },
-        { 4276221887, 1953220754, 715341436 },
-        { 2074032202, 163100603, 1953220754 } },
-      // Matrix for nskip = 4 * 8 ^ 6:
-      { { 3310184147, 2228376089, 823220763 },
-        { 3992771814, 1693168425, 2228376089 },
-        { 2295790366, 1401872772, 1693168425 } },
-      // Matrix for nskip = 5 * 8 ^ 6:
-      { { 1282168185, 2751813658, 602760489 },
-        { 2254465781, 1232521545, 2751813658 },
-        { 1025381169, 1981662800, 1232521545 } },
-      // Matrix for nskip = 6 * 8 ^ 6:
-      { { 460755919, 4283511820, 3208183750 },
-        { 3248110895, 730327118, 4283511820 },
-        { 1386862282, 926261676, 730327118 } },
-      // Matrix for nskip = 7 * 8 ^ 6:
-      { { 2392208153, 3129124418, 684400653 },
-        { 4025364146, 1122067473, 3129124418 },
-        { 773418203, 2967386517, 1122067473 } },
-      // Matrix for nskip = 1 * 8 ^ 7:
-      { { 2529428830, 1497104068, 4253248635 },
-        { 3746310018, 630867741, 1497104068 },
-        { 627043435, 721725795, 630867741 } },
-      // Matrix for nskip = 2 * 8 ^ 7:
-      { { 2571072593, 3039669025, 1591031831 },
-        { 526054481, 661344445, 3039669025 },
-        { 4246010312, 735391270, 661344445 } },
-      // Matrix for nskip = 3 * 8 ^ 7:
-      { { 3781620139, 2917363935, 2936154555 },
-        { 2668364492, 3297773364, 2917363935 },
-        { 2501878263, 3438979384, 3297773364 } },
-      // Matrix for nskip = 4 * 8 ^ 7:
-      { { 1847312821, 4042890210, 4241772463 },
-        { 606605705, 2644799309, 4042890210 },
-        { 2658402822, 1342278931, 2644799309 } },
-      // Matrix for nskip = 5 * 8 ^ 7:
-      { { 3502592220, 3704088248, 4011400538 },
-        { 2932838910, 1175764916, 3704088248 },
-        { 2865336247, 2471593729, 1175764916 } },
-      // Matrix for nskip = 6 * 8 ^ 7:
-      { { 3250474907, 3775615386, 3733878711 },
-        { 1502779384, 287728234, 3775615386 },
-        { 162441370, 246229618, 287728234 } },
-      // Matrix for nskip = 7 * 8 ^ 7:
-      { { 749636765, 3227070913, 3120894575 },
-        { 2853687796, 1910085226, 3227070913 },
-        { 2453891386, 4230641571, 1910085226 } },
-      // Matrix for nskip = 1 * 8 ^ 8:
-      { { 2409846784, 1096138313, 1416249993 },
-        { 1501878241, 138013862, 1096138313 },
-        { 1617749306, 1975136163, 138013862 } },
-      // Matrix for nskip = 2 * 8 ^ 8:
-      { { 599453422, 73950522, 2965395603 },
-        { 55354701, 3855242202, 73950522 },
-        { 3981734504, 3354399019, 3855242202 } },
-      // Matrix for nskip = 3 * 8 ^ 8:
-      { { 3515748818, 1941532786, 3590950415 },
-        { 3557298699, 2872969148, 1941532786 },
-        { 3200219335, 3657910297, 2872969148 } },
-      // Matrix for nskip = 4 * 8 ^ 8:
-      { { 4271076381, 813410089, 3461955319 },
-        { 1044920137, 3029005516, 813410089 },
-        { 3501837362, 3321539504, 3029005516 } },
-      // Matrix for nskip = 5 * 8 ^ 8:
-      { { 1749168476, 312277958, 960113158 },
-        { 3444686334, 4207289909, 312277958 },
-        { 2940543965, 559813450, 4207289909 } },
-      // Matrix for nskip = 6 * 8 ^ 8:
-      { { 316005085, 3130396563, 3837877063 },
-        { 1625744025, 2903706877, 3130396563 },
-        { 201947523, 3713704391, 2903706877 } },
-      // Matrix for nskip = 7 * 8 ^ 8:
-      { { 2725645318, 3806079268, 2159958180 },
-        { 1110389513, 1295130289, 3806079268 },
-        { 2596032611, 1951986222, 1295130289 } },
-      // Matrix for nskip = 1 * 8 ^ 9:
-      { { 3058183515, 941408572, 1783998098 },
-        { 1546486080, 4116985007, 941408572 },
-        { 2247500745, 1460625377, 4116985007 } },
-      // Matrix for nskip = 2 * 8 ^ 9:
-      { { 4216782514, 3352801941, 2315095646 },
-        { 639029973, 94451952, 3352801941 },
-        { 1242898773, 3964593332, 94451952 } },
-      // Matrix for nskip = 3 * 8 ^ 9:
-      { { 3704530610, 1763750345, 4252200234 },
-        { 3310872720, 3465004782, 1763750345 },
-        { 1602573750, 530766064, 3465004782 } },
-      // Matrix for nskip = 4 * 8 ^ 9:
-      { { 2264905138, 1926285644, 1108147171 },
-        { 2390706911, 385258225, 1926285644 },
-        { 3569882325, 3728744670, 385258225 } },
-      // Matrix for nskip = 5 * 8 ^ 9:
-      { { 1104250853, 2649508927, 1011964068 },
-        { 1303004323, 2245340871, 2649508927 },
-        { 2225918280, 1790484033, 2245340871 } },
-      // Matrix for nskip = 6 * 8 ^ 9:
-      { { 704130800, 2663175885, 3195438389 },
-        { 2578332381, 377826974, 2663175885 },
-        { 3055477316, 116744102, 377826974 } },
-      // Matrix for nskip = 7 * 8 ^ 9:
-      { { 1534677729, 1538922981, 1955454860 },
-        { 3358514099, 279668397, 1538922981 },
-        { 1333529549, 1503627474, 279668397 } },
-      // Matrix for nskip = 1 * 8 ^ 10:
-      { { 270679073, 1065683096, 2992662885 },
-        { 4196917281, 2886425156, 1065683096 },
-        { 749134119, 1849148167, 2886425156 } },
-      // Matrix for nskip = 2 * 8 ^ 10:
-      { { 35689930, 1378151623, 951629713 },
-        { 673810920, 948843427, 1378151623 },
-        { 3808868984, 927013635, 948843427 } },
-      // Matrix for nskip = 3 * 8 ^ 10:
-      { { 1708907294, 3971013929, 120796985 },
-        { 341462694, 1820387182, 3971013929 },
-        { 658508974, 1448556483, 1820387182 } },
-      // Matrix for nskip = 4 * 8 ^ 10:
-      { { 1891490872, 1130489594, 3734864133 },
-        { 1457450350, 3362920032, 1130489594 },
-        { 638998846, 1401175590, 3362920032 } },
-      // Matrix for nskip = 5 * 8 ^ 10:
-      { { 2493538871, 1119726169, 3415942617 },
-        { 3041636598, 2163282065, 1119726169 },
-        { 3770868549, 1056545317, 2163282065 } },
-      // Matrix for nskip = 6 * 8 ^ 10:
-      { { 3254893662, 3244521128, 1199630310 },
-        { 4235017122, 2943451417, 3244521128 },
-        { 2697569444, 4187443436, 2943451417 } },
-      // Matrix for nskip = 7 * 8 ^ 10:
-      { { 4046281084, 3800263816, 3215056790 },
-        { 1654449614, 386290994, 3800263816 },
-        { 1471940141, 481393463, 386290994 } },
-      // Matrix for nskip = 1 * 8 ^ 11:
-      { { 2254459023, 2384691454, 1730098031 },
-        { 2844861718, 1807491073, 2384691454 },
-        { 351423668, 1570264155, 1807491073 } },
-      // Matrix for nskip = 2 * 8 ^ 11:
-      { { 3047429268, 4245359555, 2449575498 },
-        { 1797081212, 1237196477, 4245359555 },
-        { 143400628, 3663731096, 1237196477 } },
-      // Matrix for nskip = 3 * 8 ^ 11:
-      { { 2147359263, 1349445168, 2733446300 },
-        { 1305907164, 210670816, 1349445168 },
-        { 2509073771, 839244126, 210670816 } },
-      // Matrix for nskip = 4 * 8 ^ 11:
-      { { 3313321106, 4263819658, 1047529624 },
-        { 3719941673, 3155049403, 4263819658 },
-        { 1981313839, 4281524426, 3155049403 } },
-      // Matrix for nskip = 5 * 8 ^ 11:
-      { { 1429567203, 899246895, 3248764453 },
-        { 2783815531, 108747348, 899246895 },
-        { 256526168, 1467875854, 108747348 } },
-      // Matrix for nskip = 6 * 8 ^ 11:
-      { { 2740000743, 1423127512, 1283194774 },
-        { 700110581, 582760735, 1423127512 },
-        { 571933335, 785351190, 582760735 } },
-      // Matrix for nskip = 7 * 8 ^ 11:
-      { { 448747464, 852164586, 412380392 },
-        { 497540878, 2374838356, 852164586 },
-        { 1830234951, 2052902650, 2374838356 } },
-      // Matrix for nskip = 1 * 8 ^ 12:
-      { { 2005252417, 3263186729, 1535805957 },
-        { 2951515865, 1729281525, 3263186729 },
-        { 1141249417, 2268963059, 1729281525 } },
-      // Matrix for nskip = 2 * 8 ^ 12:
-      { { 2367065164, 83908466, 4294308508 },
-        { 1352516724, 1416676049, 83908466 },
-        { 1040867745, 1304732377, 1416676049 } },
-      // Matrix for nskip = 3 * 8 ^ 12:
-      { { 2985917792, 4096493219, 1529477403 },
-        { 1201774212, 2070059496, 4096493219 },
-        { 1675108536, 3110356679, 2070059496 } },
-      // Matrix for nskip = 4 * 8 ^ 12:
-      { { 3214147257, 1434230503, 2944821434 },
-        { 2753040912, 4041536918, 1434230503 },
-        { 1317260239, 338830578, 4041536918 } },
-      // Matrix for nskip = 5 * 8 ^ 12:
-      { { 3409339184, 2193226133, 1795377731 },
-        { 1348686132, 3710830263, 2193226133 },
-        { 2242696089, 3564440066, 3710830263 } },
-      // Matrix for nskip = 6 * 8 ^ 12:
-      { { 3189933295, 1475654090, 2785534584 },
-        { 4286962883, 2397146654, 1475654090 },
-        { 403072156, 2221537290, 2397146654 } },
-      // Matrix for nskip = 7 * 8 ^ 12:
-      { { 741855424, 1898764790, 1822660758 },
-        { 1315270526, 1027835471, 1898764790 },
-        { 3142787072, 3867031443, 1027835471 } },
-      // Matrix for nskip = 1 * 8 ^ 13:
-      { { 300628476, 2054743463, 1499597869 },
-        { 1762244284, 1422043015, 2054743463 },
-        { 3581125669, 1207561803, 1422043015 } },
-      // Matrix for nskip = 2 * 8 ^ 13:
-      { { 4171745404, 4064983592, 1934508265 },
-        { 3049723261, 1744636487, 4064983592 },
-        { 947753516, 3952135907, 1744636487 } },
-      // Matrix for nskip = 3 * 8 ^ 13:
-      { { 392234088, 1933162500, 3586081024 },
-        { 4234172394, 2757237142, 1933162500 },
-        { 3177450083, 2703743057, 2757237142 } },
-      // Matrix for nskip = 4 * 8 ^ 13:
-      { { 1625369148, 3577024659, 2778677259 },
-        { 1729967818, 1049600974, 3577024659 },
-        { 2089137344, 1569794605, 1049600974 } },
-      // Matrix for nskip = 5 * 8 ^ 13:
-      { { 24259337, 1099944220, 56936276 },
-        { 2473082148, 2484906695, 1099944220 },
-        { 4143714563, 1902230902, 2484906695 } },
-      // Matrix for nskip = 6 * 8 ^ 13:
-      { { 53562000, 2164320300, 319591773 },
-        { 480516705, 2016775973, 2164320300 },
-        { 3670445841, 1306292301, 2016775973 } },
-      // Matrix for nskip = 7 * 8 ^ 13:
-      { { 1588148001, 2552094779, 2777917575 },
-        { 3446764329, 4181915770, 2552094779 },
-        { 2748502268, 1366641757, 4181915770 } },
-      // Matrix for nskip = 1 * 8 ^ 14:
-      { { 1373068765, 3958611830, 569117280 },
-        { 410042396, 3551255470, 3958611830 },
-        { 869476379, 1680625376, 3551255470 } },
-      // Matrix for nskip = 2 * 8 ^ 14:
-      { { 2108618602, 2543645250, 913717833 },
-        { 2111984988, 1012482542, 2543645250 },
-        { 2545745615, 3141042890, 1012482542 } },
-      // Matrix for nskip = 3 * 8 ^ 14:
-      { { 1200101967, 3500039413, 1380082835 },
-        { 1489246316, 1939611745, 3500039413 },
-        { 1721948148, 3454434256, 1939611745 } },
-      // Matrix for nskip = 4 * 8 ^ 14:
-      { { 1157293598, 584852249, 2272893205 },
-        { 1631801979, 3013855247, 584852249 },
-        { 3977310441, 82049263, 3013855247 } },
-      // Matrix for nskip = 5 * 8 ^ 14:
-      { { 3527704969, 2070084361, 2336461093 },
-        { 675176428, 59273233, 2070084361 },
-        { 215288790, 1628101656, 59273233 } },
-      // Matrix for nskip = 6 * 8 ^ 14:
-      { { 3037143591, 2883460010, 26163475 },
-        { 1380682893, 3598790241, 2883460010 },
-        { 1573828863, 3515570245, 3598790241 } },
-      // Matrix for nskip = 7 * 8 ^ 14:
-      { { 2503812675, 2054481550, 2095990336 },
-        { 4200011507, 3373769861, 2054481550 },
-        { 1172973983, 1101682881, 3373769861 } },
-      // Matrix for nskip = 1 * 8 ^ 15:
-      { { 3580234334, 3137526662, 2403875621 },
-        { 3580869206, 3670086228, 3137526662 },
-        { 656744553, 1764904195, 3670086228 } },
-      // Matrix for nskip = 2 * 8 ^ 15:
-      { { 2792496861, 3634185196, 3887031679 },
-        { 3601823850, 3464838365, 3634185196 },
-        { 3136165138, 2842987937, 3464838365 } },
-      // Matrix for nskip = 3 * 8 ^ 15:
-      { { 860869470, 981305692, 955067142 },
-        { 1287512071, 3232580086, 981305692 },
-        { 1932329582, 2220460662, 3232580086 } },
-      // Matrix for nskip = 4 * 8 ^ 15:
-      { { 1362557480, 3230022138, 4278720212 },
-        { 3427386258, 3848976950, 3230022138 },
-        { 2109817045, 2441486578, 3848976950 } },
-      // Matrix for nskip = 5 * 8 ^ 15:
-      { { 2708545360, 267497185, 2662390285 },
-        { 13298153, 1401050440, 267497185 },
-        { 2610290298, 574376174, 1401050440 } },
-      // Matrix for nskip = 6 * 8 ^ 15:
-      { { 4064509494, 1054794505, 2873059524 },
-        { 2518650890, 2583418592, 1054794505 },
-        { 2277374582, 2950188629, 2583418592 } },
-      // Matrix for nskip = 7 * 8 ^ 15:
-      { { 43539574, 3585947086, 1551803386 },
-        { 4188500293, 3646000753, 3585947086 },
-        { 1152314996, 3244390048, 3646000753 } },
-      // Matrix for nskip = 1 * 8 ^ 16:
-      { { 1198519135, 2007945401, 3868481 },
-        { 3335076429, 2082683147, 2007945401 },
-        { 2341088247, 888193479, 2082683147 } },
-      // Matrix for nskip = 2 * 8 ^ 16:
-      { { 3473925387, 3193380570, 565138859 },
-        { 307060547, 782210925, 3193380570 },
-        { 167617770, 2180014252, 782210925 } },
-      // Matrix for nskip = 3 * 8 ^ 16:
-      { { 3946174395, 938410993, 2583257939 },
-        { 898527522, 1909350615, 938410993 },
-        { 1517357015, 2538479259, 1909350615 } },
-      // Matrix for nskip = 4 * 8 ^ 16:
-      { { 3811588895, 3303532086, 2766583698 },
-        { 908630605, 2665400165, 3303532086 },
-        { 2499994113, 3316180851, 2665400165 } },
-      // Matrix for nskip = 5 * 8 ^ 16:
-      { { 2828295511, 296464469, 3400652741 },
-        { 3697213244, 3884416364, 296464469 },
-        { 2902099262, 1705355356, 3884416364 } },
-      // Matrix for nskip = 6 * 8 ^ 16:
-      { { 3952581582, 91397022, 1472690314 },
-        { 2332659451, 3813545212, 91397022 },
-        { 2942299995, 3287843695, 3813545212 } },
-      // Matrix for nskip = 7 * 8 ^ 16:
-      { { 1334460780, 861234488, 2817452481 },
-        { 435895955, 3356827989, 861234488 },
-        { 1590379239, 2041861019, 3356827989 } },
-      // Matrix for nskip = 1 * 8 ^ 17:
-      { { 4288926968, 3033075037, 1505732852 },
-        { 1531633406, 645804125, 3033075037 },
-        { 2942690261, 2205365640, 645804125 } },
-      // Matrix for nskip = 2 * 8 ^ 17:
-      { { 3976196483, 3651411522, 1652430357 },
-        { 1690405883, 1294990760, 3651411522 },
-        { 209339647, 3088484327, 1294990760 } },
-      // Matrix for nskip = 3 * 8 ^ 17:
-      { { 3313281387, 404839765, 4119379625 },
-        { 1282760808, 1769786574, 404839765 },
-        { 2156822533, 2134509408, 1769786574 } },
-      // Matrix for nskip = 4 * 8 ^ 17:
-      { { 3171589548, 2291131070, 2093793287 },
-        { 2997812074, 4093879780, 2291131070 },
-        { 3255666800, 858124816, 4093879780 } },
-      // Matrix for nskip = 5 * 8 ^ 17:
-      { { 2671377286, 4060168649, 2412035287 },
-        { 2560486338, 828012431, 4060168649 },
-        { 431779937, 1288430895, 828012431 } },
-      // Matrix for nskip = 6 * 8 ^ 17:
-      { { 3419357098, 2547678446, 3186955890 },
-        { 3335475366, 2875872016, 2547678446 },
-        { 1190772134, 216187195, 2875872016 } },
-      // Matrix for nskip = 7 * 8 ^ 17:
-      { { 2462780486, 3788991986, 2965830319 },
-        { 4101189674, 1696959105, 3788991986 },
-        { 170171245, 376763544, 1696959105 } },
-      // Matrix for nskip = 1 * 8 ^ 18:
-      { { 4113016361, 2999667479, 3995043314 },
-        { 1333973326, 4007774239, 2999667479 },
-        { 3322921863, 4278103786, 4007774239 } },
-      // Matrix for nskip = 2 * 8 ^ 18:
-      { { 925786347, 2109676036, 1879981040 },
-        { 1701566570, 1489702270, 2109676036 },
-        { 2719807628, 158549605, 1489702270 } },
-      // Matrix for nskip = 3 * 8 ^ 18:
-      { { 988998360, 4224987734, 2705609303 },
-        { 3781735882, 3210618179, 4224987734 },
-        { 2000646801, 3763764745, 3210618179 } },
-      // Matrix for nskip = 4 * 8 ^ 18:
-      { { 2255405265, 3460246357, 218033453 },
-        { 2135115875, 359516994, 3460246357 },
-        { 3568862459, 3114762683, 359516994 } },
-      // Matrix for nskip = 5 * 8 ^ 18:
-      { { 3151385849, 2749420870, 1663192542 },
-        { 3858805987, 658557447, 2749420870 },
-        { 3895454596, 3780884000, 658557447 } },
-      // Matrix for nskip = 6 * 8 ^ 18:
-      { { 1720065491, 953484022, 1382647120 },
-        { 1315666944, 2456296663, 953484022 },
-        { 572064418, 2149791939, 2456296663 } },
-      // Matrix for nskip = 7 * 8 ^ 18:
-      { { 2767100879, 4015038188, 1215355080 },
-        { 3185998778, 1592475141, 4015038188 },
-        { 135551392, 4171059118, 1592475141 } },
-      // Matrix for nskip = 1 * 8 ^ 19:
-      { { 773148471, 4117539411, 3073622315 },
-        { 3807175775, 186466108, 4117539411 },
-        { 2842197411, 651334129, 186466108 } },
-      // Matrix for nskip = 2 * 8 ^ 19:
-      { { 615242951, 1475251263, 3586439101 },
-        { 1693917167, 3058812486, 1475251263 },
-        { 568701600, 1164226398, 3058812486 } },
-      // Matrix for nskip = 3 * 8 ^ 19:
-      { { 3729302216, 1041711449, 2647679194 },
-        { 3878048889, 135488725, 1041711449 },
-        { 508494460, 2178143073, 135488725 } },
-      // Matrix for nskip = 4 * 8 ^ 19:
-      { { 1632636204, 15370275, 2061555515 },
-        { 4187505695, 1741164221, 15370275 },
-        { 2882176274, 3978412194, 1741164221 } },
-      // Matrix for nskip = 5 * 8 ^ 19:
-      { { 4199667935, 4240821442, 3087593298 },
-        { 2968278570, 2185585470, 4240821442 },
-        { 2826850420, 371506848, 2185585470 } },
-      // Matrix for nskip = 6 * 8 ^ 19:
-      { { 4002434761, 1455254388, 1267013695 },
-        { 2324442395, 2192287989, 1455254388 },
-        { 3389390262, 2190852671, 2192287989 } },
-      // Matrix for nskip = 7 * 8 ^ 19:
-      { { 3722528722, 3193070982, 1527096340 },
-        { 3155996013, 2278658572, 3193070982 },
-        { 2051186788, 4289100465, 2278658572 } },
-      // Matrix for nskip = 1 * 8 ^ 20:
-      { { 3446066703, 344820524, 74213775 },
-        { 1008543583, 2579620192, 344820524 },
-        { 3753911358, 1538453821, 2579620192 } },
-      // Matrix for nskip = 2 * 8 ^ 20:
-      { { 3600859892, 1269921024, 4069458760 },
-        { 2050939727, 2222725697, 1269921024 },
-        { 3208347646, 690898125, 2222725697 } },
-      // Matrix for nskip = 3 * 8 ^ 20:
-      { { 2580978896, 2572090525, 3334144098 },
-        { 804558063, 250626667, 2572090525 },
-        { 843125518, 1038659713, 250626667 } },
-      // Matrix for nskip = 4 * 8 ^ 20:
-      { { 599407451, 2806239788, 1742216102 },
-        { 975123999, 764869161, 2806239788 },
-        { 2729710367, 1845257036, 764869161 } },
-      // Matrix for nskip = 5 * 8 ^ 20:
-      { { 1900612628, 1237821080, 3847187360 },
-        { 4059416755, 2650131939, 1237821080 },
-        { 31199658, 2064718263, 2650131939 } },
-      // Matrix for nskip = 6 * 8 ^ 20:
-      { { 1347324880, 3034196764, 3435152676 },
-        { 2459581108, 68307108, 3034196764 },
-        { 4060225449, 1313975073, 68307108 } },
-      // Matrix for nskip = 7 * 8 ^ 20:
-      { { 832405527, 4273872816, 2483412578 },
-        { 1083671641, 2619838177, 4273872816 },
-        { 3452165941, 3089879239, 2619838177 } },
-      // Matrix for nskip = 1 * 8 ^ 21:
-      { { 967330218, 3464884028, 3444447102 },
-        { 580449578, 1343714307, 3464884028 },
-        { 1775329096, 4027221761, 1343714307 } },
-      // Matrix for nskip = 2 * 8 ^ 21:
-      { { 3426136514, 4123590610, 2477690850 },
-        { 1284315665, 1604068527, 4123590610 },
-        { 1818147893, 320435440, 1604068527 } },
-      // Matrix for nskip = 3 * 8 ^ 21:
-      { { 2183845304, 1753369147, 3320030113 },
-        { 1615069375, 2429599106, 1753369147 },
-        { 4089942461, 816400070, 2429599106 } },
-      // Matrix for nskip = 4 * 8 ^ 21:
-      { { 2678132557, 89090276, 2719996384 },
-        { 607972119, 3383659282, 89090276 },
-        { 480221151, 2265789281, 3383659282 } },
-      // Matrix for nskip = 5 * 8 ^ 21:
-      { { 1549131095, 4063932361, 140002783 },
-        { 3213919212, 3321129811, 4063932361 },
-        { 2806676458, 1803235719, 3321129811 } },
-      // Matrix for nskip = 6 * 8 ^ 21:
-      { { 2289583273, 1236554533, 358687301 },
-        { 1498394381, 1159516887, 1236554533 },
-        { 359182081, 4214998734, 1159516887 } },
-      // Matrix for nskip = 7 * 8 ^ 21:
-      { { 1434974522, 4046133592, 349947526 },
-        { 383007031, 4052481195, 4046133592 },
-        { 1677657970, 799675597, 4052481195 } },
-      // Matrix for nskip = 1 * 8 ^ 22:
-      { { 1827237091, 2290099491, 614471834 },
-        { 3711385978, 2748163602, 2290099491 },
-        { 2067064347, 1071954219, 2748163602 } },
-      // Matrix for nskip = 2 * 8 ^ 22:
-      { { 3894793123, 921712152, 596236860 },
-        { 4038673596, 4279784147, 921712152 },
-        { 1999065039, 859801225, 4279784147 } },
-      // Matrix for nskip = 3 * 8 ^ 22:
-      { { 3518731582, 2398700699, 3703766159 },
-        { 1998914732, 1951351916, 2398700699 },
-        { 2852188423, 1461089983, 1951351916 } },
-      // Matrix for nskip = 4 * 8 ^ 22:
-      { { 7276915, 3205297712, 1204204130 },
-        { 2667672243, 2737282292, 3205297712 },
-        { 2282864144, 2305990443, 2737282292 } },
-      // Matrix for nskip = 5 * 8 ^ 22:
-      { { 2376625824, 3090473348, 776691260 },
-        { 4067754877, 2149314284, 3090473348 },
-        { 198230411, 2870222545, 2149314284 } },
-      // Matrix for nskip = 6 * 8 ^ 22:
-      { { 1638154181, 688311656, 278971912 },
-        { 2626529484, 1769978612, 688311656 },
-        { 2951434168, 1794042358, 1769978612 } },
-      // Matrix for nskip = 7 * 8 ^ 22:
-      { { 3742216352, 1164158193, 1223269258 },
-        { 3621125172, 3964660872, 1164158193 },
-        { 3373873746, 2614176571, 3964660872 } },
-      // Matrix for nskip = 1 * 8 ^ 23:
-      { { 935922304, 2428000499, 510672020 },
-        { 1541887892, 92472822, 2428000499 },
-        { 4146892220, 1307489118, 92472822 } },
-      // Matrix for nskip = 2 * 8 ^ 23:
-      { { 690398653, 3787391292, 1705516721 },
-        { 2953871718, 4173917861, 3787391292 },
-        { 817556203, 3090114656, 4173917861 } },
-      // Matrix for nskip = 3 * 8 ^ 23:
-      { { 2596837368, 523638114, 796925063 },
-        { 2436421546, 3808361324, 523638114 },
-        { 3645860436, 2767640965, 3808361324 } },
-      // Matrix for nskip = 4 * 8 ^ 23:
-      { { 476867729, 1917800003, 1740083735 },
-        { 3167988201, 1286715218, 1917800003 },
-        { 2579365599, 4173763431, 1286715218 } },
-      // Matrix for nskip = 5 * 8 ^ 23:
-      { { 875985265, 2902381003, 3585549348 },
-        { 1487116735, 20494290, 2902381003 },
-        { 3417450723, 672893019, 20494290 } },
-      // Matrix for nskip = 6 * 8 ^ 23:
-      { { 680890926, 3782598365, 3927087723 },
-        { 3291528625, 2096301120, 3782598365 },
-        { 3927430411, 2089751145, 2096301120 } },
-      // Matrix for nskip = 7 * 8 ^ 23:
-      { { 2506371881, 3282095953, 1709670308 },
-        { 2778786590, 3316228403, 3282095953 },
-        { 3936394935, 4103225131, 3316228403 } },
-      // Matrix for nskip = 1 * 8 ^ 24:
-      { { 4092801160, 3749431174, 542781592 },
-        { 1208313783, 217808460, 3749431174 },
-        { 2708923752, 348848516, 217808460 } },
-      // Matrix for nskip = 2 * 8 ^ 24:
-      { { 381829350, 1732869179, 3638540651 },
-        { 2509789412, 1114357536, 1732869179 },
-        { 2465372475, 350550480, 1114357536 } },
-      // Matrix for nskip = 3 * 8 ^ 24:
-      { { 4088394360, 3507668274, 103212933 },
-        { 1229010797, 2457049990, 3507668274 },
-        { 1543332620, 3557973226, 2457049990 } },
-      // Matrix for nskip = 4 * 8 ^ 24:
-      { { 3712059912, 1698887908, 3706277064 },
-        { 2152325130, 232741719, 1698887908 },
-        { 4114351745, 170237153, 232741719 } },
-      // Matrix for nskip = 5 * 8 ^ 24:
-      { { 2230538189, 2798697140, 2813869207 },
-        { 2098708615, 4247643355, 2798697140 },
-        { 1732506223, 3352831267, 4247643355 } },
-      // Matrix for nskip = 6 * 8 ^ 24:
-      { { 141104167, 950363290, 3526146168 },
-        { 1842485244, 366288723, 950363290 },
-        { 901263071, 3346018419, 366288723 } },
-      // Matrix for nskip = 7 * 8 ^ 24:
-      { { 1273880950, 1252923554, 845609283 },
-        { 3523638916, 1756558336, 1252923554 },
-        { 983823623, 3396822999, 1756558336 } },
-      // Matrix for nskip = 1 * 8 ^ 25:
-      { { 993804379, 905755330, 1717718779 },
-        { 1712994855, 2713148271, 905755330 },
-        { 2200585411, 111258429, 2713148271 } },
-      // Matrix for nskip = 2 * 8 ^ 25:
-      { { 82758667, 1871391091, 4127413238 },
-        { 3672831523, 69195019, 1871391091 },
-        { 3672091415, 3528743235, 69195019 } },
-      // Matrix for nskip = 3 * 8 ^ 25:
-      { { 1954591259, 636118602, 2621269238 },
-        { 462961075, 4030630272, 636118602 },
-        { 3305976356, 1757343588, 4030630272 } },
-      // Matrix for nskip = 4 * 8 ^ 25:
-      { { 3361372532, 2329303404, 99651939 },
-        { 2008671965, 2931758910, 2329303404 },
-        { 1113529483, 2374097189, 2931758910 } },
-      // Matrix for nskip = 5 * 8 ^ 25:
-      { { 1475330900, 1973232757, 3087886870 },
-        { 1184427939, 3491162930, 1973232757 },
-        { 4229179055, 3166281484, 3491162930 } },
-      // Matrix for nskip = 6 * 8 ^ 25:
-      { { 2138712950, 3210181465, 230171794 },
-        { 1011789944, 3536018417, 3210181465 },
-        { 2847216174, 620673032, 3536018417 } },
-      // Matrix for nskip = 7 * 8 ^ 25:
-      { { 1691375920, 1708800738, 1210582211 },
-        { 2919192023, 1561934882, 1708800738 },
-        { 3388931282, 2988640653, 1561934882 } },
-      // Matrix for nskip = 1 * 8 ^ 26:
-      { { 1831590873, 1588259595, 1314332382 },
-        { 2385989343, 2508077280, 1588259595 },
-        { 1787615788, 661437137, 2508077280 } },
-      // Matrix for nskip = 2 * 8 ^ 26:
-      { { 2326052247, 4183591379, 4049009082 },
-        { 2604529491, 1453913233, 4183591379 },
-        { 2311925423, 1805360390, 1453913233 } },
-      // Matrix for nskip = 3 * 8 ^ 26:
-      { { 664423898, 2590401961, 4225456867 },
-        { 3913458720, 1982184590, 2590401961 },
-        { 2950459869, 334885555, 1982184590 } },
-      // Matrix for nskip = 4 * 8 ^ 26:
-      { { 3956367490, 604461629, 1257432102 },
-        { 794711716, 1155867175, 604461629 },
-        { 1777070788, 429445904, 1155867175 } },
-      // Matrix for nskip = 5 * 8 ^ 26:
-      { { 2357556007, 3027793563, 3037152168 },
-        { 328118796, 419690250, 3027793563 },
-        { 2699357594, 1143766272, 419690250 } },
-      // Matrix for nskip = 6 * 8 ^ 26:
-      { { 3183717084, 2634631308, 2109777894 },
-        { 1745049657, 2872637888, 2634631308 },
-        { 3660634616, 2434030341, 2872637888 } },
-      // Matrix for nskip = 7 * 8 ^ 26:
-      { { 961674331, 524745427, 3832393053 },
-        { 2375268260, 2883640227, 524745427 },
-        { 3564327755, 2130782725, 2883640227 } },
-      // Matrix for nskip = 1 * 8 ^ 27:
-      { { 1686241617, 1257046062, 1427609439 },
-        { 490376081, 387798431, 1257046062 },
-        { 235551485, 1312672615, 387798431 } },
-      // Matrix for nskip = 2 * 8 ^ 27:
-      { { 2362447880, 3445363024, 3160262066 },
-        { 2426867845, 4194339866, 3445363024 },
-        { 1046144413, 4177893681, 4194339866 } },
-      // Matrix for nskip = 3 * 8 ^ 27:
-      { { 2399569099, 1723951785, 2356709199 },
-        { 332901774, 3265509251, 1723951785 },
-        { 3616767886, 1726850927, 3265509251 } },
-      // Matrix for nskip = 4 * 8 ^ 27:
-      { { 4251175413, 3559576374, 3107663662 },
-        { 697539134, 1909472435, 3559576374 },
-        { 280754246, 375835695, 1909472435 } },
-      // Matrix for nskip = 5 * 8 ^ 27:
-      { { 1441163739, 911930333, 4028966669 },
-        { 3689446034, 1473406035, 911930333 },
-        { 3884376669, 1954838782, 1473406035 } },
-      // Matrix for nskip = 6 * 8 ^ 27:
-      { { 751906018, 4203984455, 2167450892 },
-        { 3937403282, 1862670973, 4203984455 },
-        { 402523958, 496211406, 1862670973 } },
-      // Matrix for nskip = 7 * 8 ^ 27:
-      { { 726664456, 2233062609, 98234458 },
-        { 149028817, 3613797222, 2233062609 },
-        { 3848675801, 4164228265, 3613797222 } },
-      // Matrix for nskip = 1 * 8 ^ 28:
-      { { 1099512970, 712404985, 1571467521 },
-        { 546519870, 1135109300, 712404985 },
-        { 3325312332, 2352874613, 1135109300 } },
-      // Matrix for nskip = 2 * 8 ^ 28:
-      { { 1945425936, 1653045514, 381988982 },
-        { 3733376326, 414410025, 1653045514 },
-        { 1181583679, 1185848176, 414410025 } },
-      // Matrix for nskip = 3 * 8 ^ 28:
-      { { 80175856, 1301935019, 1963289366 },
-        { 3961455404, 65355284, 1301935019 },
-        { 3052316027, 2858851708, 65355284 } },
-      // Matrix for nskip = 4 * 8 ^ 28:
-      { { 2526336124, 3019211015, 4215964965 },
-        { 2683163472, 4188191530, 3019211015 },
-        { 2964651598, 293801056, 4188191530 } },
-      // Matrix for nskip = 5 * 8 ^ 28:
-      { { 1749670132, 1387140872, 762351827 },
-        { 2971687592, 1196758134, 1387140872 },
-        { 237185264, 1741700121, 1196758134 } },
-      // Matrix for nskip = 6 * 8 ^ 28:
-      { { 4238062407, 481737140, 1487069976 },
-        { 878719633, 759707097, 481737140 },
-        { 749051338, 825174423, 759707097 } },
-      // Matrix for nskip = 7 * 8 ^ 28:
-      { { 1955913150, 1130524081, 2151646894 },
-        { 499306218, 101202533, 1130524081 },
-        { 2744191919, 1603656961, 101202533 } },
-      // Matrix for nskip = 1 * 8 ^ 29:
-      { { 1444052678, 2253324417, 39719589 },
-        { 1880267534, 2391992038, 2253324417 },
-        { 987740265, 3691889508, 2391992038 } },
-      // Matrix for nskip = 2 * 8 ^ 29:
-      { { 166599066, 2335494420, 1232261118 },
-        { 2227597731, 2570600780, 2335494420 },
-        { 2700034538, 3460843234, 2570600780 } },
-      // Matrix for nskip = 3 * 8 ^ 29:
-      { { 391577970, 1926759295, 2700541692 },
-        { 1952364431, 2281246481, 1926759295 },
-        { 1819825140, 2377574285, 2281246481 } },
-      // Matrix for nskip = 4 * 8 ^ 29:
-      { { 2511338360, 1188954576, 1251401239 },
-        { 2511664974, 292276982, 1188954576 },
-        { 697844082, 3093661552, 292276982 } },
-      // Matrix for nskip = 5 * 8 ^ 29:
-      { { 3984792772, 688024800, 2775323178 },
-        { 2263182715, 2971941970, 688024800 },
-        { 3585402638, 2532257287, 2971941970 } },
-      // Matrix for nskip = 6 * 8 ^ 29:
-      { { 1029044592, 4040666706, 3671213347 },
-        { 902332253, 3140636559, 4040666706 },
-        { 1429177194, 3213333408, 3140636559 } },
-      // Matrix for nskip = 7 * 8 ^ 29:
-      { { 3635935824, 599310841, 2541542820 },
-        { 1942681116, 83716008, 599310841 },
-        { 2957115888, 464001685, 83716008 } },
-      // Matrix for nskip = 1 * 8 ^ 30:
-      { { 3624650744, 51993077, 3540268009 },
-        { 3252828938, 3710319575, 51993077 },
-        { 2858628849, 3910069381, 3710319575 } },
-      // Matrix for nskip = 2 * 8 ^ 30:
-      { { 655966702, 754002362, 1646581402 },
-        { 1958331075, 475572423, 754002362 },
-        { 3248619000, 3228514800, 475572423 } },
-      // Matrix for nskip = 3 * 8 ^ 30:
-      { { 135820422, 1138672588, 1020827900 },
-        { 626151178, 4149545048, 1138672588 },
-        { 2180788629, 1314604300, 4149545048 } },
-      // Matrix for nskip = 4 * 8 ^ 30:
-      { { 2760311307, 4166372813, 741596417 },
-        { 2282679206, 3090782630, 4166372813 },
-        { 3242468721, 1628442374, 3090782630 } },
-      // Matrix for nskip = 5 * 8 ^ 30:
-      { { 88347075, 1420161828, 3113798953 },
-        { 217224032, 2004343529, 1420161828 },
-        { 4048389654, 3845790311, 2004343529 } },
-      // Matrix for nskip = 6 * 8 ^ 30:
-      { { 4237022985, 912148655, 165387559 },
-        { 252556101, 230998942, 912148655 },
-        { 2978268820, 7678432, 230998942 } },
-      // Matrix for nskip = 7 * 8 ^ 30:
-      { { 1702648282, 936444437, 2113813328 },
-        { 2870633999, 384435053, 936444437 },
-        { 2426580506, 1660785110, 384435053 } },
-      // Matrix for nskip = 1 * 8 ^ 31:
-      { { 4265279407, 3532111852, 1754687396 },
-        { 500404765, 2603727025, 3532111852 },
-        { 1428367254, 3149485478, 2603727025 } },
-      // Matrix for nskip = 2 * 8 ^ 31:
-      { { 2873769531, 2081104178, 596284397 },
-        { 4153800443, 1261269623, 2081104178 },
-        { 3967600061, 1830023157, 1261269623 } },
-      // Matrix for nskip = 3 * 8 ^ 31:
-      { { 1219416476, 2833805942, 877956083 },
-        { 4136201738, 926561185, 2833805942 },
-        { 790563916, 2950279312, 926561185 } },
-      // Matrix for nskip = 4 * 8 ^ 31:
-      { { 278611533, 2229285304, 3443204327 },
-        { 3110641420, 77498444, 2229285304 },
-        { 3904070810, 1070507239, 77498444 } },
-      // Matrix for nskip = 5 * 8 ^ 31:
-      { { 1569490059, 1438273012, 1676406913 },
-        { 2246148877, 835628171, 1438273012 },
-        { 1001911068, 165198836, 835628171 } },
-      // Matrix for nskip = 6 * 8 ^ 31:
-      { { 219341062, 236464123, 3922106376 },
-        { 244990374, 2122146632, 236464123 },
-        { 2065383788, 2977102789, 2122146632 } },
-      // Matrix for nskip = 7 * 8 ^ 31:
-      { { 2250560481, 1729521343, 424414765 },
-        { 2059608998, 3276353542, 1729521343 },
-        { 2230558099, 3933677451, 3276353542 } },
-      // Matrix for nskip = 1 * 8 ^ 32:
-      { { 544639534, 568528663, 2177189807 },
-        { 2475829068, 121482268, 568528663 },
-        { 876978915, 3116647617, 121482268 } },
-      // Matrix for nskip = 2 * 8 ^ 32:
-      { { 1547862823, 2404658587, 4191448009 },
-        { 2158188804, 2976916793, 2404658587 },
-        { 168571747, 1691884706, 2976916793 } },
-      // Matrix for nskip = 3 * 8 ^ 32:
-      { { 2707010111, 2933510859, 4240166566 },
-        { 1177241360, 62338927, 2933510859 },
-        { 2798158767, 906126073, 62338927 } },
-      // Matrix for nskip = 4 * 8 ^ 32:
-      { { 3208213311, 4212638780, 3235157352 },
-        { 671148556, 2951207765, 4212638780 },
-        { 2075145516, 2395485231, 2951207765 } },
-      // Matrix for nskip = 5 * 8 ^ 32:
-      { { 3757387996, 3349220842, 3722506196 },
-        { 224784515, 2952700002, 3349220842 },
-        { 1142378033, 2302905244, 2952700002 } },
-      // Matrix for nskip = 6 * 8 ^ 32:
-      { { 1941283113, 145407649, 659394903 },
-        { 347432419, 1571592397, 145407649 },
-        { 2204145504, 3369375773, 1571592397 } },
-      // Matrix for nskip = 7 * 8 ^ 32:
-      { { 1094854803, 386906095, 3767619826 },
-        { 1281474767, 179198568, 386906095 },
-        { 3021644798, 3594781674, 179198568 } },
-      // Matrix for nskip = 1 * 8 ^ 33:
-      { { 4080517315, 2133433101, 4043998180 },
-        { 2044221845, 867670560, 2133433101 },
-        { 834432416, 3613001199, 867670560 } },
-      // Matrix for nskip = 2 * 8 ^ 33:
-      { { 4102885735, 1319434267, 2678775073 },
-        { 740092580, 607380970, 1319434267 },
-        { 2198271844, 2610193258, 607380970 } },
-      // Matrix for nskip = 3 * 8 ^ 33:
-      { { 2725610481, 764583647, 1059048169 },
-        { 2571438051, 3510614410, 764583647 },
-        { 1753866259, 3525435230, 3510614410 } },
-      // Matrix for nskip = 4 * 8 ^ 33:
-      { { 1165218048, 1317690360, 1189150958 },
-        { 399240205, 2507168618, 1317690360 },
-        { 2988334517, 2687593413, 2507168618 } },
-      // Matrix for nskip = 5 * 8 ^ 33:
-      { { 1160307294, 3843003921, 120011318 },
-        { 1648569394, 2331840681, 3843003921 },
-        { 2666551617, 1826785014, 2331840681 } },
-      // Matrix for nskip = 6 * 8 ^ 33:
-      { { 2745374441, 3528536028, 2077936780 },
-        { 3475527779, 16047360, 3528536028 },
-        { 1346223401, 3691116188, 16047360 } },
-      // Matrix for nskip = 7 * 8 ^ 33:
-      { { 3985894561, 4225395152, 3428831071 },
-        { 3666024757, 3230532631, 4225395152 },
-        { 2407932196, 4261187489, 3230532631 } },
-      // Matrix for nskip = 1 * 8 ^ 34:
-      { { 1028861702, 4082006648, 338232527 },
-        { 1888486946, 1842080991, 4082006648 },
-        { 3903826366, 3109935091, 1842080991 } },
-      // Matrix for nskip = 2 * 8 ^ 34:
-      { { 614134826, 2261996505, 2888080641 },
-        { 710199359, 2773979788, 2261996505 },
-        { 1144301620, 2554371815, 2773979788 } },
-      // Matrix for nskip = 3 * 8 ^ 34:
-      { { 3872045348, 2988495416, 3084935324 },
-        { 1788745968, 3505214566, 2988495416 },
-        { 2741627244, 478558438, 3505214566 } },
-      // Matrix for nskip = 4 * 8 ^ 34:
-      { { 4056173823, 1285620078, 357420018 },
-        { 2423072612, 2309408315, 1285620078 },
-        { 1533175115, 2760088020, 2309408315 } },
-      // Matrix for nskip = 5 * 8 ^ 34:
-      { { 3469546091, 369086126, 3478496559 },
-        { 3780710118, 589042104, 369086126 },
-        { 1900191562, 3935275606, 589042104 } },
-      // Matrix for nskip = 6 * 8 ^ 34:
-      { { 1682769046, 1059146837, 2627186100 },
-        { 975501718, 2081627761, 1059146837 },
-        { 4182902400, 2809990303, 2081627761 } },
-      // Matrix for nskip = 7 * 8 ^ 34:
-      { { 3037332387, 2654288975, 181147870 },
-        { 454223518, 808123674, 2654288975 },
-        { 967475810, 1382885174, 808123674 } },
-      // Matrix for nskip = 1 * 8 ^ 35:
-      { { 4264130267, 815015434, 3142242173 },
-        { 180649975, 2500813569, 815015434 },
-        { 3378723563, 829683767, 2500813569 } },
-      // Matrix for nskip = 2 * 8 ^ 35:
-      { { 4174387531, 1030729435, 2812778314 },
-        { 1752988797, 4044178729, 1030729435 },
-        { 467969301, 554748104, 4044178729 } },
-      // Matrix for nskip = 3 * 8 ^ 35:
-      { { 1224655671, 538480994, 911775489 },
-        { 571730491, 1197428336, 538480994 },
-        { 310254483, 3482088360, 1197428336 } },
-      // Matrix for nskip = 4 * 8 ^ 35:
-      { { 1348429235, 2928743274, 3776082629 },
-        { 3607529209, 3069812185, 2928743274 },
-        { 2542432347, 3208181168, 3069812185 } },
-      // Matrix for nskip = 5 * 8 ^ 35:
-      { { 2414375640, 2994139106, 1829200407 },
-        { 3723068499, 3276234188, 2994139106 },
-        { 1384068579, 3863982741, 3276234188 } },
-      // Matrix for nskip = 6 * 8 ^ 35:
-      { { 798763723, 2897556757, 3145856482 },
-        { 3421663444, 3946110585, 2897556757 },
-        { 1853745554, 260368160, 3946110585 } },
-      // Matrix for nskip = 7 * 8 ^ 35:
-      { { 95178102, 3740645591, 3060595950 },
-        { 3321952562, 3932965485, 3740645591 },
-        { 76660843, 2044406932, 3932965485 } },
-      // Matrix for nskip = 1 * 8 ^ 36:
-      { { 4064845753, 668285756, 3816217625 },
-        { 3713143233, 1380634204, 668285756 },
-        { 3533700508, 1192551435, 1380634204 } },
-      // Matrix for nskip = 2 * 8 ^ 36:
-      { { 1515684518, 1706771705, 728123349 },
-        { 3174850469, 2057456462, 1706771705 },
-        { 3410402985, 2897339640, 2057456462 } },
-      // Matrix for nskip = 3 * 8 ^ 36:
-      { { 493252920, 4038063126, 2168451262 },
-        { 363246278, 1249105026, 4038063126 },
-        { 3395543717, 3358422070, 1249105026 } },
-      // Matrix for nskip = 4 * 8 ^ 36:
-      { { 3082272717, 531091457, 1390161328 },
-        { 3895139973, 2171402857, 531091457 },
-        { 4030688141, 3049703400, 2171402857 } },
-      // Matrix for nskip = 5 * 8 ^ 36:
-      { { 3935740675, 2355871533, 3949682718 },
-        { 2931048320, 902295474, 2355871533 },
-        { 847382876, 591758943, 902295474 } },
-      // Matrix for nskip = 6 * 8 ^ 36:
-      { { 1096633558, 956915353, 71119600 },
-        { 1282074175, 3814732591, 956915353 },
-        { 1834617826, 3605659623, 3814732591 } },
-      // Matrix for nskip = 7 * 8 ^ 36:
-      { { 1213485394, 883705085, 1819500595 },
-        { 3547515338, 2658882772, 883705085 },
-        { 3298597677, 2195730734, 2658882772 } },
-      // Matrix for nskip = 1 * 8 ^ 37:
-      { { 1241147206, 3193892819, 1244284192 },
-        { 65180262, 4065669017, 3193892819 },
-        { 1484817937, 3661081858, 4065669017 } },
-      // Matrix for nskip = 2 * 8 ^ 37:
-      { { 1438760812, 3491341751, 3414470157 },
-        { 2805337292, 272266053, 3491341751 },
-        { 824109230, 3202556526, 272266053 } },
-      // Matrix for nskip = 3 * 8 ^ 37:
-      { { 3548908153, 1458259435, 2902555273 },
-        { 3865796034, 2523447078, 1458259435 },
-        { 2359984375, 3898395136, 2523447078 } },
-      // Matrix for nskip = 4 * 8 ^ 37:
-      { { 135412706, 3627115412, 2345042216 },
-        { 1565169824, 2166856449, 3627115412 },
-        { 1026946745, 3467845248, 2166856449 } },
-      // Matrix for nskip = 5 * 8 ^ 37:
-      { { 4146693931, 4048659004, 2768049120 },
-        { 2555866488, 2548281288, 4048659004 },
-        { 2954738533, 4242463239, 2548281288 } },
-      // Matrix for nskip = 6 * 8 ^ 37:
-      { { 1796100563, 2291501743, 3432007410 },
-        { 1204345078, 1110795947, 2291501743 },
-        { 3388382946, 3937816720, 1110795947 } },
-      // Matrix for nskip = 7 * 8 ^ 37:
-      { { 3208221515, 607811602, 223757102 },
-        { 377063363, 3323143974, 607811602 },
-        { 279359428, 3272907713, 3323143974 } },
-      // Matrix for nskip = 1 * 8 ^ 38:
-      { { 1889419951, 3256876154, 1240505488 },
-        { 1254783743, 989966800, 3256876154 },
-        { 1995297400, 3692472918, 989966800 } },
-      // Matrix for nskip = 2 * 8 ^ 38:
-      { { 3206226875, 285700890, 496017472 },
-        { 2515316194, 2129675196, 285700890 },
-        { 1863853990, 2673457552, 2129675196 } },
-      // Matrix for nskip = 3 * 8 ^ 38:
-      { { 2643396669, 1141176790, 2183048631 },
-        { 2796763418, 686457718, 1141176790 },
-        { 3473541724, 755015447, 686457718 } },
-      // Matrix for nskip = 4 * 8 ^ 38:
-      { { 4163770641, 255160418, 772100749 },
-        { 1987092456, 3237660221, 255160418 },
-        { 1394381051, 4216039401, 3237660221 } },
-      // Matrix for nskip = 5 * 8 ^ 38:
-      { { 2744038617, 4151599085, 1086739611 },
-        { 2137012024, 1231067556, 4151599085 },
-        { 2054217062, 1474724988, 1231067556 } },
-      // Matrix for nskip = 6 * 8 ^ 38:
-      { { 1966926556, 2167105562, 3642406633 },
-        { 3575908026, 76072334, 2167105562 },
-        { 438275780, 1024705325, 76072334 } },
-      // Matrix for nskip = 7 * 8 ^ 38:
-      { { 3144149631, 1078973412, 1395133864 },
-        { 1200101371, 2263842276, 1078973412 },
-        { 1990245354, 4126971783, 2263842276 } },
-      // Matrix for nskip = 1 * 8 ^ 39:
-      { { 2133915627, 2713747584, 627765421 },
-        { 2300605925, 35690583, 2713747584 },
-        { 2918902946, 2638220304, 35690583 } },
-      // Matrix for nskip = 2 * 8 ^ 39:
-      { { 2587549655, 998684270, 4292130625 },
-        { 1791772791, 2820705344, 998684270 },
-        { 124590158, 3831143549, 2820705344 } },
-      // Matrix for nskip = 3 * 8 ^ 39:
-      { { 3910080826, 1802646553, 3446926966 },
-        { 129865302, 1755670478, 1802646553 },
-        { 1006007080, 2257707516, 1755670478 } },
-      // Matrix for nskip = 4 * 8 ^ 39:
-      { { 978482299, 3200877282, 497605289 },
-        { 3717741518, 3737164414, 3200877282 },
-        { 4046686626, 861393946, 3737164414 } },
-      // Matrix for nskip = 5 * 8 ^ 39:
-      { { 3183253558, 201453184, 3145469059 },
-        { 3983740037, 3717279042, 201453184 },
-        { 976459397, 485566112, 3717279042 } },
-      // Matrix for nskip = 6 * 8 ^ 39:
-      { { 1649247358, 1293997566, 1141681757 },
-        { 2104529013, 3994478979, 1293997566 },
-        { 12048398, 1296267255, 3994478979 } },
-      // Matrix for nskip = 7 * 8 ^ 39:
-      { { 1277127010, 3409985649, 2357026796 },
-        { 546146378, 1239287374, 3409985649 },
-        { 684416427, 1435662521, 1239287374 } },
-      // Matrix for nskip = 1 * 8 ^ 40:
-      { { 2665561897, 300934584, 3179822945 },
-        { 893043137, 2031413512, 300934584 },
-        { 3806926970, 2413249929, 2031413512 } },
-      // Matrix for nskip = 2 * 8 ^ 40:
-      { { 1417581911, 3071835354, 2575196237 },
-        { 4101127251, 1375339216, 3071835354 },
-        { 847617977, 3632503316, 1375339216 } },
-      // Matrix for nskip = 3 * 8 ^ 40:
-      { { 608673033, 22126256, 3556899267 },
-        { 1727979207, 849327659, 22126256 },
-        { 1702248031, 791369590, 849327659 } },
-      // Matrix for nskip = 4 * 8 ^ 40:
-      { { 2747488994, 3296604805, 898095468 },
-        { 1742777145, 219265369, 3296604805 },
-        { 823714885, 667779292, 219265369 } },
-      // Matrix for nskip = 5 * 8 ^ 40:
-      { { 2021014596, 471433423, 2651735970 },
-        { 585977516, 1605468910, 471433423 },
-        { 549943099, 3890474462, 1605468910 } },
-      // Matrix for nskip = 6 * 8 ^ 40:
-      { { 3574350911, 1933183379, 2250823873 },
-        { 1024311233, 365568357, 1933183379 },
-        { 3430128519, 3029426194, 365568357 } },
-      // Matrix for nskip = 7 * 8 ^ 40:
-      { { 1074178830, 2265105869, 2758013402 },
-        { 4125786414, 1034741107, 2265105869 },
-        { 1441524697, 2229554511, 1034741107 } },
-      // Matrix for nskip = 1 * 8 ^ 41:
-      { { 2640209692, 3040506537, 3626115220 },
-        { 161827078, 852668118, 3040506537 },
-        { 3856381322, 3360242076, 852668118 } },
-      // Matrix for nskip = 2 * 8 ^ 41:
-      { { 3734246393, 4151553160, 4177051283 },
-        { 266522866, 1731798531, 4151553160 },
-        { 632196679, 3864297722, 1731798531 } },
-      // Matrix for nskip = 3 * 8 ^ 41:
-      { { 688933188, 355423319, 287306155 },
-        { 1805598431, 3402169658, 355423319 },
-        { 2000267685, 2145558314, 3402169658 } },
-      // Matrix for nskip = 4 * 8 ^ 41:
-      { { 1694175127, 1087914338, 2384195794 },
-        { 2764925057, 505782858, 1087914338 },
-        { 3235634082, 807915248, 505782858 } },
-      // Matrix for nskip = 5 * 8 ^ 41:
-      { { 993693315, 3946332366, 3916271739 },
-        { 1789813323, 4018933334, 3946332366 },
-        { 441058505, 3553235314, 4018933334 } },
-      // Matrix for nskip = 6 * 8 ^ 41:
-      { { 1144818794, 3134263190, 1846865568 },
-        { 1502689349, 1628360471, 3134263190 },
-        { 745146577, 1872576407, 1628360471 } },
-      // Matrix for nskip = 7 * 8 ^ 41:
-      { { 3398717147, 3990568019, 892329010 },
-        { 3847547913, 3198332877, 3990568019 },
-        { 333749571, 1549630885, 3198332877 } },
-      // Matrix for nskip = 1 * 8 ^ 42:
-      { { 2402749950, 2353776151, 75909174 },
-        { 890570951, 1752665661, 2353776151 },
-        { 3120241607, 3862435696, 1752665661 } },
-      // Matrix for nskip = 2 * 8 ^ 42:
-      { { 2427906178, 3580155704, 949770784 },
-        { 226153695, 1230515664, 3580155704 },
-        { 1988835001, 986791581, 1230515664 } },
-      // Matrix for nskip = 3 * 8 ^ 42:
-      { { 2162922488, 4037183513, 346268022 },
-        { 2752767565, 2852643415, 4037183513 },
-        { 3557895539, 3796282786, 2852643415 } },
-      // Matrix for nskip = 4 * 8 ^ 42:
-      { { 1774047142, 3199155377, 3106427820 },
-        { 1901920839, 4290900039, 3199155377 },
-        { 4178980191, 280623348, 4290900039 } },
-      // Matrix for nskip = 5 * 8 ^ 42:
-      { { 564504637, 3960126556, 13271050 },
-        { 3975695622, 272607318, 3960126556 },
-        { 1199282733, 981722530, 272607318 } },
-      // Matrix for nskip = 6 * 8 ^ 42:
-      { { 3723690896, 3153461912, 693938118 },
-        { 2676196226, 1636264737, 3153461912 },
-        { 764380249, 3364804206, 1636264737 } },
-      // Matrix for nskip = 7 * 8 ^ 42:
-      { { 2002746065, 838117661, 347920205 },
-        { 3311479485, 2381255152, 838117661 },
-        { 4107898714, 2782779087, 2381255152 } },
-      // Matrix for nskip = 1 * 8 ^ 43:
-      { { 3567524348, 1934119675, 3188270128 },
-        { 2997767678, 826363896, 1934119675 },
-        { 262952343, 614326610, 826363896 } },
-      // Matrix for nskip = 2 * 8 ^ 43:
-      { { 1625613062, 4288164505, 2481284279 },
-        { 4273461426, 1177260757, 4288164505 },
-        { 305959988, 4017252267, 1177260757 } },
-      // Matrix for nskip = 3 * 8 ^ 43:
-      { { 3536417809, 429648601, 2955466274 },
-        { 1272075175, 3057838997, 429648601 },
-        { 2269698346, 4011682346, 3057838997 } },
-      // Matrix for nskip = 4 * 8 ^ 43:
-      { { 337929267, 333342539, 418300166 },
-        { 2944208672, 379097734, 333342539 },
-        { 2084056909, 3625475947, 379097734 } },
-      // Matrix for nskip = 5 * 8 ^ 43:
-      { { 68058625, 1918117806, 635887182 },
-        { 1946098288, 2963456150, 1918117806 },
-        { 2625600235, 2337231210, 2963456150 } },
-      // Matrix for nskip = 6 * 8 ^ 43:
-      { { 1700493457, 3627573759, 545164662 },
-        { 1921927973, 1170497671, 3627573759 },
-        { 3094336698, 2906222607, 1170497671 } },
-      // Matrix for nskip = 7 * 8 ^ 43:
-      { { 575329368, 1216196496, 4089812320 },
-        { 2113496301, 1220844336, 1216196496 },
-        { 3926254763, 817590918, 1220844336 } },
-      // Matrix for nskip = 1 * 8 ^ 44:
-      { { 1189899255, 1307754719, 1214919992 },
-        { 3736721708, 3514751918, 1307754719 },
-        { 732435953, 2021244538, 3514751918 } },
-      // Matrix for nskip = 2 * 8 ^ 44:
-      { { 4089172695, 1533534334, 525643282 },
-        { 1497577018, 1335684482, 1533534334 },
-        { 2079007086, 3977541427, 1335684482 } },
-      // Matrix for nskip = 3 * 8 ^ 44:
-      { { 851614119, 2992100005, 2852461785 },
-        { 2850360626, 2514447281, 2992100005 },
-        { 978015612, 1397973230, 2514447281 } },
-      // Matrix for nskip = 4 * 8 ^ 44:
-      { { 3075256652, 2762754934, 3846844247 },
-        { 3057872364, 3274545167, 2762754934 },
-        { 4028573983, 938934351, 3274545167 } },
-      // Matrix for nskip = 5 * 8 ^ 44:
-      { { 1356476668, 2626409409, 1479462144 },
-        { 1188404397, 1260428167, 2626409409 },
-        { 3595448064, 2360949430, 1260428167 } },
-      // Matrix for nskip = 6 * 8 ^ 44:
-      { { 1027674032, 887967109, 3655047107 },
-        { 3381172536, 2839247420, 887967109 },
-        { 1109942153, 1231881661, 2839247420 } },
-      // Matrix for nskip = 7 * 8 ^ 44:
-      { { 3084422684, 3716427472, 3899800153 },
-        { 2713114448, 2433847057, 3716427472 },
-        { 2089286798, 4032596403, 2433847057 } },
-      // Matrix for nskip = 1 * 8 ^ 45:
-      { { 2597859300, 2880151048, 2523330453 },
-        { 1121709186, 175667448, 2880151048 },
-        { 4182510911, 1723133625, 175667448 } },
-      // Matrix for nskip = 2 * 8 ^ 45:
-      { { 484148868, 1404283933, 2982534313 },
-        { 3736767353, 3179865161, 1404283933 },
-        { 391120388, 3758716888, 3179865161 } },
-      // Matrix for nskip = 3 * 8 ^ 45:
-      { { 3773686289, 1118146915, 4257811308 },
-        { 2626215981, 2155767823, 1118146915 },
-        { 4216113535, 234812272, 2155767823 } },
-      // Matrix for nskip = 4 * 8 ^ 45:
-      { { 2138867468, 1128973399, 2133702321 },
-        { 1613561693, 3622350766, 1128973399 },
-        { 1500151924, 3759983985, 3622350766 } },
-      // Matrix for nskip = 5 * 8 ^ 45:
-      { { 2098219600, 3500149955, 509598935 },
-        { 3938592198, 2627573355, 3500149955 },
-        { 2296762399, 2144538279, 2627573355 } },
-      // Matrix for nskip = 6 * 8 ^ 45:
-      { { 1272813809, 709982328, 2430723917 },
-        { 3808746634, 1052744045, 709982328 },
-        { 346250782, 2541155134, 1052744045 } },
-      // Matrix for nskip = 7 * 8 ^ 45:
-      { { 959495863, 240812937, 1778012651 },
-        { 803153186, 1920219267, 240812937 },
-        { 2528085623, 422007, 1920219267 } },
-      // Matrix for nskip = 1 * 8 ^ 46:
-      { { 3027706760, 3786576552, 2698781808 },
-        { 2810527099, 90498489, 3786576552 },
-        { 4220122612, 1855245979, 90498489 } },
-      // Matrix for nskip = 2 * 8 ^ 46:
-      { { 3739389517, 1110440720, 917457922 },
-        { 2163873618, 3707591763, 1110440720 },
-        { 2667061910, 2533383962, 3707591763 } },
-      // Matrix for nskip = 3 * 8 ^ 46:
-      { { 3440567542, 213023128, 821316937 },
-        { 1289665822, 1120982854, 213023128 },
-        { 1107018173, 2157902557, 1120982854 } },
-      // Matrix for nskip = 4 * 8 ^ 46:
-      { { 1545226000, 1812182123, 3693349190 },
-        { 3422065122, 3291428549, 1812182123 },
-        { 1193168720, 2072837757, 3291428549 } },
-      // Matrix for nskip = 5 * 8 ^ 46:
-      { { 1411838727, 1497286518, 2743320941 },
-        { 1476608684, 3759942398, 1497286518 },
-        { 3033567880, 1132137328, 3759942398 } },
-      // Matrix for nskip = 6 * 8 ^ 46:
-      { { 4164586694, 3847046376, 939466538 },
-        { 455920568, 1287777429, 3847046376 },
-        { 2394981758, 891603161, 1287777429 } },
-      // Matrix for nskip = 7 * 8 ^ 46:
-      { { 3992667160, 390631011, 4070853162 },
-        { 1146538952, 1264300453, 390631011 },
-        { 2489808111, 407533173, 1264300453 } },
-      // Matrix for nskip = 1 * 8 ^ 47:
-      { { 3230096243, 2131723358, 3262178024 },
-        { 2882890127, 4088518247, 2131723358 },
-        { 3991553306, 1282224087, 4088518247 } },
-      // Matrix for nskip = 2 * 8 ^ 47:
-      { { 301207261, 1722796810, 3697719854 },
-        { 3350228505, 3410986694, 1722796810 },
-        { 3684514720, 2846958957, 3410986694 } },
-      // Matrix for nskip = 3 * 8 ^ 47:
-      { { 3625524738, 3319692776, 3795749903 },
-        { 1715640681, 1890913372, 3319692776 },
-        { 225727143, 928307593, 1890913372 } },
-      // Matrix for nskip = 4 * 8 ^ 47:
-      { { 1532963114, 4236235786, 3871128158 },
-        { 3540401964, 1285250577, 4236235786 },
-        { 1105070646, 2764245175, 1285250577 } },
-      // Matrix for nskip = 5 * 8 ^ 47:
-      { { 3740934706, 2563937648, 2746910512 },
-        { 3298575982, 2047742419, 2563937648 },
-        { 654443081, 2109897740, 2047742419 } },
-      // Matrix for nskip = 6 * 8 ^ 47:
-      { { 1240524792, 1728254085, 119873755 },
-        { 1505600996, 2604901554, 1728254085 },
-        { 3134968130, 2798059827, 2604901554 } },
-      // Matrix for nskip = 7 * 8 ^ 47:
-      { { 1468859634, 1067606885, 482418964 },
-        { 2025997689, 632183943, 1067606885 },
-        { 152578308, 2630662559, 632183943 } },
-      // Matrix for nskip = 1 * 8 ^ 48:
-      { { 210906218, 3068599594, 3034582784 },
-        { 340633153, 4004365908, 3068599594 },
-        { 4238928187, 2299166464, 4004365908 } },
-      // Matrix for nskip = 2 * 8 ^ 48:
-      { { 2274701639, 3955606166, 3081246407 },
-        { 3199954992, 3948054919, 3955606166 },
-        { 2399101442, 3438340286, 3948054919 } },
-      // Matrix for nskip = 3 * 8 ^ 48:
-      { { 1699759143, 4037535932, 1219209632 },
-        { 633837171, 3333667032, 4037535932 },
-        { 1309772249, 2404397407, 3333667032 } },
-      // Matrix for nskip = 4 * 8 ^ 48:
-      { { 504137100, 1182303684, 201533985 },
-        { 4188299661, 3042453580, 1182303684 },
-        { 2578519273, 2674782930, 3042453580 } },
-      // Matrix for nskip = 5 * 8 ^ 48:
-      { { 592752793, 2717374630, 1743344011 },
-        { 1375705778, 3320840707, 2717374630 },
-        { 128640966, 3026546742, 3320840707 } },
-      // Matrix for nskip = 6 * 8 ^ 48:
-      { { 1370637124, 3074764013, 228550476 },
-        { 1199760826, 3450980261, 3074764013 },
-        { 1618563336, 1054833852, 3450980261 } },
-      // Matrix for nskip = 7 * 8 ^ 48:
-      { { 1611431067, 3710031515, 2854732050 },
-        { 528870942, 2907234375, 3710031515 },
-        { 3445439485, 1092238667, 2907234375 } },
-      // Matrix for nskip = 1 * 8 ^ 49:
-      { { 1382964588, 2578452047, 3140440866 },
-        { 261861891, 1076783073, 2578452047 },
-        { 1634588989, 164438428, 1076783073 } },
-      // Matrix for nskip = 2 * 8 ^ 49:
-      { { 2529186343, 526867394, 3102803247 },
-        { 2687252475, 2908898908, 526867394 },
-        { 1213100579, 86050422, 2908898908 } },
-      // Matrix for nskip = 3 * 8 ^ 49:
-      { { 1961703304, 2865880716, 3245956893 },
-        { 2618763101, 2785604515, 2865880716 },
-        { 2898900229, 1099125661, 2785604515 } },
-      // Matrix for nskip = 4 * 8 ^ 49:
-      { { 2690118316, 538108523, 790337895 },
-        { 4193870709, 1053552056, 538108523 },
-        { 1635227281, 4002399925, 1053552056 } },
-      // Matrix for nskip = 5 * 8 ^ 49:
-      { { 746488794, 2143647216, 1919679021 },
-        { 3920176380, 1994557046, 2143647216 },
-        { 661950432, 921383941, 1994557046 } },
-      // Matrix for nskip = 6 * 8 ^ 49:
-      { { 1934635577, 2678342194, 4048456688 },
-        { 3769235275, 3122368790, 2678342194 },
-        { 3794884445, 2578750044, 3122368790 } },
-      // Matrix for nskip = 7 * 8 ^ 49:
-      { { 2345462407, 3273239577, 504673677 },
-        { 2663769112, 483235505, 3273239577 },
-        { 2863427199, 2990731351, 483235505 } },
-      // Matrix for nskip = 1 * 8 ^ 50:
-      { { 2123712957, 4205383007, 1812304090 },
-        { 1095349745, 166243972, 4205383007 },
-        { 428569070, 2128782357, 166243972 } },
-      // Matrix for nskip = 2 * 8 ^ 50:
-      { { 1330151766, 3569679412, 4107175982 },
-        { 3808641551, 3621125056, 3569679412 },
-        { 4262164578, 1927692878, 3621125056 } },
-      // Matrix for nskip = 3 * 8 ^ 50:
-      { { 4091558631, 3732834681, 466628750 },
-        { 297727134, 2456485740, 3732834681 },
-        { 1818617085, 834096815, 2456485740 } },
-      // Matrix for nskip = 4 * 8 ^ 50:
-      { { 3606295184, 2442739556, 3894922338 },
-        { 1629626641, 2729678535, 2442739556 },
-        { 3379124758, 4279360935, 2729678535 } },
-      // Matrix for nskip = 5 * 8 ^ 50:
-      { { 3518339108, 1807718360, 3760359041 },
-        { 3698267057, 3466970024, 1807718360 },
-        { 3728530930, 3457548085, 3466970024 } },
-      // Matrix for nskip = 6 * 8 ^ 50:
-      { { 2193444679, 408556626, 3012130337 },
-        { 1097569863, 59894341, 408556626 },
-        { 3860432799, 476070138, 59894341 } },
-      // Matrix for nskip = 7 * 8 ^ 50:
-      { { 1063004122, 547821813, 3531749039 },
-        { 3513263202, 1281130561, 547821813 },
-        { 3768689719, 180869393, 1281130561 } },
-      // Matrix for nskip = 1 * 8 ^ 51:
-      { { 1052092278, 4249024666, 919210106 },
-        { 3253349463, 3629539480, 4249024666 },
-        { 852514024, 4025926501, 3629539480 } },
-      // Matrix for nskip = 2 * 8 ^ 51:
-      { { 12394571, 1252747620, 2133571953 },
-        { 4227339509, 3197545170, 1252747620 },
-        { 1884529704, 1976203831, 3197545170 } },
-      // Matrix for nskip = 3 * 8 ^ 51:
-      { { 2331594780, 452832640, 1101195955 },
-        { 2939334015, 2029416251, 452832640 },
-        { 1096100666, 3366782607, 2029416251 } },
-      // Matrix for nskip = 4 * 8 ^ 51:
-      { { 2986331025, 2671019282, 2847338542 },
-        { 3173738401, 3542657885, 2671019282 },
-        { 745203060, 1546667401, 3542657885 } },
-      // Matrix for nskip = 5 * 8 ^ 51:
-      { { 3475245690, 1308019352, 1824121179 },
-        { 2721990050, 584665331, 1308019352 },
-        { 935407479, 3072929538, 584665331 } },
-      // Matrix for nskip = 6 * 8 ^ 51:
-      { { 1254243785, 987948282, 836901607 },
-        { 2154496016, 3293370693, 987948282 },
-        { 2487351160, 2120370930, 3293370693 } },
-      // Matrix for nskip = 7 * 8 ^ 51:
-      { { 614238014, 976296831, 2444588607 },
-        { 3245218993, 99887253, 976296831 },
-        { 4012293175, 407199536, 99887253 } },
-      // Matrix for nskip = 1 * 8 ^ 52:
-      { { 2613012997, 2311336951, 2911336433 },
-        { 1493974713, 92565032, 2311336951 },
-        { 2786645250, 257065974, 92565032 } },
-      // Matrix for nskip = 2 * 8 ^ 52:
-      { { 3424925004, 2776053372, 2204068573 },
-        { 3770626858, 2509257810, 2776053372 },
-        { 2979919489, 1146336783, 2509257810 } },
-      // Matrix for nskip = 3 * 8 ^ 52:
-      { { 2499905758, 2215361770, 3750482090 },
-        { 1105380130, 3511408930, 2215361770 },
-        { 634471839, 2666607166, 3511408930 } },
-      // Matrix for nskip = 4 * 8 ^ 52:
-      { { 1474384834, 827894421, 515339473 },
-        { 1373055755, 1949809417, 827894421 },
-        { 3088339524, 1194193824, 1949809417 } },
-      // Matrix for nskip = 5 * 8 ^ 52:
-      { { 811682426, 1464831324, 673124742 },
-        { 1737209131, 4147063048, 1464831324 },
-        { 104747063, 352467977, 4147063048 } },
-      // Matrix for nskip = 6 * 8 ^ 52:
-      { { 1759193844, 2367252271, 658497461 },
-        { 2079352492, 183217259, 2367252271 },
-        { 4048695575, 533708602, 183217259 } },
-      // Matrix for nskip = 7 * 8 ^ 52:
-      { { 2604083920, 2202319015, 2821035593 },
-        { 3199388318, 109366125, 2202319015 },
-        { 552179285, 3360277248, 109366125 } },
-      // Matrix for nskip = 1 * 8 ^ 53:
-      { { 1825805135, 1289872272, 3700877161 },
-        { 3433422861, 4062509844, 1289872272 },
-        { 3019008744, 2060641859, 4062509844 } },
-      // Matrix for nskip = 2 * 8 ^ 53:
-      { { 3842597153, 4253338264, 3424495942 },
-        { 698444416, 60268595, 4253338264 },
-        { 4096010585, 47309624, 60268595 } },
-      // Matrix for nskip = 3 * 8 ^ 53:
-      { { 496690861, 2839992631, 523849894 },
-        { 3748568076, 1725353677, 2839992631 },
-        { 1590121940, 1652142356, 1725353677 } },
-      // Matrix for nskip = 4 * 8 ^ 53:
-      { { 2662288323, 2043518992, 1593435980 },
-        { 1330201507, 3618850300, 2043518992 },
-        { 2538793204, 271787962, 3618850300 } },
-      // Matrix for nskip = 5 * 8 ^ 53:
-      { { 3290637626, 1877437091, 683414954 },
-        { 297749, 1492496540, 1877437091 },
-        { 2568049682, 3340892636, 1492496540 } },
-      // Matrix for nskip = 6 * 8 ^ 53:
-      { { 1177494705, 170978053, 1258089776 },
-        { 175903832, 2352110692, 170978053 },
-        { 3367780341, 265547447, 2352110692 } },
-      // Matrix for nskip = 7 * 8 ^ 53:
-      { { 4000259518, 1585853138, 1894954679 },
-        { 4025122327, 1695479283, 1585853138 },
-        { 2854628986, 489784443, 1695479283 } },
-      // Matrix for nskip = 1 * 8 ^ 54:
-      { { 741020448, 997594656, 2398808739 },
-        { 1160477043, 1522130854, 997594656 },
-        { 3036916315, 2847712653, 1522130854 } },
-      // Matrix for nskip = 2 * 8 ^ 54:
-      { { 2654964886, 1889728930, 53329096 },
-        { 2042322941, 1621136330, 1889728930 },
-        { 1553642730, 784545882, 1621136330 } },
-      // Matrix for nskip = 3 * 8 ^ 54:
-      { { 900526416, 798626824, 3879214027 },
-        { 2219774094, 2513781045, 798626824 },
-        { 1455564465, 3987302058, 2513781045 } },
-      // Matrix for nskip = 4 * 8 ^ 54:
-      { { 1715219514, 2831829177, 929124824 },
-        { 997274536, 404228189, 2831829177 },
-        { 1386575385, 4107238699, 404228189 } },
-      // Matrix for nskip = 5 * 8 ^ 54:
-      { { 3216180354, 346253769, 2204236686 },
-        { 620690291, 2037367915, 346253769 },
-        { 1423172488, 2780020913, 2037367915 } },
-      // Matrix for nskip = 6 * 8 ^ 54:
-      { { 1361559514, 2840866920, 2161766692 },
-        { 3777816531, 4291736115, 2840866920 },
-        { 1449118903, 455358549, 4291736115 } },
-      // Matrix for nskip = 7 * 8 ^ 54:
-      { { 3361155093, 1442101330, 2915072798 },
-        { 270047328, 973080601, 1442101330 },
-        { 2538519465, 2830816977, 973080601 } },
-      // Matrix for nskip = 1 * 8 ^ 55:
-      { { 3928131551, 2912523524, 1840499723 },
-        { 4216003022, 2970489088, 2912523524 },
-        { 1158689953, 1425511081, 2970489088 } },
-      // Matrix for nskip = 2 * 8 ^ 55:
-      { { 2807004452, 2510299562, 271603006 },
-        { 2505735035, 2370490899, 2510299562 },
-        { 10873814, 2450376936, 2370490899 } },
-      // Matrix for nskip = 3 * 8 ^ 55:
-      { { 895842640, 1513759891, 652184790 },
-        { 337719276, 3793171443, 1513759891 },
-        { 661495819, 1882293939, 3793171443 } },
-      // Matrix for nskip = 4 * 8 ^ 55:
-      { { 2000734342, 1113679064, 2502160539 },
-        { 1475266926, 2787925323, 1113679064 },
-        { 1475797635, 3044470744, 2787925323 } },
-      // Matrix for nskip = 5 * 8 ^ 55:
-      { { 1766616799, 722317846, 1586650055 },
-        { 1016766460, 76599155, 722317846 },
-        { 2574759301, 623201703, 76599155 } },
-      // Matrix for nskip = 6 * 8 ^ 55:
-      { { 3664739404, 4014926443, 1080154168 },
-        { 2495955387, 1724853627, 4014926443 },
-        { 536042925, 1256783759, 1724853627 } },
-      // Matrix for nskip = 7 * 8 ^ 55:
-      { { 4046813655, 3373283605, 3767126799 },
-        { 1560329332, 2618021767, 3373283605 },
-        { 527165723, 2030169433, 2618021767 } },
-      // Matrix for nskip = 1 * 8 ^ 56:
-      { { 1457157056, 1252556678, 3073232607 },
-        { 1926798761, 3639907189, 1252556678 },
-        { 2067740348, 2256217204, 3639907189 } },
-      // Matrix for nskip = 2 * 8 ^ 56:
-      { { 3740999688, 1035400458, 3162437311 },
-        { 4126312242, 686702830, 1035400458 },
-        { 1699805291, 667792040, 686702830 } },
-      // Matrix for nskip = 3 * 8 ^ 56:
-      { { 1345468819, 1338322079, 817781640 },
-        { 2710885009, 1935673443, 1338322079 },
-        { 877889863, 2304324596, 1935673443 } },
-      // Matrix for nskip = 4 * 8 ^ 56:
-      { { 2422495016, 3203768688, 1858240466 },
-        { 848719394, 4092709154, 3203768688 },
-        { 659945473, 1863075174, 4092709154 } },
-      // Matrix for nskip = 5 * 8 ^ 56:
-      { { 21345609, 2944772441, 1446242483 },
-        { 3854092115, 3931174287, 2944772441 },
-        { 3818334033, 340393141, 3931174287 } },
-      // Matrix for nskip = 6 * 8 ^ 56:
-      { { 2472609977, 1572317229, 2146084483 },
-        { 386210076, 1579232146, 1572317229 },
-        { 3154153453, 3349077947, 1579232146 } },
-      // Matrix for nskip = 7 * 8 ^ 56:
-      { { 3934658083, 1547798902, 578076866 },
-        { 3707114992, 1649964845, 1547798902 },
-        { 3740686873, 217906160, 1649964845 } },
-      // Matrix for nskip = 1 * 8 ^ 57:
-      { { 246817944, 871751352, 2834051003 },
-        { 3976202597, 3721214025, 871751352 },
-        { 783929942, 745295675, 3721214025 } },
-      // Matrix for nskip = 2 * 8 ^ 57:
-      { { 3811740424, 3603608092, 2365398362 },
-        { 3826150877, 2906557036, 3603608092 },
-        { 2300510686, 966815948, 2906557036 } },
-      // Matrix for nskip = 3 * 8 ^ 57:
-      { { 2004086842, 752045049, 1443259442 },
-        { 4222485982, 2275171478, 752045049 },
-        { 959250674, 2731257760, 2275171478 } },
-      // Matrix for nskip = 4 * 8 ^ 57:
-      { { 2816329160, 18201123, 3367710570 },
-        { 437309679, 2220769388, 18201123 },
-        { 1346863388, 705296543, 2220769388 } },
-      // Matrix for nskip = 5 * 8 ^ 57:
-      { { 3868848671, 3006483395, 3903615747 },
-        { 1680524656, 2885742075, 3006483395 },
-        { 796648897, 2121364560, 2885742075 } },
-      // Matrix for nskip = 6 * 8 ^ 57:
-      { { 2743985808, 1183199523, 686976485 },
-        { 3080242732, 497836434, 1183199523 },
-        { 2146196184, 523073130, 497836434 } },
-      // Matrix for nskip = 7 * 8 ^ 57:
-      { { 281969912, 3168583843, 3387530534 },
-        { 3604375441, 89658761, 3168583843 },
-        { 3122537866, 3405552447, 89658761 } },
-      // Matrix for nskip = 1 * 8 ^ 58:
-      { { 3310028953, 1662315499, 132645114 },
-        { 2572908401, 3105849797, 1662315499 },
-        { 1937586849, 1735620028, 3105849797 } },
-      // Matrix for nskip = 2 * 8 ^ 58:
-      { { 461386353, 1359675853, 3599822966 },
-        { 106675209, 2044154050, 1359675853 },
-        { 1787730088, 1149892630, 2044154050 } },
-      // Matrix for nskip = 3 * 8 ^ 58:
-      { { 1678397435, 2034254929, 404593054 },
-        { 308885052, 4143854702, 2034254929 },
-        { 1276625905, 1557265403, 4143854702 } },
-      // Matrix for nskip = 4 * 8 ^ 58:
-      { { 3303902397, 345146034, 1417149696 },
-        { 2231869247, 1116882637, 345146034 },
-        { 1846832385, 79626976, 1116882637 } },
-      // Matrix for nskip = 5 * 8 ^ 58:
-      { { 3163825854, 3437355918, 3790302358 },
-        { 2966738005, 405418248, 3437355918 },
-        { 2935909124, 1737823953, 405418248 } },
-      // Matrix for nskip = 6 * 8 ^ 58:
-      { { 4188280456, 4245794318, 2115856958 },
-        { 3899866941, 2230248511, 4245794318 },
-        { 4151131385, 1810874924, 2230248511 } },
-      // Matrix for nskip = 7 * 8 ^ 58:
-      { { 3183442289, 2647800101, 3155584995 },
-        { 1803347712, 3081729031, 2647800101 },
-        { 344634507, 408464888, 3081729031 } },
-      // Matrix for nskip = 1 * 8 ^ 59:
-      { { 2765049417, 3117782790, 1805260159 },
-        { 3796182890, 1101141726, 3117782790 },
-        { 224270120, 1004001443, 1101141726 } },
-      // Matrix for nskip = 2 * 8 ^ 59:
-      { { 89118668, 2494198515, 1356989069 },
-        { 2490435731, 997151755, 2494198515 },
-        { 1175528637, 3444341166, 997151755 } },
-      // Matrix for nskip = 3 * 8 ^ 59:
-      { { 2610383359, 3160454394, 1595264559 },
-        { 613651010, 1733540130, 3160454394 },
-        { 1119988193, 1810350755, 1733540130 } },
-      // Matrix for nskip = 4 * 8 ^ 59:
-      { { 2340639019, 510225634, 286119182 },
-        { 2045217287, 1194574818, 510225634 },
-        { 2662281592, 1728500627, 1194574818 } },
-      // Matrix for nskip = 5 * 8 ^ 59:
-      { { 1447842232, 184782823, 1797257364 },
-        { 2190899193, 2854828033, 184782823 },
-        { 4138436503, 3783089951, 2854828033 } },
-      // Matrix for nskip = 6 * 8 ^ 59:
-      { { 3892495210, 2262141136, 1078367555 },
-        { 3549231332, 2559113701, 2262141136 },
-        { 4146978688, 2236162592, 2559113701 } },
-      // Matrix for nskip = 7 * 8 ^ 59:
-      { { 1510077366, 825286037, 2959985729 },
-        { 830287146, 781759955, 825286037 },
-        { 359509185, 3182735706, 781759955 } },
-      // Matrix for nskip = 1 * 8 ^ 60:
-      { { 210787847, 1189120688, 2848040407 },
-        { 1087786165, 2343328484, 1189120688 },
-        { 3465141330, 2893041005, 2343328484 } },
-      // Matrix for nskip = 2 * 8 ^ 60:
-      { { 3438170226, 3236285682, 962036916 },
-        { 2873263091, 215280489, 3236285682 },
-        { 730413847, 1474823842, 215280489 } },
-      // Matrix for nskip = 3 * 8 ^ 60:
-      { { 1877599976, 489218847, 1841260926 },
-        { 1267710679, 4177426677, 489218847 },
-        { 3908192573, 1193948814, 4177426677 } },
-      // Matrix for nskip = 4 * 8 ^ 60:
-      { { 1566461658, 133010024, 2886695328 },
-        { 2835827516, 653809404, 133010024 },
-        { 3082882924, 3710942807, 653809404 } },
-      // Matrix for nskip = 5 * 8 ^ 60:
-      { { 1018639212, 4003411060, 3748771156 },
-        { 933110981, 1484000297, 4003411060 },
-        { 3415991698, 3188783681, 1484000297 } },
-      // Matrix for nskip = 6 * 8 ^ 60:
-      { { 2630823869, 3185784250, 1624263326 },
-        { 1151112872, 440283001, 3185784250 },
-        { 4029103059, 1089550911, 440283001 } },
-      // Matrix for nskip = 7 * 8 ^ 60:
-      { { 2558003006, 4161490031, 868072046 },
-        { 2993166332, 1972186265, 4161490031 },
-        { 1890899803, 3731240792, 1972186265 } },
-      // Matrix for nskip = 1 * 8 ^ 61:
-      { { 4201558916, 1263786956, 326001602 },
-        { 762846463, 621546357, 1263786956 },
-        { 2697142404, 1156650856, 621546357 } },
-      // Matrix for nskip = 2 * 8 ^ 61:
-      { { 2655768102, 2339029465, 2430211448 },
-        { 2669906627, 403962847, 2339029465 },
-        { 1483118807, 639660658, 403962847 } },
-      // Matrix for nskip = 3 * 8 ^ 61:
-      { { 343789192, 2523152864, 3692813188 },
-        { 4182218791, 1387544806, 2523152864 },
-        { 3364170107, 1607749365, 1387544806 } },
-      // Matrix for nskip = 4 * 8 ^ 61:
-      { { 3508595200, 4228486662, 754946994 },
-        { 1913148390, 3500531602, 4228486662 },
-        { 24637, 3773159052, 3500531602 } },
-      // Matrix for nskip = 5 * 8 ^ 61:
-      { { 1767736432, 2782451483, 925961005 },
-        { 1898573829, 779641045, 2782451483 },
-        { 4172425777, 3053709304, 779641045 } },
-      // Matrix for nskip = 6 * 8 ^ 61:
-      { { 917982480, 676540794, 3402535509 },
-        { 1997794025, 3184854268, 676540794 },
-        { 2501974390, 2557204628, 3184854268 } },
-      // Matrix for nskip = 7 * 8 ^ 61:
-      { { 2265059434, 3533015776, 2085907395 },
-        { 1105268907, 2837239505, 3533015776 },
-        { 3031242459, 1173739788, 2837239505 } },
-      // Matrix for nskip = 1 * 8 ^ 62:
-      { { 4024866227, 1143874914, 3205058469 },
-        { 2970344133, 2873927273, 1143874914 },
-        { 2167114735, 4095476435, 2873927273 } },
-      // Matrix for nskip = 2 * 8 ^ 62:
-      { { 1479401095, 2958366486, 3027708794 },
-        { 2704486034, 3574053987, 2958366486 },
-        { 3630964515, 1276667706, 3574053987 } },
-      // Matrix for nskip = 3 * 8 ^ 62:
-      { { 3471121, 4212261536, 2870367456 },
-        { 3210276198, 3855580426, 4212261536 },
-        { 2974755971, 3723431054, 3855580426 } },
-      // Matrix for nskip = 4 * 8 ^ 62:
-      { { 2035927380, 1363628533, 818363998 },
-        { 3023327955, 3968427114, 1363628533 },
-        { 1284825950, 2871663372, 3968427114 } },
-      // Matrix for nskip = 5 * 8 ^ 62:
-      { { 4289867114, 1817891047, 2823353497 },
-        { 910331225, 3868760780, 1817891047 },
-        { 2783151834, 2379034525, 3868760780 } },
-      // Matrix for nskip = 6 * 8 ^ 62:
-      { { 2979837612, 1089982006, 1663630835 },
-        { 709699817, 1486004709, 1089982006 },
-        { 1956455708, 1787357723, 1486004709 } },
-      // Matrix for nskip = 7 * 8 ^ 62:
-      { { 2852981955, 2215534550, 574323950 },
-        { 1169533157, 2975065186, 2215534550 },
-        { 2290801870, 428188634, 2975065186 } },
-      // Matrix for nskip = 1 * 8 ^ 63:
-      { { 3827747418, 3897287251, 4106993377 },
-        { 1527779946, 3221052941, 3897287251 },
-        { 4178727866, 4281160673, 3221052941 } },
-      // Matrix for nskip = 2 * 8 ^ 63:
-      { { 1174358892, 2835476193, 959978619 },
-        { 850076464, 3774782533, 2835476193 },
-        { 3880910680, 3237990203, 3774782533 } },
-      // Matrix for nskip = 3 * 8 ^ 63:
-      { { 1400690756, 823435890, 1896847210 },
-        { 3000499818, 1124911735, 823435890 },
-        { 1381972838, 2683742666, 1124911735 } },
-      // Matrix for nskip = 4 * 8 ^ 63:
-      { { 3128011728, 1998893251, 1400155768 },
-        { 1430713735, 2850730926, 1998893251 },
-        { 1073801764, 2374744218, 2850730926 } },
-      // Matrix for nskip = 5 * 8 ^ 63:
-      { { 1152423219, 3000721466, 850809698 },
-        { 764299143, 3684505492, 3000721466 },
-        { 3524599640, 1299858048, 3684505492 } },
-      // Matrix for nskip = 6 * 8 ^ 63:
-      { { 2188428625, 3090564778, 4205068615 },
-        { 1911908313, 34180751, 3090564778 },
-        { 3382776937, 194682771, 34180751 } },
-      // Matrix for nskip = 7 * 8 ^ 63:
-      { { 3050396426, 627769205, 3010308075 },
-        { 987718671, 3026731980, 627769205 },
-        { 3527778260, 4200640347, 3026731980 } },
-      // Matrix for nskip = 1 * 8 ^ 64:
-      { { 364496809, 3951443831, 2338985995 },
-        { 2365728271, 1745134545, 3951443831 },
-        { 1076500940, 1589192585, 1745134545 } },
-      // Matrix for nskip = 2 * 8 ^ 64:
-      { { 3304498837, 1325046906, 3381970501 },
-        { 1563368115, 3116266625, 1325046906 },
-        { 244825785, 4251678855, 3116266625 } },
-      // Matrix for nskip = 3 * 8 ^ 64:
-      { { 4133678667, 2048440215, 4035662430 },
-        { 4086919994, 519191900, 2048440215 },
-        { 2789936683, 4051608893, 519191900 } },
-      // Matrix for nskip = 4 * 8 ^ 64:
-      { { 3603289991, 1324164821, 1776019579 },
-        { 1734804890, 3151589272, 1324164821 },
-        { 2411297223, 3296772386, 3151589272 } },
-      // Matrix for nskip = 5 * 8 ^ 64:
-      { { 2599419541, 2726072264, 662164094 },
-        { 1554537872, 2065618870, 2726072264 },
-        { 1049180268, 439080215, 2065618870 } },
-      // Matrix for nskip = 6 * 8 ^ 64:
-      { { 1694996757, 2289284793, 2258832764 },
-        { 1982364129, 3971544391, 2289284793 },
-        { 1140613093, 2605325759, 3971544391 } },
-      // Matrix for nskip = 7 * 8 ^ 64:
-      { { 3344108032, 1353133572, 2611828466 },
-        { 729814057, 219879593, 1353133572 },
-        { 1513768211, 1797897504, 219879593 } } },
-    // Matrix for nskip = 1 * 8 ^ 0:
-    { { { 0, 1, 0 }, { 0, 0, 1 }, { 4293573854, 0, 527612 } },
-      // Matrix for nskip = 2 * 8 ^ 0:
-      { { 0, 0, 1 }, { 4293573854, 0, 527612 }, { 2706407399, 4293573854, 3497978192 } },
-      // Matrix for nskip = 3 * 8 ^ 0:
-      { { 4293573854, 0, 527612 },
-        { 2706407399, 4293573854, 3497978192 },
-        { 1431525864, 2706407399, 3281754271 } },
-      // Matrix for nskip = 4 * 8 ^ 0:
-      { { 2706407399, 4293573854, 3497978192 },
-        { 1431525864, 2706407399, 3281754271 },
-        { 97673890, 1431525864, 1673476130 } },
-      // Matrix for nskip = 5 * 8 ^ 0:
-      { { 1431525864, 2706407399, 3281754271 },
-        { 97673890, 1431525864, 1673476130 },
-        { 2680076935, 97673890, 1430724370 } },
-      // Matrix for nskip = 6 * 8 ^ 0:
-      { { 97673890, 1431525864, 1673476130 },
-        { 2680076935, 97673890, 1430724370 },
-        { 3405842137, 2680076935, 893509979 } },
-      // Matrix for nskip = 7 * 8 ^ 0:
-      { { 2680076935, 97673890, 1430724370 },
-        { 3405842137, 2680076935, 893509979 },
-        { 4035147174, 3405842137, 3280220074 } },
-      // Matrix for nskip = 1 * 8 ^ 1:
-      { { 3405842137, 2680076935, 893509979 },
-        { 4035147174, 3405842137, 3280220074 },
-        { 2623373296, 4035147174, 361718588 } },
-      // Matrix for nskip = 2 * 8 ^ 1:
-      { { 818368950, 3790774567, 3542344109 },
-        { 1817134745, 818368950, 3321940838 },
-        { 3493477402, 1817134745, 2854655037 } },
-      // Matrix for nskip = 3 * 8 ^ 1:
-      { { 508190223, 940389731, 295549677 },
-        { 548891792, 508190223, 4243623497 },
-        { 1618914183, 548891792, 2585942386 } },
-      // Matrix for nskip = 4 * 8 ^ 1:
-      { { 498682467, 2928649385, 811441367 },
-        { 1777037472, 498682467, 479207863 },
-        { 3058260025, 1777037472, 1528225099 } },
-      // Matrix for nskip = 5 * 8 ^ 1:
-      { { 1605006689, 1112484358, 2137070446 },
-        { 3785946674, 1605006689, 1949907406 },
-        { 3243030173, 3785946674, 2339202713 } },
-      // Matrix for nskip = 6 * 8 ^ 1:
-      { { 1603012465, 493710616, 1996495269 },
-        { 3369502947, 1603012465, 1576432507 },
-        { 3762770058, 3369502947, 254897698 } },
-      // Matrix for nskip = 7 * 8 ^ 1:
-      { { 1138020476, 4025114134, 3077305804 },
-        { 4152260747, 1138020476, 1057298006 },
-        { 1828211552, 4152260747, 3984471979 } },
-      // Matrix for nskip = 1 * 8 ^ 2:
-      { { 3893311647, 3140922085, 64039185 },
-        { 82107183, 3893311647, 2655465224 },
-        { 1674879036, 82107183, 1089381262 } },
-      // Matrix for nskip = 2 * 8 ^ 2:
-      { { 28639152, 3496041927, 2231910770 },
-        { 3174683233, 28639152, 2828785870 },
-        { 3681140872, 3174683233, 3910194649 } },
-      // Matrix for nskip = 3 * 8 ^ 2:
-      { { 3488684910, 1250231333, 763303055 },
-        { 681409874, 3488684910, 751154769 },
-        { 3783909260, 681409874, 1465244270 } },
-      // Matrix for nskip = 4 * 8 ^ 2:
-      { { 1463826069, 300842059, 3313769518 },
-        { 1799677538, 1463826069, 3174861078 },
-        { 1882279394, 1799677538, 3509975160 } },
-      // Matrix for nskip = 5 * 8 ^ 2:
-      { { 2793448161, 3690337147, 4181759810 },
-        { 514622120, 2793448161, 3027286223 },
-        { 241620347, 514622120, 1328063696 } },
-      // Matrix for nskip = 6 * 8 ^ 2:
-      { { 3250099852, 3207068910, 3709263791 },
-        { 2342747328, 3250099852, 3729690850 },
-        { 3983203494, 2342747328, 1023622970 } },
-      // Matrix for nskip = 7 * 8 ^ 2:
-      { { 3136295372, 3178055245, 2818424094 },
-        { 2036073935, 3136295372, 3231583326 },
-        { 1782478065, 2036073935, 1053332972 } },
-      // Matrix for nskip = 1 * 8 ^ 3:
-      { { 2092194020, 184076987, 2202401252 },
-        { 3103629604, 2092194020, 3409560232 },
-        { 4257445059, 3103629604, 2390202783 } },
-      // Matrix for nskip = 2 * 8 ^ 3:
-      { { 812917091, 2574011276, 4168802395 },
-        { 209817750, 812917091, 2974870628 },
-        { 3238802184, 209817750, 3692836406 } },
-      // Matrix for nskip = 3 * 8 ^ 3:
-      { { 1621943577, 2244624888, 38864005 },
-        { 3618177584, 1621943577, 3295260066 },
-        { 414159965, 3618177584, 1095692911 } },
-      // Matrix for nskip = 4 * 8 ^ 3:
-      { { 477309738, 3314523413, 3442242150 },
-        { 2755731404, 477309738, 2782713347 },
-        { 1606221490, 2755731404, 1033463096 } },
-      // Matrix for nskip = 5 * 8 ^ 3:
-      { { 3233499061, 2494617440, 1002517819 },
-        { 3026123612, 3233499061, 3338202446 },
-        { 1979145017, 3026123612, 3790308130 } },
-      // Matrix for nskip = 6 * 8 ^ 3:
-      { { 2567113113, 781663248, 3993869449 },
-        { 402756912, 2567113113, 2817097718 },
-        { 3190930010, 402756912, 2884691291 } },
-      // Matrix for nskip = 7 * 8 ^ 3:
-      { { 2223683788, 4195752245, 2738363134 },
-        { 1171605168, 2223683788, 3904649711 },
-        { 2631005941, 1171605168, 3445807882 } },
-      // Matrix for nskip = 1 * 8 ^ 4:
-      { { 2155469603, 3326516116, 3843369786 },
-        { 288604458, 2155469603, 571673571 },
-        { 1501677614, 288604458, 2928213494 } },
-      // Matrix for nskip = 2 * 8 ^ 4:
-      { { 2082469029, 749754403, 3963963316 },
-        { 2764859700, 2082469029, 3576428059 },
-        { 2840894706, 2764859700, 1782279859 } },
-      // Matrix for nskip = 3 * 8 ^ 4:
-      { { 1583407457, 2056027805, 55614242 },
-        { 2405645826, 1583407457, 1737043333 },
-        { 1118910623, 2405645826, 1180559812 } },
-      // Matrix for nskip = 4 * 8 ^ 4:
-      { { 3760163766, 1041986082, 1799196192 },
-        { 1022129134, 3760163766, 1332558840 },
-        { 276873446, 1022129134, 3979423632 } },
-      // Matrix for nskip = 5 * 8 ^ 4:
-      { { 1438626566, 3619082489, 1569836243 },
-        { 3671597039, 1438626566, 907924984 },
-        { 3732297029, 3671597039, 1221779212 } },
-      // Matrix for nskip = 6 * 8 ^ 4:
-      { { 483787924, 3115606677, 2374703971 },
-        { 117552025, 483787924, 4234241969 },
-        { 774331833, 117552025, 530787287 } },
-      // Matrix for nskip = 7 * 8 ^ 4:
-      { { 955925224, 1961750426, 3644821859 },
-        { 213414981, 955925224, 927956770 },
-        { 1671634731, 213414981, 4186423122 } },
-      // Matrix for nskip = 1 * 8 ^ 5:
-      { { 1021313167, 1312544548, 1716381787 },
-        { 3037868518, 1021313167, 199085085 },
-        { 2582787611, 3037868518, 3539882179 } },
-      // Matrix for nskip = 2 * 8 ^ 5:
-      { { 2569413030, 1631336015, 2594942403 },
-        { 1030618503, 2569413030, 3467650326 },
-        { 1998739584, 1030618503, 3174552073 } },
-      // Matrix for nskip = 3 * 8 ^ 5:
-      { { 2179955734, 1825159949, 1082151624 },
-        { 937147983, 2179955734, 978382746 },
-        { 2629591623, 937147983, 3579678559 } },
-      // Matrix for nskip = 4 * 8 ^ 5:
-      { { 2334639309, 3114094203, 601680947 },
-        { 2110199318, 2334639309, 678342865 },
-        { 1649523168, 2110199318, 2154948056 } },
-      // Matrix for nskip = 5 * 8 ^ 5:
-      { { 2715012491, 247412130, 1566452082 },
-        { 3425439428, 2715012491, 3004133824 },
-        { 1615468474, 3425439428, 588082730 } },
-      // Matrix for nskip = 6 * 8 ^ 5:
-      { { 2654502125, 654123598, 3954383978 },
-        { 2454987531, 2654502125, 161781366 },
-        { 3631058630, 2454987531, 2718719935 } },
-      // Matrix for nskip = 7 * 8 ^ 5:
-      { { 2620087047, 1022484731, 3275546712 },
-        { 4119759001, 2620087047, 1849544363 },
-        { 1245152096, 4119759001, 2978477502 } },
-      // Matrix for nskip = 1 * 8 ^ 6:
-      { { 563657176, 191330473, 1641595774 },
-        { 780563537, 563657176, 3029522338 },
-        { 2037330914, 780563537, 2084602709 } },
-      // Matrix for nskip = 2 * 8 ^ 6:
-      { { 3414769923, 1968799026, 2238126504 },
-        { 832866376, 3414769923, 3754780168 },
-        { 2165145850, 832866376, 1594768331 } },
-      // Matrix for nskip = 3 * 8 ^ 6:
-      { { 1457310151, 2262086849, 2480319255 },
-        { 1778576621, 1457310151, 367796024 },
-        { 444536774, 1778576621, 873301213 } },
-      // Matrix for nskip = 4 * 8 ^ 6:
-      { { 1646861218, 2317984620, 2301581548 },
-        { 2672536210, 1646861218, 359763062 },
-        { 2391283983, 2672536210, 1885870777 } },
-      // Matrix for nskip = 5 * 8 ^ 6:
-      { { 2962497351, 1089931025, 970191811 },
-        { 2050228336, 2962497351, 1568166288 },
-        { 3288162415, 2050228336, 3921597644 } },
-      // Matrix for nskip = 6 * 8 ^ 6:
-      { { 2468196470, 3544275509, 3557597196 },
-        { 3893425026, 2468196470, 2061293842 },
-        { 2019325804, 3893425026, 2905314 } },
-      // Matrix for nskip = 7 * 8 ^ 6:
-      { { 3407411651, 4206194937, 989129012 },
-        { 1280115996, 3407411651, 1843205351 },
-        { 752661975, 1280115996, 693779416 } },
-      // Matrix for nskip = 1 * 8 ^ 7:
-      { { 841254072, 3765813448, 1635365181 },
-        { 2013240130, 841254072, 605925849 },
-        { 3743932305, 2013240130, 400681955 } },
-      // Matrix for nskip = 2 * 8 ^ 7:
-      { { 1930213004, 2072952279, 3077694794 },
-        { 3579956569, 1930213004, 2478539210 },
-        { 1960229502, 3579956569, 1455652656 } },
-      // Matrix for nskip = 3 * 8 ^ 7:
-      { { 490241598, 1155806426, 2341304300 },
-        { 1821354750, 490241598, 2364275695 },
-        { 3717764728, 1821354750, 1349151461 } },
-      // Matrix for nskip = 4 * 8 ^ 7:
-      { { 1097613522, 1784540933, 1194440107 },
-        { 321747515, 1097613522, 1225209584 },
-        { 74521379, 321747515, 4288531000 } },
-      // Matrix for nskip = 5 * 8 ^ 7:
-      { { 3795899570, 3294470896, 2568537852 },
-        { 1615892324, 3795899570, 2277651644 },
-        { 245018475, 1615892324, 3269831184 } },
-      // Matrix for nskip = 6 * 8 ^ 7:
-      { { 2284610128, 1711688841, 2988405862 },
-        { 1861018675, 2284610128, 3450880655 },
-        { 4077631310, 1861018675, 2595646099 } },
-      // Matrix for nskip = 7 * 8 ^ 7:
-      { { 1338063869, 4236188627, 4005334159 },
-        { 2199059659, 1338063869, 3613475430 },
-        { 954928333, 2199059659, 1383222658 } },
-      // Matrix for nskip = 1 * 8 ^ 8:
-      { { 143812745, 3254530816, 3514348856 },
-        { 769295000, 143812745, 2468210728 },
-        { 1927161272, 769295000, 522705580 } },
-      // Matrix for nskip = 2 * 8 ^ 8:
-      { { 2692035063, 2596905012, 1643240704 },
-        { 1103432342, 2692035063, 1446182108 },
-        { 4161111774, 1103432342, 3076435551 } },
-      // Matrix for nskip = 3 * 8 ^ 8:
-      { { 1809137988, 2412502608, 3993875038 },
-        { 1332423877, 1809137988, 3101816103 },
-        { 1366553339, 1332423877, 2986424418 } },
-      // Matrix for nskip = 4 * 8 ^ 8:
-      { { 2375319030, 1391532370, 3742334018 },
-        { 1202100604, 2375319030, 4098434768 },
-        { 2327872488, 1202100604, 1471526950 } },
-      // Matrix for nskip = 5 * 8 ^ 8:
-      { { 953526753, 3517620599, 1558514368 },
-        { 3674658855, 953526753, 1517070807 },
-        { 828283166, 3674658855, 2689974385 } },
-      // Matrix for nskip = 6 * 8 ^ 8:
-      { { 3063334100, 3228801559, 269715831 },
-        { 612058994, 3063334100, 4143597212 },
-        { 1918225488, 612058994, 2055175984 } },
-      // Matrix for nskip = 7 * 8 ^ 8:
-      { { 2623568215, 482061697, 191091208 },
-        { 2499397071, 2623568215, 2970642011 },
-        { 759749547, 2499397071, 3510580843 } },
-      // Matrix for nskip = 1 * 8 ^ 9:
-      { { 4269164791, 2795313144, 2507855960 },
-        { 4245372460, 4269164791, 4094914553 },
-        { 3873219634, 4245372460, 1473695507 } },
-      // Matrix for nskip = 2 * 8 ^ 9:
-      { { 513890845, 1208902926, 2870530442 },
-        { 1984873167, 513890845, 1257532340 },
-        { 1212627640, 1984873167, 2354363842 } },
-      // Matrix for nskip = 3 * 8 ^ 9:
-      { { 3386048256, 4196280201, 3121820178 },
-        { 2926727276, 3386048256, 2790144637 },
-        { 3970110476, 2926727276, 3495704635 } },
-      // Matrix for nskip = 4 * 8 ^ 9:
-      { { 1848364568, 1552116673, 3496528455 },
-        { 4160778291, 1848364568, 141769900 },
-        { 3611019106, 4160778291, 596424080 } },
-      // Matrix for nskip = 5 * 8 ^ 9:
-      { { 4194097650, 3986230829, 3091752508 },
-        { 3352554321, 4194097650, 4041363667 },
-        { 3822925061, 3352554321, 3748054631 } },
-      // Matrix for nskip = 6 * 8 ^ 9:
-      { { 1292986218, 172755364, 997232463 },
-        { 1505642955, 1292986218, 4112978448 },
-        { 1757204931, 1505642955, 3038511100 } },
-      // Matrix for nskip = 7 * 8 ^ 9:
-      { { 3805104355, 3540279669, 2118304338 },
-        { 1984875159, 3805104355, 3000869736 },
-        { 6466700, 1984875159, 1778898381 } },
-      // Matrix for nskip = 1 * 8 ^ 10:
-      { { 364070020, 3520039729, 837362349 },
-        { 2544671570, 364070020, 2188646679 },
-        { 163978331, 2544671570, 672947816 } },
-      // Matrix for nskip = 2 * 8 ^ 10:
-      { { 1192700714, 3968150021, 298357363 },
-        { 635565666, 1192700714, 2589432341 },
-        { 2548654227, 635565666, 3531570992 } },
-      // Matrix for nskip = 3 * 8 ^ 10:
-      { { 3438963520, 1845346034, 2575726025 },
-        { 2187600669, 3438963520, 958916489 },
-        { 2672427080, 2187600669, 3420061274 } },
-      // Matrix for nskip = 4 * 8 ^ 10:
-      { { 2709640529, 676525399, 875361870 },
-        { 1315499519, 2709640529, 3842690720 },
-        { 3300994644, 1315499519, 2446760804 } },
-      // Matrix for nskip = 5 * 8 ^ 10:
-      { { 1292317767, 393678487, 143711415 },
-        { 1162526988, 1292317767, 1311572745 },
-        { 344898630, 1162526988, 1362796547 } },
-      // Matrix for nskip = 6 * 8 ^ 10:
-      { { 2857812374, 598000082, 2114605560 },
-        { 3454872661, 2857812374, 2738653578 },
-        { 2522086851, 3454872661, 1190449620 } },
-      // Matrix for nskip = 7 * 8 ^ 10:
-      { { 2614530149, 753841941, 146778273 },
-        { 2511297323, 2614530149, 588764284 },
-        { 1785429779, 2511297323, 1269211096 } },
-      // Matrix for nskip = 1 * 8 ^ 11:
-      { { 2742149264, 1410604392, 3032350755 },
-        { 3774935330, 2742149264, 597633965 },
-        { 4085935803, 3774935330, 3952463556 } },
-      // Matrix for nskip = 2 * 8 ^ 11:
-      { { 3878579563, 845297523, 1721916511 },
-        { 2077922420, 3878579563, 3651360351 },
-        { 2177255734, 2077922420, 3791239282 } },
-      // Matrix for nskip = 3 * 8 ^ 11:
-      { { 2642777370, 1064863813, 4046131253 },
-        { 2032494710, 2642777370, 3511906271 },
-        { 2787706468, 2032494710, 1602633162 } },
-      // Matrix for nskip = 4 * 8 ^ 11:
-      { { 1570315355, 4252790045, 3522351060 },
-        { 2324624266, 1570315355, 3594939336 },
-        { 1725087354, 2324624266, 1338343327 } },
-      // Matrix for nskip = 5 * 8 ^ 11:
-      { { 3128806513, 3431512800, 3791370211 },
-        { 26016991, 3128806513, 1182007239 },
-        { 2629261386, 26016991, 1219288409 } },
-      // Matrix for nskip = 6 * 8 ^ 11:
-      { { 2323129699, 2040722667, 4032945011 },
-        { 1824515104, 2323129699, 783304238 },
-        { 1910382756, 1824515104, 2009721680 } },
-      // Matrix for nskip = 7 * 8 ^ 11:
-      { { 495056704, 1303223717, 299029371 },
-        { 3001848199, 495056704, 2298546607 },
-        { 528121192, 3001848199, 3574765936 } },
-      // Matrix for nskip = 1 * 8 ^ 12:
-      { { 2305761589, 381933244, 3663579047 },
-        { 1355307047, 2305761589, 313617972 },
-        { 992174375, 1355307047, 3881593435 } },
-      // Matrix for nskip = 2 * 8 ^ 12:
-      { { 1667857811, 1564715297, 2263851601 },
-        { 3791771273, 1667857811, 4196134923 },
-        { 3347975047, 3791771273, 615040705 } },
-      // Matrix for nskip = 3 * 8 ^ 12:
-      { { 2699274746, 2208033721, 3314336764 },
-        { 1723493827, 2699274746, 3721738282 },
-        { 3116429712, 1723493827, 763211059 } },
-      // Matrix for nskip = 4 * 8 ^ 12:
-      { { 4093947334, 3454015638, 2815567716 },
-        { 4261953004, 4093947334, 3973733876 },
-        { 2979573134, 4261953004, 3757047667 } },
-      // Matrix for nskip = 5 * 8 ^ 12:
-      { { 1497333242, 3837209858, 4043986454 },
-        { 3928412309, 1497333242, 4232950837 },
-        { 868538065, 3928412309, 3223762258 } },
-      // Matrix for nskip = 6 * 8 ^ 12:
-      { { 4178728130, 2981026540, 3927272953 },
-        { 668310420, 4178728130, 551557198 },
-        { 3532851694, 668310420, 4119399398 } },
-      // Matrix for nskip = 7 * 8 ^ 12:
-      { { 4121879899, 2179415297, 3607008098 },
-        { 243696529, 4121879899, 168490644 },
-        { 3444486351, 243696529, 752516370 } },
-      // Matrix for nskip = 1 * 8 ^ 13:
-      { { 250120061, 570149551, 1513430926 },
-        { 3178644752, 250120061, 1701869032 },
-        { 4172515680, 3178644752, 4213855850 } },
-      // Matrix for nskip = 2 * 8 ^ 13:
-      { { 4158106802, 3062358456, 1815738463 },
-        { 1379176112, 4158106802, 3926509890 },
-        { 2842564878, 1379176112, 2852219546 } },
-      // Matrix for nskip = 3 * 8 ^ 13:
-      { { 4056930326, 2130453857, 3298513997 },
-        { 3059400883, 4056930326, 439468763 },
-        { 546163799, 3059400883, 1884270041 } },
-      // Matrix for nskip = 4 * 8 ^ 13:
-      { { 931848746, 256263523, 2633569246 },
-        { 3284646837, 931848746, 2567084715 },
-        { 415258465, 3284646837, 2017565947 } },
-      // Matrix for nskip = 5 * 8 ^ 13:
-      { { 239941751, 4065438988, 4260302551 },
-        { 3480241466, 239941751, 1576122049 },
-        { 4073589963, 3480241466, 2593293965 } },
-      // Matrix for nskip = 6 * 8 ^ 13:
-      { { 507915211, 625612469, 3733827320 },
-        { 3909587424, 507915211, 3313512626 },
-        { 1707582600, 3909587424, 985910059 } },
-      // Matrix for nskip = 7 * 8 ^ 13:
-      { { 3287778427, 3984689764, 3572719740 },
-        { 207904085, 3287778427, 1330617931 },
-        { 1894788630, 207904085, 1656936419 } },
-      // Matrix for nskip = 1 * 8 ^ 14:
-      { { 1648005210, 1032291296, 3987397422 },
-        { 1831496020, 1648005210, 2829448427 },
-        { 1821082272, 1831496020, 2917140265 } },
-      // Matrix for nskip = 2 * 8 ^ 14:
-      { { 4161327077, 489964129, 3870847744 },
-        { 1669447863, 4161327077, 4292947198 },
-        { 1522417114, 1669447863, 2652286672 } },
-      // Matrix for nskip = 3 * 8 ^ 14:
-      { { 655280634, 3675619486, 3487203083 },
-        { 3658400031, 655280634, 4093432727 },
-        { 3338913609, 3658400031, 2005464907 } },
-      // Matrix for nskip = 4 * 8 ^ 14:
-      { { 1270934555, 3136631324, 505612043 },
-        { 2981474723, 1270934555, 2528619024 },
-        { 625182639, 2981474723, 1008985039 } },
-      // Matrix for nskip = 5 * 8 ^ 14:
-      { { 2670739471, 1317142118, 928068368 },
-        { 3334643457, 2670739471, 3298861790 },
-        { 3116973979, 3334643457, 4091848087 } },
-      // Matrix for nskip = 6 * 8 ^ 14:
-      { { 87174298, 3714928458, 3674535785 },
-        { 3591445536, 87174298, 3557842564 },
-        { 2600409828, 3591445536, 3509905000 } },
-      // Matrix for nskip = 7 * 8 ^ 14:
-      { { 1374849292, 3669747751, 313867341 },
-        { 2805321474, 1374849292, 3672378692 },
-        { 862662086, 2805321474, 1269888877 } },
-      // Matrix for nskip = 1 * 8 ^ 15:
-      { { 280996820, 143706137, 3013099060 },
-        { 1797675893, 280996820, 3743985508 },
-        { 1123794455, 1797675893, 2460119169 } },
-      // Matrix for nskip = 2 * 8 ^ 15:
-      { { 919218027, 4154920441, 1125672685 },
-        { 3933041881, 919218027, 474242849 },
-        { 564891116, 3933041881, 2263904321 } },
-      // Matrix for nskip = 3 * 8 ^ 15:
-      { { 4046953169, 707039159, 59087677 },
-        { 552285455, 4046953169, 3367709189 },
-        { 1558638678, 552285455, 3541844079 } },
-      // Matrix for nskip = 4 * 8 ^ 15:
-      { { 2920112852, 1965329198, 1177141043 },
-        { 2135250851, 2920112852, 969184056 },
-        { 296035385, 2135250851, 4267827987 } },
-      // Matrix for nskip = 5 * 8 ^ 15:
-      { { 3182682829, 216191227, 2317042610 },
-        { 3166912454, 3182682829, 3895260799 },
-        { 3316963881, 3166912454, 2773111558 } },
-      // Matrix for nskip = 6 * 8 ^ 15:
-      { { 4005961945, 962333604, 1596766252 },
-        { 155090437, 4005961945, 3465811606 },
-        { 995757623, 155090437, 842864023 } },
-      // Matrix for nskip = 7 * 8 ^ 15:
-      { { 3616509225, 3195052585, 2901642782 },
-        { 4257279454, 3616509225, 3209952933 },
-        { 159699513, 4257279454, 746020360 } },
-      // Matrix for nskip = 1 * 8 ^ 16:
-      { { 1481142942, 4120754772, 1088557292 },
-        { 265491023, 1481142942, 2860005744 },
-        { 301796252, 265491023, 1935975979 } },
-      // Matrix for nskip = 2 * 8 ^ 16:
-      { { 2111859033, 2813610100, 1001476468 },
-        { 73849832, 2111859033, 3980799998 },
-        { 3330206241, 73849832, 1933943506 } },
-      // Matrix for nskip = 3 * 8 ^ 16:
-      { { 4238802520, 1791251057, 3659825373 },
-        { 756158319, 4238802520, 1208877520 },
-        { 3666294602, 756158319, 1800377045 } },
-      // Matrix for nskip = 4 * 8 ^ 16:
-      { { 1781286360, 3661231931, 3509383709 },
-        { 2753158871, 1781286360, 3119883109 },
-        { 3576525143, 2753158871, 551079002 } },
-      // Matrix for nskip = 5 * 8 ^ 16:
-      { { 1150902763, 3730191199, 946744850 },
-        { 3422735839, 1150902763, 2750435170 },
-        { 3792794843, 3422735839, 808249292 } },
-      // Matrix for nskip = 6 * 8 ^ 16:
-      { { 429107478, 1467997203, 689359610 },
-        { 3244671951, 429107478, 2795337511 },
-        { 3397069741, 3244671951, 186846111 } },
-      // Matrix for nskip = 7 * 8 ^ 16:
-      { { 1453148331, 352897577, 3494583787 },
-        { 2340848640, 1453148331, 3699044308 },
-        { 3239904192, 2340848640, 209181640 } },
-      // Matrix for nskip = 1 * 8 ^ 17:
-      { { 1185024844, 587779104, 1004942725 },
-        { 3763632860, 1185024844, 947424568 },
-        { 3811666068, 3763632860, 2352253462 } },
-      // Matrix for nskip = 2 * 8 ^ 17:
-      { { 1310227170, 218138208, 3172947233 },
-        { 766129426, 1310227170, 1808643264 },
-        { 2226659371, 766129426, 3853798112 } },
-      // Matrix for nskip = 3 * 8 ^ 17:
-      { { 3141996820, 528748361, 1701083808 },
-        { 2360837423, 3141996820, 2513545590 },
-        { 1425244435, 2360837423, 4192496132 } },
-      // Matrix for nskip = 4 * 8 ^ 17:
-      { { 2230902378, 4243560874, 2491962392 },
-        { 3836629116, 2230902378, 3637515403 },
-        { 2846140932, 3836629116, 3083355464 } },
-      // Matrix for nskip = 5 * 8 ^ 17:
-      { { 506476814, 1267508030, 152968246 },
-        { 1117668151, 506476814, 2848688169 },
-        { 3001214254, 1117668151, 3940649164 } },
-      // Matrix for nskip = 6 * 8 ^ 17:
-      { { 1544421101, 772024440, 2364160468 },
-        { 2733679040, 1544421101, 965008581 },
-        { 2290142084, 2733679040, 3167919795 } },
-      // Matrix for nskip = 7 * 8 ^ 17:
-      { { 2195717687, 3299928213, 1911548095 },
-        { 3677807589, 2195717687, 2979544321 },
-        { 1288751520, 3677807589, 1379093393 } },
-      // Matrix for nskip = 1 * 8 ^ 18:
-      { { 999448569, 1464488480, 3344426626 },
-        { 946166795, 999448569, 340856814 },
-        { 3686999436, 946166795, 3231079441 } },
-      // Matrix for nskip = 2 * 8 ^ 18:
-      { { 1226155368, 3477563770, 550006884 },
-        { 2378667355, 1226155368, 1493409040 },
-        { 260364836, 2378667355, 4133888397 } },
-      // Matrix for nskip = 3 * 8 ^ 18:
-      { { 662024646, 2039234405, 3990280006 },
-        { 2342461604, 662024646, 17023679 },
-        { 1965981888, 2342461604, 1830518881 } },
-      // Matrix for nskip = 4 * 8 ^ 18:
-      { { 1277901832, 310796286, 2818511068 },
-        { 3088910653, 1277901832, 3303406025 },
-        { 2507911914, 3088910653, 3712928074 } },
-      // Matrix for nskip = 5 * 8 ^ 18:
-      { { 1103450261, 1722381279, 1394112836 },
-        { 640743651, 1103450261, 198700731 },
-        { 1095985628, 640743651, 2694625446 } },
-      // Matrix for nskip = 6 * 8 ^ 18:
-      { { 4043182751, 1859059885, 1911031801 },
-        { 2638851660, 4043182751, 4012210417 },
-        { 783591639, 2638851660, 2188651115 } },
-      // Matrix for nskip = 7 * 8 ^ 18:
-      { { 2318313639, 843870069, 2868175764 },
-        { 3777361816, 2318313639, 4070019017 },
-        { 2087410703, 3777361816, 2574355460 } },
-      // Matrix for nskip = 1 * 8 ^ 19:
-      { { 481918378, 339570348, 1728801469 },
-        { 1623163429, 481918378, 2209094694 },
-        { 3146982514, 1623163429, 508445538 } },
-      // Matrix for nskip = 2 * 8 ^ 19:
-      { { 3138921230, 2381863183, 1992357430 },
-        { 1024510915, 3138921230, 2122851650 },
-        { 1453455184, 1024510915, 941946604 } },
-      // Matrix for nskip = 3 * 8 ^ 19:
-      { { 3235663883, 499846706, 3251827412 },
-        { 801993191, 3235663883, 2207701640 },
-        { 1201194185, 801993191, 2705683748 } },
-      // Matrix for nskip = 4 * 8 ^ 19:
-      { { 2465372719, 1391015357, 3328905025 },
-        { 1821933605, 2465372719, 1343489680 },
-        { 3648970313, 1821933605, 1816599716 } },
-      // Matrix for nskip = 5 * 8 ^ 19:
-      { { 582796091, 1306170361, 1574617829 },
-        { 4167642903, 582796091, 284777447 },
-        { 3124784671, 4167642903, 2539713186 } },
-      // Matrix for nskip = 6 * 8 ^ 19:
-      { { 116486317, 2122591885, 1696181092 },
-        { 381403852, 116486317, 2932149608 },
-        { 3221291545, 381403852, 2742038256 } },
-      // Matrix for nskip = 7 * 8 ^ 19:
-      { { 3035480468, 2182693760, 2351066479 },
-        { 638141264, 3035480468, 100617977 },
-        { 478641834, 638141264, 479301469 } },
-      // Matrix for nskip = 1 * 8 ^ 20:
-      { { 118634664, 3358712512, 2492792220 },
-        { 348833376, 118634664, 2495544591 },
-        { 3235582254, 348833376, 4043157504 } },
-      // Matrix for nskip = 2 * 8 ^ 20:
-      { { 2303067090, 3371139074, 1967771133 },
-        { 598630070, 2303067090, 1819012637 },
-        { 2049250561, 598630070, 4093044926 } },
-      // Matrix for nskip = 3 * 8 ^ 20:
-      { { 897071837, 763331173, 3837362577 },
-        { 294683328, 897071837, 2496877097 },
-        { 2268904495, 294683328, 3496861697 } },
-      // Matrix for nskip = 4 * 8 ^ 20:
-      { { 3035321857, 3971176093, 226779704 },
-        { 3361614254, 3035321857, 2807125404 },
-        { 326640887, 3361614254, 3147308542 } },
-      // Matrix for nskip = 5 * 8 ^ 20:
-      { { 4010547095, 2725421511, 511986932 },
-        { 1545732164, 4010547095, 2643845410 },
-        { 2010134838, 1545732164, 3633977146 } },
-      // Matrix for nskip = 6 * 8 ^ 20:
-      { { 3118026103, 1037137281, 1600236290 },
-        { 2957620899, 3118026103, 433027378 },
-        { 2926759199, 2957620899, 3989342054 } },
-      // Matrix for nskip = 7 * 8 ^ 20:
-      { { 2423025801, 3089536821, 995021703 },
-        { 3613148280, 2423025801, 241254395 },
-        { 2857733472, 3613148280, 1868423350 } },
-      // Matrix for nskip = 1 * 8 ^ 21:
-      { { 1774298149, 4179629947, 3145006948 },
-        { 1688753503, 1774298149, 94869516 },
-        { 2327946901, 1688753503, 2786835219 } },
-      // Matrix for nskip = 2 * 8 ^ 21:
-      { { 185429251, 88142322, 3372328450 },
-        { 1198432931, 185429251, 1527068783 },
-        { 2880072915, 1198432931, 2782214191 } },
-      // Matrix for nskip = 3 * 8 ^ 21:
-      { { 2610521617, 1116660734, 2002689706 },
-        { 152508922, 2610521617, 2005955946 },
-        { 3106947611, 152508922, 239569623 } },
-      // Matrix for nskip = 4 * 8 ^ 21:
-      { { 127447080, 487724245, 2942566616 },
-        { 2180042365, 127447080, 1722814040 },
-        { 288658537, 2180042365, 4036691926 } },
-      // Matrix for nskip = 5 * 8 ^ 21:
-      { { 3269833722, 2788004771, 1482042877 },
-        { 834850082, 3269833722, 219243029 },
-        { 3704080414, 834850082, 2784167151 } },
-      // Matrix for nskip = 6 * 8 ^ 21:
-      { { 3956830949, 61587123, 1894752970 },
-        { 1989171734, 3956830949, 3197042083 },
-        { 457585003, 1989171734, 948838482 } },
-      // Matrix for nskip = 7 * 8 ^ 21:
-      { { 1982687998, 3610851352, 1902386191 },
-        { 2465097713, 1982687998, 1172472587 },
-        { 1202471365, 2465097713, 3151246066 } },
-      // Matrix for nskip = 1 * 8 ^ 22:
-      { { 1614979968, 1486547157, 1122661217 },
-        { 3976346810, 1614979968, 2343603502 },
-        { 3049605934, 3976346810, 440737492 } },
-      // Matrix for nskip = 2 * 8 ^ 22:
-      { { 613698149, 3416334823, 3832821180 },
-        { 1308958254, 613698149, 1338381534 },
-        { 4058246217, 1308958254, 2070907998 } },
-      // Matrix for nskip = 3 * 8 ^ 22:
-      { { 4069522778, 1558347771, 1555772973 },
-        { 2924102885, 4069522778, 561176530 },
-        { 566720713, 2924102885, 2660857604 } },
-      // Matrix for nskip = 4 * 8 ^ 22:
-      { { 2575546527, 1033712257, 125034191 },
-        { 2091411644, 2575546527, 226649669 },
-        { 1198488263, 2091411644, 1522580506 } },
-      // Matrix for nskip = 5 * 8 ^ 22:
-      { { 180639007, 1841709550, 234837148 },
-        { 2219662691, 180639007, 4181748462 },
-        { 3183232763, 2219662691, 2120135993 } },
-      // Matrix for nskip = 6 * 8 ^ 22:
-      { { 4275704717, 2295071345, 1852983492 },
-        { 3461773529, 4275704717, 417692359 },
-        { 1477011348, 3461773529, 1587362209 } },
-      // Matrix for nskip = 7 * 8 ^ 22:
-      { { 755069175, 2381439395, 890314398 },
-        { 3019982523, 755069175, 572921618 },
-        { 330076245, 3019982523, 2885887051 } },
-      // Matrix for nskip = 1 * 8 ^ 23:
-      { { 1051614737, 227719572, 3725579556 },
-        { 3910426444, 1051614737, 2075080920 },
-        { 3357426062, 3910426444, 1473179318 } },
-      // Matrix for nskip = 2 * 8 ^ 23:
-      { { 2999155498, 2971093563, 2685380188 },
-        { 93938118, 2999155498, 4035265564 },
-        { 3853931650, 93938118, 2034180250 } },
-      // Matrix for nskip = 3 * 8 ^ 23:
-      { { 3543842569, 1469908890, 519769416 },
-        { 3600765500, 3543842569, 1553393489 },
-        { 60922281, 3600765500, 1226136476 } },
-      // Matrix for nskip = 4 * 8 ^ 23:
-      { { 1253368368, 2860152458, 2836784419 },
-        { 1656084047, 1253368368, 646811031 },
-        { 3103367928, 1656084047, 3114448889 } },
-      // Matrix for nskip = 5 * 8 ^ 23:
-      { { 2205916258, 1604698588, 3155610724 },
-        { 2362004551, 2205916258, 181736283 },
-        { 3847535541, 2362004551, 3814972479 } },
-      // Matrix for nskip = 6 * 8 ^ 23:
-      { { 7725939, 1654580658, 4264117811 },
-        { 1274240457, 7725939, 2108223515 },
-        { 1813716775, 1274240457, 2141296207 } },
-      // Matrix for nskip = 7 * 8 ^ 23:
-      { { 1828440339, 726307104, 566806600 },
-        { 2069873554, 1828440339, 2003524657 },
-        { 2528019064, 2069873554, 868624934 } },
-      // Matrix for nskip = 1 * 8 ^ 24:
-      { { 2962469315, 4021086500, 2670244515 },
-        { 299199825, 2962469315, 3624275162 },
-        { 3634541206, 299199825, 1684552227 } },
-      // Matrix for nskip = 2 * 8 ^ 24:
-      { { 804213223, 438999528, 3143925885 },
-        { 1625976775, 804213223, 1494982903 },
-        { 3498104358, 1625976775, 881729466 } },
-      // Matrix for nskip = 3 * 8 ^ 24:
-      { { 2885386524, 2618720282, 4093772765 },
-        { 1140571071, 2885386524, 2989367205 },
-        { 2802821649, 1140571071, 742292537 } },
-      // Matrix for nskip = 4 * 8 ^ 24:
-      { { 1547173514, 490999994, 918013965 },
-        { 1312079237, 1547173514, 1905431135 },
-        { 3784344293, 1312079237, 3643511238 } },
-      // Matrix for nskip = 5 * 8 ^ 24:
-      { { 3363084915, 889964766, 2840623993 },
-        { 485137636, 3363084915, 1563107974 },
-        { 4117358359, 485137636, 2655518143 } },
-      // Matrix for nskip = 6 * 8 ^ 24:
-      { { 2014523666, 1476325540, 1550754572 },
-        { 588313388, 2014523666, 2691287218 },
-        { 4248816946, 588313388, 1568942409 } },
-      // Matrix for nskip = 7 * 8 ^ 24:
-      { { 2407332340, 3541076740, 1876171062 },
-        { 1127328556, 2407332340, 3702106930 },
-        { 1804600645, 1127328556, 2140373745 } },
-      // Matrix for nskip = 1 * 8 ^ 25:
-      { { 3846994569, 2894966137, 1130633118 },
-        { 4115190113, 3846994569, 777098754 },
-        { 3088495692, 4115190113, 2193427908 } },
-      // Matrix for nskip = 2 * 8 ^ 25:
-      { { 1511326704, 3759209742, 1610795712 },
-        { 4292754251, 1511326704, 3889917532 },
-        { 3859662829, 4292754251, 3708466080 } },
-      // Matrix for nskip = 3 * 8 ^ 25:
-      { { 2721725192, 3847490931, 444351073 },
-        { 429225403, 2721725192, 673508566 },
-        { 387279730, 429225403, 3104869093 } },
-      // Matrix for nskip = 4 * 8 ^ 25:
-      { { 972103006, 964807713, 878035866 },
-        { 4248550197, 972103006, 1926628839 },
-        { 1448629089, 4248550197, 3196114006 } },
-      // Matrix for nskip = 5 * 8 ^ 25:
-      { { 549140019, 2935386277, 4206854109 },
-        { 459549553, 549140019, 1011901572 },
-        { 821145437, 459549553, 302470082 } },
-      // Matrix for nskip = 6 * 8 ^ 25:
-      { { 907238901, 2926293232, 2865846472 },
-        { 840689212, 907238901, 1249197731 },
-        { 4278768404, 840689212, 3331097822 } },
-      // Matrix for nskip = 7 * 8 ^ 25:
-      { { 105585154, 3513063153, 2552212444 },
-        { 379969606, 105585154, 378686420 },
-        { 3414457398, 379969606, 3084470277 } },
-      // Matrix for nskip = 1 * 8 ^ 26:
-      { { 3497384788, 3174249442, 3182508868 },
-        { 3864816447, 3497384788, 3038399593 },
-        { 2546884738, 3864816447, 2980208068 } },
-      // Matrix for nskip = 2 * 8 ^ 26:
-      { { 1776335558, 1189944887, 4095757548 },
-        { 3813600746, 1776335558, 789475914 },
-        { 4119698302, 3813600746, 2145357457 } },
-      // Matrix for nskip = 3 * 8 ^ 26:
-      { { 1736653518, 945282763, 3568863651 },
-        { 2539405616, 1736653518, 3870991887 },
-        { 1676082014, 2539405616, 4282213129 } },
-      // Matrix for nskip = 4 * 8 ^ 26:
-      { { 4022832294, 4130146837, 1942923647 },
-        { 1675130777, 4022832294, 916677004 },
-        { 4089786548, 1675130777, 116540512 } },
-      // Matrix for nskip = 5 * 8 ^ 26:
-      { { 3414208535, 1938436883, 1996617380 },
-        { 3508342845, 3414208535, 3024221061 },
-        { 863275511, 3508342845, 3926625937 } },
-      // Matrix for nskip = 6 * 8 ^ 26:
-      { { 943060309, 1550884686, 1524180490 },
-        { 1603911046, 943060309, 659956132 },
-        { 3864471824, 1603911046, 1981894197 } },
-      // Matrix for nskip = 7 * 8 ^ 26:
-      { { 4039258344, 2877267458, 1263654722 },
-        { 2264646264, 4039258344, 866786660 },
-        { 3436002161, 2264646264, 1103279181 } },
-      // Matrix for nskip = 1 * 8 ^ 27:
-      { { 165639584, 1205513289, 2037453462 },
-        { 1444587280, 165639584, 161923120 },
-        { 2617085459, 1444587280, 2006913311 } },
-      // Matrix for nskip = 2 * 8 ^ 27:
-      { { 3458099202, 3062421748, 4052486999 },
-        { 1064270720, 3458099202, 230768332 },
-        { 4056228301, 1064270720, 2219267779 } },
-      // Matrix for nskip = 3 * 8 ^ 27:
-      { { 4130534548, 3958841381, 2978123129 },
-        { 3549040929, 4130534548, 624596665 },
-        { 3007893075, 3549040929, 2033981581 } },
-      // Matrix for nskip = 4 * 8 ^ 27:
-      { { 296275263, 3452455838, 2081462173 },
-        { 1789143993, 296275263, 3463234943 },
-        { 2097389984, 1789143993, 3447191459 } },
-      // Matrix for nskip = 5 * 8 ^ 27:
-      { { 3690699991, 194807645, 3499022088 },
-        { 895650639, 3690699991, 202155710 },
-        { 3063493626, 895650639, 2818867049 } },
-      // Matrix for nskip = 6 * 8 ^ 27:
-      { { 775854673, 2918396394, 2709062415 },
-        { 2684216609, 775854673, 721391189 },
-        { 4036938266, 2684216609, 1742271124 } },
-      // Matrix for nskip = 7 * 8 ^ 27:
-      { { 3150458758, 4126093705, 1386916196 },
-        { 3083923483, 3150458758, 2299677089 },
-        { 1576871217, 3083923483, 1393814954 } },
-      // Matrix for nskip = 1 * 8 ^ 28:
-      { { 2828288883, 3866690251, 410553827 },
-        { 1587005542, 2828288883, 1469478670 },
-        { 2766486018, 1587005542, 2627363449 } },
-      // Matrix for nskip = 2 * 8 ^ 28:
-      { { 3288027530, 412403981, 2458742268 },
-        { 4267121909, 3288027530, 138566505 },
-        { 420803572, 4267121909, 4094554844 } },
-      // Matrix for nskip = 3 * 8 ^ 28:
-      { { 2136361676, 3398888999, 2068559481 },
-        { 3790597750, 2136361676, 3281478755 },
-        { 4056706273, 3790597750, 1765993677 } },
-      // Matrix for nskip = 4 * 8 ^ 28:
-      { { 3844599430, 2430152838, 3283485436 },
-        { 2486244684, 3844599430, 4252427633 },
-        { 3560842909, 2486244684, 3960267499 } },
-      // Matrix for nskip = 5 * 8 ^ 28:
-      { { 3419145577, 107246070, 429885456 },
-        { 1381214928, 3419145577, 1111366755 },
-        { 767007913, 1381214928, 2270459619 } },
-      // Matrix for nskip = 6 * 8 ^ 28:
-      { { 1494013447, 1485743041, 931794028 },
-        { 3674972444, 1494013447, 2085831739 },
-        { 62603161, 3674972444, 555083053 } },
-      // Matrix for nskip = 7 * 8 ^ 28:
-      { { 1677686741, 1049056456, 3063490072 },
-        { 3432517708, 1677686741, 1550912558 },
-        { 3096606227, 3432517708, 349068991 } },
-      // Matrix for nskip = 1 * 8 ^ 29:
-      { { 67933059, 1294996291, 2657888382 },
-        { 513233413, 67933059, 1379805031 },
-        { 44564058, 513233413, 86971645 } },
-      // Matrix for nskip = 2 * 8 ^ 29:
-      { { 2732588524, 1866530072, 818237694 },
-        { 2540507736, 2732588524, 3257104212 },
-        { 1164400003, 2540507736, 1124501551 } },
-      // Matrix for nskip = 3 * 8 ^ 29:
-      { { 1412660773, 1524580236, 2800129005 },
-        { 3198153122, 1412660773, 3904718713 },
-        { 2546401509, 3198153122, 386568104 } },
-      // Matrix for nskip = 4 * 8 ^ 29:
-      { { 4199239222, 3155848463, 2121388468 },
-        { 1135554501, 4199239222, 2056492193 },
-        { 3251740389, 1135554501, 2343537248 } },
-      // Matrix for nskip = 5 * 8 ^ 29:
-      { { 3239971958, 3891714065, 1807213249 },
-        { 3694822198, 3239971958, 3557488352 },
-        { 2750758637, 3694822198, 163867522 } },
-      // Matrix for nskip = 6 * 8 ^ 29:
-      { { 884974087, 1753139982, 2087168228 },
-        { 2226758301, 884974087, 1590955204 },
-        { 1886560387, 2226758301, 4000127015 } },
-      // Matrix for nskip = 7 * 8 ^ 29:
-      { { 3230269711, 3957529982, 3575750396 },
-        { 3930348525, 3230269711, 2594598825 },
-        { 3785901658, 3930348525, 4178374892 } },
-      // Matrix for nskip = 1 * 8 ^ 30:
-      { { 550710036, 500329021, 1075236085 },
-        { 356444753, 550710036, 1634965500 },
-        { 58733535, 356444753, 1261552815 } },
-      // Matrix for nskip = 2 * 8 ^ 30:
-      { { 708689546, 419139045, 2012018174 },
-        { 706488081, 708689546, 1113760995 },
-        { 585555005, 706488081, 76092226 } },
-      // Matrix for nskip = 3 * 8 ^ 30:
-      { { 2584730290, 103417098, 2018833769 },
-        { 831116151, 2584730290, 1919249397 },
-        { 1036497162, 831116151, 2546254144 } },
-      // Matrix for nskip = 4 * 8 ^ 30:
-      { { 1293182265, 3168473803, 366230236 },
-        { 3319068849, 1293182265, 1085259665 },
-        { 1675229290, 3319068849, 3912300371 } },
-      // Matrix for nskip = 5 * 8 ^ 30:
-      { { 2602420349, 3992244735, 1543754813 },
-        { 3770060220, 2602420349, 1407637442 },
-        { 944746705, 3770060220, 2920440850 } },
-      // Matrix for nskip = 6 * 8 ^ 30:
-      { { 1601703108, 619857159, 1219413461 },
-        { 2824672719, 1601703108, 3707169777 },
-        { 3352413650, 2824672719, 1098132331 } },
-      // Matrix for nskip = 7 * 8 ^ 30:
-      { { 3630967154, 3444173778, 3289446159 },
-        { 1769199423, 3630967154, 2021155330 },
-        { 1478978985, 1769199423, 1976131087 } },
-      // Matrix for nskip = 1 * 8 ^ 31:
-      { { 3186089068, 4188864734, 1211781402 },
-        { 756122322, 3186089068, 578262892 },
-        { 2518961174, 756122322, 1658665581 } },
-      // Matrix for nskip = 2 * 8 ^ 31:
-      { { 1347291439, 2050427676, 736113023 },
-        { 4102191254, 1347291439, 878627148 },
-        { 1293500383, 4102191254, 745646810 } },
-      // Matrix for nskip = 3 * 8 ^ 31:
-      { { 1428398286, 758558167, 59314928 },
-        { 2615508955, 1428398286, 3061138405 },
-        { 1098162878, 2615508955, 2401469211 } },
-      // Matrix for nskip = 4 * 8 ^ 31:
-      { { 4196897331, 3436564969, 1900167098 },
-        { 3108887846, 4196897331, 2697923227 },
-        { 1405263476, 3108887846, 314631094 } },
-      // Matrix for nskip = 5 * 8 ^ 31:
-      { { 3004743607, 2733058282, 4202297421 },
-        { 956778663, 3004743607, 1815192601 },
-        { 2211295748, 956778663, 3626831178 } },
-      // Matrix for nskip = 6 * 8 ^ 31:
-      { { 3694919563, 2520419703, 731922800 },
-        { 540077867, 3694919563, 2433069844 },
-        { 2129238146, 540077867, 301939378 } },
-      // Matrix for nskip = 7 * 8 ^ 31:
-      { { 2475140271, 37335008, 2778457406 },
-        { 2217587145, 2475140271, 1363889163 },
-        { 135344313, 2217587145, 1707617706 } },
-      // Matrix for nskip = 1 * 8 ^ 32:
-      { { 958383622, 3694638688, 1150087061 },
-        { 3770009830, 958383622, 793326651 },
-        { 533700213, 3770009830, 1513734026 } },
-      // Matrix for nskip = 2 * 8 ^ 32:
-      { { 4119603367, 3479396923, 3534176399 },
-        { 3765397477, 4119603367, 1458031003 },
-        { 3380901602, 3765397477, 2684083587 } },
-      // Matrix for nskip = 3 * 8 ^ 32:
-      { { 178016378, 1184002529, 789650986 },
-        { 389885259, 178016378, 3729279189 },
-        { 1268575347, 389885259, 4091367000 } },
-      // Matrix for nskip = 4 * 8 ^ 32:
-      { { 980937351, 2094378936, 448446028 },
-        { 1421333909, 980937351, 3405683645 },
-        { 323724368, 1421333909, 338680738 } },
-      // Matrix for nskip = 5 * 8 ^ 32:
-      { { 2381808660, 341372255, 146194193 },
-        { 4185254045, 2381808660, 1244677534 },
-        { 2006223188, 4185254045, 3589653882 } },
-      // Matrix for nskip = 6 * 8 ^ 32:
-      { { 1104593159, 2457034166, 4243190272 },
-        { 2690000574, 1104593159, 3592133108 },
-        { 3935039161, 2690000574, 2028886430 } },
-      // Matrix for nskip = 7 * 8 ^ 32:
-      { { 798595991, 3072704016, 1453032677 },
-        { 3595149031, 798595991, 1556294726 },
-        { 775957906, 3595149031, 208124234 } },
-      // Matrix for nskip = 1 * 8 ^ 33:
-      { { 2942968846, 4293637338, 3549906544 },
-        { 527851489, 2942968846, 3852871282 },
-        { 4209198933, 527851489, 1091268872 } },
-      // Matrix for nskip = 2 * 8 ^ 33:
-      { { 1975983015, 2092556693, 611187071 },
-        { 3982652344, 1975983015, 3001736262 },
-        { 2055073597, 3982652344, 1875181995 } },
-      // Matrix for nskip = 3 * 8 ^ 33:
-      { { 1752967931, 1167063522, 3817182484 },
-        { 3760899628, 1752967931, 2808655727 },
-        { 3110603267, 3760899628, 1832178008 } },
-      // Matrix for nskip = 4 * 8 ^ 33:
-      { { 2970221269, 880904779, 2447465272 },
-        { 2888742196, 2970221269, 3521651749 },
-        { 3019977656, 2888742196, 2712717326 } },
-      // Matrix for nskip = 5 * 8 ^ 33:
-      { { 604958655, 442191761, 1996070625 },
-        { 1269454015, 604958655, 814754560 },
-        { 507433046, 1269454015, 2488458391 } },
-      // Matrix for nskip = 6 * 8 ^ 33:
-      { { 710612185, 99734716, 3956229929 },
-        { 2137129319, 710612185, 2895847378 },
-        { 1727032860, 2137129319, 1001260701 } },
-      // Matrix for nskip = 7 * 8 ^ 33:
-      { { 1066664047, 4152765348, 1734907969 },
-        { 2968154336, 1066664047, 2381691001 },
-        { 1497199245, 2968154336, 3563839605 } },
-      // Matrix for nskip = 1 * 8 ^ 34:
-      { { 419134859, 2976059897, 747864206 },
-        { 4101695717, 419134859, 4264593116 },
-        { 2657991148, 4101695717, 2542621682 } },
-      // Matrix for nskip = 2 * 8 ^ 34:
-      { { 4043135299, 1612983166, 1149778656 },
-        { 1267010518, 4043135299, 3496325546 },
-        { 3094232897, 1267010518, 2949176293 } },
-      // Matrix for nskip = 3 * 8 ^ 34:
-      { { 3214297332, 2846434362, 4106231685 },
-        { 1780972559, 3214297332, 1132838092 },
-        { 1348023856, 1780972559, 537227984 } },
-      // Matrix for nskip = 4 * 8 ^ 34:
-      { { 3949395794, 1774568686, 2123036003 },
-        { 2182983404, 3949395794, 2355671350 },
-        { 2820933455, 2182983404, 513963325 } },
-      // Matrix for nskip = 5 * 8 ^ 34:
-      { { 1877604589, 3803366824, 2927718923 },
-        { 2817972608, 1877604589, 901177092 },
-        { 1008515195, 2817972608, 1900906578 } },
-      // Matrix for nskip = 6 * 8 ^ 34:
-      { { 2247365780, 1508191753, 929996525 },
-        { 2014701429, 2247365780, 2906849518 },
-        { 1864911773, 2014701429, 634217040 } },
-      // Matrix for nskip = 7 * 8 ^ 34:
-      { { 3200692723, 3246632578, 3558417384 },
-        { 733273917, 3200692723, 715293224 },
-        { 3878803573, 733273917, 3720987401 } },
-      // Matrix for nskip = 1 * 8 ^ 35:
-      { { 3046911698, 2576744453, 2492729814 },
-        { 4277866093, 3046911698, 3146977604 },
-        { 2249371766, 4277866093, 3622293976 } },
-      // Matrix for nskip = 2 * 8 ^ 35:
-      { { 1391529818, 423458502, 2587125255 },
-        { 3536237833, 1391529818, 985347517 },
-        { 157623850, 3536237833, 1015566287 } },
-      // Matrix for nskip = 3 * 8 ^ 35:
-      { { 2768170623, 2671124421, 1038000683 },
-        { 2258964805, 2768170623, 3036723158 },
-        { 2454977948, 2258964805, 2502325941 } },
-      // Matrix for nskip = 4 * 8 ^ 35:
-      { { 48329260, 2599277669, 821961664 },
-        { 902187690, 48329260, 1716556555 },
-        { 4019658974, 902187690, 950730510 } },
-      // Matrix for nskip = 5 * 8 ^ 35:
-      { { 3100975771, 1019061132, 1844417430 },
-        { 1634016885, 3100975771, 2161076681 },
-        { 378757639, 1634016885, 4124897232 } },
-      // Matrix for nskip = 6 * 8 ^ 35:
-      { { 1045387495, 796030826, 1236131839 },
-        { 2328291482, 1045387495, 2884310858 },
-        { 3863948457, 2328291482, 465921502 } },
-      // Matrix for nskip = 7 * 8 ^ 35:
-      { { 3483511399, 741205873, 1920164372 },
-        { 1105604243, 3483511399, 2420741811 },
-        { 2484220821, 1105604243, 2513215163 } },
-      // Matrix for nskip = 1 * 8 ^ 36:
-      { { 1318489562, 1530977112, 3713577419 },
-        { 4270158447, 1318489562, 1654940598 },
-        { 2679964938, 4270158447, 1337075195 } },
-      // Matrix for nskip = 2 * 8 ^ 36:
-      { { 770600793, 3249576224, 3578552768 },
-        { 2710443459, 770600793, 2990852339 },
-        { 3098163705, 2710443459, 522138188 } },
-      // Matrix for nskip = 3 * 8 ^ 36:
-      { { 3299888517, 1806316064, 2474407987 },
-        { 3432253975, 3299888517, 3480703284 },
-        { 201692417, 3432253975, 1711417284 } },
-      // Matrix for nskip = 4 * 8 ^ 36:
-      { { 2803285489, 1922250286, 3164022812 },
-        { 477609731, 2803285489, 2140252218 },
-        { 2252852611, 477609731, 3058519788 } },
-      // Matrix for nskip = 5 * 8 ^ 36:
-      { { 3735324161, 860809210, 2792496593 },
-        { 1613420642, 3735324161, 651730634 },
-        { 3412387271, 1613420642, 2796594703 } },
-      // Matrix for nskip = 6 * 8 ^ 36:
-      { { 993539593, 3499265007, 3772074010 },
-        { 3213913829, 993539593, 3655831787 },
-        { 2561980091, 3213913829, 2164990937 } },
-      // Matrix for nskip = 7 * 8 ^ 36:
-      { { 76754721, 818311023, 1258273773 },
-        { 2914546594, 76754721, 3007787703 },
-        { 1554324281, 2914546594, 1645121444 } },
-      // Matrix for nskip = 1 * 8 ^ 37:
-      { { 208329741, 3633562083, 3548346666 },
-        { 3892091460, 208329741, 516833304 },
-        { 3440632377, 3892091460, 1638833719 } },
-      // Matrix for nskip = 2 * 8 ^ 37:
-      { { 1816075033, 3570111203, 959489356 },
-        { 3482051486, 1816075033, 861657108 },
-        { 3119495098, 3482051486, 2576849579 } },
-      // Matrix for nskip = 3 * 8 ^ 37:
-      { { 955576990, 607798602, 220457899 },
-        { 760121425, 955576990, 1155400464 },
-        { 1209136348, 760121425, 1165671753 } },
-      // Matrix for nskip = 4 * 8 ^ 37:
-      { { 4240216888, 2891584407, 2102314945 },
-        { 4064489450, 4240216888, 1427441010 },
-        { 2441164913, 4064489450, 3558527186 } },
-      // Matrix for nskip = 5 * 8 ^ 37:
-      { { 3943073787, 2113696223, 3840029496 },
-        { 42559030, 3943073787, 2203932271 },
-        { 638717597, 42559030, 3208053933 } },
-      // Matrix for nskip = 6 * 8 ^ 37:
-      { { 714331518, 510361535, 3438751245 },
-        { 2783614947, 714331518, 666348656 },
-        { 4028058908, 2783614947, 2994150339 } },
-      // Matrix for nskip = 7 * 8 ^ 37:
-      { { 3978295779, 1441779930, 4249164235 },
-        { 1006134725, 3978295779, 2022224066 },
-        { 1257228544, 1006134725, 3563676111 } },
-      // Matrix for nskip = 1 * 8 ^ 38:
-      { { 2918371295, 65155283, 3469357011 },
-        { 3579773554, 2918371295, 3494391959 },
-        { 3266584309, 3579773554, 3837485479 } },
-      // Matrix for nskip = 2 * 8 ^ 38:
-      { { 2959420453, 1365016881, 4082486022 },
-        { 236489012, 2959420453, 3802558529 },
-        { 2687043642, 236489012, 2547086826 } },
-      // Matrix for nskip = 3 * 8 ^ 38:
-      { { 3501988208, 1843500325, 3464182128 },
-        { 969269805, 3501988208, 2232088910 },
-        { 3829792024, 969269805, 2334756085 } },
-      // Matrix for nskip = 4 * 8 ^ 38:
-      { { 4185325422, 2762854843, 3200044912 },
-        { 3664909559, 4185325422, 3543921700 },
-        { 4240262918, 3664909559, 2853212443 } },
-      // Matrix for nskip = 5 * 8 ^ 38:
-      { { 3870531367, 2625370600, 1928035826 },
-        { 1477778653, 3870531367, 4167218005 },
-        { 2810379745, 1477778653, 1547435981 } },
-      // Matrix for nskip = 6 * 8 ^ 38:
-      { { 2166942438, 2045317959, 2862960125 },
-        { 1192305592, 2166942438, 2202186359 },
-        { 1282445014, 1192305592, 3680855685 } },
-      // Matrix for nskip = 7 * 8 ^ 38:
-      { { 4183888729, 1630438655, 1622555680 },
-        { 841523235, 4183888729, 266662726 },
-        { 1888300357, 841523235, 553070804 } },
-      // Matrix for nskip = 1 * 8 ^ 39:
-      { { 2618500928, 4237264351, 1470046497 },
-        { 1893990098, 2618500928, 2982567031 },
-        { 3017062825, 1893990098, 3195556801 } },
-      // Matrix for nskip = 2 * 8 ^ 39:
-      { { 1868464655, 3407681142, 1652841784 },
-        { 1678569574, 1868464655, 4162480901 },
-        { 1477016185, 1678569574, 4145063890 } },
-      // Matrix for nskip = 3 * 8 ^ 39:
-      { { 346858981, 2885211332, 1550050752 },
-        { 3168708136, 346858981, 2121517268 },
-        { 696413464, 3168708136, 2779761666 } },
-      // Matrix for nskip = 4 * 8 ^ 39:
-      { { 792188465, 4251338402, 2219407026 },
-        { 3840340879, 792188465, 3493367465 },
-        { 2979958414, 3840340879, 2338974139 } },
-      // Matrix for nskip = 5 * 8 ^ 39:
-      { { 3859433262, 3764728773, 1297631730 },
-        { 3833824001, 3859433262, 1333287789 },
-        { 1909447704, 3833824001, 2135933046 } },
-      // Matrix for nskip = 6 * 8 ^ 39:
-      { { 102264893, 4038432252, 2717349223 },
-        { 709433989, 102264893, 1807326569 },
-        { 2997676666, 709433989, 3722753261 } },
-      // Matrix for nskip = 7 * 8 ^ 39:
-      { { 4020257258, 1217293203, 2346103599 },
-        { 3809824315, 4020257258, 576285090 },
-        { 3162683019, 3809824315, 2652264596 } },
-      // Matrix for nskip = 1 * 8 ^ 40:
-      { { 478845700, 2378167062, 882114621 },
-        { 1674533845, 478845700, 3572905305 },
-        { 3571222880, 1674533845, 1242316901 } },
-      // Matrix for nskip = 2 * 8 ^ 40:
-      { { 2636090868, 1972761498, 71690719 },
-        { 1228103463, 2636090868, 1280685025 },
-        { 3741735502, 1228103463, 994061750 } },
-      // Matrix for nskip = 3 * 8 ^ 40:
-      { { 2765592972, 3759047976, 2089192298 },
-        { 2592791249, 2765592972, 2079317731 },
-        { 3195761319, 2592791249, 913428082 } },
-      // Matrix for nskip = 4 * 8 ^ 40:
-      { { 1156725261, 1100755307, 221922891 },
-        { 2892200461, 1156725261, 1505716533 },
-        { 2287613563, 2892200461, 3689457190 } },
-      // Matrix for nskip = 5 * 8 ^ 40:
-      { { 716602832, 851112058, 2726490354 },
-        { 328778061, 716602832, 2662750501 },
-        { 2300190858, 328778061, 2031908929 } },
-      // Matrix for nskip = 6 * 8 ^ 40:
-      { { 131535614, 3548535605, 1837882588 },
-        { 3257415168, 131535614, 1374937136 },
-        { 1879184234, 3257415168, 167534374 } },
-      // Matrix for nskip = 7 * 8 ^ 40:
-      { { 3131954528, 4223897546, 515553914 },
-        { 326215900, 3131954528, 644217952 },
-        { 934922655, 326215900, 2645770575 } },
-      // Matrix for nskip = 1 * 8 ^ 41:
-      { { 1387244644, 3135090808, 1243609165 },
-        { 1724967466, 1387244644, 3296353235 },
-        { 1064364031, 1724967466, 2107521044 } },
-      // Matrix for nskip = 2 * 8 ^ 41:
-      { { 2822471992, 2034317853, 2071407475 },
-        { 170903528, 2822471992, 1322162887 },
-        { 2524982332, 170903528, 2656231333 } },
-      // Matrix for nskip = 3 * 8 ^ 41:
-      { { 2401421275, 3219909065, 1167519964 },
-        { 3200856372, 2401421275, 2651362201 },
-        { 3150793696, 3200856372, 3740263529 } },
-      // Matrix for nskip = 4 * 8 ^ 41:
-      { { 3653936868, 3893194049, 2484299328 },
-        { 1313746234, 3653936868, 1705346273 },
-        { 1397638018, 1313746234, 4015529545 } },
-      // Matrix for nskip = 5 * 8 ^ 41:
-      { { 762850190, 2502708647, 3030789377 },
-        { 605169915, 762850190, 2517301940 },
-        { 2651641442, 605169915, 3739297479 } },
-      // Matrix for nskip = 6 * 8 ^ 41:
-      { { 4185157227, 3109351418, 2907095532 },
-        { 3981440524, 4185157227, 2447807956 },
-        { 1358765607, 3981440524, 2947483756 } },
-      // Matrix for nskip = 7 * 8 ^ 41:
-      { { 616351240, 2708761949, 3510102453 },
-        { 1192816102, 616351240, 3430261471 },
-        { 3769975746, 1192816102, 1092752722 } },
-      // Matrix for nskip = 1 * 8 ^ 42:
-      { { 4129760842, 1671665759, 1677834656 },
-        { 3200005334, 4129760842, 3486207172 },
-        { 2850728736, 3200005334, 3076201597 } },
-      // Matrix for nskip = 2 * 8 ^ 42:
-      { { 1464411153, 277697599, 1610723613 },
-        { 32183930, 1464411153, 1022607788 },
-        { 2824425944, 32183930, 2093834863 } },
-      // Matrix for nskip = 3 * 8 ^ 42:
-      { { 4289888328, 3225021158, 546274137 },
-        { 3161813725, 4289888328, 3178255601 },
-        { 811227116, 3161813725, 2040329321 } },
-      // Matrix for nskip = 4 * 8 ^ 42:
-      { { 3492361727, 1027004383, 3167429889 },
-        { 3674905362, 3492361727, 3572939265 },
-        { 4270409313, 3674905362, 698814233 } },
-      // Matrix for nskip = 5 * 8 ^ 42:
-      { { 1024068271, 2798745077, 2659447825 },
-        { 2040144100, 1024068271, 1035060877 },
-        { 2866843005, 2040144100, 787687659 } },
-      // Matrix for nskip = 6 * 8 ^ 42:
-      { { 2906151318, 3986151835, 2581649800 },
-        { 571744464, 2906151318, 1834943086 },
-        { 3448634312, 571744464, 290967548 } },
-      // Matrix for nskip = 7 * 8 ^ 42:
-      { { 1570041711, 1880130578, 2514738078 },
-        { 3388141786, 1570041711, 744775425 },
-        { 2735736928, 3388141786, 964597855 } },
-      // Matrix for nskip = 1 * 8 ^ 43:
-      { { 880482061, 205175925, 4070445105 },
-        { 2208329119, 880482061, 1933248566 },
-        { 3741227945, 2208329119, 3962062826 } },
-      // Matrix for nskip = 2 * 8 ^ 43:
-      { { 4184605179, 1189429800, 567967482 },
-        { 107217966, 4184605179, 784865788 },
-        { 549462420, 107217966, 3134382704 } },
-      // Matrix for nskip = 3 * 8 ^ 43:
-      { { 1386364785, 4079260578, 3001857777 },
-        { 3010784539, 1386364785, 3667065093 },
-        { 3692171012, 3010784539, 2361530061 } },
-      // Matrix for nskip = 4 * 8 ^ 43:
-      { { 2732536445, 1231107067, 3374588386 },
-        { 409954030, 2732536445, 1044831206 },
-        { 3398162498, 409954030, 3505648581 } },
-      // Matrix for nskip = 5 * 8 ^ 43:
-      { { 3249719425, 4215633308, 1637240461 },
-        { 151877124, 3249719425, 2638755179 },
-        { 3634975465, 151877124, 1546467979 } },
-      // Matrix for nskip = 6 * 8 ^ 43:
-      { { 2408251701, 89238831, 4165007723 },
-        { 4262743528, 2408251701, 4114669800 },
-        { 2878757823, 4262743528, 3182943863 } },
-      // Matrix for nskip = 7 * 8 ^ 43:
-      { { 1831049905, 2380192587, 325575207 },
-        { 2045407448, 1831049905, 3463310486 },
-        { 1637651789, 2045407448, 1889914987 } },
-      // Matrix for nskip = 1 * 8 ^ 44:
-      { { 2169560691, 1076348534, 637306236 },
-        { 3704346564, 2169560691, 293694496 },
-        { 632453145, 3704346564, 1609425246 } },
-      // Matrix for nskip = 2 * 8 ^ 44:
-      { { 372115891, 3928812480, 2830541169 },
-        { 3056527841, 372115891, 1924239834 },
-        { 3044937468, 3056527841, 547142630 } },
-      // Matrix for nskip = 3 * 8 ^ 44:
-      { { 3652440052, 1383186997, 3140353867 },
-        { 1157890357, 3652440052, 3280219833 },
-        { 2953685245, 1157890357, 481162011 } },
-      // Matrix for nskip = 4 * 8 ^ 44:
-      { { 1660852083, 3635660815, 1389092450 },
-        { 1025573319, 1660852083, 3276803366 },
-        { 4036331438, 1025573319, 4092197741 } },
-      // Matrix for nskip = 5 * 8 ^ 44:
-      { { 2683005143, 1323793242, 1291869629 },
-        { 2903240813, 2683005143, 3854329533 },
-        { 2695585089, 2903240813, 1426976484 } },
-      // Matrix for nskip = 6 * 8 ^ 44:
-      { { 56767734, 116994667, 111909274 },
-        { 3730950473, 56767734, 2191610434 },
-        { 1091419714, 3730950473, 718571338 } },
-      // Matrix for nskip = 7 * 8 ^ 44:
-      { { 336318787, 391538001, 10025372 },
-        { 3157633492, 336318787, 2821500332 },
-        { 3413552779, 3157633492, 4255875513 } },
-      // Matrix for nskip = 1 * 8 ^ 45:
-      { { 1360732901, 2887812973, 4101068693 },
-        { 52572783, 1360732901, 112458461 },
-        { 2636566855, 52572783, 1136777988 } },
-      // Matrix for nskip = 2 * 8 ^ 45:
-      { { 3455696508, 536919193, 3978804036 },
-        { 3094157668, 3455696508, 3821833900 },
-        { 2278849016, 3094157668, 2531965909 } },
-      // Matrix for nskip = 3 * 8 ^ 45:
-      { { 105839550, 1126024816, 287198647 },
-        { 351807867, 105839550, 643672297 },
-        { 1483330368, 351807867, 3781751861 } },
-      // Matrix for nskip = 4 * 8 ^ 45:
-      { { 2125991744, 890897326, 3790557569 },
-        { 1433592392, 2125991744, 3671109604 },
-        { 808215503, 1433592392, 2446306581 } },
-      // Matrix for nskip = 5 * 8 ^ 45:
-      { { 3640380877, 422210679, 1510633752 },
-        { 1569172639, 3640380877, 3192250064 },
-        { 1376060847, 1569172639, 2027936709 } },
-      // Matrix for nskip = 6 * 8 ^ 45:
-      { { 3177388361, 1344488735, 2994552097 },
-        { 284988983, 3177388361, 3227966904 },
-        { 2044803401, 284988983, 4277058832 } },
-      // Matrix for nskip = 7 * 8 ^ 45:
-      { { 3412413108, 4186230758, 3922996456 },
-        { 3683836901, 3412413108, 271458827 },
-        { 3964969101, 3683836901, 539759068 } },
-      // Matrix for nskip = 1 * 8 ^ 46:
-      { { 3524411799, 932865240, 1838275365 },
-        { 1789634890, 3524411799, 4130736474 },
-        { 2252266098, 1789634890, 3048775967 } },
-      // Matrix for nskip = 2 * 8 ^ 46:
-      { { 1773339925, 948403862, 1999624391 },
-        { 983864203, 1773339925, 3734776305 },
-        { 314407045, 983864203, 2648614071 } },
-      // Matrix for nskip = 3 * 8 ^ 46:
-      { { 1928167136, 2078532030, 1690025039 },
-        { 2529043017, 1928167136, 1858653225 },
-        { 2142588179, 2529043017, 2188623418 } },
-      // Matrix for nskip = 4 * 8 ^ 46:
-      { { 321802921, 1099164995, 2112167358 },
-        { 3760936985, 321802921, 1003573324 },
-        { 3758858458, 3760936985, 4014658840 } },
-      // Matrix for nskip = 5 * 8 ^ 46:
-      { { 774593807, 1711411238, 3653945922 },
-        { 1751249890, 774593807, 10024535 },
-        { 9872042, 1751249890, 2762944894 } },
-      // Matrix for nskip = 6 * 8 ^ 46:
-      { { 2825735696, 1396615016, 3702967335 },
-        { 3652693925, 2825735696, 4120492766 },
-        { 1992385943, 3652693925, 686943862 } },
-      // Matrix for nskip = 7 * 8 ^ 46:
-      { { 2314946087, 4102352240, 989909889 },
-        { 459855750, 2314946087, 1424771850 },
-        { 1469834717, 459855750, 2094187769 } },
-      // Matrix for nskip = 1 * 8 ^ 47:
-      { { 2196438580, 805386227, 4266375092 },
-        { 4124675351, 2196438580, 2527961345 },
-        { 94452540, 4124675351, 2825656399 } },
-      // Matrix for nskip = 2 * 8 ^ 47:
-      { { 66735368, 2228005807, 4186703168 },
-        { 2624855312, 66735368, 2708679078 },
-        { 4098470056, 2624855312, 1773862183 } },
-      // Matrix for nskip = 3 * 8 ^ 47:
-      { { 320933009, 1915174474, 3744070526 },
-        { 562558814, 320933009, 1706424966 },
-        { 413766233, 562558814, 2881230326 } },
-      // Matrix for nskip = 4 * 8 ^ 47:
-      { { 3072642883, 2746897053, 2690305546 },
-        { 1105106652, 3072642883, 4047666135 },
-        { 2862886282, 1105106652, 3597347398 } },
-      // Matrix for nskip = 5 * 8 ^ 47:
-      { { 1498353481, 3428325510, 1424606567 },
-        { 372840925, 1498353481, 1901161856 },
-        { 1201903815, 372840925, 1622747589 } },
-      // Matrix for nskip = 6 * 8 ^ 47:
-      { { 3754310983, 2829438112, 3947637114 },
-        { 2617184648, 3754310983, 3119630359 },
-        { 2102395010, 2617184648, 2313448358 } },
-      // Matrix for nskip = 7 * 8 ^ 47:
-      { { 2033651727, 3918276995, 2324222273 },
-        { 2517499860, 2033651727, 3237758154 },
-        { 3966641526, 2517499860, 2296152269 } },
-      // Matrix for nskip = 1 * 8 ^ 48:
-      { { 232906611, 3873338256, 4051554873 },
-        { 3027413363, 232906611, 3159432673 },
-        { 3872967050, 3027413363, 987156327 } },
-      // Matrix for nskip = 2 * 8 ^ 48:
-      { { 1160686753, 3676603152, 1635979789 },
-        { 1447386846, 1160686753, 2670438424 },
-        { 816212890, 1447386846, 4288868534 } },
-      // Matrix for nskip = 3 * 8 ^ 48:
-      { { 232406022, 1438391315, 351811028 },
-        { 792615675, 232406022, 2249558877 },
-        { 4000461186, 792615675, 3773572468 } },
-      // Matrix for nskip = 4 * 8 ^ 48:
-      { { 3825238244, 1445162354, 2362389441 },
-        { 3440193648, 3825238244, 3520937545 },
-        { 2652790808, 3440193648, 405299994 } },
-      // Matrix for nskip = 5 * 8 ^ 48:
-      { { 1153297111, 1584881761, 3755481813 },
-        { 2565782177, 1153297111, 595979811 },
-        { 3520546605, 2565782177, 1485833084 } },
-      // Matrix for nskip = 6 * 8 ^ 48:
-      { { 2264796250, 1995295374, 4156333842 },
-        { 4182411213, 2264796250, 3692855966 },
-        { 2398102705, 4182411213, 135106935 } },
-      // Matrix for nskip = 7 * 8 ^ 48:
-      { { 1510709042, 3654924984, 4137143940 },
-        { 3411234559, 1510709042, 3713963703 },
-        { 3111723660, 3411234559, 3580357515 } },
-      // Matrix for nskip = 1 * 8 ^ 49:
-      { { 1984094858, 532165989, 2027397575 },
-        { 1455977136, 1984094858, 2433255524 },
-        { 1039994763, 1455977136, 2069333087 } },
-      // Matrix for nskip = 2 * 8 ^ 49:
-      { { 3680843319, 2332949611, 3516795313 },
-        { 2033851810, 3680843319, 3843367307 },
-        { 3686294589, 2033851810, 3912995069 } },
-      // Matrix for nskip = 3 * 8 ^ 49:
-      { { 2570307024, 165497191, 3880130435 },
-        { 540713030, 2570307024, 1096034689 },
-        { 3859799631, 540713030, 3714945286 } },
-      // Matrix for nskip = 4 * 8 ^ 49:
-      { { 967423689, 1724183394, 635932799 },
-        { 641380480, 967423689, 2145297779 },
-        { 1723000412, 641380480, 455633660 } },
-      // Matrix for nskip = 5 * 8 ^ 49:
-      { { 2807559499, 2180128950, 1968769828 },
-        { 1885526032, 2807559499, 3568246807 },
-        { 1874951461, 1885526032, 2399805320 } },
-      // Matrix for nskip = 6 * 8 ^ 49:
-      { { 743327961, 3817146458, 2078921540 },
-        { 752843557, 743327961, 3382133383 },
-        { 1546279541, 752843557, 4269455046 } },
-      // Matrix for nskip = 7 * 8 ^ 49:
-      { { 355924266, 3865252236, 3092467664 },
-        { 2414940441, 355924266, 3290161562 },
-        { 493050060, 2414940441, 2727946913 } },
-      // Matrix for nskip = 1 * 8 ^ 50:
-      { { 2130938335, 1534972306, 2511584766 },
-        { 273828453, 2130938335, 3112810093 },
-        { 4084843716, 273828453, 1399334152 } },
-      // Matrix for nskip = 2 * 8 ^ 50:
-      { { 168278549, 541167592, 190177712 },
-        { 403188859, 168278549, 2092073970 },
-        { 58789558, 403188859, 2777887189 } },
-      // Matrix for nskip = 3 * 8 ^ 50:
-      { { 664028138, 360061317, 3240810721 },
-        { 3427777045, 664028138, 589375738 },
-        { 1247469758, 3427777045, 4103288151 } },
-      // Matrix for nskip = 4 * 8 ^ 50:
-      { { 634843389, 4082275720, 2092828966 },
-        { 351187677, 634843389, 1312056270 },
-        { 3347241070, 351187677, 2417192332 } },
-      // Matrix for nskip = 5 * 8 ^ 50:
-      { { 3269976890, 3103127568, 907107523 },
-        { 3154851935, 3269976890, 1078491382 },
-        { 1129461097, 3154851935, 3960596933 } },
-      // Matrix for nskip = 6 * 8 ^ 50:
-      { { 1155790154, 89494164, 1039763155 },
-        { 393005763, 1155790154, 2648470077 },
-        { 2830413843, 393005763, 1280581785 } },
-      // Matrix for nskip = 7 * 8 ^ 50:
-      { { 2340682307, 3775335435, 3604492026 },
-        { 4198859651, 2340682307, 1392463605 },
-        { 1917833692, 4198859651, 2536657316 } },
-      // Matrix for nskip = 1 * 8 ^ 51:
-      { { 443276110, 1113643788, 271102234 },
-        { 3083745876, 443276110, 3370743767 },
-        { 4200577503, 3083745876, 3298601960 } },
-      // Matrix for nskip = 2 * 8 ^ 51:
-      { { 3533393557, 764977733, 3400275098 },
-        { 144639933, 3533393557, 2646475951 },
-        { 77963866, 144639933, 3794766611 } },
-      // Matrix for nskip = 3 * 8 ^ 51:
-      { { 914011908, 1379977154, 3635095314 },
-        { 4096393357, 914011908, 962932343 },
-        { 410940557, 4096393357, 2300259911 } },
-      // Matrix for nskip = 4 * 8 ^ 51:
-      { { 4064854722, 1198665008, 2872196602 },
-        { 3274748603, 4064854722, 4164637970 },
-        { 4238693771, 3274748603, 1981721347 } },
-      // Matrix for nskip = 5 * 8 ^ 51:
-      { { 658075764, 868441731, 631337149 },
-        { 3000164892, 658075764, 3213078611 },
-        { 2494369285, 3000164892, 1969086166 } },
-      // Matrix for nskip = 6 * 8 ^ 51:
-      { { 1202027740, 1218291611, 251455117 },
-        { 1904530179, 1202027740, 1121637945 },
-        { 2014861157, 1904530179, 3331497439 } },
-      // Matrix for nskip = 7 * 8 ^ 51:
-      { { 860183345, 3722900937, 2577917907 },
-        { 184407828, 860183345, 3959662009 },
-        { 1130199284, 184407828, 1996334021 } },
-      // Matrix for nskip = 1 * 8 ^ 52:
-      { { 2279220396, 2355957139, 1417574285 },
-        { 885864931, 2279220396, 1344421653 },
-        { 1895527787, 885864931, 3726919367 } },
-      // Matrix for nskip = 2 * 8 ^ 52:
-      { { 2898100178, 2427331008, 348923199 },
-        { 3175444953, 2898100178, 4290541487 },
-        { 246118669, 3175444953, 3410622769 } },
-      // Matrix for nskip = 3 * 8 ^ 52:
-      { { 55373162, 3987120186, 2739617092 },
-        { 488341106, 55373162, 3877861726 },
-        { 468535899, 488341106, 2277317349 } },
-      // Matrix for nskip = 4 * 8 ^ 52:
-      { { 284442065, 4064194676, 2295560707 },
-        { 4182706556, 284442065, 3696899246 },
-        { 1201342255, 4182706556, 1145356382 } },
-      // Matrix for nskip = 5 * 8 ^ 52:
-      { { 854963956, 3894612396, 2185360428 },
-        { 3161673906, 854963956, 1200638109 },
-        { 808492591, 3161673906, 1983142708 } },
-      // Matrix for nskip = 6 * 8 ^ 52:
-      { { 2146747531, 896368240, 1430380976 },
-        { 1613992473, 2146747531, 901075807 },
-        { 2390399884, 1613992473, 270201416 } },
-      // Matrix for nskip = 7 * 8 ^ 52:
-      { { 1033390767, 4214343810, 3176316290 },
-        { 238941078, 1033390767, 957806905 },
-        { 3045719234, 238941078, 3992043804 } },
-      // Matrix for nskip = 1 * 8 ^ 53:
-      { { 656615546, 442908965, 3724738272 },
-        { 1624967553, 656615546, 798014134 },
-        { 1157949454, 1624967553, 496247378 } },
-      // Matrix for nskip = 2 * 8 ^ 53:
-      { { 265689579, 675056541, 3009083380 },
-        { 3820679930, 265689579, 2961990151 },
-        { 562287964, 3820679930, 1853486796 } },
-      // Matrix for nskip = 3 * 8 ^ 53:
-      { { 3115797761, 1090045712, 399035362 },
-        { 452658959, 3115797761, 3053809839 },
-        { 3970000518, 452658959, 2899502994 } },
-      // Matrix for nskip = 4 * 8 ^ 53:
-      { { 1675739167, 2319843005, 760605578 },
-        { 4161492847, 1675739167, 226142150 },
-        { 1017447188, 4161492847, 3431158427 } },
-      // Matrix for nskip = 5 * 8 ^ 53:
-      { { 1814415714, 3446998641, 1659100687 },
-        { 299018378, 1814415714, 3661851369 },
-        { 2777381296, 299018378, 730677422 } },
-      // Matrix for nskip = 6 * 8 ^ 53:
-      { { 497640593, 3005114205, 2309875696 },
-        { 3522463659, 497640593, 590519806 },
-        { 855175401, 3522463659, 1973739759 } },
-      // Matrix for nskip = 7 * 8 ^ 53:
-      { { 2668363194, 344864589, 270881279 },
-        { 981182918, 2668363194, 1986955069 },
-        { 956851812, 981182918, 3901969881 } },
-      // Matrix for nskip = 1 * 8 ^ 54:
-      { { 1759873736, 2334568602, 2154570180 },
-        { 1812793060, 1759873736, 2111094408 },
-        { 1168460586, 1812793060, 2495653141 } },
-      // Matrix for nskip = 2 * 8 ^ 54:
-      { { 317621194, 868104288, 664971082 },
-        { 2340275074, 317621194, 2168960688 },
-        { 725706104, 2340275074, 3532023115 } },
-      // Matrix for nskip = 3 * 8 ^ 54:
-      { { 3585587043, 2378713321, 2463381051 },
-        { 2919944362, 3585587043, 1464119531 },
-        { 3588451359, 2919944362, 1912059035 } },
-      // Matrix for nskip = 4 * 8 ^ 54:
-      { { 3926931954, 2907684453, 615601328 },
-        { 1132340715, 3926931954, 676995757 },
-        { 1154819290, 1132340715, 1662727700 } },
-      // Matrix for nskip = 5 * 8 ^ 54:
-      { { 918221359, 2912639129, 1883551759 },
-        { 4114315731, 918221359, 1703365082 },
-        { 2391341541, 4114315731, 3946112236 } },
-      // Matrix for nskip = 6 * 8 ^ 54:
-      { { 2495152894, 362016218, 2659927506 },
-        { 1721141770, 2495152894, 2577006096 },
-        { 73701594, 1721141770, 2683266250 } },
-      // Matrix for nskip = 7 * 8 ^ 54:
-      { { 1978338540, 424481557, 341918993 },
-        { 3862312182, 1978338540, 436776944 },
-        { 566398653, 3862312182, 1196282660 } },
-      // Matrix for nskip = 1 * 8 ^ 55:
-      { { 3921782078, 3376494857, 2969567377 },
-        { 475345024, 3921782078, 4206379953 },
-        { 1795936544, 475345024, 934679595 } },
-      // Matrix for nskip = 2 * 8 ^ 55:
-      { { 3119292228, 741613041, 2083352304 },
-        { 1047885963, 3119292228, 1581078542 },
-        { 1065969969, 1047885963, 661718928 } },
-      // Matrix for nskip = 3 * 8 ^ 55:
-      { { 3193382049, 573569291, 3880461974 },
-        { 1401117517, 3193382049, 335339494 },
-        { 2267936793, 1401117517, 2098160992 } },
-      // Matrix for nskip = 4 * 8 ^ 55:
-      { { 3643472111, 2870554228, 3995474529 },
-        { 3804264051, 3643472111, 1366457944 },
-        { 1246805564, 3804264051, 993186530 } },
-      // Matrix for nskip = 5 * 8 ^ 55:
-      { { 2693567720, 1775121226, 3619720132 },
-        { 1859333754, 2693567720, 2377603858 },
-        { 2682882800, 1859333754, 532216705 } },
-      // Matrix for nskip = 6 * 8 ^ 55:
-      { { 2520305729, 3279882298, 2663387463 },
-        { 1160802169, 2520305729, 1363372142 },
-        { 92806587, 1160802169, 3842743664 } },
-      // Matrix for nskip = 7 * 8 ^ 55:
-      { { 1402382861, 2128689614, 967911190 },
-        { 1124729601, 1402382861, 1908361865 },
-        { 2731098528, 1124729601, 3607037865 } },
-      // Matrix for nskip = 1 * 8 ^ 56:
-      { { 796711791, 3878204845, 3160293932 },
-        { 255632881, 796711791, 3778927111 },
-        { 3472564181, 255632881, 388382377 } },
-      // Matrix for nskip = 2 * 8 ^ 56:
-      { { 1776984101, 1742284034, 3449763933 },
-        { 1349354417, 1776984101, 1264780832 },
-        { 715722511, 1349354417, 1213319489 } },
-      // Matrix for nskip = 3 * 8 ^ 56:
-      { { 3231284907, 2981539575, 3476263944 },
-        { 3070932389, 3231284907, 4183678140 },
-        { 4073569309, 3070932389, 1095273395 } },
-      // Matrix for nskip = 4 * 8 ^ 56:
-      { { 4261866865, 1914382786, 201872335 },
-        { 614207188, 4261866865, 1853554849 },
-        { 2046042882, 614207188, 3193186353 } },
-      // Matrix for nskip = 5 * 8 ^ 56:
-      { { 4179922982, 2821238835, 3720886954 },
-        { 1712333408, 4179922982, 2683472927 },
-        { 2838663503, 1712333408, 3967303913 } },
-      // Matrix for nskip = 6 * 8 ^ 56:
-      { { 2701381139, 3664845069, 2023182114 },
-        { 2420177830, 2701381139, 1924402503 },
-        { 3429706463, 2420177830, 2803635446 } },
-      // Matrix for nskip = 7 * 8 ^ 56:
-      { { 4122275824, 2032046756, 1051494202 },
-        { 2221023672, 4122275824, 722305627 },
-        { 547107197, 2221023672, 2228432272 } },
-      // Matrix for nskip = 1 * 8 ^ 57:
-      { { 2210205512, 2847073169, 3324925707 },
-        { 1251969297, 2210205512, 3491451503 },
-        { 470400916, 1251969297, 2184392547 } },
-      // Matrix for nskip = 2 * 8 ^ 57:
-      { { 1523590942, 2391111113, 68341529 },
-        { 295466806, 1523590942, 4143310876 },
-        { 3527253079, 295466806, 4059123142 } },
-      // Matrix for nskip = 3 * 8 ^ 57:
-      { { 3667945349, 431655152, 2687669798 },
-        { 1584045661, 3667945349, 2642149990 },
-        { 2169193555, 1584045661, 2115882504 } },
-      // Matrix for nskip = 4 * 8 ^ 57:
-      { { 1406902110, 3735012720, 1774518130 },
-        { 1814959027, 1406902110, 1560544267 },
-        { 346472965, 1814959027, 964257199 } },
-      // Matrix for nskip = 5 * 8 ^ 57:
-      { { 2718256179, 4102604932, 4277499868 },
-        { 3681834937, 2718256179, 4201441381 },
-        { 1715953284, 3681834937, 1112580533 } },
-      // Matrix for nskip = 6 * 8 ^ 57:
-      { { 992368492, 2710608111, 2674694909 },
-        { 3754191262, 992368492, 1060465580 },
-        { 2574962339, 3754191262, 60540513 } },
-      // Matrix for nskip = 7 * 8 ^ 57:
-      { { 1719209658, 2756912996, 4193028814 },
-        { 4256860235, 1719209658, 3552491408 },
-        { 1070852068, 4256860235, 3586319939 } },
-      // Matrix for nskip = 1 * 8 ^ 58:
-      { { 855309653, 4208503105, 1518467541 },
-        { 2025248418, 855309653, 4148125749 },
-        { 1349947330, 2025248418, 1168504873 } },
-      // Matrix for nskip = 2 * 8 ^ 58:
-      { { 2375338156, 3629519168, 409696181 },
-        { 252401654, 2375338156, 3992097193 },
-        { 2793725401, 252401654, 1350184085 } },
-      // Matrix for nskip = 3 * 8 ^ 58:
-      { { 2856909490, 1191427722, 3088217623 },
-        { 3529719882, 2856909490, 204704202 },
-        { 1918223997, 3529719882, 2282426993 } },
-      // Matrix for nskip = 4 * 8 ^ 58:
-      { { 873141039, 3885583138, 361604799 },
-        { 3554143374, 873141039, 894746180 },
-        { 1919765327, 3554143374, 876210854 } },
-      // Matrix for nskip = 5 * 8 ^ 58:
-      { { 652228317, 107568976, 2576316170 },
-        { 790910548, 652228317, 1352723275 },
-        { 1091561936, 790910548, 1291982092 } },
-      // Matrix for nskip = 6 * 8 ^ 58:
-      { { 3452179482, 4206785268, 2363956864 },
-        { 2619693001, 3452179482, 54522393 },
-        { 4241208723, 2619693001, 2583115784 } },
-      // Matrix for nskip = 7 * 8 ^ 58:
-      { { 547180410, 904354606, 3387638559 },
-        { 2429997228, 547180410, 1350013492 },
-        { 4258335371, 2429997228, 1689405508 } },
-      // Matrix for nskip = 1 * 8 ^ 59:
-      { { 246368794, 1703793169, 2317362874 },
-        { 2300930144, 246368794, 2560214589 },
-        { 2016163623, 2300930144, 1504276775 } },
-      // Matrix for nskip = 2 * 8 ^ 59:
-      { { 1574610921, 2147546631, 4103450226 },
-        { 107416526, 1574610921, 1773803959 },
-        { 1402542742, 107416526, 550063800 } },
-      // Matrix for nskip = 3 * 8 ^ 59:
-      { { 2364572364, 3566983915, 468574833 },
-        { 3825719596, 2364572364, 3679744745 },
-        { 2445832362, 3825719596, 1752846470 } },
-      // Matrix for nskip = 4 * 8 ^ 59:
-      { { 363388665, 592194244, 1746615522 },
-        { 2637234667, 363388665, 4031408742 },
-        { 2895130475, 2637234667, 296510335 } },
-      // Matrix for nskip = 5 * 8 ^ 59:
-      { { 208003776, 91247399, 1566440482 },
-        { 2144494056, 208003776, 1022614336 },
-        { 2439698058, 2144494056, 4292230862 } },
-      // Matrix for nskip = 6 * 8 ^ 59:
-      { { 2823846657, 4257316854, 3340983277 },
-        { 218486499, 2823846657, 3142931989 },
-        { 2351513088, 218486499, 3471595726 } },
-      // Matrix for nskip = 7 * 8 ^ 59:
-      { { 3562083579, 3058668461, 1588504573 },
-        { 2047897620, 3562083579, 1674831117 },
-        { 965798968, 2047897620, 1212961148 } },
-      // Matrix for nskip = 1 * 8 ^ 60:
-      { { 3997368560, 3047771871, 3178383826 },
-        { 1160174754, 3997368560, 4027094919 },
-        { 1234984211, 1160174754, 4226264344 } },
-      // Matrix for nskip = 2 * 8 ^ 60:
-      { { 3303179301, 4243968063, 3235964171 },
-        { 1776841674, 3303179301, 2867287469 },
-        { 1500495759, 1776841674, 1708226553 } },
-      // Matrix for nskip = 3 * 8 ^ 60:
-      { { 1859001036, 2962890971, 2391336228 },
-        { 1694166096, 1859001036, 593465055 },
-        { 1377070160, 1694166096, 2513927224 } },
-      // Matrix for nskip = 4 * 8 ^ 60:
-      { { 1482944153, 3192311574, 354466071 },
-        { 3932773012, 1482944153, 389193591 },
-        { 3350181058, 3932773012, 3398059015 } },
-      // Matrix for nskip = 5 * 8 ^ 60:
-      { { 3478906695, 565159378, 3563812138 },
-        { 2637114657, 3478906695, 1117546206 },
-        { 909882870, 2637114657, 2819889512 } },
-      // Matrix for nskip = 6 * 8 ^ 60:
-      { { 3406907174, 3949116664, 536198867 },
-        { 3969663510, 3406907174, 915271858 },
-        { 1537382635, 3969663510, 1154112679 } },
-      // Matrix for nskip = 7 * 8 ^ 60:
-      { { 1488624292, 2799268852, 4148140705 },
-        { 2326140461, 1488624292, 2413540258 },
-        { 3071215524, 2326140461, 1918378675 } },
-      // Matrix for nskip = 1 * 8 ^ 61:
-      { { 640968550, 3226860971, 922372912 },
-        { 1254989667, 640968550, 2383815228 },
-        { 2027371896, 1254989667, 2925300409 } },
-      // Matrix for nskip = 2 * 8 ^ 61:
-      { { 2313146046, 3910187183, 1377591475 },
-        { 1689291784, 2313146046, 4255405993 },
-        { 1650609719, 1689291784, 1897624297 } },
-      // Matrix for nskip = 3 * 8 ^ 61:
-      { { 3547277681, 272901338, 2842437455 },
-        { 1746901015, 3547277681, 4272690944 },
-        { 2000451168, 1746901015, 417326012 } },
-      // Matrix for nskip = 4 * 8 ^ 61:
-      { { 3656310954, 882924050, 2702189958 },
-        { 3185020283, 3656310954, 1923190496 },
-        { 2449669145, 3185020283, 4235849984 } },
-      // Matrix for nskip = 5 * 8 ^ 61:
-      { { 3659342577, 1641516630, 2539516650 },
-        { 2275633679, 3659342577, 167207049 },
-        { 1798452176, 2275633679, 1651075902 } },
-      // Matrix for nskip = 6 * 8 ^ 61:
-      { { 1932812117, 4060977130, 4129096120 },
-        { 4247470915, 1932812117, 1398719693 },
-        { 101546088, 4247470915, 103612315 } },
-      // Matrix for nskip = 7 * 8 ^ 61:
-      { { 3420997084, 2682742609, 1335389027 },
-        { 3883479775, 3420997084, 1501959755 },
-        { 1647828648, 3883479775, 3801963100 } },
-      // Matrix for nskip = 1 * 8 ^ 62:
-      { { 377232416, 1498446142, 4229103619 },
-        { 3926377906, 377232416, 600268838 },
-        { 511317726, 3926377906, 216160452 } },
-      // Matrix for nskip = 2 * 8 ^ 62:
-      { { 1969399344, 3273966859, 4220943579 },
-        { 3952111894, 1969399344, 575096961 },
-        { 3815277103, 3952111894, 792177412 } },
-      // Matrix for nskip = 3 * 8 ^ 62:
-      { { 1779275464, 2781126556, 2466688033 },
-        { 1573179329, 1779275464, 2922475892 },
-        { 3416534728, 1573179329, 2830179495 } },
-      // Matrix for nskip = 4 * 8 ^ 62:
-      { { 2957238169, 1410010554, 1523740068 },
-        { 3949237584, 2957238169, 74149658 },
-        { 2564746147, 3949237584, 2557663578 } },
-      // Matrix for nskip = 5 * 8 ^ 62:
-      { { 2132274169, 3311898863, 3609324462 },
-        { 3719565953, 2132274169, 3678195166 },
-        { 284265108, 3719565953, 4278461540 } },
-      // Matrix for nskip = 6 * 8 ^ 62:
-      { { 2540404064, 675336157, 1264339488 },
-        { 29787664, 2540404064, 3475225382 },
-        { 591030331, 29787664, 1242712946 } },
-      // Matrix for nskip = 7 * 8 ^ 62:
-      { { 3161673998, 796026877, 3360592842 },
-        { 1326727008, 3161673998, 3697232048 },
-        { 330692835, 1326727008, 3520194976 } },
-      // Matrix for nskip = 1 * 8 ^ 63:
-      { { 3377318569, 1927835240, 2556102508 },
-        { 3022040116, 3377318569, 2549406364 },
-        { 2387074241, 3022040116, 1477293711 } },
-      // Matrix for nskip = 2 * 8 ^ 63:
-      { { 257306870, 1748489735, 547809226 },
-        { 3708493374, 257306870, 4183546362 },
-        { 4435502, 3708493374, 1607696753 } },
-      // Matrix for nskip = 3 * 8 ^ 63:
-      { { 2404623323, 4132820260, 1615062394 },
-        { 1844725476, 2404623323, 570318859 },
-        { 2839043606, 1844725476, 1375837008 } },
-      // Matrix for nskip = 4 * 8 ^ 63:
-      { { 4076910933, 930542270, 3433720143 },
-        { 675898567, 4076910933, 892406741 },
-        { 5625977, 675898567, 2412946221 } },
-      // Matrix for nskip = 5 * 8 ^ 63:
-      { { 3143857447, 1394551864, 4202002846 },
-        { 973255696, 3143857447, 3968325674 },
-        { 2327635494, 973255696, 1217794308 } },
-      // Matrix for nskip = 6 * 8 ^ 63:
-      { { 2448094751, 2840824567, 1627957632 },
-        { 1469753239, 2448094751, 4063581553 },
-        { 3388871077, 1469753239, 3521935017 } },
-      // Matrix for nskip = 7 * 8 ^ 63:
-      { { 1593620760, 1002861683, 2173731154 },
-        { 3577868319, 1593620760, 39982755 },
-        { 3566899985, 3577868319, 207847804 } },
-      // Matrix for nskip = 1 * 8 ^ 64:
-      { { 2146755704, 2635194649, 1512299181 },
-        { 3860948634, 2146755704, 3641948767 },
-        { 3872596381, 3860948634, 1350534123 } },
-      // Matrix for nskip = 2 * 8 ^ 64:
-      { { 2650974852, 2792146306, 1334806440 },
-        { 3511147120, 2650974852, 3467471104 },
-        { 2826608091, 3511147120, 3185213777 } },
-      // Matrix for nskip = 3 * 8 ^ 64:
-      { { 4154591539, 929373784, 2614972987 },
-        { 617404183, 4154591539, 1283899280 },
-        { 637243382, 617404183, 1889016496 } },
-      // Matrix for nskip = 4 * 8 ^ 64:
-      { { 1735625475, 2923145251, 885546512 },
-        { 926645131, 1735625475, 2358202840 },
-        { 3503695789, 926645131, 2511917556 } },
-      // Matrix for nskip = 5 * 8 ^ 64:
-      { { 3169405477, 2071788237, 2197719325 },
-        { 3454276765, 3169405477, 354513440 },
-        { 3433509316, 3454276765, 3884018107 } },
-      // Matrix for nskip = 6 * 8 ^ 64:
-      { { 154139786, 961249414, 3740576106 },
-        { 1113118249, 154139786, 3880685356 },
-        { 177260972, 1113118249, 1811433812 } },
-      // Matrix for nskip = 7 * 8 ^ 64:
-      { { 2636917497, 3922853891, 3167851814 },
-        { 911696899, 2636917497, 1449426394 },
-        { 2845905825, 911696899, 1062710260 } } }
-};
-
-} // namespace mrg32k3a_impl
-} // namespace oneapi::mkl::rng::device::detail
-
-#endif // _MKL_RNG_DEVICE_MRG32K3A_SKIP_AHEAD_MATRIX_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/philox4x32x10_impl.hpp b/include/oneapi/mkl/rng/device/detail/philox4x32x10_impl.hpp
deleted file mode 100644
index f061bb754..000000000
--- a/include/oneapi/mkl/rng/device/detail/philox4x32x10_impl.hpp
+++ /dev/null
@@ -1,552 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_PHILOX4X32X10_IMPL_HPP_
-#define _MKL_RNG_DEVICE_PHILOX4X32X10_IMPL_HPP_
-
-#include <utility> // std::pair
-
-namespace oneapi::mkl::rng::device {
-
-template <std::int32_t VecSize = 1>
-class philox4x32x10;
-
-namespace detail {
-
-template <std::int32_t VecSize>
-struct engine_state<oneapi::mkl::rng::device::philox4x32x10<VecSize>> {
-    std::uint32_t key[2];
-    std::uint32_t counter[4];
-    std::uint32_t part;
-    std::uint32_t result[4];
-};
-
-namespace philox4x32x10_impl {
-
-static inline void add128(std::uint32_t* a, std::uint64_t b) {
-    std::uint64_t tmp = ((static_cast<std::uint64_t>(a[1]) << 32) | a[0]);
-
-    tmp += b;
-
-    a[0] = static_cast<std::uint32_t>(tmp);
-    a[1] = static_cast<std::uint32_t>(tmp >> 32);
-
-    if (tmp < b) {
-        tmp = ((static_cast<std::uint64_t>(a[3]) << 32) | a[2]) + 1;
-
-        a[2] = static_cast<std::uint32_t>(tmp);
-        a[3] = static_cast<std::uint32_t>(tmp >> 32);
-    }
-    return;
-}
-
-static inline void add128_1(std::uint32_t* a) {
-    if (++a[0]) {
-        return;
-    }
-    if (++a[1]) {
-        return;
-    }
-    if (++a[2]) {
-        return;
-    }
-    ++a[3];
-}
-
-static inline std::pair<std::uint32_t, std::uint32_t> mul_hilo_32(std::uint32_t a,
-                                                                  std::uint32_t b) {
-    std::uint64_t res_64 = static_cast<std::uint64_t>(a) * static_cast<std::uint64_t>(b);
-    return std::make_pair(static_cast<std::uint32_t>(res_64),
-                          static_cast<std::uint32_t>(res_64 >> 32));
-}
-
-static inline void round(std::uint32_t* cnt, std::uint32_t* k) {
-    auto [L0, H0] = mul_hilo_32(0xD2511F53, cnt[0]);
-    auto [L1, H1] = mul_hilo_32(0xCD9E8D57, cnt[2]);
-
-    cnt[0] = H1 ^ cnt[1] ^ k[0];
-    cnt[1] = L1;
-    cnt[2] = H0 ^ cnt[3] ^ k[1];
-    cnt[3] = L0;
-}
-
-static inline void round_10(std::uint32_t* cnt, std::uint32_t* k) {
-    round(cnt, k); // 1
-    // increasing keys with philox4x32x10 constants
-    k[0] += 0x9E3779B9;
-    k[1] += 0xBB67AE85;
-    round(cnt, k); // 2
-    k[0] += 0x9E3779B9;
-    k[1] += 0xBB67AE85;
-    round(cnt, k); // 3
-    k[0] += 0x9E3779B9;
-    k[1] += 0xBB67AE85;
-    round(cnt, k); // 4
-    k[0] += 0x9E3779B9;
-    k[1] += 0xBB67AE85;
-    round(cnt, k); // 5
-    k[0] += 0x9E3779B9;
-    k[1] += 0xBB67AE85;
-    round(cnt, k); // 6
-    k[0] += 0x9E3779B9;
-    k[1] += 0xBB67AE85;
-    round(cnt, k); // 7
-    k[0] += 0x9E3779B9;
-    k[1] += 0xBB67AE85;
-    round(cnt, k); // 8
-    k[0] += 0x9E3779B9;
-    k[1] += 0xBB67AE85;
-    round(cnt, k); // 9
-    k[0] += 0x9E3779B9;
-    k[1] += 0xBB67AE85;
-    round(cnt, k); // 10
-}
-
-template <std::int32_t VecSize>
-static inline void skip_ahead(engine_state<oneapi::mkl::rng::device::philox4x32x10<VecSize>>& state,
-                              std::uint64_t num_to_skip) {
-    std::uint64_t num_to_skip_tmp = num_to_skip;
-    std::uint64_t c_inc;
-    std::uint32_t counter[4];
-    std::uint32_t key[2];
-    std::uint64_t tail;
-    if (num_to_skip_tmp <= state.part) {
-        state.part -= num_to_skip_tmp;
-    }
-    else {
-        tail = num_to_skip % 4;
-        if ((tail == 0) && (state.part == 0)) {
-            add128(state.counter, num_to_skip / 4);
-        }
-        else {
-            num_to_skip_tmp = num_to_skip_tmp - state.part;
-            state.part = 0;
-            c_inc = (num_to_skip_tmp - 1) / 4;
-            state.part = (4 - num_to_skip_tmp % 4) % 4;
-            add128(state.counter, c_inc);
-            counter[0] = state.counter[0];
-            counter[1] = state.counter[1];
-            counter[2] = state.counter[2];
-            counter[3] = state.counter[3];
-            key[0] = state.key[0];
-            key[1] = state.key[1];
-            round_10(counter, key);
-            state.result[0] = counter[0];
-            state.result[1] = counter[1];
-            state.result[2] = counter[2];
-            state.result[3] = counter[3];
-            add128_1(state.counter);
-        }
-    }
-}
-
-template <std::int32_t VecSize>
-static inline void skip_ahead(engine_state<oneapi::mkl::rng::device::philox4x32x10<VecSize>>& state,
-                              std::uint64_t n, const std::uint64_t* num_to_skip_ptr) {
-    constexpr std::uint64_t uint_max = 0xFFFFFFFFFFFFFFFF;
-    std::uint64_t post_buffer, pre_buffer;
-    std::int32_t num_elements = 0;
-    std::int32_t remained_counter;
-    std::uint64_t tmp_skip_array[3] = { 0, 0, 0 };
-
-    for (std::uint64_t i = 0; (i < 3) && (i < n); i++) {
-        tmp_skip_array[i] = num_to_skip_ptr[i];
-        if (tmp_skip_array[i]) {
-            num_elements = i + 1;
-        }
-    }
-
-    if (num_elements == 0) {
-        return;
-    }
-    if ((num_elements == 1) && (tmp_skip_array[0] <= state.part)) {
-        state.part -= static_cast<std::uint32_t>(tmp_skip_array[0]);
-        return;
-    }
-    std::uint32_t counter[4];
-    std::uint32_t key[2];
-
-    if ((tmp_skip_array[0] - state.part) <= tmp_skip_array[0]) {
-        tmp_skip_array[0] = tmp_skip_array[0] - state.part;
-    }
-    else if ((num_elements == 2) || (tmp_skip_array[1] - 1 < tmp_skip_array[1])) {
-        tmp_skip_array[1] = tmp_skip_array[1] - 1;
-        tmp_skip_array[0] = uint_max - state.part + tmp_skip_array[0];
-    }
-    else {
-        tmp_skip_array[2] = tmp_skip_array[2] - 1;
-        tmp_skip_array[1] = uint_max - 1;
-        tmp_skip_array[0] = uint_max - state.part + tmp_skip_array[0];
-    }
-
-    state.part = 0;
-
-    post_buffer = 0;
-
-    remained_counter = static_cast<std::uint32_t>(tmp_skip_array[0] % 4);
-
-    for (int i = num_elements - 1; i >= 0; i--) {
-        pre_buffer = (tmp_skip_array[i] << 62);
-        tmp_skip_array[i] >>= 2;
-        tmp_skip_array[i] |= post_buffer;
-        post_buffer = pre_buffer;
-    }
-
-    state.part = 4 - remained_counter;
-
-    std::uint64_t counter64[] = { state.counter[1], state.counter[3] };
-    counter64[0] = ((counter64[0] << 32ull) | state.counter[0]);
-    counter64[1] = ((counter64[1] << 32ull) | state.counter[2]);
-
-    counter64[0] += tmp_skip_array[0];
-
-    if (counter64[0] < tmp_skip_array[0]) {
-        counter64[1]++;
-    }
-
-    counter64[1] += tmp_skip_array[1];
-
-    counter[0] = static_cast<std::uint32_t>(counter64[0]);
-    counter[1] = static_cast<std::uint32_t>(counter64[0] >> 32);
-    counter[2] = static_cast<std::uint32_t>(counter64[1]);
-    counter[3] = static_cast<std::uint32_t>(counter64[1] >> 32);
-
-    key[0] = state.key[0];
-    key[1] = state.key[1];
-
-    round_10(counter, key);
-
-    state.result[0] = counter[0];
-    state.result[1] = counter[1];
-    state.result[2] = counter[2];
-    state.result[3] = counter[3];
-
-    counter64[0]++;
-
-    if (counter64[0] < 1) {
-        counter64[1]++;
-    }
-
-    state.counter[0] = static_cast<std::uint32_t>(counter64[0]);
-    state.counter[1] = static_cast<std::uint32_t>(counter64[0] >> 32);
-    state.counter[2] = static_cast<std::uint32_t>(counter64[1]);
-    state.counter[3] = static_cast<std::uint32_t>(counter64[1] >> 32);
-}
-
-template <std::int32_t VecSize>
-static inline void init(engine_state<oneapi::mkl::rng::device::philox4x32x10<VecSize>>& state,
-                        std::uint64_t n, const std::uint64_t* seed_ptr, std::uint64_t offset) {
-    state.key[0] = static_cast<std::uint32_t>(seed_ptr[0]);
-    state.key[1] = static_cast<std::uint32_t>(seed_ptr[0] >> 32);
-
-    state.counter[0] = (n >= 2 ? static_cast<std::uint32_t>(seed_ptr[1]) : 0);
-    state.counter[1] = (n >= 2 ? static_cast<std::uint32_t>(seed_ptr[1] >> 32) : 0);
-
-    state.counter[2] = (n >= 3 ? static_cast<std::uint32_t>(seed_ptr[2]) : 0);
-    state.counter[3] = (n >= 3 ? static_cast<std::uint32_t>(seed_ptr[2] >> 32) : 0);
-
-    state.part = 0;
-    state.result[0] = 0;
-    state.result[1] = 0;
-    state.result[2] = 0;
-    state.result[3] = 0;
-    skip_ahead(state, offset);
-}
-
-template <std::int32_t VecSize>
-static inline void init(engine_state<oneapi::mkl::rng::device::philox4x32x10<VecSize>>& state,
-                        std::uint64_t n, const std::uint64_t* seed_ptr, std::uint64_t n_offset,
-                        const std::uint64_t* offset_ptr) {
-    state.key[0] = static_cast<std::uint32_t>(seed_ptr[0]);
-    state.key[1] = static_cast<std::uint32_t>(seed_ptr[0] >> 32);
-
-    state.counter[0] = (n >= 2 ? static_cast<std::uint32_t>(seed_ptr[1]) : 0);
-    state.counter[1] = (n >= 2 ? static_cast<std::uint32_t>(seed_ptr[1] >> 32) : 0);
-
-    state.counter[2] = (n >= 3 ? static_cast<std::uint32_t>(seed_ptr[2]) : 0);
-    state.counter[3] = (n >= 3 ? static_cast<std::uint32_t>(seed_ptr[2] >> 32) : 0);
-
-    state.part = 0;
-    state.result[0] = 0;
-    state.result[1] = 0;
-    state.result[2] = 0;
-    state.result[3] = 0;
-    skip_ahead(state, n_offset, offset_ptr);
-}
-
-// for VecSize > 4
-template <std::int32_t VecSize>
-__attribute__((always_inline)) static inline sycl::vec<std::uint32_t, VecSize> generate_full(
-    engine_state<oneapi::mkl::rng::device::philox4x32x10<VecSize>>& state) {
-    const std::int32_t num_elements = VecSize;
-    sycl::vec<std::uint32_t, VecSize> res;
-
-    std::uint32_t counter[4];
-
-    int i = 0;
-    int part = (int)state.part;
-    while (part && (i < num_elements)) {
-        res[i++] = state.result[3 - (--part)];
-    }
-    if (i == num_elements) {
-        skip_ahead(state, num_elements);
-        return res;
-    }
-
-    counter[0] = state.counter[0];
-    counter[1] = state.counter[1];
-    counter[2] = state.counter[2];
-    counter[3] = state.counter[3];
-
-    std::uint32_t cntTmp[4];
-    std::uint32_t keyTmp[2];
-    for (; i < num_elements; i += 4) {
-        cntTmp[0] = counter[0];
-        cntTmp[1] = counter[1];
-        cntTmp[2] = counter[2];
-        cntTmp[3] = counter[3];
-
-        keyTmp[0] = state.key[0];
-        keyTmp[1] = state.key[1];
-
-        round_10(cntTmp, keyTmp);
-
-        if (i + 4 <= num_elements) {
-            for (int j = 0; j < 4; j++) {
-                res[i + j] = cntTmp[j];
-            }
-            add128_1(counter);
-        }
-        else {
-            // here if last iteration
-            for (int j = 0; i < num_elements; i++, j++) {
-                res[i] = cntTmp[j];
-            }
-        }
-    }
-    skip_ahead(state, num_elements);
-    return res;
-}
-
-// for VecSize <= 4
-template <std::int32_t VecSize>
-__attribute__((always_inline)) static inline sycl::vec<std::uint32_t, VecSize> generate_small(
-    engine_state<oneapi::mkl::rng::device::philox4x32x10<VecSize>>& state) {
-    const std::int32_t num_elements = VecSize;
-    sycl::vec<std::uint32_t, VecSize> res;
-
-    std::uint32_t counter[4];
-    std::uint32_t key[2];
-
-    int i = 0;
-    int part = (int)state.part;
-    while (part && (i < num_elements)) {
-        res[i++] = state.result[3 - (--part)];
-    }
-    if (i == num_elements) {
-        skip_ahead(state, num_elements);
-        return res;
-    }
-
-    counter[0] = state.counter[0];
-    counter[1] = state.counter[1];
-    counter[2] = state.counter[2];
-    counter[3] = state.counter[3];
-    key[0] = state.key[0];
-    key[1] = state.key[1];
-
-    round_10(counter, key);
-
-    for (int j = 0; i < num_elements; i++, j++) {
-        res[i] = counter[j];
-    }
-
-    skip_ahead(state, num_elements);
-    return res;
-}
-
-template <int VecSize>
-__attribute__((always_inline)) static inline std::uint32_t generate_single(
-    engine_state<oneapi::mkl::rng::device::philox4x32x10<VecSize>>& state) {
-    std::uint32_t res;
-
-    std::uint32_t counter[4];
-    std::uint32_t key[2];
-
-    std::int32_t part = static_cast<std::int32_t>(state.part);
-    if (part != 0) {
-        res = state.result[3 - (--part)];
-        skip_ahead(state, 1);
-        return res;
-    }
-    counter[0] = state.counter[0];
-    counter[1] = state.counter[1];
-    counter[2] = state.counter[2];
-    counter[3] = state.counter[3];
-    key[0] = state.key[0];
-    key[1] = state.key[1];
-
-    round_10(counter, key);
-
-    res = counter[0];
-
-    skip_ahead(state, 1);
-    return res;
-}
-
-} // namespace philox4x32x10_impl
-
-template <std::int32_t VecSize>
-class engine_base<oneapi::mkl::rng::device::philox4x32x10<VecSize>> {
-protected:
-    engine_base(std::uint64_t seed, std::uint64_t offset = 0) {
-        philox4x32x10_impl::init(this->state_, 1, &seed, offset);
-    }
-
-    engine_base(std::uint64_t n, const std::uint64_t* seed, std::uint64_t offset = 0) {
-        philox4x32x10_impl::init(this->state_, n, seed, offset);
-    }
-
-    engine_base(std::uint64_t seed, std::uint64_t n_offset, const std::uint64_t* offset_ptr) {
-        philox4x32x10_impl::init(this->state_, 1, &seed, n_offset, offset_ptr);
-    }
-
-    engine_base(std::uint64_t n, const std::uint64_t* seed, std::uint64_t n_offset,
-                const std::uint64_t* offset_ptr) {
-        philox4x32x10_impl::init(this->state_, n, seed, n_offset, offset_ptr);
-    }
-
-    template <typename RealType>
-    __attribute__((always_inline)) inline auto generate(RealType a, RealType b) ->
-        typename std::conditional<VecSize == 1, RealType, sycl::vec<RealType, VecSize>>::type {
-        sycl::vec<RealType, VecSize> res;
-        sycl::vec<std::uint32_t, VecSize> res_uint;
-        RealType a1;
-        RealType c1;
-
-        c1 = (b - a) / (static_cast<RealType>((std::numeric_limits<std::uint32_t>::max)()) + 1);
-        a1 = (b + a) / static_cast<RealType>(2.0);
-
-        if constexpr (VecSize > 4) {
-            res_uint = philox4x32x10_impl::generate_full(this->state_);
-        }
-        else {
-            res_uint = philox4x32x10_impl::generate_small(this->state_);
-        }
-        for (int i = 0; i < VecSize; i++) {
-            res[i] = static_cast<RealType>(static_cast<std::int32_t>(res_uint[i])) * c1 + a1;
-        }
-        return res;
-    }
-
-    __attribute__((always_inline)) inline auto generate() ->
-        typename std::conditional<VecSize == 1, std::uint32_t,
-                                  sycl::vec<std::uint32_t, VecSize>>::type {
-        if constexpr (VecSize > 4) {
-            return philox4x32x10_impl::generate_full(this->state_);
-        }
-        return philox4x32x10_impl::generate_small(this->state_);
-    }
-
-    template <typename UIntType>
-    __attribute__((always_inline)) inline auto generate_uniform_bits() ->
-        typename std::conditional<VecSize == 1, UIntType, sycl::vec<UIntType, VecSize>>::type {
-        if constexpr (std::is_same<UIntType, std::uint32_t>::value) {
-            return generate();
-        }
-        else {
-            auto uni_res1 = generate();
-            auto uni_res2 = generate();
-
-            if constexpr (VecSize == 1) {
-                return (static_cast<std::uint64_t>(uni_res2) << 32) + uni_res1;
-            }
-            else {
-                sycl::vec<std::uint64_t, VecSize> vec_out;
-
-                if constexpr (VecSize != 3) {
-                    for (int i = 0; i < VecSize / 2; i++) {
-                        vec_out[i] = (static_cast<std::uint64_t>(uni_res1[2 * i + 1]) << 32) +
-                                     uni_res1[2 * i];
-                        vec_out[i + VecSize / 2] =
-                            (static_cast<std::uint64_t>(uni_res2[2 * i + 1]) << 32) +
-                            uni_res2[2 * i];
-                    }
-                }
-                else {
-                    vec_out[0] = (static_cast<std::uint64_t>(uni_res1[1]) << 32) + uni_res1[0];
-                    vec_out[1] = (static_cast<std::uint64_t>(uni_res2[0]) << 32) + uni_res1[2];
-                    vec_out[2] = (static_cast<std::uint64_t>(uni_res2[2]) << 32) + uni_res2[1];
-                }
-
-                return vec_out;
-            }
-        }
-    }
-
-    template <typename RealType>
-    RealType generate_single(RealType a, RealType b) {
-        RealType res;
-        std::uint32_t res_uint;
-        RealType a1;
-        RealType c1;
-
-        c1 = (b - a) / (static_cast<RealType>((std::numeric_limits<std::uint32_t>::max)()) + 1);
-        a1 = (b + a) / static_cast<RealType>(2.0);
-
-        res_uint = philox4x32x10_impl::generate_single(this->state_);
-
-        res = static_cast<RealType>(static_cast<std::int32_t>(res_uint)) * c1 + a1;
-
-        return res;
-    }
-
-    __attribute__((always_inline)) inline std::uint32_t generate_single() {
-        return philox4x32x10_impl::generate_single(this->state_);
-    }
-
-    template <typename UIntType>
-    __attribute__((always_inline)) inline auto generate_single_uniform_bits() {
-        if constexpr (std::is_same<UIntType, std::uint32_t>::value) {
-            return philox4x32x10_impl::generate_single(this->state_);
-        }
-        else {
-            auto uni_res1 = philox4x32x10_impl::generate_single(this->state_);
-            auto uni_res2 = philox4x32x10_impl::generate_single(this->state_);
-
-            return (static_cast<std::uint64_t>(uni_res2) << 32) + uni_res1;
-        }
-    }
-
-    void skip_ahead(std::uint64_t num_to_skip) {
-        detail::philox4x32x10_impl::skip_ahead(this->state_, num_to_skip);
-    }
-
-    void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) {
-        detail::philox4x32x10_impl::skip_ahead(this->state_, num_to_skip.size(),
-                                               num_to_skip.begin());
-    }
-
-    engine_state<oneapi::mkl::rng::device::philox4x32x10<VecSize>> state_;
-};
-
-} // namespace detail
-} // namespace oneapi::mkl::rng::device
-
-#endif // _MKL_RNG_DEVICE_PHILOX4X32X10_IMPL_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/poisson_impl.hpp b/include/oneapi/mkl/rng/device/detail/poisson_impl.hpp
deleted file mode 100644
index 9fa9b26ec..000000000
--- a/include/oneapi/mkl/rng/device/detail/poisson_impl.hpp
+++ /dev/null
@@ -1,355 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_POISSON_IMPL_HPP_
-#define _MKL_RNG_DEVICE_POISSON_IMPL_HPP_
-
-#include <limits>
-
-namespace oneapi::mkl::rng::device::detail {
-
-// Implementation of Poisson distribution uses 3 methods depending on lambda parameter:
-//    - table-lookup method [1] for small lambdas (lambda < 60)
-//    - Devroye's method [2] for medium lambdas (60 <= lambda < 1000)
-//    - Gaussian approximation [1] for huge lambdas (lambda >= 1000)
-//
-// References:
-// [1] Michael B. Giles // Algorithm 955: approximation of the inverse Poisson cumulative
-// distribution function
-// [2] Devroye, L. Non-Uniform Random Variates Generation. Springer-Verlag,
-// New York, 1986, Ch. X, Sects. 3.3 & 3.4 + Errata
-
-#define RNG_POISSON_LAMBDA_HUGE_BOUND 1000.0
-#define RNG_POISSON_LAMBDA_LOW_BOUND  60.0
-#define RNG_POISSON_N_PRECOMPUTED_CDF 32
-
-struct poisson_parameters {
-    void set_lambda(double lambda) {
-        if (lambda >= RNG_POISSON_LAMBDA_HUGE_BOUND) {
-            sqrt_lambda_ = sycl::sqrt(lambda);
-        }
-        else if (lambda >= RNG_POISSON_LAMBDA_LOW_BOUND) {
-            floored_lambda_ = sycl::floor(lambda);
-            log_lambda_ = sycl::log(lambda);
-            lgamma_floored_lambda_ = sycl::lgamma(floored_lambda_ + 1.0);
-            sqrt_floored_lambda_ = sycl::sqrt(floored_lambda_);
-            dx_ = sycl::sqrt(2.0 * floored_lambda_ * sycl::log(32.0 * floored_lambda_ / pi_4_));
-            delta_ = sycl::round((sycl::max)(6.0, (sycl::min)(floored_lambda_, dx_)));
-            dpdfl_ = delta_ + 2.0 * floored_lambda_;
-            sqrt_half_dpdfl_ = sycl::sqrt(dpdfl_ / 2.0);
-            inv_dpdfl_ = 1.0 / dpdfl_;
-            c2_add_coeff_ = sycl::sqrt(pi_4_ * dpdfl_) * sycl::exp(inv_dpdfl_);
-            c_add_coeff_ =
-                2.0 * dpdfl_ * sycl::exp(-delta_ * inv_dpdfl_ * (1.0 + delta_ / 2.0)) / delta_;
-            c1_ = sqrt_floored_lambda_ * spi_2_;
-            c2_ = c2_add_coeff_ + c1_;
-            c3_ = c2_ + 1.0;
-            c4_ = c2_ + 2.0;
-            c5_ = c4_ + exp_one_by_78;
-            c_ = c5_ + c_add_coeff_;
-        }
-        else {
-            prob[0] = sycl::exp(-lambda);
-            double tmp = prob[0];
-            for (int i = 1; i < RNG_POISSON_N_PRECOMPUTED_CDF; ++i) {
-                tmp *= lambda / (double)i;
-                prob[i] = prob[i - 1] + tmp;
-            }
-        }
-    }
-
-    poisson_parameters& operator=(const poisson_parameters& other) {
-        if (this == &other) {
-            return *this;
-        }
-        for (int i = 0; i < RNG_POISSON_N_PRECOMPUTED_CDF; i++) {
-            prob[i] = other.prob[i];
-        }
-        floored_lambda_ = other.floored_lambda_;
-        log_lambda_ = other.log_lambda_;
-        lgamma_floored_lambda_ = other.lgamma_floored_lambda_;
-        sqrt_lambda_ = other.sqrt_lambda_;
-        sqrt_floored_lambda_ = other.sqrt_floored_lambda_;
-        dx_ = other.dx_;
-        delta_ = other.delta_;
-        dpdfl_ = other.dpdfl_;
-        sqrt_half_dpdfl_ = other.sqrt_half_dpdfl_;
-        inv_dpdfl_ = other.inv_dpdfl_;
-        c2_add_coeff_ = other.c2_add_coeff_;
-        c_add_coeff_ = other.c_add_coeff_;
-        c1_ = other.c1_;
-        c2_ = other.c2_;
-        c3_ = other.c3_;
-        c4_ = other.c4_;
-        c5_ = other.c5_;
-        c_ = other.c_;
-        return *this;
-    }
-    double prob[RNG_POISSON_N_PRECOMPUTED_CDF];
-    double floored_lambda_ = 0.0;
-    double log_lambda_ = 0.0;
-    double lgamma_floored_lambda_ = 0.0;
-    double sqrt_lambda_ = 0.0;
-    double sqrt_floored_lambda_ = 0.0;
-    double dx_ = 0.0;
-    double delta_ = 0.0;
-    double dpdfl_ = 0.0;
-    double sqrt_half_dpdfl_ = 0.0;
-    double inv_dpdfl_ = 0.0;
-    double c2_add_coeff_ = 0.0;
-    double c_add_coeff_ = 0.0;
-    double c1_ = 0.0;
-    double c2_ = 0.0;
-    double c3_ = 0.0;
-    double c4_ = 0.0;
-    double c5_ = 0.0;
-    double c_ = 0.0;
-    const double exp_one_by_78 = 1.0129030479320018583185514777512983L;
-    const double pi_4_ = 0.7853981633974483096156608458198757L;
-    const double spi_2_ = 1.2533141373155002512078826424055226L;
-};
-
-template <typename IntType>
-class distribution_base<oneapi::mkl::rng::device::poisson<IntType, poisson_method::devroye>> {
-public:
-    struct param_type {
-        param_type(double lambda) : lambda_(lambda) {}
-        double lambda_;
-    };
-
-    distribution_base(double lambda) : lambda_(lambda) {
-#ifndef __SYCL_DEVICE_ONLY__
-        if (lambda_ <= 0.0) {
-            throw oneapi::mkl::invalid_argument("rng", "poisson", "lambda <= 0");
-        }
-#endif
-        params_.set_lambda(lambda_);
-    }
-
-    double lambda() const {
-        return lambda_;
-    }
-
-    param_type param() const {
-        return param_type(lambda_);
-    }
-
-    void param(const param_type& pt) {
-#ifndef __SYCL_DEVICE_ONLY__
-        if (pt.lambda_ <= 0.0) {
-            throw oneapi::mkl::invalid_argument("rng", "poisson", "lambda <= 0");
-        }
-#endif
-        lambda_ = pt.lambda_;
-        params_.set_lambda(lambda_);
-    }
-
-protected:
-    IntType get_one_num_small_lambdas(double uniform_var) {
-        IntType res = 0;
-        if (uniform_var < params_.prob[0]) {
-            return res;
-        }
-        else {
-            for (res = 1; res < RNG_POISSON_N_PRECOMPUTED_CDF; ++res) {
-                if (uniform_var < params_.prob[res]) {
-                    return res;
-                }
-            }
-            // in case uniform_var is still bigger than CDF[31] compute additional CDF coefficients
-            double prob_less_than_k = params_.prob[--res];
-            double prob_that_k = prob_less_than_k - params_.prob[res - 1];
-            do {
-                prob_that_k *= lambda_ / (double)(res++ + 1);
-                prob_less_than_k += prob_that_k;
-            } while (uniform_var >= prob_less_than_k);
-
-            return res;
-        }
-    }
-    template <typename EngineType>
-    IntType get_one_num_med_lambdas(EngineType& engine) {
-        const double rounding_coeff = (1.0 - std::numeric_limits<double>::epsilon()) / 2.0;
-        const double max_inttype_val = (std::numeric_limits<IntType>::max)() + rounding_coeff;
-        double res_;
-        bool rejection_flag = true;
-        do {
-            const double uniform_var = params_.c_ * engine.generate_single(0.0, 1.0);
-            const double exponential_var = exponential_.generate_single(engine);
-            double w = 0.0;
-            if (uniform_var <= params_.c1_) {
-                const double gaussian_var = gaussian_.generate_single(engine);
-                const double y = -sycl::fabs(gaussian_var) * params_.sqrt_floored_lambda_ - 1.0;
-                res_ = sycl::floor(y);
-                w = -gaussian_var * gaussian_var / 2.0;
-                if (res_ < -params_.floored_lambda_)
-                    continue;
-            }
-            else if (uniform_var <= params_.c2_) {
-                const double gaussian_var = gaussian_.generate_single(engine);
-                const double y = 1.0 + sycl::fabs(gaussian_var) * params_.sqrt_half_dpdfl_;
-                res_ = sycl::ceil(y);
-                w = y * (2.0 - y) * params_.inv_dpdfl_;
-                if (res_ > params_.delta_)
-                    continue;
-            }
-            else if (uniform_var <= params_.c3_)
-                res_ = -1.0;
-            else if (uniform_var <= params_.c4_)
-                res_ = 0.0;
-            else if (uniform_var <= params_.c5_)
-                res_ = 1.0;
-            else {
-                const double exponential_var_1 = exponential_.generate_single(engine);
-                const double y =
-                    params_.delta_ + exponential_var_1 * 2.0 * params_.dpdfl_ / params_.delta_;
-                res_ = sycl::ceil(y);
-                w = -params_.delta_ * params_.inv_dpdfl_ * (1.0 + y / 2.0);
-            }
-
-            rejection_flag = ((w - exponential_var - res_ * params_.log_lambda_) >
-                              (params_.lgamma_floored_lambda_ -
-                               sycl::lgamma(res_ + params_.floored_lambda_ + 1.0)));
-
-            rejection_flag |= (res_ + params_.floored_lambda_) >= max_inttype_val;
-
-        } while (rejection_flag);
-
-        return ((IntType)(res_ + params_.floored_lambda_ + rounding_coeff));
-    }
-
-    template <typename EngineType>
-    auto generate(EngineType& engine) ->
-        typename std::conditional<EngineType::vec_size == 1, IntType,
-                                  sycl::vec<IntType, EngineType::vec_size>>::type {
-        using OutType = typename std::conditional<EngineType::vec_size == 1, IntType,
-                                                  sycl::vec<IntType, EngineType::vec_size>>::type;
-        OutType res;
-        if constexpr (EngineType::vec_size == 1) {
-            res = 0;
-            if (lambda_ < RNG_POISSON_LAMBDA_LOW_BOUND) {
-                double uniform_var = engine.generate(0.0, 1.0);
-                return get_one_num_small_lambdas(uniform_var);
-            }
-            else if (lambda_ < RNG_POISSON_LAMBDA_HUGE_BOUND) {
-                const double rounding_coeff = (1.0 - std::numeric_limits<double>::epsilon()) / 2.0;
-                const double max_inttype_val =
-                    (std::numeric_limits<IntType>::max)() + rounding_coeff;
-                double res_;
-                bool rejection_flag = true;
-                do {
-                    const double uniform_var = params_.c_ * engine.generate(0.0, 1.0);
-                    const double exponential_var = exponential_.generate(engine);
-                    double w = 0.0;
-                    if (uniform_var <= params_.c1_) {
-                        const double gaussian_var = gaussian_.generate(engine);
-                        const double y =
-                            -sycl::fabs(gaussian_var) * params_.sqrt_floored_lambda_ - 1.0;
-                        res_ = sycl::floor(y);
-                        w = -gaussian_var * gaussian_var / 2.0;
-                        if (res_ < -params_.floored_lambda_)
-                            continue;
-                    }
-                    else if (uniform_var <= params_.c2_) {
-                        const double gaussian_var = gaussian_.generate(engine);
-                        const double y = 1.0 + sycl::fabs(gaussian_var) * params_.sqrt_half_dpdfl_;
-                        res_ = sycl::ceil(y);
-                        w = y * (2.0 - y) * params_.inv_dpdfl_;
-                        if (res_ > params_.delta_)
-                            continue;
-                    }
-                    else if (uniform_var <= params_.c3_)
-                        res_ = -1.0;
-                    else if (uniform_var <= params_.c4_)
-                        res_ = 0.0;
-                    else if (uniform_var <= params_.c5_)
-                        res_ = 1.0;
-                    else {
-                        const double exponential_var_1 = exponential_.generate(engine);
-                        const double y = params_.delta_ +
-                                         exponential_var_1 * 2.0 * params_.dpdfl_ / params_.delta_;
-                        res_ = sycl::ceil(y);
-                        w = -params_.delta_ * params_.inv_dpdfl_ * (1.0 + y / 2.0);
-                    }
-
-                    rejection_flag = ((w - exponential_var - res_ * params_.log_lambda_) >
-                                      (params_.lgamma_floored_lambda_ -
-                                       sycl::lgamma(res_ + params_.floored_lambda_ + 1.0)));
-
-                    rejection_flag |= (res_ + params_.floored_lambda_) >= max_inttype_val;
-
-                } while (rejection_flag);
-
-                return ((IntType)(res_ + params_.floored_lambda_ + rounding_coeff));
-            }
-            else {
-                res = static_cast<IntType>(lambda_ +
-                                           params_.sqrt_lambda_ * gaussian_.generate(engine));
-            }
-        }
-        else {
-            if (lambda_ < RNG_POISSON_LAMBDA_LOW_BOUND) {
-                auto uniform_var = engine.generate(0.0, 1.0);
-                for (int i = 0; i < EngineType::vec_size; ++i) {
-                    res[i] = get_one_num_small_lambdas(uniform_var[i]);
-                }
-                return res;
-            }
-            else if (lambda_ < RNG_POISSON_LAMBDA_HUGE_BOUND) {
-                for (int i = 0; i < EngineType::vec_size; ++i) {
-                    res[i] = get_one_num_med_lambdas(engine);
-                }
-                return res;
-            }
-            else {
-                sycl::vec<double, EngineType::vec_size> res_fp =
-                    lambda_ + params_.sqrt_lambda_ * gaussian_.generate(engine);
-                res_fp = sycl::floor(res_fp);
-                res = res_fp.template convert<IntType>();
-            }
-        }
-        return res;
-    }
-
-    template <typename EngineType>
-    IntType generate_single(EngineType& engine) {
-        IntType res = 0;
-        if (lambda_ < RNG_POISSON_LAMBDA_LOW_BOUND) {
-            double uniform_var = engine.generate_single(0.0, 1.0);
-            return get_one_num_small_lambdas(uniform_var);
-        }
-        else if (lambda_ < RNG_POISSON_LAMBDA_HUGE_BOUND) {
-            return get_one_num_med_lambdas(engine);
-        }
-        else {
-            res = static_cast<IntType>(lambda_ +
-                                       params_.sqrt_lambda_ * gaussian_.generate_single(engine));
-        }
-        return res;
-    }
-
-    distribution_base<oneapi::mkl::rng::device::gaussian<double>> gaussian_ = { 0.0, 1.0 };
-    distribution_base<oneapi::mkl::rng::device::exponential<double>> exponential_ = { 0.0, 1.0 };
-    poisson_parameters params_;
-    double lambda_;
-};
-
-} // namespace oneapi::mkl::rng::device::detail
-
-#endif // _MKL_RNG_DEVICE_POISSON_IMPL_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/uniform_bits_impl.hpp b/include/oneapi/mkl/rng/device/detail/uniform_bits_impl.hpp
deleted file mode 100644
index cd3cd2eed..000000000
--- a/include/oneapi/mkl/rng/device/detail/uniform_bits_impl.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_UNIFORM_BITS_IMPL_HPP_
-#define _MKL_RNG_DEVICE_UNIFORM_BITS_IMPL_HPP_
-
-#include "engine_base.hpp"
-
-namespace oneapi::mkl::rng::device::detail {
-
-template <typename UIntType>
-class distribution_base<oneapi::mkl::rng::device::uniform_bits<UIntType>> {
-protected:
-    template <typename EngineType>
-    auto generate(EngineType& engine) ->
-        typename std::conditional<EngineType::vec_size == 1, UIntType,
-                                  sycl::vec<UIntType, EngineType::vec_size>>::type {
-        static_assert(std::is_same<EngineType, philox4x32x10<EngineType::vec_size>>::value ||
-                          std::is_same<EngineType, mcg59<EngineType::vec_size>>::value,
-                      "oneMKL: uniform_bits works only with philox4x32x10/mcg59 engines");
-        return engine.template generate_uniform_bits<UIntType>();
-    }
-
-    template <typename EngineType>
-    UIntType generate_single(EngineType& engine) {
-        static_assert(std::is_same<EngineType, philox4x32x10<EngineType::vec_size>>::value ||
-                          std::is_same<EngineType, mcg59<EngineType::vec_size>>::value,
-                      "oneMKL: uniform_bits works only with philox4x32x10/mcg59 engines");
-        return engine.template generate_single_uniform_bits<UIntType>();
-    }
-};
-
-} // namespace oneapi::mkl::rng::device::detail
-
-#endif // _MKL_RNG_DEVICE_UNIFORM_BITS_IMPL_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/uniform_impl.hpp b/include/oneapi/mkl/rng/device/detail/uniform_impl.hpp
deleted file mode 100644
index bdd7f79d7..000000000
--- a/include/oneapi/mkl/rng/device/detail/uniform_impl.hpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_UNIFORM_IMPL_HPP_
-#define _MKL_RNG_DEVICE_UNIFORM_IMPL_HPP_
-
-namespace oneapi::mkl::rng::device::detail {
-
-template <typename Type, typename Method>
-class distribution_base<oneapi::mkl::rng::device::uniform<Type, Method>> {
-public:
-    struct param_type {
-        param_type(Type a, Type b) : a_(a), b_(b) {}
-        Type a_;
-        Type b_;
-    };
-
-    distribution_base(Type a, Type b) : a_(a), b_(b) {
-#ifndef __SYCL_DEVICE_ONLY__
-        if (a >= b) {
-            throw oneapi::mkl::invalid_argument("rng", "uniform", "a >= b");
-        }
-#endif
-    }
-
-    Type a() const {
-        return a_;
-    }
-
-    Type b() const {
-        return b_;
-    }
-
-    param_type param() const {
-        return param_type(a_, b_);
-    }
-
-    void param(const param_type& pt) {
-#ifndef __SYCL_DEVICE_ONLY__
-        if (pt.a_ >= pt.b_) {
-            throw oneapi::mkl::invalid_argument("rng", "uniform", "a >= b");
-        }
-#endif
-        a_ = pt.a_;
-        b_ = pt.b_;
-    }
-
-protected:
-    template <typename EngineType>
-    auto generate(EngineType& engine) ->
-        typename std::conditional<EngineType::vec_size == 1, Type,
-                                  sycl::vec<Type, EngineType::vec_size>>::type {
-        using OutType = typename std::conditional<EngineType::vec_size == 1, Type,
-                                                  sycl::vec<Type, EngineType::vec_size>>::type;
-        using FpType =
-            typename std::conditional<std::is_same<Method, uniform_method::accurate>::value, double,
-                                      float>::type;
-        OutType res;
-        if constexpr (std::is_integral<Type>::value) {
-            if constexpr (EngineType::vec_size == 1) {
-                FpType res_fp = engine.generate(static_cast<FpType>(a_), static_cast<FpType>(b_));
-                res_fp = sycl::floor(res_fp);
-                res = static_cast<Type>(res_fp);
-                return res;
-            }
-            else {
-                sycl::vec<FpType, EngineType::vec_size> res_fp;
-                res_fp = engine.generate(static_cast<FpType>(a_), static_cast<FpType>(b_));
-                res_fp = sycl::floor(res_fp);
-                res = res_fp.template convert<Type>();
-                return res;
-            }
-        }
-        else {
-            res = engine.generate(a_, b_);
-            if constexpr (std::is_same<Method, uniform_method::accurate>::value) {
-                res = sycl::fmax(res, OutType{ a_ });
-                res = sycl::fmin(res, OutType{ b_ });
-            }
-        }
-
-        return res;
-    }
-
-    template <typename EngineType>
-    Type generate_single(EngineType& engine) {
-        using FpType =
-            typename std::conditional<std::is_same<Method, uniform_method::accurate>::value, double,
-                                      float>::type;
-        Type res;
-        if constexpr (std::is_integral<Type>::value) {
-            FpType res_fp =
-                engine.generate_single(static_cast<FpType>(a_), static_cast<FpType>(b_));
-            res_fp = sycl::floor(res_fp);
-            res = static_cast<Type>(res_fp);
-            return res;
-        }
-        else {
-            res = engine.generate_single(a_, b_);
-            if constexpr (std::is_same<Method, uniform_method::accurate>::value) {
-                res = sycl::fmax(res, a_);
-                res = sycl::fmin(res, b_);
-            }
-        }
-
-        return res;
-    }
-
-    Type a_;
-    Type b_;
-};
-
-} // namespace oneapi::mkl::rng::device::detail
-
-#endif // _MKL_RNG_DEVICE_UNIFORM_IMPL_HPP_
diff --git a/include/oneapi/mkl/rng/device/detail/vm_wrappers.hpp b/include/oneapi/mkl/rng/device/detail/vm_wrappers.hpp
deleted file mode 100644
index ec070c92c..000000000
--- a/include/oneapi/mkl/rng/device/detail/vm_wrappers.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_VM_WRAPPERS_HPP_
-#define _MKL_RNG_DEVICE_VM_WRAPPERS_HPP_
-
-#include <cmath>
-
-namespace oneapi::mkl::rng::device::detail {
-
-template <typename DataType>
-static inline DataType sqrt_wrapper(DataType a) {
-    return sycl::sqrt(a);
-}
-
-template <typename DataType>
-static inline DataType sinpi_wrapper(DataType a) {
-    return sycl::sinpi(a);
-}
-
-template <typename DataType>
-static inline DataType cospi_wrapper(DataType a) {
-    return sycl::cospi(a);
-}
-
-template <typename DataType>
-static inline DataType sincospi_wrapper(DataType a, DataType& b) {
-    b = sycl::cospi(a);
-    return sycl::sinpi(a);
-}
-
-template <typename DataType>
-static inline DataType ln_wrapper(DataType a) {
-    if (a == DataType(0)) {
-        if constexpr (std::is_same_v<DataType, double>)
-            return -0x1.74385446D71C3P+9; // ln(0.494065e-323) = -744.440072
-        else
-            return -0x1.9D1DA0P+6f; // ln(0.14012984e-44) = -103.278929
-    }
-    return sycl::log(a);
-}
-
-} // namespace oneapi::mkl::rng::device::detail
-
-#endif // _MKL_RNG_DEVICE_VM_WRAPPERS_HPP_
diff --git a/include/oneapi/mkl/rng/device/distributions.hpp b/include/oneapi/mkl/rng/device/distributions.hpp
deleted file mode 100644
index 21739f7f2..000000000
--- a/include/oneapi/mkl/rng/device/distributions.hpp
+++ /dev/null
@@ -1,480 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_DISTRIBUTIONS_HPP_
-#define _MKL_RNG_DEVICE_DISTRIBUTIONS_HPP_
-
-#include <limits>
-
-#include "oneapi/mkl/rng/device/detail/distribution_base.hpp"
-#include "oneapi/mkl/rng/device/functions.hpp"
-
-namespace oneapi::mkl::rng::device {
-
-// CONTINUOUS AND DISCRETE RANDOM NUMBER DISTRIBUTIONS
-
-// Class template oneapi::mkl::rng::device::uniform
-//
-// Represents continuous and discrete uniform random number distribution
-//
-// Supported types:
-//      float
-//      double
-//      std::int32_t
-//      std::uint32_t
-//
-// Supported methods:
-//      oneapi::mkl::rng::device::uniform_method::standard
-//      oneapi::mkl::rng::device::uniform_method::accurate
-//
-// Input arguments:
-//      a - left bound. 0.0 by default
-//      b - right bound. 1.0 by default (for std::(u)int32_t std::numeric_limits<std::int32_t>::max()
-//          is used for accurate method and 2^23 is used for standard method)
-//
-// Note: using (un)signed integer uniform distribution with uniform_method::standard method may
-// cause incorrect statistics of the produced random numbers (due to rounding error) if
-// (abs(b - a) > 2^23) || (abs(b) > 2^23) || (abs(a) > 2^23)
-// Please use uniform_method::accurate method instead
-//
-template <typename Type, typename Method>
-class uniform : detail::distribution_base<uniform<Type, Method>> {
-public:
-    static_assert(std::is_same<Method, uniform_method::standard>::value ||
-                      std::is_same<Method, uniform_method::accurate>::value,
-                  "oneMKL: rng/uniform: method is incorrect");
-
-    static_assert(std::is_same<Type, float>::value || std::is_same<Type, double>::value ||
-                      std::is_same<Type, std::int32_t>::value ||
-                      std::is_same<Type, std::uint32_t>::value,
-                  "oneMKL: rng/uniform: type is not supported");
-
-    using method_type = Method;
-    using result_type = Type;
-    using param_type = typename detail::distribution_base<uniform<Type, Method>>::param_type;
-
-    uniform()
-            : detail::distribution_base<uniform<Type, Method>>(
-                  static_cast<Type>(0.0),
-                  std::is_integral<Type>::value
-                      ? (std::is_same<Method, uniform_method::standard>::value
-                             ? (1 << 23)
-                             : (std::numeric_limits<Type>::max)())
-                      : static_cast<Type>(1.0)) {}
-
-    explicit uniform(Type a, Type b) : detail::distribution_base<uniform<Type, Method>>(a, b) {}
-    explicit uniform(const param_type& pt)
-            : detail::distribution_base<uniform<Type, Method>>(pt.a_, pt.b_) {}
-
-    Type a() const {
-        return detail::distribution_base<uniform<Type, Method>>::a();
-    }
-
-    Type b() const {
-        return detail::distribution_base<uniform<Type, Method>>::b();
-    }
-
-    param_type param() const {
-        return detail::distribution_base<uniform<Type, Method>>::param();
-    }
-
-    void param(const param_type& pt) {
-        detail::distribution_base<uniform<Type, Method>>::param(pt);
-    }
-
-private:
-    template <typename Distr, typename Engine>
-    friend auto generate(Distr& distr, Engine& engine) ->
-        typename std::conditional<Engine::vec_size == 1, typename Distr::result_type,
-                                  sycl::vec<typename Distr::result_type, Engine::vec_size>>::type;
-
-    template <typename Distr, typename Engine>
-    friend typename Distr::result_type generate_single(Distr& distr, Engine& engine);
-};
-
-// Class template oneapi::mkl::rng::device::gaussian
-//
-// Represents continuous normal random number distribution
-//
-// Supported types:
-//      float
-//      double
-//
-// Supported methods:
-//      oneapi::mkl::rng::device::gaussian_method::box_muller2
-//      oneapi::mkl::rng::device::gaussian_method::icdf
-//
-// Input arguments:
-//      mean   - mean. 0 by default
-//      stddev - standard deviation. 1.0 by default
-//
-template <typename RealType, typename Method>
-class gaussian : detail::distribution_base<gaussian<RealType, Method>> {
-public:
-    static_assert(std::is_same<Method, gaussian_method::box_muller2>::value
-#if MKL_RNG_USE_BINARY_CODE
-                      || std::is_same<Method, gaussian_method::icdf>::value
-#endif
-                  ,
-                  "oneMKL: rng/gaussian: method is incorrect");
-#if !MKL_RNG_USE_BINARY_CODE
-    static_assert(!std::is_same<Method, gaussian_method::icdf>::value, "icdf method not supported");
-#endif
-    static_assert(std::is_same<RealType, float>::value || std::is_same<RealType, double>::value,
-                  "oneMKL: rng/gaussian: type is not supported");
-
-    using method_type = Method;
-    using result_type = RealType;
-    using param_type = typename detail::distribution_base<gaussian<RealType, Method>>::param_type;
-
-    gaussian()
-            : detail::distribution_base<gaussian<RealType, Method>>(static_cast<RealType>(0.0),
-                                                                    static_cast<RealType>(1.0)) {}
-
-    explicit gaussian(RealType mean, RealType stddev)
-            : detail::distribution_base<gaussian<RealType, Method>>(mean, stddev) {}
-    explicit gaussian(const param_type& pt)
-            : detail::distribution_base<gaussian<RealType, Method>>(pt.mean_, pt.stddev_) {}
-
-    RealType mean() const {
-        return detail::distribution_base<gaussian<RealType, Method>>::mean();
-    }
-
-    RealType stddev() const {
-        return detail::distribution_base<gaussian<RealType, Method>>::stddev();
-    }
-
-    param_type param() const {
-        return detail::distribution_base<gaussian<RealType, Method>>::param();
-    }
-
-    void param(const param_type& pt) {
-        detail::distribution_base<gaussian<RealType, Method>>::param(pt);
-    }
-
-    template <typename Distr, typename Engine>
-    friend auto generate(Distr& distr, Engine& engine) ->
-        typename std::conditional<Engine::vec_size == 1, typename Distr::result_type,
-                                  sycl::vec<typename Distr::result_type, Engine::vec_size>>::type;
-    template <typename Distr, typename Engine>
-    friend typename Distr::result_type generate_single(Distr& distr, Engine& engine);
-};
-
-// Class template oneapi::mkl::rng::device::lognormal
-//
-// Represents continuous lognormal random number distribution
-//
-// Supported types:
-//      float
-//      double
-//
-// Supported methods:
-//      oneapi::mkl::rng::device::lognormal_method::box_muller2
-//
-// Input arguments:
-//      m     - mean of the subject normal distribution. 0.0 by default
-//      s     - standard deviation of the subject normal distribution. 1.0 by default
-//      displ - displacement. 0.0 by default
-//      scale - scalefactor. 1.0 by default
-//
-template <typename RealType, typename Method>
-class lognormal : detail::distribution_base<lognormal<RealType, Method>> {
-public:
-    static_assert(std::is_same<Method, lognormal_method::box_muller2>::value,
-                  "oneMKL: rng/lognormal: method is incorrect");
-
-    static_assert(std::is_same<RealType, float>::value || std::is_same<RealType, double>::value,
-                  "oneMKL: rng/lognormal: type is not supported");
-
-    using method_type = Method;
-    using result_type = RealType;
-    using param_type = typename detail::distribution_base<lognormal<RealType, Method>>::param_type;
-
-    lognormal()
-            : detail::distribution_base<lognormal<RealType, Method>>(
-                  static_cast<RealType>(0.0), static_cast<RealType>(1.0),
-                  static_cast<RealType>(0.0), static_cast<RealType>(1.0)) {}
-
-    explicit lognormal(RealType m, RealType s, RealType displ = static_cast<RealType>(0.0),
-                       RealType scale = static_cast<RealType>(1.0))
-            : detail::distribution_base<lognormal<RealType, Method>>(m, s, displ, scale) {}
-    explicit lognormal(const param_type& pt)
-            : detail::distribution_base<lognormal<RealType, Method>>(pt.m_, pt.s_, pt.displ_,
-                                                                     pt.scale_) {}
-
-    RealType m() const {
-        return detail::distribution_base<lognormal<RealType, Method>>::m();
-    }
-
-    RealType s() const {
-        return detail::distribution_base<lognormal<RealType, Method>>::s();
-    }
-
-    RealType displ() const {
-        return detail::distribution_base<lognormal<RealType, Method>>::displ();
-    }
-
-    RealType scale() const {
-        return detail::distribution_base<lognormal<RealType, Method>>::scale();
-    }
-
-    param_type param() const {
-        return detail::distribution_base<lognormal<RealType, Method>>::param();
-    }
-
-    void param(const param_type& pt) {
-        detail::distribution_base<lognormal<RealType, Method>>::param(pt);
-    }
-
-    template <typename Distr, typename Engine>
-    friend auto generate(Distr& distr, Engine& engine) ->
-        typename std::conditional<Engine::vec_size == 1, typename Distr::result_type,
-                                  sycl::vec<typename Distr::result_type, Engine::vec_size>>::type;
-    template <typename Distr, typename Engine>
-    friend typename Distr::result_type generate_single(Distr& distr, Engine& engine);
-};
-
-// Class template oneapi::mkl::rng::device::uniform_bits
-//
-// Represents discrete uniform bits random number distribution
-//
-// Supported types:
-//      std::uint32_t
-//      std::uint64_t
-//
-template <typename UIntType>
-class uniform_bits : detail::distribution_base<uniform_bits<UIntType>> {
-public:
-    static_assert(std::is_same<UIntType, std::uint32_t>::value ||
-                      std::is_same<UIntType, std::uint64_t>::value,
-                  "oneMKL: rng/uniform_bits: type is not supported");
-    using result_type = UIntType;
-
-private:
-    template <typename Distr, typename Engine>
-    friend auto generate(Distr& distr, Engine& engine) ->
-        typename std::conditional<Engine::vec_size == 1, typename Distr::result_type,
-                                  sycl::vec<typename Distr::result_type, Engine::vec_size>>::type;
-
-    template <typename Distr, typename Engine>
-    friend typename Distr::result_type generate_single(Distr& distr, Engine& engine);
-};
-
-// Class template oneapi::mkl::rng::device::bits
-//
-// Represents bits of underlying random number engine
-//
-// Supported types:
-//      std::uint32_t for philox4x32x10, mrg32k3a and mcg31m1
-//      std::uint64_t for mcg59 only
-//
-template <typename UIntType>
-class bits : detail::distribution_base<bits<UIntType>> {
-public:
-    static_assert(std::is_same<UIntType, std::uint32_t>::value ||
-                      std::is_same<UIntType, std::uint64_t>::value,
-                  "oneMKL: rng/bits: type is not supported");
-    using result_type = UIntType;
-
-private:
-    template <typename Distr, typename Engine>
-    friend auto generate(Distr& distr, Engine& engine) ->
-        typename std::conditional<Engine::vec_size == 1, typename Distr::result_type,
-                                  sycl::vec<typename Distr::result_type, Engine::vec_size>>::type;
-
-    template <typename Distr, typename Engine>
-    friend typename Distr::result_type generate_single(Distr& distr, Engine& engine);
-};
-
-// Class template oneapi::mkl::rng::device::exponential
-//
-// Represents continuous exponential random number distribution
-//
-// Supported types:
-//      float
-//      double
-//
-// Supported methods:
-//      oneapi::mkl::rng::device::exponential_method::icdf
-//      oneapi::mkl::rng::device::exponential_method::icdf_accurate
-//
-// Input arguments:
-//      displ - displacement. 0.0 by default
-//      scale - scalefactor. 1.0 by default
-//
-template <typename RealType, typename Method>
-class exponential : detail::distribution_base<exponential<RealType, Method>> {
-public:
-    static_assert(std::is_same<Method, exponential_method::icdf>::value ||
-                      std::is_same<Method, exponential_method::icdf_accurate>::value,
-                  "oneMKL: rng/exponential: method is incorrect");
-
-    static_assert(std::is_same<RealType, float>::value || std::is_same<RealType, double>::value,
-                  "oneMKL: rng/exponential: type is not supported");
-
-    using method_type = Method;
-    using result_type = RealType;
-    using param_type =
-        typename detail::distribution_base<exponential<RealType, Method>>::param_type;
-
-    exponential()
-            : detail::distribution_base<exponential<RealType, Method>>(
-                  static_cast<RealType>(0.0), static_cast<RealType>(1.0)) {}
-
-    explicit exponential(RealType a, RealType beta)
-            : detail::distribution_base<exponential<RealType, Method>>(a, beta) {}
-
-    explicit exponential(const param_type& pt)
-            : detail::distribution_base<exponential<RealType, Method>>(pt.a_, pt.beta_) {}
-
-    RealType a() const {
-        return detail::distribution_base<exponential<RealType, Method>>::a();
-    }
-
-    RealType beta() const {
-        return detail::distribution_base<exponential<RealType, Method>>::beta();
-    }
-
-    param_type param() const {
-        return detail::distribution_base<exponential<RealType, Method>>::param();
-    }
-
-    void param(const param_type& pt) {
-        detail::distribution_base<exponential<RealType, Method>>::param(pt);
-    }
-
-    template <typename Distr, typename Engine>
-    friend auto generate(Distr& distr, Engine& engine) ->
-        typename std::conditional<Engine::vec_size == 1, typename Distr::result_type,
-                                  sycl::vec<typename Distr::result_type, Engine::vec_size>>::type;
-    template <typename Distr, typename Engine>
-    friend typename Distr::result_type generate_single(Distr& distr, Engine& engine);
-};
-
-// Class template oneapi::mkl::rng::device::poisson
-//
-// Represents discrete poisson random number distribution
-//
-// Supported types:
-//      std::int32_t
-//      std::uint32_t
-//
-// Supported methods:
-//      oneapi::mkl::rng::device::poisson_method::devroye
-//
-// Input arguments:
-//      lambda - mean value. 1.0 by default
-//
-template <typename IntType, typename Method>
-class poisson : detail::distribution_base<poisson<IntType, Method>> {
-public:
-    static_assert(std::is_same<Method, poisson_method::devroye>::value,
-                  "oneMKL: rng/poisson: method is incorrect");
-
-    static_assert(std::is_same<IntType, std::int32_t>::value ||
-                      std::is_same<IntType, std::uint32_t>::value,
-                  "oneMKL: rng/poisson: type is not supported");
-
-    using method_type = Method;
-    using result_type = IntType;
-    using param_type = typename detail::distribution_base<poisson<IntType, Method>>::param_type;
-
-    poisson() : detail::distribution_base<poisson<IntType, Method>>(0.5) {}
-
-    explicit poisson(double lambda) : detail::distribution_base<poisson<IntType, Method>>(lambda) {}
-    explicit poisson(const param_type& pt)
-            : detail::distribution_base<poisson<IntType, Method>>(pt.lambda_) {}
-
-    double lambda() const {
-        return detail::distribution_base<poisson<IntType, Method>>::lambda();
-    }
-
-    param_type param() const {
-        return detail::distribution_base<poisson<IntType, Method>>::param();
-    }
-
-    void param(const param_type& pt) {
-        detail::distribution_base<poisson<IntType, Method>>::param(pt);
-    }
-
-    template <typename Distr, typename Engine>
-    friend auto generate(Distr& distr, Engine& engine) ->
-        typename std::conditional<Engine::vec_size == 1, typename Distr::result_type,
-                                  sycl::vec<typename Distr::result_type, Engine::vec_size>>::type;
-    template <typename Distr, typename Engine>
-    friend typename Distr::result_type generate_single(Distr& distr, Engine& engine);
-};
-
-// Class template oneapi::mkl::rng::device::bernoulli
-//
-// Represents discrete Bernoulli random number distribution
-//
-// Supported types:
-//      std::uint32_t
-//      std::int32_t
-//
-// Supported methods:
-//      oneapi::mkl::rng::bernoulli_method::icdf;
-//
-// Input arguments:
-//      p - success probablity of a trial. 0.5 by default
-//
-template <typename IntType, typename Method>
-class bernoulli : detail::distribution_base<bernoulli<IntType, Method>> {
-public:
-    static_assert(std::is_same<Method, bernoulli_method::icdf>::value,
-                  "oneMKL: rng/bernoulli: method is incorrect");
-
-    static_assert(std::is_same<IntType, std::int32_t>::value ||
-                      std::is_same<IntType, std::uint32_t>::value,
-                  "oneMKL: rng/bernoulli: type is not supported");
-
-    using method_type = Method;
-    using result_type = IntType;
-    using param_type = typename detail::distribution_base<bernoulli<IntType, Method>>::param_type;
-
-    bernoulli() : detail::distribution_base<bernoulli<IntType, Method>>(0.5f) {}
-
-    explicit bernoulli(float p) : detail::distribution_base<bernoulli<IntType, Method>>(p) {}
-    explicit bernoulli(const param_type& pt)
-            : detail::distribution_base<bernoulli<IntType, Method>>(pt.p_) {}
-
-    float p() const {
-        return detail::distribution_base<bernoulli<IntType, Method>>::p();
-    }
-
-    param_type param() const {
-        return detail::distribution_base<bernoulli<IntType, Method>>::param();
-    }
-
-    void param(const param_type& pt) {
-        detail::distribution_base<bernoulli<IntType, Method>>::param(pt);
-    }
-
-    template <typename Distr, typename Engine>
-    friend auto generate(Distr& distr, Engine& engine) ->
-        typename std::conditional<Engine::vec_size == 1, typename Distr::result_type,
-                                  sycl::vec<typename Distr::result_type, Engine::vec_size>>::type;
-    template <typename Distr, typename Engine>
-    friend typename Distr::result_type generate_single(Distr& distr, Engine& engine);
-};
-
-} // namespace oneapi::mkl::rng::device
-
-#endif // _MKL_RNG_DEVICE_DISTRIBUTIONS_HPP_
diff --git a/include/oneapi/mkl/rng/device/engines.hpp b/include/oneapi/mkl/rng/device/engines.hpp
deleted file mode 100644
index f1bcfd1b0..000000000
--- a/include/oneapi/mkl/rng/device/engines.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_ENGINES_HPP_
-#define _MKL_RNG_DEVICE_ENGINES_HPP_
-
-#include <limits>
-
-#include "oneapi/mkl/rng/device/types.hpp"
-#include "oneapi/mkl/rng/device/functions.hpp"
-#include "oneapi/mkl/rng/device/detail/engine_base.hpp"
-
-namespace oneapi::mkl::rng::device {
-
-// PSEUDO-RANDOM NUMBER DEVICE-SIDE ENGINES
-
-// Class template oneapi::mkl::rng::device::philox4x32x10
-//
-// Represents Philox4x32-10 counter-based pseudorandom number generator
-//
-// Supported parallelization methods:
-//      skip_ahead
-//
-template <std::int32_t VecSize>
-class philox4x32x10 : detail::engine_base<philox4x32x10<VecSize>> {
-public:
-    static constexpr std::uint64_t default_seed = 0;
-
-    static constexpr std::int32_t vec_size = VecSize;
-
-    philox4x32x10() : detail::engine_base<philox4x32x10<VecSize>>(default_seed) {}
-
-    philox4x32x10(std::uint64_t seed, std::uint64_t offset = 0)
-            : detail::engine_base<philox4x32x10<VecSize>>(seed, offset) {}
-
-    philox4x32x10(std::initializer_list<std::uint64_t> seed, std::uint64_t offset = 0)
-            : detail::engine_base<philox4x32x10<VecSize>>(seed.size(), seed.begin(), offset) {}
-
-    philox4x32x10(std::uint64_t seed, std::initializer_list<std::uint64_t> offset)
-            : detail::engine_base<philox4x32x10<VecSize>>(seed, offset.size(), offset.begin()) {}
-
-    philox4x32x10(std::initializer_list<std::uint64_t> seed,
-                  std::initializer_list<std::uint64_t> offset)
-            : detail::engine_base<philox4x32x10<VecSize>>(seed.size(), seed.begin(), offset.size(),
-                                                          offset.begin()) {}
-
-private:
-    template <typename Engine>
-    friend void skip_ahead(Engine& engine, std::uint64_t num_to_skip);
-
-    template <typename Engine>
-    friend void skip_ahead(Engine& engine, std::initializer_list<std::uint64_t> num_to_skip);
-
-    template <typename DistrType>
-    friend class detail::distribution_base;
-};
-
-// Class oneapi::mkl::rng::device::mrg32k3a
-//
-// Represents the combined recurcive pseudorandom number generator
-//
-// Supported parallelization methods:
-//      skip_ahead
-//
-template <std::int32_t VecSize>
-class mrg32k3a : detail::engine_base<mrg32k3a<VecSize>> {
-public:
-    static constexpr std::uint32_t default_seed = 1;
-
-    static constexpr std::int32_t vec_size = VecSize;
-
-    mrg32k3a() : detail::engine_base<mrg32k3a<VecSize>>(default_seed) {}
-
-    mrg32k3a(std::uint32_t seed, std::uint64_t offset = 0)
-            : detail::engine_base<mrg32k3a<VecSize>>(seed, offset) {}
-
-    mrg32k3a(std::initializer_list<std::uint32_t> seed, std::uint64_t offset = 0)
-            : detail::engine_base<mrg32k3a<VecSize>>(seed.size(), seed.begin(), offset) {}
-
-    mrg32k3a(std::uint32_t seed, std::initializer_list<std::uint64_t> offset)
-            : detail::engine_base<mrg32k3a<VecSize>>(seed, offset.size(), offset.begin()) {}
-
-    mrg32k3a(std::initializer_list<std::uint32_t> seed, std::initializer_list<std::uint64_t> offset)
-            : detail::engine_base<mrg32k3a<VecSize>>(seed.size(), seed.begin(), offset.size(),
-                                                     offset.begin()) {}
-
-private:
-    template <typename Engine>
-    friend void skip_ahead(Engine& engine, std::uint64_t num_to_skip);
-
-    template <typename Engine>
-    friend void skip_ahead(Engine& engine, std::initializer_list<std::uint64_t> num_to_skip);
-
-    template <typename DistrType>
-    friend class detail::distribution_base;
-};
-
-// Class oneapi::mkl::rng::device::mcg31m1
-//
-//
-//
-// Supported parallelization methods:
-//      skip_ahead
-//
-template <std::int32_t VecSize>
-class mcg31m1 : detail::engine_base<mcg31m1<VecSize>> {
-public:
-    static constexpr std::uint32_t default_seed = 1;
-
-    static constexpr std::int32_t vec_size = VecSize;
-
-    mcg31m1() : detail::engine_base<mcg31m1<VecSize>>(default_seed) {}
-
-    mcg31m1(std::uint32_t seed, std::uint64_t offset = 0)
-            : detail::engine_base<mcg31m1<VecSize>>(seed, offset) {}
-
-private:
-    template <typename Engine>
-    friend void skip_ahead(Engine& engine, std::uint64_t num_to_skip);
-
-    template <typename DistrType>
-    friend class detail::distribution_base;
-};
-
-// Class oneapi::mkl::rng::device::mcg59
-//
-//
-//
-// Supported parallelization methods:
-//      skip_ahead
-//
-template <std::int32_t VecSize>
-class mcg59 : detail::engine_base<mcg59<VecSize>> {
-public:
-    static constexpr std::uint32_t default_seed = 1;
-
-    static constexpr std::int32_t vec_size = VecSize;
-
-    mcg59() : detail::engine_base<mcg59<VecSize>>(default_seed) {}
-
-    mcg59(std::uint64_t seed, std::uint64_t offset = 0)
-            : detail::engine_base<mcg59<VecSize>>(seed, offset) {}
-
-private:
-    template <typename Engine>
-    friend void skip_ahead(Engine& engine, std::uint64_t num_to_skip);
-
-    template <typename DistrType>
-    friend class detail::distribution_base;
-};
-
-} // namespace oneapi::mkl::rng::device
-
-#endif // _MKL_RNG_DEVICE_ENGINES_HPP_
diff --git a/include/oneapi/mkl/rng/device/functions.hpp b/include/oneapi/mkl/rng/device/functions.hpp
deleted file mode 100644
index d8542b836..000000000
--- a/include/oneapi/mkl/rng/device/functions.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_FUNCTIONS_HPP_
-#define _MKL_RNG_DEVICE_FUNCTIONS_HPP_
-
-#include <sycl/sycl.hpp>
-
-#include "oneapi/mkl/rng/device/detail/distribution_base.hpp"
-
-namespace oneapi::mkl::rng::device {
-
-// GENERATE FUNCTIONS
-
-template <typename Distr, typename Engine>
-auto generate(Distr& distr, Engine& engine) ->
-    typename std::conditional<Engine::vec_size == 1, typename Distr::result_type,
-                              sycl::vec<typename Distr::result_type, Engine::vec_size>>::type {
-    return distr.generate(engine);
-}
-
-// SERVICE FUNCTIONS
-
-template <typename Engine>
-void skip_ahead(Engine& engine, std::uint64_t num_to_skip) {
-    engine.skip_ahead(num_to_skip);
-}
-
-template <typename Engine>
-void skip_ahead(Engine& engine, std::initializer_list<std::uint64_t> num_to_skip) {
-    engine.skip_ahead(num_to_skip);
-}
-
-} // namespace oneapi::mkl::rng::device
-
-#endif // _MKL_RNG_DEVICE_FUNCTIONS_HPP_
diff --git a/include/oneapi/mkl/rng/device/types.hpp b/include/oneapi/mkl/rng/device/types.hpp
deleted file mode 100644
index e5f74e25b..000000000
--- a/include/oneapi/mkl/rng/device/types.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_RNG_DEVICE_TYPES_HPP_
-#define _MKL_RNG_DEVICE_TYPES_HPP_
-
-namespace oneapi::mkl::rng::device {
-
-// METHODS FOR DISTRIBUTIONS
-
-namespace uniform_method {
-struct standard {};
-struct accurate {};
-using by_default = standard;
-} // namespace uniform_method
-
-namespace gaussian_method {
-struct box_muller2 {};
-struct icdf {};
-using by_default = box_muller2;
-} // namespace gaussian_method
-
-namespace lognormal_method {
-struct box_muller2 {};
-using by_default = box_muller2;
-} // namespace lognormal_method
-
-namespace exponential_method {
-struct icdf {};
-struct icdf_accurate {};
-using by_default = icdf;
-} // namespace exponential_method
-
-namespace poisson_method {
-struct devroye {};
-using by_default = devroye;
-} // namespace poisson_method
-
-namespace bernoulli_method {
-struct icdf {};
-using by_default = icdf;
-} // namespace bernoulli_method
-
-} // namespace oneapi::mkl::rng::device
-
-#endif // _MKL_RNG_DEVICE_TYPES_HPP_
diff --git a/include/oneapi/mkl/rng/distributions.hpp b/include/oneapi/mkl/rng/distributions.hpp
deleted file mode 100644
index 88d1e46e7..000000000
--- a/include/oneapi/mkl/rng/distributions.hpp
+++ /dev/null
@@ -1,373 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_RNG_DISTRIBUTIONS_HPP_
-#define _ONEMKL_RNG_DISTRIBUTIONS_HPP_
-
-#include <cstdint>
-#include <limits>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-
-// Class template oneapi::mkl::rng::uniform
-//
-// Represents continuous and discrete uniform random number distribution
-//
-// Supported types:
-//      float
-//      double
-//      std::int32_t
-//
-// Supported methods:
-//      oneapi::mkl::rng::uniform_method::standard
-//      oneapi::mkl::rng::uniform_method::accurate - for float and double types only
-//
-// Input arguments:
-//      a - left bound. 0.0 by default
-//      b - right bound. 1.0 by default (std::numeric_limits<std::int32_t>::max() for std::int32_t)
-
-namespace uniform_method {
-struct standard {};
-struct accurate {};
-using by_default = standard;
-} // namespace uniform_method
-
-template <typename Type = float, typename Method = uniform_method::by_default>
-class uniform {
-public:
-    static_assert(std::is_same<Method, uniform_method::standard>::value ||
-                      (std::is_same<Method, uniform_method::accurate>::value &&
-                       !std::is_same<Type, std::int32_t>::value),
-                  "rng uniform distribution method is incorrect");
-
-    static_assert(std::is_same<Type, float>::value || std::is_same<Type, double>::value,
-                  "rng uniform distribution type is not supported");
-
-    using method_type = Method;
-    using result_type = Type;
-
-    uniform() : uniform(static_cast<Type>(0.0f), static_cast<Type>(1.0f)) {}
-
-    explicit uniform(Type a, Type b) : a_(a), b_(b) {
-        if (a >= b) {
-            throw oneapi::mkl::invalid_argument("rng", "uniform",
-                                                "parameters are incorrect, a >= b");
-        }
-    }
-
-    Type a() const {
-        return a_;
-    }
-
-    Type b() const {
-        return b_;
-    }
-
-private:
-    Type a_;
-    Type b_;
-};
-
-template <typename Method>
-class uniform<std::int32_t, Method> {
-public:
-    using method_type = Method;
-    using result_type = std::int32_t;
-
-    uniform() : uniform(0, std::numeric_limits<std::int32_t>::max()) {}
-
-    explicit uniform(std::int32_t a, std::int32_t b) : a_(a), b_(b) {
-        if (a >= b) {
-            throw oneapi::mkl::invalid_argument("rng", "uniform",
-                                                "parameters are incorrect, a >= b");
-        }
-    }
-
-    std::int32_t a() const {
-        return a_;
-    }
-
-    std::int32_t b() const {
-        return b_;
-    }
-
-private:
-    std::int32_t a_;
-    std::int32_t b_;
-};
-
-// Class template oneapi::mkl::rng::gaussian
-//
-// Represents continuous normal random number distribution
-//
-// Supported types:
-//      float
-//      double
-//
-// Supported methods:
-//      oneapi::mkl::rng::gaussian_method::box_muller2
-//      oneapi::mkl::rng::gaussian_method::icdf
-//
-// Input arguments:
-//      mean   - mean. 0 by default
-//      stddev - standard deviation. 1.0 by default
-
-namespace gaussian_method {
-struct icdf {};
-struct box_muller2 {};
-using by_default = box_muller2;
-} // namespace gaussian_method
-
-template <typename RealType = float, typename Method = gaussian_method::by_default>
-class gaussian {
-public:
-    static_assert(std::is_same<Method, gaussian_method::icdf>::value ||
-                      std::is_same<Method, gaussian_method::box_muller2>::value,
-                  "rng gaussian distribution method is incorrect");
-
-    static_assert(std::is_same<RealType, float>::value || std::is_same<RealType, double>::value,
-                  "rng gaussian distribution type is not supported");
-
-    using method_type = Method;
-    using result_type = RealType;
-
-    gaussian() : gaussian(static_cast<RealType>(0.0), static_cast<RealType>(1.0)) {}
-
-    explicit gaussian(RealType mean, RealType stddev) : mean_(mean), stddev_(stddev) {
-        if (stddev <= static_cast<RealType>(0.0)) {
-            throw oneapi::mkl::invalid_argument("rng", "gaussian",
-                                                "stddev parameter is incorrect, stddev <= 0.0");
-        }
-    }
-
-    RealType mean() const {
-        return mean_;
-    }
-
-    RealType stddev() const {
-        return stddev_;
-    }
-
-private:
-    RealType mean_;
-    RealType stddev_;
-};
-
-// Class template oneapi::mkl::rng::lognormal
-//
-// Represents continuous lognormal random number distribution
-//
-// Supported types:
-//      float
-//      double
-//
-// Supported methods:
-//      oneapi::mkl::rng::lognormal_method::box_muller2
-//      oneapi::mkl::rng::lognormal_method::icdf
-//
-// Input arguments:
-//      m     - mean of the subject normal distribution. 0.0 by default
-//      s     - standard deviation of the subject normal distribution. 1.0 by default
-//      displ - displacement. 0.0 by default
-//      scale - scalefactor. 1.0 by default
-
-namespace lognormal_method {
-struct icdf {};
-struct box_muller2 {};
-using by_default = box_muller2;
-} // namespace lognormal_method
-
-template <typename RealType = float, typename Method = lognormal_method::by_default>
-class lognormal {
-public:
-    static_assert(std::is_same<Method, lognormal_method::box_muller2>::value ||
-                      std::is_same<Method, lognormal_method::icdf>::value,
-                  "rng lognormal distribution method is incorrect");
-
-    static_assert(std::is_same<RealType, float>::value || std::is_same<RealType, double>::value,
-                  "rng lognormal distribution type is not supported");
-
-    using method_type = Method;
-    using result_type = RealType;
-
-    lognormal()
-            : lognormal(static_cast<RealType>(0.0), static_cast<RealType>(1.0),
-                        static_cast<RealType>(0.0), static_cast<RealType>(1.0)) {}
-
-    explicit lognormal(RealType m, RealType s, RealType displ = static_cast<RealType>(0.0),
-                       RealType scale = static_cast<RealType>(1.0))
-            : m_(m),
-              s_(s),
-              displ_(displ),
-              scale_(scale) {
-        if (s <= static_cast<RealType>(0.0)) {
-            throw oneapi::mkl::invalid_argument("rng", "lognormal", "s <= 0");
-        }
-        if (scale <= static_cast<RealType>(0.0)) {
-            throw oneapi::mkl::invalid_argument("rng", "lognormal", "scale <= 0");
-        }
-    }
-
-    RealType m() const {
-        return m_;
-    }
-
-    RealType s() const {
-        return s_;
-    }
-
-    RealType displ() const {
-        return displ_;
-    }
-
-    RealType scale() const {
-        return scale_;
-    }
-
-private:
-    RealType m_;
-    RealType s_;
-    RealType displ_;
-    RealType scale_;
-};
-
-// Class template oneapi::mkl::rng::bernoulli
-//
-// Represents discrete Bernoulli random number distribution
-//
-// Supported types:
-//      std::uint32_t
-//      std::int32_t
-//
-// Supported methods:
-//      oneapi::mkl::rng::bernoulli_method::icdf;
-//
-// Input arguments:
-//      p - success probability of a trial. 0.5 by default
-
-namespace bernoulli_method {
-struct icdf {};
-using by_default = icdf;
-} // namespace bernoulli_method
-
-template <typename IntType = std::uint32_t, typename Method = bernoulli_method::by_default>
-class bernoulli {
-public:
-    static_assert(std::is_same<Method, bernoulli_method::icdf>::value,
-                  "rng bernoulli method is incorrect");
-
-    static_assert(std::is_same<IntType, std::int32_t>::value ||
-                      std::is_same<IntType, std::uint32_t>::value,
-                  "rng bernoulli type is not supported");
-
-    using method_type = Method;
-    using result_type = IntType;
-
-    bernoulli() : bernoulli(0.5f) {}
-
-    explicit bernoulli(float p) : p_(p) {
-        if ((p > 1.0f) || (p < 0.0f)) {
-            throw oneapi::mkl::invalid_argument("rng", "bernoulli", "p > 1 or p < 0");
-        }
-    }
-
-    float p() const {
-        return p_;
-    }
-
-private:
-    float p_;
-};
-
-// Class template oneapi::mkl::rng::poisson
-//
-// Represents discrete Poisson random number distribution
-//
-// Supported types:
-//      std::int32_t
-//
-// Supported methods:
-//      oneapi::mkl::rng::poisson_method::gaussian_icdf_based
-//
-// Input arguments:
-//      lambda - distribution parameter. 0.5 by default
-
-namespace poisson_method {
-struct gaussian_icdf_based {};
-using by_default = gaussian_icdf_based;
-} // namespace poisson_method
-
-template <typename IntType = std::int32_t, typename Method = poisson_method::by_default>
-class poisson {
-public:
-    static_assert(std::is_same<Method, poisson_method::gaussian_icdf_based>::value,
-                  "rng poisson method is incorrect");
-
-    static_assert(std::is_same<IntType, std::int32_t>::value ||
-                      std::is_same<IntType, std::uint32_t>::value,
-                  "rng poisson type is not supported");
-
-    using method_type = Method;
-    using result_type = IntType;
-
-    poisson() : poisson(0.5) {}
-
-    explicit poisson(double lambda) : lambda_(lambda) {
-        if ((lambda <= 0.0)) {
-            throw oneapi::mkl::invalid_argument("rng", "poisson", "lamdba < 0");
-        }
-    }
-
-    double lambda() const {
-        return lambda_;
-    }
-
-private:
-    double lambda_;
-};
-
-// Class template oneapi::mkl::rng::bits
-//
-// Represents bits of underlying random number engine
-//
-// Supported types:
-//      std::uint32_t
-//
-
-template <typename UIntType = std::uint32_t>
-class bits {
-public:
-    static_assert(std::is_same<UIntType, std::uint32_t>::value, "rng bits type is not supported");
-    using result_type = UIntType;
-};
-
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_RNG_DISTRIBUTIONS_HPP_
diff --git a/include/oneapi/mkl/rng/engines.hpp b/include/oneapi/mkl/rng/engines.hpp
deleted file mode 100644
index c28ae2b45..000000000
--- a/include/oneapi/mkl/rng/engines.hpp
+++ /dev/null
@@ -1,243 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_RNG_ENGINES_HPP_
-#define _ONEMKL_RNG_ENGINES_HPP_
-
-#include <cstdint>
-#include <limits>
-#include <memory>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/detail/backend_selector.hpp"
-
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-#include "oneapi/mkl/rng/detail/rng_loader.hpp"
-
-#ifdef ENABLE_MKLCPU_BACKEND
-#include "oneapi/mkl/rng/detail/mklcpu/onemkl_rng_mklcpu.hpp"
-#endif
-#ifdef ENABLE_MKLGPU_BACKEND
-#include "oneapi/mkl/rng/detail/mklgpu/onemkl_rng_mklgpu.hpp"
-#endif
-#ifdef ENABLE_CURAND_BACKEND
-#include "oneapi/mkl/rng/detail/curand/onemkl_rng_curand.hpp"
-#endif
-#ifdef ENABLE_ROCRAND_BACKEND
-#include "oneapi/mkl/rng/detail/rocrand/onemkl_rng_rocrand.hpp"
-#endif
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-
-// Class oneapi::mkl::rng::philox4x32x10
-//
-// Represents Philox4x32-10 counter-based pseudorandom number generator
-//
-// Supported parallelization methods:
-//      skip_ahead
-class philox4x32x10 {
-public:
-    static constexpr std::uint64_t default_seed = 0;
-
-    philox4x32x10(sycl::queue queue, std::uint64_t seed = default_seed)
-            : pimpl_(detail::create_philox4x32x10(get_device_id(queue), queue, seed)) {}
-
-    philox4x32x10(sycl::queue queue, std::initializer_list<std::uint64_t> seed)
-            : pimpl_(detail::create_philox4x32x10(get_device_id(queue), queue, seed)) {}
-
-#ifdef ENABLE_MKLCPU_BACKEND
-    philox4x32x10(backend_selector<backend::mklcpu> selector, std::uint64_t seed = default_seed)
-            : pimpl_(mklcpu::create_philox4x32x10(selector.get_queue(), seed)) {}
-
-    philox4x32x10(backend_selector<backend::mklcpu> selector,
-                  std::initializer_list<std::uint64_t> seed)
-            : pimpl_(mklcpu::create_philox4x32x10(selector.get_queue(), seed)) {}
-#endif
-
-#ifdef ENABLE_MKLGPU_BACKEND
-    philox4x32x10(backend_selector<backend::mklgpu> selector, std::uint64_t seed = default_seed)
-            : pimpl_(mklgpu::create_philox4x32x10(selector.get_queue(), seed)) {}
-
-    philox4x32x10(backend_selector<backend::mklgpu> selector,
-                  std::initializer_list<std::uint64_t> seed)
-            : pimpl_(mklgpu::create_philox4x32x10(selector.get_queue(), seed)) {}
-#endif
-
-#ifdef ENABLE_CURAND_BACKEND
-    philox4x32x10(backend_selector<backend::curand> selector, std::uint64_t seed = default_seed)
-            : pimpl_(curand::create_philox4x32x10(selector.get_queue(), seed)) {}
-
-    philox4x32x10(backend_selector<backend::curand> selector,
-                  std::initializer_list<std::uint64_t> seed)
-            : pimpl_(curand::create_philox4x32x10(selector.get_queue(), seed)) {}
-#endif
-#ifdef ENABLE_ROCRAND_BACKEND
-    philox4x32x10(backend_selector<backend::rocrand> selector, std::uint64_t seed = default_seed)
-            : pimpl_(rocrand::create_philox4x32x10(selector.get_queue(), seed)) {}
-
-    philox4x32x10(backend_selector<backend::rocrand> selector,
-                  std::initializer_list<std::uint64_t> seed)
-            : pimpl_(rocrand::create_philox4x32x10(selector.get_queue(), seed)) {}
-#endif
-
-    philox4x32x10(const philox4x32x10& other) {
-        pimpl_.reset(other.pimpl_.get()->copy_state());
-    }
-
-    philox4x32x10(philox4x32x10&& other) {
-        pimpl_ = std::move(other.pimpl_);
-    }
-
-    philox4x32x10& operator=(const philox4x32x10& other) {
-        if (this == &other)
-            return *this;
-        pimpl_.reset(other.pimpl_.get()->copy_state());
-        return *this;
-    }
-
-    philox4x32x10& operator=(philox4x32x10&& other) {
-        if (this == &other)
-            return *this;
-        pimpl_ = std::move(other.pimpl_);
-        return *this;
-    }
-
-private:
-    std::unique_ptr<detail::engine_impl> pimpl_;
-
-    template <typename Engine>
-    friend void skip_ahead(Engine& engine, std::uint64_t num_to_skip);
-
-    template <typename Engine>
-    friend void skip_ahead(Engine& engine, std::initializer_list<std::uint64_t> num_to_skip);
-
-    template <typename Distr, typename Engine>
-    friend void generate(const Distr& distr, Engine& engine, std::int64_t n,
-                         sycl::buffer<typename Distr::result_type, 1>& r);
-
-    template <typename Distr, typename Engine>
-    friend sycl::event generate(const Distr& distr, Engine& engine, std::int64_t n,
-                                typename Distr::result_type* r,
-                                const std::vector<sycl::event>& dependencies);
-};
-
-// Class oneapi::mkl::rng::mrg32k3a
-//
-// Represents the combined recurcive pseudorandom number generator
-//
-// Supported parallelization methods:
-//      skip_ahead
-class mrg32k3a {
-public:
-    static constexpr std::uint32_t default_seed = 1;
-
-    mrg32k3a(sycl::queue queue, std::uint32_t seed = default_seed)
-            : pimpl_(detail::create_mrg32k3a(get_device_id(queue), queue, seed)) {}
-
-    mrg32k3a(sycl::queue queue, std::initializer_list<std::uint32_t> seed)
-            : pimpl_(detail::create_mrg32k3a(get_device_id(queue), queue, seed)) {}
-
-#ifdef ENABLE_MKLCPU_BACKEND
-    mrg32k3a(backend_selector<backend::mklcpu> selector, std::uint32_t seed = default_seed)
-            : pimpl_(mklcpu::create_mrg32k3a(selector.get_queue(), seed)) {}
-
-    mrg32k3a(backend_selector<backend::mklcpu> selector, std::initializer_list<std::uint32_t> seed)
-            : pimpl_(mklcpu::create_mrg32k3a(selector.get_queue(), seed)) {}
-#endif
-
-#ifdef ENABLE_MKLGPU_BACKEND
-    mrg32k3a(backend_selector<backend::mklgpu> selector, std::uint32_t seed = default_seed)
-            : pimpl_(mklgpu::create_mrg32k3a(selector.get_queue(), seed)) {}
-
-    mrg32k3a(backend_selector<backend::mklgpu> selector, std::initializer_list<std::uint32_t> seed)
-            : pimpl_(mklgpu::create_mrg32k3a(selector.get_queue(), seed)) {}
-#endif
-
-#ifdef ENABLE_CURAND_BACKEND
-    mrg32k3a(backend_selector<backend::curand> selector, std::uint32_t seed = default_seed)
-            : pimpl_(curand::create_mrg32k3a(selector.get_queue(), seed)) {}
-
-    mrg32k3a(backend_selector<backend::curand> selector, std::initializer_list<std::uint32_t> seed)
-            : pimpl_(curand::create_mrg32k3a(selector.get_queue(), seed)) {}
-#endif
-
-#ifdef ENABLE_ROCRAND_BACKEND
-    mrg32k3a(backend_selector<backend::rocrand> selector, std::uint32_t seed = default_seed)
-            : pimpl_(rocrand::create_mrg32k3a(selector.get_queue(), seed)) {}
-
-    mrg32k3a(backend_selector<backend::rocrand> selector, std::initializer_list<std::uint32_t> seed)
-            : pimpl_(rocrand::create_mrg32k3a(selector.get_queue(), seed)) {}
-#endif
-
-    mrg32k3a(const mrg32k3a& other) {
-        pimpl_.reset(other.pimpl_.get()->copy_state());
-    }
-
-    mrg32k3a(mrg32k3a&& other) {
-        pimpl_ = std::move(other.pimpl_);
-    }
-
-    mrg32k3a& operator=(const mrg32k3a& other) {
-        if (this == &other)
-            return *this;
-        pimpl_.reset(other.pimpl_.get()->copy_state());
-        return *this;
-    }
-
-    mrg32k3a& operator=(mrg32k3a&& other) {
-        if (this == &other)
-            return *this;
-        pimpl_ = std::move(other.pimpl_);
-        return *this;
-    }
-
-private:
-    std::unique_ptr<detail::engine_impl> pimpl_;
-
-    template <typename Engine>
-    friend void skip_ahead(Engine& engine, std::uint64_t num_to_skip);
-
-    template <typename Engine>
-    friend void skip_ahead(Engine& engine, std::initializer_list<std::uint64_t> num_to_skip);
-
-    template <typename Distr, typename Engine>
-    friend void generate(const Distr& distr, Engine& engine, std::int64_t n,
-                         sycl::buffer<typename Distr::result_type, 1>& r);
-
-    template <typename Distr, typename Engine>
-    friend sycl::event generate(const Distr& distr, Engine& engine, std::int64_t n,
-                                typename Distr::result_type* r,
-                                const std::vector<sycl::event>& dependencies);
-};
-
-// Default engine to be used for common cases
-using default_engine = philox4x32x10;
-
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_RNG_ENGINES_HPP_
diff --git a/include/oneapi/mkl/rng/functions.hpp b/include/oneapi/mkl/rng/functions.hpp
deleted file mode 100644
index 028e13557..000000000
--- a/include/oneapi/mkl/rng/functions.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_RNG_FUNCTIONS_HPP_
-#define _ONEMKL_RNG_FUNCTIONS_HPP_
-
-#include <cstdint>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/rng/predicates.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-
-// Function oneapi::mkl::rng::generate().Buffer API
-// Provides random numbers from a given engine with a given statistics
-//
-// Input parameters:
-//      const Distr& distr              - distribution object
-//      Engine& engine                   - engine object
-//      std::int64_t n                   - number of random values to be generated
-//
-// Output parameters:
-//      sycl::buffer<typename Distr::result_type, 1>& r - sycl::buffer to the output vector
-template <typename Distr, typename Engine>
-static inline void generate(const Distr& distr, Engine& engine, std::int64_t n,
-                            sycl::buffer<typename Distr::result_type, 1>& r) {
-    generate_precondition(distr, engine, n, r);
-    engine.pimpl_->generate(distr, n, r);
-}
-
-// Function oneapi::mkl::rng::generate(). USM API
-// Provides random numbers from a given engine with a given statistics
-//
-// Input parameters:
-//      const Distr& distr               - distribution object
-//      Engine& engine                   - engine object
-//      std::int64_t n                   - number of random values to be generated
-//      const std::vector<sycl::event>& dependencies - list of events to wait for
-//                  before starting computation, if any. If omitted, defaults to no dependencies
-//
-// Output parameters:
-//      typename Distr::result_type* - pointer to the output vector
-//
-// Returns:
-//      sycl::event - event for the submitted to the engine's queue task
-template <typename Distr, typename Engine>
-static inline sycl::event generate(const Distr& distr, Engine& engine, std::int64_t n,
-                                   typename Distr::result_type* r,
-                                   const std::vector<sycl::event>& dependencies = {}) {
-    generate_precondition(distr, engine, n, r, dependencies);
-    return engine.pimpl_->generate(distr, n, r, dependencies);
-}
-
-//  SERVICE FUNCTIONS
-
-// Function oneapi::mkl::rng::skip_ahead(). Common interface
-//
-// Proceeds state of engine using the skip-ahead method
-//
-// Input parameters:
-//      Engine& engine             - engine object
-//      const std::int64_t num_to_skip - number of skipped elements
-template <typename Engine>
-static inline void skip_ahead(Engine& engine, std::uint64_t num_to_skip) {
-    engine.pimpl_->skip_ahead(num_to_skip);
-}
-
-// Function oneapi::mkl::rng::skip_ahead(). Interface with partitioned number of skipped elements
-//
-// Proceeds state of engine using the skip-ahead method
-//
-// Input parameters:
-//      Engine& engine                               - engine object
-//      std::initializer_list<std::uint64_t> num_to_skip - number of skipped elements
-template <typename Engine>
-static inline void skip_ahead(Engine& engine, std::initializer_list<std::uint64_t> num_to_skip) {
-    engine.pimpl_->skip_ahead(num_to_skip);
-}
-
-// Function oneapi::mkl::rng::leapfrog()
-//
-// Proceeds state of engine using the leapfrog method
-//
-// Input parameters:
-//      Engine& engine  - engine object
-//      std::uint64_t idx    - index of the computational node
-//      std::uint64_t stride - largest number of computational nodes, or stride
-template <typename Engine>
-static inline void leapfrog(Engine& engine, std::uint64_t idx, std::uint64_t stride) {
-    engine.pimpl_->leapfrog(idx, stride);
-}
-
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_RNG_FUNCTIONS_HPP_
diff --git a/include/oneapi/mkl/rng/predicates.hpp b/include/oneapi/mkl/rng/predicates.hpp
deleted file mode 100644
index 10422e543..000000000
--- a/include/oneapi/mkl/rng/predicates.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_RNG_PREDICATES_HPP_
-#define _ONEMKL_RNG_PREDICATES_HPP_
-
-#include <cstdint>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/types.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-
-// Buffer APIs
-
-template <typename Distr, typename Engine>
-inline void generate_precondition(const Distr& /*distr*/, Engine& /*engine*/, std::int64_t n,
-                                  sycl::buffer<typename Distr::result_type, 1>& r) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-    if (n < 0 || n > r.size()) {
-        throw oneapi::mkl::invalid_argument("rng", "generate", "n");
-    }
-#endif
-}
-
-// USM APIs
-
-template <typename Distr, typename Engine>
-inline void generate_precondition(const Distr& /*distr*/, Engine& /*engine*/, std::int64_t n,
-                                  typename Distr::result_type* r,
-                                  const std::vector<sycl::event>& /*dependencies*/) {
-#ifndef ONEMKL_DISABLE_PREDICATES
-    if (n < 0) {
-        throw oneapi::mkl::invalid_argument("rng", "generate", "n");
-    }
-    if (r == nullptr) {
-        throw oneapi::mkl::invalid_argument("rng", "generate", "r is nullptr");
-    }
-#endif
-}
-
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_RNG_PREDICATES_HPP_
diff --git a/include/oneapi/mkl/sparse_blas.hpp b/include/oneapi/mkl/sparse_blas.hpp
deleted file mode 100644
index 912a20eb8..000000000
--- a/include/oneapi/mkl/sparse_blas.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _ONEMKL_SPARSE_BLAS_HPP_
-#define _ONEMKL_SPARSE_BLAS_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/detail/config.hpp"
-
-#ifdef ENABLE_MKLCPU_BACKEND
-#include "sparse_blas/detail/mklcpu/sparse_blas_ct.hpp"
-#endif
-#ifdef ENABLE_MKLGPU_BACKEND
-#include "sparse_blas/detail/mklgpu/sparse_blas_ct.hpp"
-#endif
-
-#include "sparse_blas/detail/sparse_blas_rt.hpp"
-
-#endif // _ONEMKL_SPARSE_BLAS_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/detail/helper_types.hpp b/include/oneapi/mkl/sparse_blas/detail/helper_types.hpp
deleted file mode 100644
index 4964b1eff..000000000
--- a/include/oneapi/mkl/sparse_blas/detail/helper_types.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_HELPER_TYPES_HPP_
-#define _ONEMKL_SPARSE_BLAS_DETAIL_HELPER_TYPES_HPP_
-
-#include <complex>
-#include <cstdint>
-#include <type_traits>
-
-namespace oneapi {
-namespace mkl {
-namespace sparse {
-namespace detail {
-
-struct matrix_handle;
-
-template <typename fpType>
-inline constexpr bool is_fp_supported_v =
-    std::is_same_v<fpType, float> || std::is_same_v<fpType, double> ||
-    std::is_same_v<fpType, std::complex<float>> || std::is_same_v<fpType, std::complex<double>>;
-
-template <typename intType>
-inline constexpr bool is_int_supported_v =
-    std::is_same_v<intType, std::int32_t> || std::is_same_v<intType, std::int64_t>;
-
-template <typename fpType, typename intType>
-inline constexpr bool are_fp_int_supported_v =
-    is_fp_supported_v<fpType>&& is_int_supported_v<intType>;
-
-} // namespace detail
-} // namespace sparse
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _ONEMKL_SPARSE_BLAS_DETAIL_HELPER_TYPES_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp b/include/oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp
deleted file mode 100644
index 2535e61f6..000000000
--- a/include/oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_MKLCPU_ONEMKL_SPARSE_BLAS_MKLCPU_HPP_
-#define _ONEMKL_SPARSE_BLAS_DETAIL_MKLCPU_ONEMKL_SPARSE_BLAS_MKLCPU_HPP_
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/sparse_blas/detail/helper_types.hpp"
-
-namespace oneapi::mkl::sparse::mklcpu {
-
-namespace detail = oneapi::mkl::sparse::detail;
-
-#include "oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx"
-
-} // namespace oneapi::mkl::sparse::mklcpu
-
-#endif // _ONEMKL_SPARSE_BLAS_DETAIL_MKLCPU_ONEMKL_SPARSE_BLAS_MKLCPU_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/detail/mklcpu/sparse_blas_ct.hpp b/include/oneapi/mkl/sparse_blas/detail/mklcpu/sparse_blas_ct.hpp
deleted file mode 100644
index bc0089c57..000000000
--- a/include/oneapi/mkl/sparse_blas/detail/mklcpu/sparse_blas_ct.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_MKLCPU_SPARSE_BLAS_CT_HPP_
-#define _ONEMKL_SPARSE_BLAS_DETAIL_MKLCPU_SPARSE_BLAS_CT_HPP_
-
-#include "oneapi/mkl/sparse_blas/types.hpp"
-#include "oneapi/mkl/detail/backends.hpp"
-#include "oneapi/mkl/detail/backend_selector.hpp"
-
-#include "onemkl_sparse_blas_mklcpu.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace sparse {
-
-#define BACKEND mklcpu
-#include "oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx"
-#undef BACKEND
-
-} //namespace sparse
-} //namespace mkl
-} //namespace oneapi
-
-#endif // _ONEMKL_SPARSE_BLAS_DETAIL_MKLCPU_SPARSE_BLAS_CT_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp b/include/oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp
deleted file mode 100644
index 1ca336b9b..000000000
--- a/include/oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_MKLGPU_ONEMKL_SPARSE_BLAS_MKLGPU_HPP_
-#define _ONEMKL_SPARSE_BLAS_DETAIL_MKLGPU_ONEMKL_SPARSE_BLAS_MKLGPU_HPP_
-
-#include "oneapi/mkl/detail/export.hpp"
-#include "oneapi/mkl/sparse_blas/detail/helper_types.hpp"
-
-namespace oneapi::mkl::sparse::mklgpu {
-
-namespace detail = oneapi::mkl::sparse::detail;
-
-#include "oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx"
-
-} // namespace oneapi::mkl::sparse::mklgpu
-
-#endif // _ONEMKL_SPARSE_BLAS_DETAIL_MKLGPU_ONEMKL_SPARSE_BLAS_MKLGPU_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/detail/mklgpu/sparse_blas_ct.hpp b/include/oneapi/mkl/sparse_blas/detail/mklgpu/sparse_blas_ct.hpp
deleted file mode 100644
index 00c01346f..000000000
--- a/include/oneapi/mkl/sparse_blas/detail/mklgpu/sparse_blas_ct.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_MKLGPU_SPARSE_BLAS_CT_HPP_
-#define _ONEMKL_SPARSE_BLAS_DETAIL_MKLGPU_SPARSE_BLAS_CT_HPP_
-
-#include "oneapi/mkl/sparse_blas/types.hpp"
-#include "oneapi/mkl/detail/backends.hpp"
-#include "oneapi/mkl/detail/backend_selector.hpp"
-
-#include "onemkl_sparse_blas_mklgpu.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace sparse {
-
-#define BACKEND mklgpu
-#include "oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx"
-#undef BACKEND
-
-} //namespace sparse
-} //namespace mkl
-} //namespace oneapi
-
-#endif // _ONEMKL_SPARSE_BLAS_DETAIL_MKLGPU_SPARSE_BLAS_CT_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx b/include/oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx
deleted file mode 100644
index 03beaa4b4..000000000
--- a/include/oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************
-*  Copyright(C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0(the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-// This file is meant to be included in each backend onemkl_sparse_blas_BACKEND.hpp files.
-// It is used to exports each symbol to the onemkl_sparse_blas_BACKEND library.
-
-ONEMKL_EXPORT void init_matrix_handle(sycl::queue &queue, matrix_handle_t *p_handle);
-
-ONEMKL_EXPORT sycl::event release_matrix_handle(sycl::queue &queue, matrix_handle_t *p_handle,
-                                                const std::vector<sycl::event> &dependencies = {});
-
-template <typename fpType, typename intType>
-ONEMKL_EXPORT std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>> set_csr_data(
-    sycl::queue &queue, matrix_handle_t handle, intType num_rows, intType num_cols, intType nnz,
-    index_base index, sycl::buffer<intType, 1> &row_ptr, sycl::buffer<intType, 1> &col_ind,
-    sycl::buffer<fpType, 1> &val);
-
-template <typename fpType, typename intType>
-ONEMKL_EXPORT std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>, sycl::event>
-set_csr_data(sycl::queue &queue, matrix_handle_t handle, intType num_rows, intType num_cols,
-             intType nnz, index_base index, intType *row_ptr, intType *col_ind, fpType *val,
-             const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event optimize_gemm(sycl::queue &queue, transpose transpose_A,
-                                        matrix_handle_t handle,
-                                        const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event optimize_gemm(sycl::queue &queue, transpose transpose_A,
-                                        transpose transpose_B, layout dense_matrix_layout,
-                                        const std::int64_t columns, matrix_handle_t handle,
-                                        const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event optimize_gemv(sycl::queue &queue, transpose transpose_val,
-                                        matrix_handle_t handle,
-                                        const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_EXPORT sycl::event optimize_trsv(sycl::queue &queue, uplo uplo_val, transpose transpose_val,
-                                        diag diag_val, matrix_handle_t handle,
-                                        const std::vector<sycl::event> &dependencies = {});
-
-template <typename fpType>
-ONEMKL_EXPORT std::enable_if_t<detail::is_fp_supported_v<fpType>> gemv(
-    sycl::queue &queue, transpose transpose_val, const fpType alpha, matrix_handle_t A_handle,
-    sycl::buffer<fpType, 1> &x, const fpType beta, sycl::buffer<fpType, 1> &y);
-
-template <typename fpType>
-ONEMKL_EXPORT std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemv(
-    sycl::queue &queue, transpose transpose_val, const fpType alpha, matrix_handle_t A_handle,
-    const fpType *x, const fpType beta, fpType *y,
-    const std::vector<sycl::event> &dependencies = {});
-
-template <typename fpType>
-ONEMKL_EXPORT std::enable_if_t<detail::is_fp_supported_v<fpType>> trsv(
-    sycl::queue &queue, uplo uplo_val, transpose transpose_val, diag diag_val,
-    matrix_handle_t A_handle, sycl::buffer<fpType, 1> &x, sycl::buffer<fpType, 1> &y);
-
-template <typename fpType>
-ONEMKL_EXPORT std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> trsv(
-    sycl::queue &queue, uplo uplo_val, transpose transpose_val, diag diag_val,
-    matrix_handle_t A_handle, const fpType *x, fpType *y,
-    const std::vector<sycl::event> &dependencies = {});
-
-template <typename fpType>
-ONEMKL_EXPORT std::enable_if_t<detail::is_fp_supported_v<fpType>> gemm(
-    sycl::queue &queue, layout dense_matrix_layout, transpose transpose_A, transpose transpose_B,
-    const fpType alpha, matrix_handle_t A_handle, sycl::buffer<fpType, 1> &B,
-    const std::int64_t columns, const std::int64_t ldb, const fpType beta,
-    sycl::buffer<fpType, 1> &C, const std::int64_t ldc);
-
-template <typename fpType>
-ONEMKL_EXPORT std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemm(
-    sycl::queue &queue, layout dense_matrix_layout, transpose transpose_A, transpose transpose_B,
-    const fpType alpha, matrix_handle_t A_handle, const fpType *B, const std::int64_t columns,
-    const std::int64_t ldb, const fpType beta, fpType *C, const std::int64_t ldc,
-    const std::vector<sycl::event> &dependencies = {});
diff --git a/include/oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx b/include/oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx
deleted file mode 100644
index 41fe51c49..000000000
--- a/include/oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx
+++ /dev/null
@@ -1,135 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-// This file is meant to be included in each backend sparse_blas_ct.hpp files
-// Each function calls the implementation from onemkl_sparse_blas_backends.hxx
-
-#ifndef BACKEND
-#error "BACKEND is not defined"
-#endif
-
-inline void init_matrix_handle(backend_selector<backend::BACKEND> selector,
-                               matrix_handle_t *p_handle) {
-    BACKEND::init_matrix_handle(selector.get_queue(), p_handle);
-}
-
-inline sycl::event release_matrix_handle(backend_selector<backend::BACKEND> selector,
-                                         matrix_handle_t *p_handle,
-                                         const std::vector<sycl::event> &dependencies = {}) {
-    return BACKEND::release_matrix_handle(selector.get_queue(), p_handle, dependencies);
-}
-
-template <typename fpType, typename intType>
-std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>> set_csr_data(
-    backend_selector<backend::BACKEND> selector, matrix_handle_t handle, intType num_rows,
-    intType num_cols, intType nnz, index_base index, sycl::buffer<intType, 1> &row_ptr,
-    sycl::buffer<intType, 1> &col_ind, sycl::buffer<fpType, 1> &val) {
-    BACKEND::set_csr_data(selector.get_queue(), handle, num_rows, num_cols, nnz, index, row_ptr,
-                          col_ind, val);
-}
-
-template <typename fpType, typename intType>
-std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>, sycl::event> set_csr_data(
-    backend_selector<backend::BACKEND> selector, matrix_handle_t handle, intType num_rows,
-    intType num_cols, intType nnz, index_base index, intType *row_ptr, intType *col_ind,
-    fpType *val, const std::vector<sycl::event> &dependencies = {}) {
-    return BACKEND::set_csr_data(selector.get_queue(), handle, num_rows, num_cols, nnz, index,
-                                 row_ptr, col_ind, val, dependencies);
-}
-
-inline sycl::event optimize_gemm(backend_selector<backend::BACKEND> selector, transpose transpose_A,
-                                 matrix_handle_t handle,
-                                 const std::vector<sycl::event> &dependencies = {}) {
-    return BACKEND::optimize_gemm(selector.get_queue(), transpose_A, handle, dependencies);
-}
-
-inline sycl::event optimize_gemm(backend_selector<backend::BACKEND> selector, transpose transpose_A,
-                                 transpose transpose_B, layout dense_matrix_layout,
-                                 const std::int64_t columns, matrix_handle_t handle,
-                                 const std::vector<sycl::event> &dependencies = {}) {
-    return BACKEND::optimize_gemm(selector.get_queue(), transpose_A, transpose_B,
-                                  dense_matrix_layout, columns, handle, dependencies);
-}
-
-inline sycl::event optimize_gemv(backend_selector<backend::BACKEND> selector,
-                                 transpose transpose_val, matrix_handle_t handle,
-                                 const std::vector<sycl::event> &dependencies = {}) {
-    return BACKEND::optimize_gemv(selector.get_queue(), transpose_val, handle, dependencies);
-}
-
-inline sycl::event optimize_trsv(backend_selector<backend::BACKEND> selector, uplo uplo_val,
-                                 transpose transpose_val, diag diag_val, matrix_handle_t handle,
-                                 const std::vector<sycl::event> &dependencies = {}) {
-    return BACKEND::optimize_trsv(selector.get_queue(), uplo_val, transpose_val, diag_val, handle,
-                                  dependencies);
-}
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>> gemv(
-    backend_selector<backend::BACKEND> selector, transpose transpose_val, const fpType alpha,
-    matrix_handle_t A_handle, sycl::buffer<fpType, 1> &x, const fpType beta,
-    sycl::buffer<fpType, 1> &y) {
-    BACKEND::gemv(selector.get_queue(), transpose_val, alpha, A_handle, x, beta, y);
-}
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemv(
-    backend_selector<backend::BACKEND> selector, transpose transpose_val, const fpType alpha,
-    matrix_handle_t A_handle, const fpType *x, const fpType beta, fpType *y,
-    const std::vector<sycl::event> &dependencies = {}) {
-    return BACKEND::gemv(selector.get_queue(), transpose_val, alpha, A_handle, x, beta, y,
-                         dependencies);
-}
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>> trsv(
-    backend_selector<backend::BACKEND> selector, uplo uplo_val, transpose transpose_val,
-    diag diag_val, matrix_handle_t A_handle, sycl::buffer<fpType, 1> &x,
-    sycl::buffer<fpType, 1> &y) {
-    BACKEND::trsv(selector.get_queue(), uplo_val, transpose_val, diag_val, A_handle, x, y);
-}
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> trsv(
-    backend_selector<backend::BACKEND> selector, uplo uplo_val, transpose transpose_val,
-    diag diag_val, matrix_handle_t A_handle, const fpType *x, fpType *y,
-    const std::vector<sycl::event> &dependencies = {}) {
-    return BACKEND::trsv(selector.get_queue(), uplo_val, transpose_val, diag_val, A_handle, x, y,
-                         dependencies);
-}
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>> gemm(
-    backend_selector<backend::BACKEND> selector, layout dense_matrix_layout, transpose transpose_A,
-    transpose transpose_B, const fpType alpha, matrix_handle_t A_handle, sycl::buffer<fpType, 1> &B,
-    const std::int64_t columns, const std::int64_t ldb, const fpType beta,
-    sycl::buffer<fpType, 1> &C, const std::int64_t ldc) {
-    BACKEND::gemm(selector.get_queue(), dense_matrix_layout, transpose_A, transpose_B, alpha,
-                  A_handle, B, columns, ldb, beta, C, ldc);
-}
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemm(
-    backend_selector<backend::BACKEND> selector, layout dense_matrix_layout, transpose transpose_A,
-    transpose transpose_B, const fpType alpha, matrix_handle_t A_handle, const fpType *B,
-    const std::int64_t columns, const std::int64_t ldb, const fpType beta, fpType *C,
-    const std::int64_t ldc, const std::vector<sycl::event> &dependencies = {}) {
-    return BACKEND::gemm(selector.get_queue(), dense_matrix_layout, transpose_A, transpose_B, alpha,
-                         A_handle, B, columns, ldb, beta, C, ldc, dependencies);
-}
diff --git a/include/oneapi/mkl/sparse_blas/detail/sparse_blas_rt.hpp b/include/oneapi/mkl/sparse_blas/detail/sparse_blas_rt.hpp
deleted file mode 100644
index 131e0545a..000000000
--- a/include/oneapi/mkl/sparse_blas/detail/sparse_blas_rt.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_SPARSE_BLAS_RT_HPP_
-#define _ONEMKL_SPARSE_BLAS_DETAIL_SPARSE_BLAS_RT_HPP_
-
-#include "oneapi/mkl/sparse_blas/types.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace sparse {
-
-void init_matrix_handle(sycl::queue &queue, matrix_handle_t *p_handle);
-
-sycl::event release_matrix_handle(sycl::queue &queue, matrix_handle_t *p_handle,
-                                  const std::vector<sycl::event> &dependencies = {});
-
-template <typename fpType, typename intType>
-std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>> set_csr_data(
-    sycl::queue &queue, matrix_handle_t handle, intType num_rows, intType num_cols, intType nnz,
-    index_base index, sycl::buffer<intType, 1> &row_ptr, sycl::buffer<intType, 1> &col_ind,
-    sycl::buffer<fpType, 1> &val);
-
-template <typename fpType, typename intType>
-std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>, sycl::event> set_csr_data(
-    sycl::queue &queue, matrix_handle_t handle, intType num_rows, intType num_cols, intType nnz,
-    index_base index, intType *row_ptr, intType *col_ind, fpType *val,
-    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event optimize_gemm(sycl::queue &queue, transpose transpose_A, matrix_handle_t handle,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event optimize_gemm(sycl::queue &queue, transpose transpose_A, transpose transpose_B,
-                          layout dense_matrix_layout, const std::int64_t columns,
-                          matrix_handle_t handle,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event optimize_gemv(sycl::queue &queue, transpose transpose_val, matrix_handle_t handle,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event optimize_trsv(sycl::queue &queue, uplo uplo_val, transpose transpose_val, diag diag_val,
-                          matrix_handle_t handle,
-                          const std::vector<sycl::event> &dependencies = {});
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>> gemv(
-    sycl::queue &queue, transpose transpose_val, const fpType alpha, matrix_handle_t A_handle,
-    sycl::buffer<fpType, 1> &x, const fpType beta, sycl::buffer<fpType, 1> &y);
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemv(
-    sycl::queue &queue, transpose transpose_val, const fpType alpha, matrix_handle_t A_handle,
-    const fpType *x, const fpType beta, fpType *y,
-    const std::vector<sycl::event> &dependencies = {});
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>> trsv(sycl::queue &queue, uplo uplo_val,
-                                                         transpose transpose_val, diag diag_val,
-                                                         matrix_handle_t A_handle,
-                                                         sycl::buffer<fpType, 1> &x,
-                                                         sycl::buffer<fpType, 1> &y);
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> trsv(
-    sycl::queue &queue, uplo uplo_val, transpose transpose_val, diag diag_val,
-    matrix_handle_t A_handle, const fpType *x, fpType *y,
-    const std::vector<sycl::event> &dependencies = {});
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>> gemm(
-    sycl::queue &queue, layout dense_matrix_layout, transpose transpose_A, transpose transpose_B,
-    const fpType alpha, matrix_handle_t A_handle, sycl::buffer<fpType, 1> &B,
-    const std::int64_t columns, const std::int64_t ldb, const fpType beta,
-    sycl::buffer<fpType, 1> &C, const std::int64_t ldc);
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemm(
-    sycl::queue &queue, layout dense_matrix_layout, transpose transpose_A, transpose transpose_B,
-    const fpType alpha, matrix_handle_t A_handle, const fpType *B, const std::int64_t columns,
-    const std::int64_t ldb, const fpType beta, fpType *C, const std::int64_t ldc,
-    const std::vector<sycl::event> &dependencies = {});
-
-} // namespace sparse
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _ONEMKL_SPARSE_BLAS_DETAIL_SPARSE_BLAS_RT_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/types.hpp b/include/oneapi/mkl/sparse_blas/types.hpp
deleted file mode 100644
index 406c7dd1f..000000000
--- a/include/oneapi/mkl/sparse_blas/types.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _ONEMKL_SPARSE_BLAS_TYPES_HPP_
-#define _ONEMKL_SPARSE_BLAS_TYPES_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include <vector>
-
-#include "oneapi/mkl/types.hpp"
-#include "detail/helper_types.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace sparse {
-
-using matrix_handle_t = detail::matrix_handle*;
-
-} // namespace sparse
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _ONEMKL_SPARSE_BLAS_TYPES_HPP_
diff --git a/include/oneapi/mkl/types.hpp b/include/oneapi/mkl/types.hpp
deleted file mode 100644
index 32d336e11..000000000
--- a/include/oneapi/mkl/types.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_TYPES_HPP_
-#define _ONEMKL_TYPES_HPP_
-
-#ifdef __HIPSYCL__
-#include "oneapi/mkl/bfloat16.hpp"
-#endif
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-namespace oneapi {
-namespace mkl {
-
-#ifndef __HIPSYCL__
-using bfloat16 = sycl::ext::oneapi::bfloat16;
-#endif
-
-// BLAS flag types.
-enum class transpose : char { nontrans = 0, trans = 1, conjtrans = 3, N = 0, T = 1, C = 3 };
-
-enum class uplo : char { upper = 0, lower = 1, U = 0, L = 1 };
-
-enum class diag : char { nonunit = 0, unit = 1, N = 0, U = 1 };
-
-enum class side : char { left = 0, right = 1, L = 0, R = 1 };
-
-enum class offset : char { row = 0, column = 1, fix = 2, R = 0, C = 1, F = 2 };
-
-enum class layout : char { row_major = 0, col_major = 1, R = 0, C = 1 };
-
-enum class index_base : char {
-    zero = 0,
-    one = 1,
-};
-
-// LAPACK flag types.
-enum class job : char {
-    novec = 0,
-    vec = 1,
-    updatevec = 2,
-    allvec = 3,
-    somevec = 4,
-    overwritevec = 5,
-    N = 0,
-    V = 1,
-    U = 2,
-    A = 3,
-    S = 4,
-    O = 5
-};
-enum class jobsvd : char {
-    novec = 0,
-    vectors = 1,
-    vectorsina = 2,
-    somevec = 3,
-    N = 0,
-    A = 1,
-    O = 2,
-    S = 3
-};
-enum class generate : char { q = 0, p = 1, none = 2, both = 3, Q = 0, P = 1, N = 2, V = 3 };
-enum class compz : char {
-    novectors = 0,
-    vectors = 1,
-    initvectors = 2,
-    N = 0,
-    V = 1,
-    I = 2,
-};
-enum class direct : char {
-    forward = 0,
-    backward = 1,
-    F = 0,
-    B = 1,
-};
-enum class storev : char {
-    columnwise = 0,
-    rowwise = 1,
-    C = 0,
-    R = 1,
-};
-enum class rangev : char {
-    all = 0,
-    values = 1,
-    indices = 2,
-    A = 0,
-    V = 1,
-    I = 2,
-};
-enum class order : char {
-    block = 0,
-    entire = 1,
-    B = 0,
-    E = 1,
-};
-
-} //namespace mkl
-} //namespace oneapi
-
-#endif //_ONEMKL_TYPES_HPP_
diff --git a/legal_information.md b/legal_information.md
deleted file mode 100644
index 120c41043..000000000
--- a/legal_information.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Trademark Information
-
-3D XPoint, Altera, APEX, AnyWAN, Arria, Avalon, Axxia, BunnyPeople, Celeron, 
-Cilk, CONVERGATE, Cyclone, Docea, eASIC, easicopy, Enpirion, Hyperflex, Intel, 
-the Intel logo, Intel Adaptix, Intel Agilex, Intel Atom, Intel CoFluent, Intel 
-Core, Intel Inside, the Intel Inside logo, Intel Nervana, Intel Optane, Intel 
-RealSense, Intel Shooting Star, Intel Sirius, Intel SpeedStep, Intel Unite, 
-Intel vPro, Intel Xeon Phi, Iris, Itanium, MAX, Movidius, Myriad, neon, Nios, 
-OpenVINO, the OpenVINO logo, Pentium, Puma, Quark, Quartus, SICOFI, Simics, 
-SoftSilicon, Sound Mark, StarPro, Stratix, the Stratix logo, Stay With It, the 
-Engineering Stay With It logo, StreamSight, Tarari, The Journey Inside, 
-Thunderbolt, the Thunderbolt logo, Transcede, Ultrabook, VTune, and Xeon are 
-trademarks of Intel Corporation or its subsidiaries.
-
-*Other names and brands may be claimed as the property of others.
-
-Microsoft, Windows, and the Windows logo are trademarks, or registered 
-trademarks of Microsoft Corporation in the United States and/or other countries.
-
-Java is a registered trademark of Oracle and/or its affiliates.
-
-The Bluetooth(R) word mark and logos are registered trademarks owned by the 
-Bluetooth SIG, Inc. and any use of such marks by Intel Corporation is under 
-license.
-
-Intel Corporation uses the Palm OS* Ready mark under license from Palm, Inc.
-
-OpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by 
-Khronos.
-
-Radeon and the Radeon RX Vega logo are trademarks of Advanced Micro Devices, 
-Inc.
-
-SPEC(R), SPECrate(R) and SPEC CPU(R) are registered trademarks of the Standard 
-Performance Evaluation Corporation. See http://www.spec.org/spec/trademarks.html
-for more information.
\ No newline at end of file
diff --git a/rfcs/template.md b/rfcs/template.md
new file mode 100644
index 000000000..12340e6dc
--- /dev/null
+++ b/rfcs/template.md
@@ -0,0 +1,62 @@
+# Design Document
+
+### Revision
+
+
+|Date       |Revision| Comments                                                                 |
+|-----------|--------|--------------------------------------------------------------------------|
+|  YYYYMMDD |  1.0   | Initial version                                                          |
+|  YYYYMMDD |  X.Y   |                                                                          |
+
+
+
+## Motivation
+
+Short and clear explanation of the main reason for this design document.
+* What problem do we solve?
+   * Concise statement of the problem.
+* What is the user impact if we don't do this?
+   * Explain user impact if it is not implemented.
+* What is the timeline?
+   * Desired timeline.
+* Do we have existing feature requests or issues from users?
+   * List existing features/bugs related to the problem.
+
+## Outline
+
+1. [Introduction](#introduction)
+2. [Proposal](#proposal)
+3. [Changes in the Product](#changes-in-the-product)
+4. [Examples](#examples)
+5. [User impact](#user-impact)
+6. [Open questions](#open-questions)
+
+## Introduction
+
+Background and detailed description of the problem
+
+## Proposal
+
+A full and detailed description of the proposal.
+
+### Other Considered Approaches
+
+Any other approaches that were considered with pros and cons.
+
+## Changes in the Project
+
+List of changes in the project, timelines, affected components, etc.
+Please request version update if this proposal contains major changes.
+
+## Examples
+
+Small examples if applicable, e.g. to demonstrate how new API can be called from the application,
+or how to build new component.
+
+## User impact
+
+If applicable, add information about user impact for the proposed changes.
+
+## Open questions
+
+List of open questions, potential problems, and other important things to consider in the future
diff --git a/scripts/blas_list.txt b/scripts/blas_list.txt
deleted file mode 100644
index cc7f0a05a..000000000
--- a/scripts/blas_list.txt
+++ /dev/null
@@ -1,178 +0,0 @@
-asum
-asum
-asum
-asum
-axpy
-axpy
-axpy
-axpy
-copy
-copy
-copy
-copy
-dot
-dot
-dot
-dotc
-dotc
-dotu
-dotu
-iamin
-iamin
-iamin
-iamin
-iamax
-iamax
-iamax
-iamax
-nrm2
-nrm2
-nrm2
-nrm2
-rot
-rot
-rot
-rot
-rotg
-rotg
-rotg
-rotg
-rotm
-rotm
-rotmg
-rotmg
-scal
-scal
-scal
-scal
-scal
-scal
-sdsdot
-swap
-swap
-swap
-swap
-gbmv
-gbmv
-gbmv
-gbmv
-gemv
-gemv
-gemv
-gemv
-ger
-ger
-gerc
-gerc
-geru
-geru
-hbmv
-hbmv
-hemv
-hemv
-her
-her
-her2
-her2
-hpmv
-hpmv
-hpr
-hpr
-hpr2
-hpr2
-sbmv
-sbmv
-spmv
-spmv
-spr
-spr
-spr2
-spr2
-symv
-symv
-syr
-syr
-syr2
-syr2
-tbmv
-tbmv
-tbmv
-tbmv
-tbsv
-tbsv
-tbsv
-tbsv
-tpmv
-tpmv
-tpmv
-tpmv
-tpsv
-tpsv
-tpsv
-tpsv
-trmv
-trmv
-trmv
-trmv
-trsv
-trsv
-trsv
-trsv
-gemm
-gemm
-gemm
-gemm
-gemm
-hemm
-hemm
-herk
-herk
-her2k
-her2k
-symm
-symm
-symm
-symm
-syrk
-syrk
-syrk
-syrk
-syr2k
-syr2k
-syr2k
-syr2k
-trmm
-trmm
-trmm
-trmm
-trsm
-trsm
-trsm
-trsm
-gemm_batch
-gemm_batch
-gemm_batch
-gemm_batch
-gemm_batch
-gemm_batch
-gemm_batch
-gemm_batch
-trsm_batch
-trsm_batch
-trsm_batch
-trsm_batch
-trsm_batch
-trsm_batch
-trsm_batch
-trsm_batch
-gemmt
-gemmt
-gemmt
-gemmt
-gemm_ext
-gemm_ext
-gemm_ext
-gemm_ext
-gemm_ext
-gemm_ext
-gemm_ext
diff --git a/scripts/func_parser.py b/scripts/func_parser.py
deleted file mode 100755
index cbaa26142..000000000
--- a/scripts/func_parser.py
+++ /dev/null
@@ -1,190 +0,0 @@
-#!/usr/bin/env python
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-from sys import argv, exit, stdin
-from collections import defaultdict
-import re
-
-def parse_item(item):
-    func_type_and_name, par_str = item.split('(', 1)
-    ret_type, func_name = func_type_and_name.strip().rsplit(' ', 1)
-    """remove macros callback"""
-    ret_type = re.sub('^[A-Z_]+ ','',ret_type)
-    """remove macros calling convention"""
-    ret_type = re.sub(' [A-Z_]+$','',ret_type.strip())
-    """remove templates"""
-    ret_type = re.sub('template\s+<[a-zA-Z, _*:]*>','',ret_type.strip())
-    if func_name[0] == '*':
-            ret_type += ' *'
-            func_name = func_name[1:]
-
-    par_str = re.sub('^\s*(void)\s*$', '', par_str.strip(');'))
-
-    """Extract callback calls from parameter list and replace them by temp 'cllbck' param"""
-    clbck_type = None
-    clbck_param = None
-    if par_str.find('(') != -1:
-        par_str = re.sub('[)]\s*[(]', ')(', par_str)
-        clbck_re = re.compile(r'\w*\s*[(]\s*[A-Z_]*\s*[*]\s*\w*\s*[)]\s*[(][\w*,\[\]\s]*[)]')
-        clbck_list = clbck_re.findall(par_str)
-        clbck_type = [ re.sub('[*]\s*\w*', '*', x.split(')(')[0]) for x in clbck_list]
-        clbck_param = [ x.split(')(')[1] for x in clbck_list]
-
-        par_str = re.sub('[)]\s*[(][\w*,\[\]\s]*[)]', '', par_str)
-        par_str = re.sub('\w*\s*[(]\s*[A-Z_]*\s*[\w]*\s*[*]\s*', 'cllbck ', par_str)
-    par_str_ = re.sub('[,]+(?![^<]+>)', '@', par_str)
-    par_list = [x.strip() for x in par_str_.split('@') \
-                    if len(x.strip()) > 0 ]
-    """Split list of parameters to types and names"""
-    if len(par_list) > 0:
-            """Add parameter names (param1, param2, etc) if the declaration includes only types"""
-            if re.search('(,|^)+\s*(const)*\s*[\w:]+\s*[*]*\s*(,|$)+', re.sub('<[\s\w\d,]*>', '', par_str)) is not None:
-                par_list = [(x + ' param' + str(idx)).replace(" * ", " *" \
-                     ).replace("[] param" + str(idx), "param" + str(idx) + "[]") \
-                     for idx, x in enumerate(par_list)]
-
-            """Extract names to call_list"""
-            call_list = [x.split('=', 1)[0].strip().rsplit(' ', 1)[1].strip(' *').strip('\[\]').strip('&') \
-                            for x in par_list]
-
-            """Extract types to sig_list"""
-            par_list_wo_st_arrays = [(x.rsplit(' ', 1)[0] + \
-                    (lambda x: '* ' if x.find('[]') != -1 else ' ')(x.rsplit(' ', 1)[1]) + \
-                    (x.rsplit(' ', 1)[1]).strip('\[\]')) for x in par_list]
-            sig_list = [(x.rsplit(' ', 1)[0] + \
-                                    (x.rsplit(' ', 1)[1].startswith('*') \
-                                    and (' ' + x.rsplit(' ', 1)[1].count('*') * '*') or '')) \
-                                    for x in par_list_wo_st_arrays]
-    else:
-            call_list = list()
-            sig_list = list()
-    par_str = '(' + ', '.join(par_list) + ')'
-    call_str = '(' + ', '.join(call_list) + ')'
-    sig_str = '(' + ', '.join(sig_list) + ')'
-
-    """Put real callback call types back to the param_list and sig_str """
-    if clbck_param is not None:
-        for idx, x in enumerate(clbck_param):
-            par_str = re.sub(r'(cllbck\s*\w*)[,]', r'\1(' + x + ',', par_str, idx)
-            sig_str = re.sub(r'(cllbck\s*\w*)[,]', r'\1(' + x + ',', sig_str, idx)
-
-    if clbck_type is not None:
-        for idx, x in enumerate(clbck_type):
-            par_str = re.sub(r'cllbck(\s*\w*)', x + r'\1)', par_str, idx)
-            sig_str = re.sub(r'cllbck(\s*\w*)', x + r'\1)', sig_str, idx)
-    return func_name, ret_type, func_name, par_str, call_str, sig_str, call_list, sig_list
-
-
-def to_dict(func_data):
-    """ convert (ret_type, 'name', par_str, call_str, sig_str, call_list, sig_list) tuple to
-        dict with corresponding keys """
-    return dict(zip(('ret_type', 'name', 'par_str', 'call_str', 'sig_str', 'call_list', 'sig_list'), func_data))
-
-is_comment = 0
-is_wrapperbody = 0
-def strip_line(l):
-    """ remove global variables"""
-    if re.search('^\s*\w+\s*\w+[;]', l) is not None:
-        l = ''
-    """ remove namespaces"""
-    if re.search('^\s*namespace\s*\w+\s*[{]', l) is not None:
-        l = ''
-    """ remove declaration keywords """
-    l = re.sub("^extern ", "", l)
-    l = re.sub("^static ", "", l)
-    l = re.sub("^inline ", "", l)
-    """ remove extra whitespace and comments from input line """
-    l = re.sub("[)][A-Za-z0-9\s_]*[;]", ");", l)
-
-    """ remove simple wrapper function body"""
-    global is_wrapperbody
-    if is_wrapperbody == 1:
-        if re.search('^\s*}', l) is not None:
-            l = l.split('}', 1)[1].strip()
-            is_wrapperbody = 0
-        else:
-            return ""
-
-    m = re.search(r'[)]\s*\n*\s*[{]', l)
-    if m is not None:
-        l = l[:m.end()].strip('{').strip() + ";"
-        is_wrapperbody = 1
-
-    global is_comment
-    if is_comment == 1:
-        if re.search('\*/', l) is not None:
-            l = l.split('*/', 1)[1].strip()
-            is_comment = 0
-        else:
-            return ""
-    """ Delete comments """
-    l1 = l.split('#', 1)[0].strip()
-    l2 = l1.split('//', 1)[0].strip()
-    l3 = l2.split('/*', 1)[0].strip()
-    if re.search('/\*', l2) is not None:
-        is_comment = 1
-        if re.search('\*/', l2) is not None:
-            is_comment = 0
-            l4 = l2.split('*/', 1)[1].strip()
-            l3 += l4
-    """ Delete comments if there are several of them in one line """
-    l3 = re.sub("[/][*][\w\s]*[*][/]", "", l3);
-    """Delete all tabs"""
-    return re.sub(' +',' ', l3)
-
-def create_func_db(filename):
-    with open(filename, 'r') as f:
-        data = f.readlines()
-    funcs_db = defaultdict(list)
-    whole_line = ""
-    idx = 0
-    for l in data:
-        stripped = strip_line(l)
-        if not stripped:
-            continue
-        """ Check if function contains 1 line """
-        whole_line += stripped + ' '
-        """ Check if there is function """
-        if re.search('[(][\w\s\*/\&,_\[\]():<>={}]*[)]\s*[;]', whole_line) is None:
-            """ Check if there is some other staff before the function """
-            if re.search('[;{}]\s*$', whole_line) is not None:
-                whole_line = ""
-            continue
-        else:
-            stripped = whole_line.strip()
-            whole_line = ""
-        parsed = parse_item(stripped)
-        func_name, func_data = parsed[0], parsed[1:]
-        funcs_db[func_name].append(to_dict(func_data))
-        idx = idx + 1
-    return funcs_db
-
-def get_namespaces(filename):
-    with open(filename, 'r') as f:
-        data = f.readlines()
-    namespace_list = list()
-    for l in data:
-        stripped = strip_line(l)
-        if re.search('^\s*namespace\s*\w+\s*[{]', l) is not None:
-           l = l.split("namespace", 1)[1]
-           l = l.split("{", 1)[0]
-           namespace_list.append(l.strip())
-    return namespace_list
-
diff --git a/scripts/generate_backend_api.py b/scripts/generate_backend_api.py
deleted file mode 100755
index c3866270e..000000000
--- a/scripts/generate_backend_api.py
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/env python
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-from sys import argv, exit, stdin
-from subprocess import call
-from pprint import pprint
-from collections import defaultdict
-import errno
-import re
-import os
-
-from func_parser import create_func_db, get_namespaces
-
-def usage(err = None):
-    if err:
-        print('error: %s' % err)
-    print('''\
-Script to generate backend library header based on base_header.h
-Note: requires clang-format 9.0.0 tool to be installed
-Usage:
-
-    {script} <path/to/base_header.hpp> <path/to/backend_include.hpp> <namespace>
-
-Example:
-The command below will generate:
-"onemkl_blas_mklgpu.hpp" header with declaration of all backend library APIs.
-API from backend library will be called from "oneapi::mkl::mklgpu::blas" namespace.
-
-{script}  include/oneapi/mkl/blas.hpp include/oneapi/mkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp oneapi::mkl::mklgpu::blas
-'''.format(script = argv[0]))
-
-if len(argv) < 3:
-    usage()
-    exit(0)
-
-if re.search(r'[-]*\b[h]([e][l][p])?\b' ,argv[1]):
-    usage()
-    exit(0)
-
-in_filename = argv[1]
-out_headername = argv[2]
-namespace = argv[3]
-
-namespace_list=namespace.split("::")
-
-header_db = create_func_db(in_filename)
-
-print("Generate " + out_headername)
-
-def print_declaration(func_list):
-    code=""
-    for data in func_list:
-        code +="""
-{ret_type} {name}{par_str};
-
-""".format(**data)
-    return code
-
-try:
-    os.makedirs(os.path.dirname(out_headername))
-except OSError as exc:
-    if exc.errno != errno.EEXIST:
-        raise
-
-out_file = open(out_headername, "w+")
-out_file.write("""//
-// Generated based on {in_filename}
-//
-
-#pragma once
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-""".format(in_filename=in_filename))
-
-for nmsp in namespace_list:
-    out_file.write("""namespace {name} {{
-""".format(name=nmsp))
-
-for func_name, func_list in header_db.items():
-    out_file.write("""
-{funcs}""".format(funcs=print_declaration(func_list)))
-
-for nmsp in reversed(namespace_list):
-    out_file.write("""}} // namespace {name} {{
-""".format(name=nmsp))
-
-out_file.close()
-
-print("Formatting with clang-format " + out_headername)
-try:
-    lc = ["clang-format", "-style=file", "-i", out_headername]
-    call(lc)
-except OSError as exc:
-    if exc.errno == errno.ENOENT:
-        print("Error: clang-format is not found")
-
diff --git a/scripts/generate_cmake.py b/scripts/generate_cmake.py
deleted file mode 100644
index 02efedb25..000000000
--- a/scripts/generate_cmake.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env python
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-from sys import argv, exit, stdin
-from subprocess import call
-from pprint import pprint
-from collections import defaultdict
-import re
-import os
-
-from func_parser import create_func_db, get_namespaces
-
-def usage(err = None):
-    if err:
-        print('error: %s' % err)
-    print('''\
-Script to generate CMakeLists.txt for all files in the specified directory
-Usage:
-
-    {script} <path/to/directory> <libname>
-
-Example:
-
-    {script}  include/oneapi/mkl/blas/detail/mklgpu mklgpu
-'''.format(script = argv[0]))
-
-if len(argv) <= 2:
-    usage()
-    exit(0)
-
-if re.search(r'[-]*\b[h]([e][l][p])?\b' ,argv[1]):
-    usage()
-    exit(0)
-
-in_dir = argv[1]
-libname = argv[2]
-
-if not os.path.exists(in_dir):
-    print("Error: directory " + in_dir + " doesn't exist\n")
-    exit(1)
-
-cmake_file = in_dir + "/CMakeLists.txt"
-
-if os.path.exists(cmake_file):
-    print("Error: file " + cmake_file + " already exists\n")
-    exit(1)
-else:
-    print("Generate " + cmake_file)
-
-file_list = os.listdir(in_dir)
-
-out_file = open(cmake_file, "w+")
-
-out_file.write("""#
-# generated file
-#
-
-set(LIB_NAME onemkl_blas_{libname})
-set(LIB_OBJ ${{LIB_NAME}}_obj)
-
-# Add third-party library
-# find_package(XXX REQUIRED)
-
-add_library(${{LIB_NAME}})
-add_library(${{LIB_OBJ}} OBJECT
-""".format(libname=libname))
-
-for f in file_list:
-    if re.search('_dyn.c', f):
-        out_file.write("""  $<$<BOOL:${{BUILD_SHARED_LIBS}}>: {filename}>
-""".format(filename=f))
-    else:
-        out_file.write("""  {filename}
-""".format(filename=f))
-
-out_file.write("""
-)
-
-target_include_directories(${{LIB_OBJ}}
-  PRIVATE ${{PROJECT_SOURCE_DIR}}/include
-          ${{PROJECT_SOURCE_DIR}}/src
-)
-
-target_link_libraries(${{LIB_OBJ}}
-    PUBLIC ONEMKL::SYCL::SYCL
-    # Add third party library to link with here
-)
-
-target_compile_features(${{LIB_OBJ}} PUBLIC cxx_std_14)
-set_target_properties(${{LIB_OBJ}} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${{LIB_NAME}} PUBLIC ${{LIB_OBJ}})
-
-# Add major version to the library
-set_target_properties(${{LIB_NAME}} PROPERTIES
-  SOVERSION ${{PROJECT_VERSION_MAJOR}}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${{LIB_NAME}}>)
-
-# Add the library to install package
-install(TARGETS ${{LIB_OBJ}} EXPORT oneMKLTargets)
-install(TARGETS ${{LIB_NAME}} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
-""".format())
-
-out_file.close()
diff --git a/scripts/generate_ct_instant.py b/scripts/generate_ct_instant.py
deleted file mode 100755
index acc02e3ea..000000000
--- a/scripts/generate_ct_instant.py
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/usr/bin/env python
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-from sys import argv, exit, stdin
-from subprocess import call
-from pprint import pprint
-from collections import defaultdict
-import errno
-import re
-import os
-
-from func_parser import create_func_db, get_namespaces
-
-def usage(err = None):
-    if err:
-        print('error: %s' % err)
-    print('''\
-Script to generate CT API instantiations for backend based on general_ct_templates.hpp
-Note: requires clang-format 9.0.0 tool to be installed
-Usage:
-
-    {script} <path/to/general_ct_templates.hpp> <path/to/out_ct_header.hpp> <path/to/backend_include.hpp> <backend> <namespace>
-
-Example:
-The command below will generate:
-"blas_ct.hpp" header with compile-time BLAS API based on "blas_ct_templates.hpp" for "mklgpu" backend.
-API from the backend library will be called from "oneapi::mkl::mklgpu::blas" namespace.
-
-{script}  include/oneapi/mkl/blas/detail/blas_ct_templates.hpp include/oneapi/mkl/blas/detail/mklgpu/blas_ct.hpp include/oneapi/mkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp mklgpu oneapi::mkl::mklgpu::blas
-'''.format(script = argv[0]))
-
-if len(argv) < 6:
-    usage()
-    exit(0)
-
-if re.search(r'[-]*\b[h]([e][l][p])?\b' ,argv[1]):
-    usage()
-    exit(0)
-
-in_filename = argv[1]
-out_filename = argv[2]
-include = argv[3]
-backend = argv[4]
-namespace = argv[5]
-
-namespace_list=namespace.split("::")
-
-header_db = create_func_db(in_filename)
-external_namespace_list=get_namespaces(in_filename)
-
-print("Generate " + out_filename)
-
-def print_funcs(func_list):
-    code=""
-    for data in func_list:
-        code +="""
-template<>
-{ret_type} {name}<backend::{backend}>{par_str} {{
-    {name}_precondition{call_str};
-    {namespace}::{name}{call_str};
-    {name}_postcondition{call_str};
-}}
-""".format(namespace=namespace, backend=backend, **data)
-    return code
-
-try:
-    os.makedirs(os.path.dirname(out_filename))
-except OSError as exc:
-    if exc.errno != errno.EEXIST:
-        raise
-
-out_file = open(out_filename, "w+")
-out_file.write("""//
-// Generated based on {in_header}
-//
-
-#pragma once
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/detail/backends.hpp"
-#include "{internal_api}"
-#include "{ct_teplates}"
-
-""".format(in_header=in_filename, ct_teplates=in_filename.strip("include/"), internal_api=include.strip("include/")))
-
-
-for nmsp in external_namespace_list:
-    out_file.write("""namespace {name} {{
-""".format(name=nmsp))
-
-for func_name, func_list in header_db.items():
-    out_file.write("""
-{funcs}""".format(funcs=print_funcs(func_list)))
-
-
-for nmsp in reversed(external_namespace_list):
-    out_file.write("""}} // namespace {name} {{
-""".format(name=nmsp))
-
-out_file.close()
-
-print("Formatting with clang-format " + out_filename)
-try:
-    lc = ["clang-format", "-style=file", "-i", out_filename]
-    call(lc)
-except OSError as exc:
-    if exc.errno == errno.ENOENT:
-        print("Error: clang-format is not found")
-
diff --git a/scripts/generate_ct_templates.py b/scripts/generate_ct_templates.py
deleted file mode 100755
index 7f54d4f08..000000000
--- a/scripts/generate_ct_templates.py
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/env python
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-from sys import argv, exit, stdin
-from subprocess import call
-from pprint import pprint
-from collections import defaultdict
-import errno
-import re
-import os
-
-from func_parser import create_func_db, get_namespaces
-
-def usage(err = None):
-    if err:
-        print('error: %s' % err)
-    print('''\
-Script to generate header file for templated compile-time API based on base_header.h
-Note: requires clang-format 9.0.0 tool to be installed
-Usage:
-
-    {script} <path/to/base_header.hpp> <path/to/out_headername.hpp>
-
-Example:
-The command below will generate:
-"blas_ct_templates.hpp" header with general templates for compile-time BLAS API based on "blas.hpp".
-
-    {script}  include/oneapi/mkl/blas.hpp include/oneapi/mkl/blas/detail/blas_ct_templates.hpp
-'''.format(script = argv[0]))
-
-if len(argv) < 2:
-    usage()
-    exit(0)
-
-if re.search(r'[-]*\b[h]([e][l][p])?\b' ,argv[1]):
-    usage()
-    exit(0)
-
-in_filename = argv[1]
-out_filename = argv[2]
-
-header_db = create_func_db(in_filename)
-external_namespace_list=get_namespaces(in_filename)
-
-print("Generate " + out_filename)
-
-def print_funcs(func_list):
-    code=""
-    for data in func_list:
-        code +="""
-template <oneapi::mkl::backend backend> static inline {ret_type} {name}{par_str};
-""".format(**data)
-    return code
-
-try:
-    os.makedirs(os.path.dirname(out_filename))
-except OSError as exc:
-    if exc.errno != errno.EEXIST:
-        raise
-
-out_file = open(out_filename, "w+")
-out_file.write("""//
-// Generated based on {in_header}
-//
-
-#pragma once
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-#include <cstdint>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/detail/backends.hpp"
-
-""".format(in_header=in_filename))
-
-
-for nmsp in external_namespace_list:
-    out_file.write("""namespace {name} {{
-""".format(name=nmsp))
-
-for func_name, func_list in header_db.items():
-    out_file.write("""
-{funcs}""".format(funcs=print_funcs(func_list)))
-
-
-for nmsp in reversed(external_namespace_list):
-    out_file.write("""}} // namespace {name} {{
-""".format(name=nmsp))
-
-out_file.close()
-
-print("Formatting with clang-format " + out_filename)
-retcode = 1
-try:
-    lc = ["clang-format", "-style=file", "-i", out_filename]
-    retcode=call(lc)
-except OSError as exc:
-    if exc.errno == errno.ENOENT:
-        print("Error: clang-format is not found")
-
diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py
deleted file mode 100755
index 29947273f..000000000
--- a/scripts/generate_wrappers.py
+++ /dev/null
@@ -1,172 +0,0 @@
-#!/usr/bin/env python
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-from sys import argv, exit, stdin
-from subprocess import call
-from pprint import pprint
-from collections import defaultdict
-import errno
-import re
-import os
-
-from func_parser import create_func_db, get_namespaces
-
-def usage(err = None):
-    if err:
-        print('error: %s' % err)
-    print('''\
-Script to generate blank wrappers and pointers table based on header.hpp
-Note: requires clang-format 9.0.0 tool
-Usage:
-
-    {script} <path/to/header.hpp> <path/to/table.hpp> <path/to/out_wrappers.cpp> <libname>
-
-Example:
-
-    {script}  include/oneapi/mkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp src/blas/function_table.hpp src/blas/backend/mklgpu/wrappers.cpp mklgpu
-'''.format(script = argv[0]))
-
-if len(argv) <= 4:
-    usage()
-    exit(0)
-
-if re.search(r'[-]*\b[h]([e][l][p])?\b' ,argv[1]):
-    usage()
-    exit(0)
-
-in_filename = argv[1]
-in_table = argv[2]
-out_filename = argv[3]
-libname = argv[4]
-
-table_list = argv[0].rsplit('/', 1)[0] + "/blas_list.txt"
-table_file = out_filename.rsplit('/', 1)[0] + "/" + libname + "_wrappers_table_dyn.cpp"
-
-cmake_file = out_filename.rsplit('/', 1)[0] + "/CMakeLists.txt"
-
-header_db = create_func_db(in_filename)
-namespace_list = get_namespaces(in_filename)
-
-# Generate wrappers
-print("Generate " + out_filename)
-
-def print_funcs(func_list):
-    code=""
-    for data in func_list:
-        code +="""
-{ret_type} {name}{par_str} {{
-    throw std::runtime_error("Not implemented for {libname}");
-}}
-""".format(libname=libname, **data)
-    return code
-
-try:
-    os.makedirs(os.path.dirname(out_filename))
-except OSError as exc:
-    if exc.errno != errno.EEXIST:
-        raise
-
-out_file = open(out_filename, "w+")
-out_file.write("""//
-// generated file
-//
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/types.hpp"
-
-#include "{header}"
-
-""".format(header=in_filename.strip("include/")))
-
-for nmsp in namespace_list:
-    out_file.write("""namespace {name} {{
-""".format(name=nmsp))
-
-for func_name, func_list in header_db.items():
-    out_file.write("""
-{funcs}""".format(funcs=print_funcs(func_list)))
-
-out_file.write("\n")
-for nmsp in reversed(namespace_list):
-    out_file.write("""}} // namespace {name} {{
-""".format(name=nmsp))
-
-out_file.close()
-
-print("Formatting with clang-format " + out_filename)
-try:
-    lc = ["clang-format", "-style=file", "-i", out_filename]
-    call(lc)
-except OSError as exc:
-    if exc.errno == errno.ENOENT:
-        print("Error: clang-format is not found")
-    else:
-        raise
-
-# Generate table
-print("Generate " + table_file)
-
-try:
-    os.makedirs(os.path.dirname(table_file))
-except OSError as exc:
-    if exc.errno != errno.EEXIST:
-        raise
-
-out_file = open(table_file, "w+")
-out_file.write("""//
-// generated file
-//
-
-#include "{header}"
-#include "{table}"
-
-#define WRAPPER_VERSION 1
-
-extern "C" function_table_t mkl_blas_table = {{
-    WRAPPER_VERSION,
-""".format(table=in_table.strip('src/'), header=in_filename.strip('include/')))
-
-namespace = ""
-for nmsp in namespace_list:
-    namespace = namespace + nmsp.strip() + "::"
-with open(table_list, "r") as f:
-    table = f.readlines()
-
-for t in table:
-    out_file.write("    " + namespace + t.strip() + ",\n")
-
-
-out_file.write("\n};\n")
-out_file.close()
-
-print("Formatting with clang-format " + table_file)
-try:
-    lc = ["clang-format", "-style=file", "-i", table_file]
-    call(lc)
-except OSError as exc:
-    if exc.errno == errno.ENOENT:
-        print("Error: clang-format is not found")
-    else:
-        raise
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
deleted file mode 100644
index 0b632c1bd..000000000
--- a/src/CMakeLists.txt
+++ /dev/null
@@ -1,94 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Define common build flags for oneMKL libraries
-set(ONEMKL_BUILD_COPT "")
-if(WIN32 AND BUILD_SHARED_LIBS)
-  list(APPEND ONEMKL_BUILD_COPT "-Donemkl_EXPORTS")
-endif()
-
-# portBLAS backend variables must be accessible here to correctly
-# generate the config file.
-set(ENABLE_PORTBLAS_BACKEND_INTEL_CPU OFF CACHE INTERNAL "")
-set(ENABLE_PORTBLAS_BACKEND_INTEL_GPU OFF CACHE INTERNAL "")
-set(ENABLE_PORTBLAS_BACKEND_AMD_GPU OFF CACHE INTERNAL "")
-set(ENABLE_PORTBLAS_BACKEND_NVIDIA_GPU OFF CACHE INTERNAL "")
-# store path to CMAKE_CURRENT_BINARY_DIR to use it later (makes FetchContent_Declare workable)
-set(ONEMKL_GENERATED_INCLUDE_PATH ${CMAKE_CURRENT_BINARY_DIR})
-
-
-set(ONEMKL_INTERFACE_INCLUDE_DIRS
-  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
-  $<INSTALL_INTERFACE:include>
-)
-
-# Build loader and backends for each domain
-add_custom_target(onemkl_backend_libs)
-foreach(domain ${TARGET_DOMAINS})
-  add_subdirectory(${domain})
-endforeach()
-
-# Generate header with enabled backends for testing
-configure_file(config.hpp.in "${CMAKE_CURRENT_BINARY_DIR}/oneapi/mkl/config.hpp.configured")
-file(GENERATE
-  OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/oneapi/mkl/detail/config.hpp"
-  INPUT "${CMAKE_CURRENT_BINARY_DIR}/oneapi/mkl/config.hpp.configured"
-)
-install(FILES "${CMAKE_CURRENT_BINARY_DIR}/oneapi/mkl/detail/config.hpp"
-  DESTINATION include/oneapi/mkl/detail
-  COMPONENT Devel
-)
-
-# Add recipe for onemkl loader library
-if(BUILD_SHARED_LIBS)
-  add_library(onemkl SHARED)
-
-  # The loader library depends on all the backend libraries as it uses
-  # dlopen to load them at runtime.
-  # Use add_dependencies to ensure that all the backend libraries are
-  # (re-)built when compiling the loader or runtime binaries.
-  add_dependencies(onemkl onemkl_backend_libs)
-
-  target_include_directories(onemkl
-    PUBLIC ${ONEMKL_INTERFACE_INCLUDE_DIRS}
-  )
-  set_target_properties(onemkl PROPERTIES
-    SOVERSION ${PROJECT_VERSION_MAJOR}
-  )
-  # w/a for setting oneMKL Interfaces installed headers as -I instead of -isystem for cmake >= 3.25 for workable find_package(MKL) combination
-  if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.25.0")
-    set_target_properties(onemkl PROPERTIES EXPORT_NO_SYSTEM true)
-  endif()
-
-  # Build dispatcher library
-  set (ONEMKL_LIBS ${TARGET_DOMAINS})
-  list(TRANSFORM ONEMKL_LIBS PREPEND onemkl_)
-  target_link_libraries(onemkl PUBLIC ${ONEMKL_LIBS})
-
-  # Add the library to install package
-  foreach(domain_lib ${ONEMKL_LIBS})
-    install(TARGETS ${domain_lib} EXPORT oneMKLTargets)
-  endforeach()
-  install(TARGETS onemkl EXPORT oneMKLTargets
-    RUNTIME DESTINATION bin
-    ARCHIVE DESTINATION lib
-    LIBRARY DESTINATION lib
-  )
-endif()
diff --git a/src/blas/CMakeLists.txt b/src/blas/CMakeLists.txt
deleted file mode 100644
index 1edf2e445..000000000
--- a/src/blas/CMakeLists.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build backends
-add_subdirectory(backends)
-
-# Recipe for BLAS loader object
-if(BUILD_SHARED_LIBS)
-add_library(onemkl_blas OBJECT)
-target_sources(onemkl_blas PRIVATE blas_loader.cpp)
-target_include_directories(onemkl_blas
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${PROJECT_SOURCE_DIR}/src/include
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-          $<TARGET_FILE_DIR:onemkl>
-)
-
-target_compile_options(onemkl_blas PRIVATE ${ONEMKL_BUILD_COPT})
-
-set_target_properties(onemkl_blas PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET onemkl_blas SOURCES blas_loader.cpp)
-else()
-  target_link_libraries(onemkl_blas PUBLIC ONEMKL::SYCL::SYCL)
-endif()
-
-endif()
diff --git a/src/blas/backends/CMakeLists.txt b/src/blas/backends/CMakeLists.txt
deleted file mode 100644
index 351f4b0e5..000000000
--- a/src/blas/backends/CMakeLists.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_custom_target(onemkl_backend_libs_blas)
-add_dependencies(onemkl_backend_libs onemkl_backend_libs_blas)
-
-if(ENABLE_MKLCPU_BACKEND)
-  add_subdirectory(mklcpu)
-endif()
-
-if(ENABLE_NETLIB_BACKEND)
-  add_subdirectory(netlib)
-endif()
-
-if(ENABLE_MKLGPU_BACKEND)
-  add_subdirectory(mklgpu)
-endif()
-
-if(ENABLE_CUBLAS_BACKEND AND UNIX)
-  add_subdirectory(cublas)
-endif()
-
-if(ENABLE_ROCBLAS_BACKEND AND UNIX)
-  add_subdirectory(rocblas)
-endif()
-
-if(ENABLE_PORTBLAS_BACKEND AND UNIX)
-  add_subdirectory(portblas)
-endif()
diff --git a/src/blas/backends/backend_wrappers.cxx b/src/blas/backends/backend_wrappers.cxx
deleted file mode 100644
index 62f6ced13..000000000
--- a/src/blas/backends/backend_wrappers.cxx
+++ /dev/null
@@ -1,511 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// clang-format off
-oneapi::mkl::blas::BACKEND::MAJOR::asum,
-oneapi::mkl::blas::BACKEND::MAJOR::asum,
-oneapi::mkl::blas::BACKEND::MAJOR::asum,
-oneapi::mkl::blas::BACKEND::MAJOR::asum,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::axpby,
-oneapi::mkl::blas::BACKEND::MAJOR::axpby,
-oneapi::mkl::blas::BACKEND::MAJOR::axpby,
-oneapi::mkl::blas::BACKEND::MAJOR::axpby,
-oneapi::mkl::blas::BACKEND::MAJOR::copy,
-oneapi::mkl::blas::BACKEND::MAJOR::copy,
-oneapi::mkl::blas::BACKEND::MAJOR::copy,
-oneapi::mkl::blas::BACKEND::MAJOR::copy,
-oneapi::mkl::blas::BACKEND::MAJOR::copy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::copy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::copy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::copy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::dot,
-oneapi::mkl::blas::BACKEND::MAJOR::dot,
-oneapi::mkl::blas::BACKEND::MAJOR::dot,
-oneapi::mkl::blas::BACKEND::MAJOR::dotc,
-oneapi::mkl::blas::BACKEND::MAJOR::dotc,
-oneapi::mkl::blas::BACKEND::MAJOR::dotu,
-oneapi::mkl::blas::BACKEND::MAJOR::dotu,
-oneapi::mkl::blas::BACKEND::MAJOR::iamin,
-oneapi::mkl::blas::BACKEND::MAJOR::iamin,
-oneapi::mkl::blas::BACKEND::MAJOR::iamin,
-oneapi::mkl::blas::BACKEND::MAJOR::iamin,
-oneapi::mkl::blas::BACKEND::MAJOR::iamax,
-oneapi::mkl::blas::BACKEND::MAJOR::iamax,
-oneapi::mkl::blas::BACKEND::MAJOR::iamax,
-oneapi::mkl::blas::BACKEND::MAJOR::iamax,
-oneapi::mkl::blas::BACKEND::MAJOR::nrm2,
-oneapi::mkl::blas::BACKEND::MAJOR::nrm2,
-oneapi::mkl::blas::BACKEND::MAJOR::nrm2,
-oneapi::mkl::blas::BACKEND::MAJOR::nrm2,
-oneapi::mkl::blas::BACKEND::MAJOR::rot,
-oneapi::mkl::blas::BACKEND::MAJOR::rot,
-oneapi::mkl::blas::BACKEND::MAJOR::rot,
-oneapi::mkl::blas::BACKEND::MAJOR::rot,
-oneapi::mkl::blas::BACKEND::MAJOR::rotg,
-oneapi::mkl::blas::BACKEND::MAJOR::rotg,
-oneapi::mkl::blas::BACKEND::MAJOR::rotg,
-oneapi::mkl::blas::BACKEND::MAJOR::rotg,
-oneapi::mkl::blas::BACKEND::MAJOR::rotm,
-oneapi::mkl::blas::BACKEND::MAJOR::rotm,
-oneapi::mkl::blas::BACKEND::MAJOR::rotmg,
-oneapi::mkl::blas::BACKEND::MAJOR::rotmg,
-oneapi::mkl::blas::BACKEND::MAJOR::scal,
-oneapi::mkl::blas::BACKEND::MAJOR::scal,
-oneapi::mkl::blas::BACKEND::MAJOR::scal,
-oneapi::mkl::blas::BACKEND::MAJOR::scal,
-oneapi::mkl::blas::BACKEND::MAJOR::scal,
-oneapi::mkl::blas::BACKEND::MAJOR::scal,
-oneapi::mkl::blas::BACKEND::MAJOR::sdsdot,
-oneapi::mkl::blas::BACKEND::MAJOR::swap,
-oneapi::mkl::blas::BACKEND::MAJOR::swap,
-oneapi::mkl::blas::BACKEND::MAJOR::swap,
-oneapi::mkl::blas::BACKEND::MAJOR::swap,
-oneapi::mkl::blas::BACKEND::MAJOR::gbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::gbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::gbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::gbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::dgmm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::dgmm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::dgmm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::dgmm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::ger,
-oneapi::mkl::blas::BACKEND::MAJOR::ger,
-oneapi::mkl::blas::BACKEND::MAJOR::gerc,
-oneapi::mkl::blas::BACKEND::MAJOR::gerc,
-oneapi::mkl::blas::BACKEND::MAJOR::geru,
-oneapi::mkl::blas::BACKEND::MAJOR::geru,
-oneapi::mkl::blas::BACKEND::MAJOR::hbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::hbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::hemv,
-oneapi::mkl::blas::BACKEND::MAJOR::hemv,
-oneapi::mkl::blas::BACKEND::MAJOR::her,
-oneapi::mkl::blas::BACKEND::MAJOR::her,
-oneapi::mkl::blas::BACKEND::MAJOR::her2,
-oneapi::mkl::blas::BACKEND::MAJOR::her2,
-oneapi::mkl::blas::BACKEND::MAJOR::hpmv,
-oneapi::mkl::blas::BACKEND::MAJOR::hpmv,
-oneapi::mkl::blas::BACKEND::MAJOR::hpr,
-oneapi::mkl::blas::BACKEND::MAJOR::hpr,
-oneapi::mkl::blas::BACKEND::MAJOR::hpr2,
-oneapi::mkl::blas::BACKEND::MAJOR::hpr2,
-oneapi::mkl::blas::BACKEND::MAJOR::sbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::sbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::spmv,
-oneapi::mkl::blas::BACKEND::MAJOR::spmv,
-oneapi::mkl::blas::BACKEND::MAJOR::spr,
-oneapi::mkl::blas::BACKEND::MAJOR::spr,
-oneapi::mkl::blas::BACKEND::MAJOR::spr2,
-oneapi::mkl::blas::BACKEND::MAJOR::spr2,
-oneapi::mkl::blas::BACKEND::MAJOR::symv,
-oneapi::mkl::blas::BACKEND::MAJOR::symv,
-oneapi::mkl::blas::BACKEND::MAJOR::syr,
-oneapi::mkl::blas::BACKEND::MAJOR::syr,
-oneapi::mkl::blas::BACKEND::MAJOR::syr2,
-oneapi::mkl::blas::BACKEND::MAJOR::syr2,
-oneapi::mkl::blas::BACKEND::MAJOR::tbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tbsv,
-oneapi::mkl::blas::BACKEND::MAJOR::tbsv,
-oneapi::mkl::blas::BACKEND::MAJOR::tbsv,
-oneapi::mkl::blas::BACKEND::MAJOR::tbsv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpsv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpsv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpsv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpsv,
-oneapi::mkl::blas::BACKEND::MAJOR::trmv,
-oneapi::mkl::blas::BACKEND::MAJOR::trmv,
-oneapi::mkl::blas::BACKEND::MAJOR::trmv,
-oneapi::mkl::blas::BACKEND::MAJOR::trmv,
-oneapi::mkl::blas::BACKEND::MAJOR::trsv,
-oneapi::mkl::blas::BACKEND::MAJOR::trsv,
-oneapi::mkl::blas::BACKEND::MAJOR::trsv,
-oneapi::mkl::blas::BACKEND::MAJOR::trsv,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm,
-oneapi::mkl::blas::BACKEND::MAJOR::hemm,
-oneapi::mkl::blas::BACKEND::MAJOR::hemm,
-oneapi::mkl::blas::BACKEND::MAJOR::herk,
-oneapi::mkl::blas::BACKEND::MAJOR::herk,
-oneapi::mkl::blas::BACKEND::MAJOR::her2k,
-oneapi::mkl::blas::BACKEND::MAJOR::her2k,
-oneapi::mkl::blas::BACKEND::MAJOR::symm,
-oneapi::mkl::blas::BACKEND::MAJOR::symm,
-oneapi::mkl::blas::BACKEND::MAJOR::symm,
-oneapi::mkl::blas::BACKEND::MAJOR::symm,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::syr2k,
-oneapi::mkl::blas::BACKEND::MAJOR::syr2k,
-oneapi::mkl::blas::BACKEND::MAJOR::syr2k,
-oneapi::mkl::blas::BACKEND::MAJOR::syr2k,
-oneapi::mkl::blas::BACKEND::MAJOR::trmm,
-oneapi::mkl::blas::BACKEND::MAJOR::trmm,
-oneapi::mkl::blas::BACKEND::MAJOR::trmm,
-oneapi::mkl::blas::BACKEND::MAJOR::trmm,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemmt,
-oneapi::mkl::blas::BACKEND::MAJOR::gemmt,
-oneapi::mkl::blas::BACKEND::MAJOR::gemmt,
-oneapi::mkl::blas::BACKEND::MAJOR::gemmt,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_bias,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_bias,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_bias,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_bias,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd_batch,    
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy2,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy2,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy2,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy2,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd,    
-oneapi::mkl::blas::BACKEND::MAJOR::asum,
-oneapi::mkl::blas::BACKEND::MAJOR::asum,
-oneapi::mkl::blas::BACKEND::MAJOR::asum,
-oneapi::mkl::blas::BACKEND::MAJOR::asum,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::axpy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::axpby,
-oneapi::mkl::blas::BACKEND::MAJOR::axpby,
-oneapi::mkl::blas::BACKEND::MAJOR::axpby,
-oneapi::mkl::blas::BACKEND::MAJOR::axpby,
-oneapi::mkl::blas::BACKEND::MAJOR::copy,
-oneapi::mkl::blas::BACKEND::MAJOR::copy,
-oneapi::mkl::blas::BACKEND::MAJOR::copy,
-oneapi::mkl::blas::BACKEND::MAJOR::copy,
-oneapi::mkl::blas::BACKEND::MAJOR::copy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::copy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::copy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::copy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::copy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::copy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::copy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::copy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::dot,
-oneapi::mkl::blas::BACKEND::MAJOR::dot,
-oneapi::mkl::blas::BACKEND::MAJOR::dot,
-oneapi::mkl::blas::BACKEND::MAJOR::dotc,
-oneapi::mkl::blas::BACKEND::MAJOR::dotc,
-oneapi::mkl::blas::BACKEND::MAJOR::dotu,
-oneapi::mkl::blas::BACKEND::MAJOR::dotu,
-oneapi::mkl::blas::BACKEND::MAJOR::iamin,
-oneapi::mkl::blas::BACKEND::MAJOR::iamin,
-oneapi::mkl::blas::BACKEND::MAJOR::iamin,
-oneapi::mkl::blas::BACKEND::MAJOR::iamin,
-oneapi::mkl::blas::BACKEND::MAJOR::iamax,
-oneapi::mkl::blas::BACKEND::MAJOR::iamax,
-oneapi::mkl::blas::BACKEND::MAJOR::iamax,
-oneapi::mkl::blas::BACKEND::MAJOR::iamax,
-oneapi::mkl::blas::BACKEND::MAJOR::nrm2,
-oneapi::mkl::blas::BACKEND::MAJOR::nrm2,
-oneapi::mkl::blas::BACKEND::MAJOR::nrm2,
-oneapi::mkl::blas::BACKEND::MAJOR::nrm2,
-oneapi::mkl::blas::BACKEND::MAJOR::rot,
-oneapi::mkl::blas::BACKEND::MAJOR::rot,
-oneapi::mkl::blas::BACKEND::MAJOR::rot,
-oneapi::mkl::blas::BACKEND::MAJOR::rot,
-oneapi::mkl::blas::BACKEND::MAJOR::rotg,
-oneapi::mkl::blas::BACKEND::MAJOR::rotg,
-oneapi::mkl::blas::BACKEND::MAJOR::rotg,
-oneapi::mkl::blas::BACKEND::MAJOR::rotg,
-oneapi::mkl::blas::BACKEND::MAJOR::rotm,
-oneapi::mkl::blas::BACKEND::MAJOR::rotm,
-oneapi::mkl::blas::BACKEND::MAJOR::rotmg,
-oneapi::mkl::blas::BACKEND::MAJOR::rotmg,
-oneapi::mkl::blas::BACKEND::MAJOR::scal,
-oneapi::mkl::blas::BACKEND::MAJOR::scal,
-oneapi::mkl::blas::BACKEND::MAJOR::scal,
-oneapi::mkl::blas::BACKEND::MAJOR::scal,
-oneapi::mkl::blas::BACKEND::MAJOR::scal,
-oneapi::mkl::blas::BACKEND::MAJOR::scal,
-oneapi::mkl::blas::BACKEND::MAJOR::sdsdot,
-oneapi::mkl::blas::BACKEND::MAJOR::swap,
-oneapi::mkl::blas::BACKEND::MAJOR::swap,
-oneapi::mkl::blas::BACKEND::MAJOR::swap,
-oneapi::mkl::blas::BACKEND::MAJOR::swap,
-oneapi::mkl::blas::BACKEND::MAJOR::gbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::gbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::gbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::gbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemv_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::dgmm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::dgmm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::dgmm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::dgmm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::dgmm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::dgmm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::dgmm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::dgmm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::ger,
-oneapi::mkl::blas::BACKEND::MAJOR::ger,
-oneapi::mkl::blas::BACKEND::MAJOR::gerc,
-oneapi::mkl::blas::BACKEND::MAJOR::gerc,
-oneapi::mkl::blas::BACKEND::MAJOR::geru,
-oneapi::mkl::blas::BACKEND::MAJOR::geru,
-oneapi::mkl::blas::BACKEND::MAJOR::hbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::hbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::hemv,
-oneapi::mkl::blas::BACKEND::MAJOR::hemv,
-oneapi::mkl::blas::BACKEND::MAJOR::her,
-oneapi::mkl::blas::BACKEND::MAJOR::her,
-oneapi::mkl::blas::BACKEND::MAJOR::her2,
-oneapi::mkl::blas::BACKEND::MAJOR::her2,
-oneapi::mkl::blas::BACKEND::MAJOR::hpmv,
-oneapi::mkl::blas::BACKEND::MAJOR::hpmv,
-oneapi::mkl::blas::BACKEND::MAJOR::hpr,
-oneapi::mkl::blas::BACKEND::MAJOR::hpr,
-oneapi::mkl::blas::BACKEND::MAJOR::hpr2,
-oneapi::mkl::blas::BACKEND::MAJOR::hpr2,
-oneapi::mkl::blas::BACKEND::MAJOR::sbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::sbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::spmv,
-oneapi::mkl::blas::BACKEND::MAJOR::spmv,
-oneapi::mkl::blas::BACKEND::MAJOR::spr,
-oneapi::mkl::blas::BACKEND::MAJOR::spr,
-oneapi::mkl::blas::BACKEND::MAJOR::spr2,
-oneapi::mkl::blas::BACKEND::MAJOR::spr2,
-oneapi::mkl::blas::BACKEND::MAJOR::symv,
-oneapi::mkl::blas::BACKEND::MAJOR::symv,
-oneapi::mkl::blas::BACKEND::MAJOR::syr,
-oneapi::mkl::blas::BACKEND::MAJOR::syr,
-oneapi::mkl::blas::BACKEND::MAJOR::syr2,
-oneapi::mkl::blas::BACKEND::MAJOR::syr2,
-oneapi::mkl::blas::BACKEND::MAJOR::tbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tbmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tbsv,
-oneapi::mkl::blas::BACKEND::MAJOR::tbsv,
-oneapi::mkl::blas::BACKEND::MAJOR::tbsv,
-oneapi::mkl::blas::BACKEND::MAJOR::tbsv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpmv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpsv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpsv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpsv,
-oneapi::mkl::blas::BACKEND::MAJOR::tpsv,
-oneapi::mkl::blas::BACKEND::MAJOR::trmv,
-oneapi::mkl::blas::BACKEND::MAJOR::trmv,
-oneapi::mkl::blas::BACKEND::MAJOR::trmv,
-oneapi::mkl::blas::BACKEND::MAJOR::trmv,
-oneapi::mkl::blas::BACKEND::MAJOR::trsv,
-oneapi::mkl::blas::BACKEND::MAJOR::trsv,
-oneapi::mkl::blas::BACKEND::MAJOR::trsv,
-oneapi::mkl::blas::BACKEND::MAJOR::trsv,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm,
-oneapi::mkl::blas::BACKEND::MAJOR::hemm,
-oneapi::mkl::blas::BACKEND::MAJOR::hemm,
-oneapi::mkl::blas::BACKEND::MAJOR::herk,
-oneapi::mkl::blas::BACKEND::MAJOR::herk,
-oneapi::mkl::blas::BACKEND::MAJOR::her2k,
-oneapi::mkl::blas::BACKEND::MAJOR::her2k,
-oneapi::mkl::blas::BACKEND::MAJOR::symm,
-oneapi::mkl::blas::BACKEND::MAJOR::symm,
-oneapi::mkl::blas::BACKEND::MAJOR::symm,
-oneapi::mkl::blas::BACKEND::MAJOR::symm,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::syrk_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::syr2k,
-oneapi::mkl::blas::BACKEND::MAJOR::syr2k,
-oneapi::mkl::blas::BACKEND::MAJOR::syr2k,
-oneapi::mkl::blas::BACKEND::MAJOR::syr2k,
-oneapi::mkl::blas::BACKEND::MAJOR::trmm,
-oneapi::mkl::blas::BACKEND::MAJOR::trmm,
-oneapi::mkl::blas::BACKEND::MAJOR::trmm,
-oneapi::mkl::blas::BACKEND::MAJOR::trmm,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::trsm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::gemmt,
-oneapi::mkl::blas::BACKEND::MAJOR::gemmt,
-oneapi::mkl::blas::BACKEND::MAJOR::gemmt,
-oneapi::mkl::blas::BACKEND::MAJOR::gemmt,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_bias,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_bias,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_bias,
-oneapi::mkl::blas::BACKEND::MAJOR::gemm_bias,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy2,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy2,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy2,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy2,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd,
-oneapi::mkl::blas::BACKEND::MAJOR::omatadd,    
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::omatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy_batch,
-oneapi::mkl::blas::BACKEND::MAJOR::imatcopy_batch,
-    // clang-format on
diff --git a/src/blas/backends/cublas/CMakeLists.txt b/src/blas/backends/cublas/CMakeLists.txt
deleted file mode 100644
index b64e7c37d..000000000
--- a/src/blas/backends/cublas/CMakeLists.txt
+++ /dev/null
@@ -1,72 +0,0 @@
-#==========================================================================
-#  Copyright (C) Codeplay Software Limited
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  For your convenience, a copy of the License has been included in this
-#  repository.
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-#=========================================================================
-
-set(LIB_NAME onemkl_blas_cublas)
-set(LIB_OBJ ${LIB_NAME}_obj)
-find_package(cuBLAS REQUIRED)
-set(SOURCES cublas_level1.cpp 
-                cublas_level2.cpp 
-                cublas_level3.cpp 
-                cublas_batch.cpp 
-                cublas_extensions.cpp 
-                $<$<STREQUAL:${ONEMKL_SYCL_IMPLEMENTATION},dpc++>:cublas_scope_handle.cpp >
-                $<$<STREQUAL:${ONEMKL_SYCL_IMPLEMENTATION},hipsycl>:cublas_scope_handle_hipsycl.cpp >
-                $<$<BOOL:${BUILD_SHARED_LIBS}>: cublas_wrappers.cpp>)
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT ${SOURCES})
-add_dependencies(onemkl_backend_libs_blas ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-
-if(NOT ${ONEMKL_SYCL_IMPLEMENTATION} STREQUAL "hipsycl")
-    target_compile_options(ONEMKL::SYCL::SYCL INTERFACE
-          -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda)
-    target_link_options(ONEMKL::SYCL::SYCL INTERFACE
-          -fsycl-targets=nvptx64-nvidia-cuda)
-endif()
-target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL ONEMKL::cuBLAS::cuBLAS)
-target_compile_features(${LIB_OBJ} PUBLIC cxx_std_11)
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON)
-
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET ${LIB_OBJ} SOURCES ${SOURCES})
-endif()
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/blas/backends/cublas/cublas_batch.cpp b/src/blas/backends/cublas/cublas_batch.cpp
deleted file mode 100644
index 009bb9541..000000000
--- a/src/blas/backends/cublas/cublas_batch.cpp
+++ /dev/null
@@ -1,1844 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "cublas_helper.hpp"
-#include "cublas_task.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace cublas {
-namespace column_major {
-
-// Buffer APIs
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-                int64_t stridex, sycl::buffer<float, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-}
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-                int64_t stridex, sycl::buffer<double, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-}
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                int64_t incy, int64_t stridey, int64_t batch_size) {
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-}
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                int64_t incy, int64_t stridey, int64_t batch_size) {
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x, int64_t incx,
-                int64_t stridex, sycl::buffer<float, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<double, 1> &y, int64_t incy,
-                int64_t stridey, int64_t batch_size) {
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a, sycl::buffer<float, 1> &x,
-                int64_t incx, int64_t stride_x, float beta, sycl::buffer<float, 1> &y, int64_t incy,
-                int64_t stride_y, int64_t batch_size) {
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<double, 1> &x, int64_t incx, int64_t stride_x, double beta,
-                sycl::buffer<double, 1> &y, int64_t incy, int64_t stride_y, int64_t batch_size) {
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-                int64_t stride_x, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                int64_t incy, int64_t stride_y, int64_t batch_size) {
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-                int64_t stride_x, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &y, int64_t incy, int64_t stride_y,
-                int64_t batch_size) {
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a, sycl::buffer<float, 1> &x,
-                int64_t incx, int64_t stride_x, sycl::buffer<float, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<double, 1> &x, int64_t incx, int64_t stride_x,
-                sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &x, int64_t incx, int64_t stride_x,
-                sycl::buffer<std::complex<float>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &x, int64_t incx, int64_t stride_x,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-}
-
-template <typename Ta, typename Tb, typename Tc, typename Ts>
-inline void gemm_batch_impl(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                            int64_t n, int64_t k, Ts alpha, sycl::buffer<Ta, 1> &a, int64_t lda,
-                            int64_t stride_a, sycl::buffer<Tb, 1> &b, int64_t ldb, int64_t stride_b,
-                            Ts beta, sycl::buffer<Tc, 1> &c, int64_t ldc, int64_t stride_c,
-                            int64_t batch_size) {
-    using cuTypeA = typename CudaEquivalentType<Ta>::Type;
-    using cuTypeB = typename CudaEquivalentType<Tb>::Type;
-    using cuTypeC = typename CudaEquivalentType<Tc>::Type;
-    using cuTypeS = typename CudaEquivalentType<Ts>::Type;
-    overflow_check(m, n, k, lda, ldb, ldc, stride_a, stride_b, stride_c, batch_size);
-
-    cublasGemmAlgo_t cublas_gemm_algo = CUBLAS_GEMM_DEFAULT;
-    queue.submit([&](sycl::handler &cgh) {
-        if (!verify_support<sycl::half, Ta, Tb, Tc, Ts>(queue, sycl::aspect::fp16)) {
-            throw oneapi::mkl::unimplemented(
-                "blas", "sycl::half", "half is not supported by the device or the sycl compiler");
-        }
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuTypeA *>(a_acc);
-            auto b_ = sc.get_mem<cuTypeB *>(b_acc);
-            auto c_ = sc.get_mem<cuTypeC *>(c_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(
-                "cublasGemmStridedBatchedEx", cublasGemmStridedBatchedEx, err, handle,
-                get_cublas_operation(transa), get_cublas_operation(transb), m, n, k, &alpha, a_,
-                get_cublas_datatype<cuTypeA>(), lda, stride_a, b_, get_cublas_datatype<cuTypeB>(),
-                ldb, stride_b, &beta, c_, get_cublas_datatype<cuTypeC>(), ldc, stride_c, batch_size,
-                get_cublas_datatype<cuTypeS>(), cublas_gemm_algo);
-        });
-    });
-}
-
-#define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                               \
-    void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                    int64_t k, TYPE_S alpha, sycl::buffer<TYPE_A, 1> &a, int64_t lda,             \
-                    int64_t stride_a, sycl::buffer<TYPE_B, 1> &b, int64_t ldb, int64_t stride_b,  \
-                    TYPE_S beta, sycl::buffer<TYPE_C, 1> &c, int64_t ldc, int64_t stride_c,       \
-                    int64_t batch_size) {                                                         \
-        gemm_batch_impl<TYPE_A, TYPE_B, TYPE_C, TYPE_S>(queue, transa, transb, m, n, k, alpha, a, \
-                                                        lda, stride_a, b, ldb, stride_b, beta, c, \
-                                                        ldc, stride_c, batch_size);               \
-    }
-
-GEMM_STRIDED_BATCH_LAUNCHER(sycl::half, sycl::half, sycl::half, sycl::half)
-GEMM_STRIDED_BATCH_LAUNCHER(sycl::half, sycl::half, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER(float, float, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER(double, double, double, double)
-GEMM_STRIDED_BATCH_LAUNCHER(std::complex<float>, std::complex<float>, std::complex<float>,
-                            std::complex<float>)
-GEMM_STRIDED_BATCH_LAUNCHER(std::complex<double>, std::complex<double>, std::complex<double>,
-                            std::complex<double>)
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER
-
-#define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                               \
-    void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                    int64_t k, TYPE_S alpha, sycl::buffer<TYPE_A, 1> &a, int64_t lda,             \
-                    int64_t stride_a, sycl::buffer<TYPE_B, 1> &b, int64_t ldb, int64_t stride_b,  \
-                    TYPE_S beta, sycl::buffer<TYPE_C, 1> &c, int64_t ldc, int64_t stride_c,       \
-                    int64_t batch_size) {                                                         \
-        throw unimplemented("blas", "gemm_batch",                                                 \
-                            std::string("for dtype unimplemented dtype combination <") +          \
-                                dtype_string<TYPE_A>() + "," + dtype_string<TYPE_B>() + "," +     \
-                                dtype_string<TYPE_C>() + "," + dtype_string<TYPE_S>() + ">");     \
-    }
-
-GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, std::int32_t, float)
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<float, 1> &b, int64_t ldb,
-                int64_t stride_b, int64_t batch_size) {
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-}
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<double, 1> &b, int64_t ldb,
-                int64_t stride_b, int64_t batch_size) {
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-}
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, int64_t stride_b,
-                int64_t batch_size) {
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-}
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, int64_t stride_b,
-                int64_t batch_size) {
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a, float beta,
-                sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                double beta, sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                int64_t stride_a, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-                int64_t stride_a, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                    sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                    sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size) {
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                    sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                    sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size) {
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                    int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-                    int64_t stride_b, int64_t batch_size) {
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                    int64_t lda, int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    int64_t ldb, int64_t stride_b, int64_t batch_size) {
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                    sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                    sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                   float beta, sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                   double beta, sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                   int64_t stride_a, std::complex<float> beta,
-                   sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<std::complex<float>, 1> &c, int64_t ldc, int64_t stride_c,
-                   int64_t batch_size) {
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                   int64_t lda, int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                   int64_t batch_size) {
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-}
-
-// USM APIs
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, int64_t *incx, float **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, int64_t *incx, double **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex<float> **x, int64_t *incx,
-                       std::complex<float> **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex<double> **x,
-                       int64_t *incx, std::complex<double> **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const float *x, int64_t incx,
-                       std::int64_t stridex, float *y, int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, int64_t incx,
-                       std::int64_t stridex, double *y, int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                       std::int64_t stridex, std::complex<float> *y, int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                       std::int64_t stridex, std::complex<double> *y, int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, const float **x, int64_t *incx,
-                       float **y, int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, const double **x,
-                       int64_t *incx, double **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex<float> *alpha,
-                       const std::complex<float> **x, int64_t *incx, std::complex<float> **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex<double> *alpha,
-                       const std::complex<double> **x, int64_t *incx, std::complex<double> **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx,
-                       int64_t stridex, float *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx,
-                       int64_t stridex, double *y, int64_t incy, int64_t stridey,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                       const std::complex<float> *x, int64_t incx, int64_t stridex,
-                       std::complex<float> *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                       const std::complex<double> *x, int64_t incx, int64_t stridex,
-                       std::complex<double> *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha,
-                       const float *a, int64_t lda, int64_t stride_a, const float *x, int64_t incx,
-                       int64_t stride_x, float beta, float *y, int64_t incy, int64_t stride_y,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha,
-                       const double *a, int64_t lda, int64_t stride_a, const double *x,
-                       int64_t incx, int64_t stride_x, double beta, double *y, int64_t incy,
-                       int64_t stride_y, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                       std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                       int64_t stride_a, const std::complex<float> *x, int64_t incx,
-                       int64_t stride_x, std::complex<float> beta, std::complex<float> *y,
-                       int64_t incy, int64_t stride_y, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                       std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                       int64_t stride_a, const std::complex<double> *x, int64_t incx,
-                       int64_t stride_x, std::complex<double> beta, std::complex<double> *y,
-                       int64_t incy, int64_t stride_y, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, float *alpha,
-                       const float **a, int64_t *lda, const float **x, int64_t *incx, float *beta,
-                       float **y, int64_t *incy, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, double *alpha,
-                       const double **a, int64_t *lda, const double **x, int64_t *incx,
-                       double *beta, double **y, int64_t *incy, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n,
-                       std::complex<float> *alpha, const std::complex<float> **a, int64_t *lda,
-                       const std::complex<float> **x, int64_t *incx, std::complex<float> *beta,
-                       std::complex<float> **y, int64_t *incy, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n,
-                       std::complex<double> *alpha, const std::complex<double> **a, int64_t *lda,
-                       const std::complex<double> **x, int64_t *incx, std::complex<double> *beta,
-                       std::complex<double> **y, int64_t *incy, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const float *a,
-                       int64_t lda, int64_t stride_a, const float *x, int64_t incx,
-                       int64_t stride_x, float *c, int64_t ldc, int64_t stride_c,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const double *a,
-                       int64_t lda, int64_t stride_a, const double *x, int64_t incx,
-                       int64_t stride_x, double *c, int64_t ldc, int64_t stride_c,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                       const std::complex<float> *a, int64_t lda, int64_t stride_a,
-                       const std::complex<float> *x, int64_t incx, int64_t stride_x,
-                       std::complex<float> *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                       const std::complex<double> *a, int64_t lda, int64_t stride_a,
-                       const std::complex<double> *x, int64_t incx, int64_t stride_x,
-                       std::complex<double> *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const float **a, int64_t *lda, const float **x, int64_t *incx, float **c,
-                       int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const double **a, int64_t *lda, const double **x, int64_t *incx, double **c,
-                       int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const std::complex<float> **a, int64_t *lda, const std::complex<float> **x,
-                       int64_t *incx, std::complex<float> **c, int64_t *ldc, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const std::complex<double> **a, int64_t *lda, const std::complex<double> **x,
-                       int64_t *incx, std::complex<double> **c, int64_t *ldc, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-}
-
-template <typename Ta, typename Tb, typename Tc, typename Ts>
-inline sycl::event gemm_batch_strided_usm_impl(sycl::queue &queue, transpose transa,
-                                               transpose transb, int64_t m, int64_t n, int64_t k,
-                                               Ts alpha, const Ta *a, int64_t lda, int64_t stride_a,
-                                               const Tb *b, int64_t ldb, int64_t stride_b, Ts beta,
-                                               Tc *c, int64_t ldc, int64_t stride_c,
-                                               int64_t batch_size,
-                                               const std::vector<sycl::event> &dependencies) {
-    using cuTypeA = typename CudaEquivalentType<Ta>::Type;
-    using cuTypeB = typename CudaEquivalentType<Tb>::Type;
-    using cuTypeC = typename CudaEquivalentType<Tc>::Type;
-    using cuTypeS = typename CudaEquivalentType<Ts>::Type;
-    overflow_check(m, n, k, lda, ldb, ldc, stride_a, stride_b, stride_c, batch_size);
-
-    cublasGemmAlgo_t cublas_gemm_algo = CUBLAS_GEMM_DEFAULT;
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        if (!verify_support<sycl::half, Ta, Tb, Tc, Ts>(queue, sycl::aspect::fp16)) {
-            throw oneapi::mkl::unimplemented(
-                "blas", "sycl::half", "half is not supported by the device or the sycl compiler");
-        }
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(
-                "cublasGemmStridedBatchedEx", cublasGemmStridedBatchedEx, err, handle,
-                get_cublas_operation(transa), get_cublas_operation(transb), m, n, k, &alpha, a,
-                get_cublas_datatype<cuTypeA>(), lda, stride_a, b, get_cublas_datatype<cuTypeB>(),
-                ldb, stride_b, &beta, c, get_cublas_datatype<cuTypeC>(), ldc, stride_c, batch_size,
-                get_cublas_datatype<cuTypeS>(), cublas_gemm_algo);
-        });
-    });
-    return done;
-}
-
-#define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                        \
-    sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,  \
-                           int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda,   \
-                           int64_t stride_a, const TYPE_B *b, int64_t ldb, int64_t stride_b,   \
-                           TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stride_c,              \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) { \
-        return gemm_batch_strided_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda,      \
-                                           stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, \
-                                           batch_size, dependencies);                          \
-    }
-
-GEMM_STRIDED_BATCH_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, sycl::half)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(float, float, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(double, double, double, double)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, std::complex<float>, std::complex<float>,
-                                std::complex<float>)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, std::complex<double>, std::complex<double>,
-                                std::complex<double>)
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER_USM
-
-#define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                        \
-    sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,  \
-                           int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda,   \
-                           int64_t stride_a, const TYPE_B *b, int64_t ldb, int64_t stride_b,   \
-                           TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stride_c,              \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) { \
-        throw unimplemented("blas", "gemm_batch",                                              \
-                            std::string("for dtype unimplemented dtype combination <") +       \
-                                dtype_string<TYPE_A>() + "," + dtype_string<TYPE_B>() + "," +  \
-                                dtype_string<TYPE_C>() + "," + dtype_string<TYPE_S>() + ">");  \
-    }
-
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float)
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Ta, typename Tb, typename Tc, typename Ts>
-inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, transpose *transb,
-                                       int64_t *m, int64_t *n, int64_t *k, Ts *alpha, const Ta **a,
-                                       int64_t *lda, const Tb **b, int64_t *ldb, Ts *beta, Tc **c,
-                                       int64_t *ldc, int64_t group_count, int64_t *group_size,
-                                       const std::vector<sycl::event> &dependencies) {
-    using cuTypeA = typename CudaEquivalentType<Ta>::Type;
-    using cuTypeB = typename CudaEquivalentType<Tb>::Type;
-    using cuTypeC = typename CudaEquivalentType<Tc>::Type;
-    using cuTypeS = typename CudaEquivalentType<Ts>::Type;
-    for (int64_t i = 0; i < group_count; i++) {
-        overflow_check(m[i], n[i], k[i], lda[i], ldb[i], ldc[i], group_size[i]);
-    }
-
-    cublasGemmAlgo_t cublas_gemm_algo = CUBLAS_GEMM_DEFAULT;
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        if (!verify_support<sycl::half, Ta, Tb, Tc, Ts>(queue, sycl::aspect::fp16)) {
-            throw oneapi::mkl::unimplemented(
-                "blas", "sycl::half", "half is not supported by the device or the sycl compiler");
-        }
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            int64_t offset = 0;
-            cublasStatus_t err;
-            for (int64_t i = 0; i < group_count; i++) {
-                CUBLAS_ERROR_FUNC_T_SYNC(
-                    "cublasGemmBatchedEx", cublasGemmBatchedEx, err, handle,
-                    get_cublas_operation(transa[i]), get_cublas_operation(transb[i]), (int)m[i],
-                    (int)n[i], (int)k[i], &alpha[i], (const void *const *)(a + offset),
-                    get_cublas_datatype<cuTypeA>(), (int)lda[i], (const void *const *)(b + offset),
-                    get_cublas_datatype<cuTypeB>(), (int)ldb[i], &beta[i],
-                    (void *const *)(c + offset), get_cublas_datatype<cuTypeC>(), (int)ldc[i],
-                    (int)group_size[i], get_cublas_datatype<cuTypeS>(), cublas_gemm_algo);
-                offset += group_size[i];
-            }
-        });
-    });
-    return done;
-}
-
-#define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                                    \
-    sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,   \
-                           int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda,  \
-                           const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \
-                           int64_t group_count, int64_t *group_size,                               \
-                           const std::vector<sycl::event> &dependencies) {                         \
-        return gemm_batch_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, \
-                                   ldc, group_count, group_size, dependencies);                    \
-    }
-
-GEMM_BATCH_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, sycl::half)
-GEMM_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float)
-GEMM_BATCH_LAUNCHER_USM(float, float, float, float)
-GEMM_BATCH_LAUNCHER_USM(double, double, double, double)
-GEMM_BATCH_LAUNCHER_USM(std::complex<float>, std::complex<float>, std::complex<float>,
-                        std::complex<float>)
-GEMM_BATCH_LAUNCHER_USM(std::complex<double>, std::complex<double>, std::complex<double>,
-                        std::complex<double>)
-
-#undef GEMM_BATCH_LAUNCHER_USM
-
-#define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                                    \
-    sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,   \
-                           int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda,  \
-                           const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \
-                           int64_t group_count, int64_t *group_size,                               \
-                           const std::vector<sycl::event> &dependencies) {                         \
-        throw unimplemented("blas", "gemm_batch",                                                  \
-                            std::string("for dtype unimplemented dtype combination <") +           \
-                                dtype_string<TYPE_A>() + "," + dtype_string<TYPE_B>() + "," +      \
-                                dtype_string<TYPE_C>() + "," + dtype_string<TYPE_S>() + ">");      \
-    }
-
-GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, float, float)
-GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float)
-
-#undef GEMM_BATCH_LAUNCHER_USM
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, float alpha, const float *a,
-                       int64_t lda, int64_t stride_a, float *b, int64_t ldb, int64_t stride_b,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, double alpha, const double *a,
-                       int64_t lda, int64_t stride_a, double *b, int64_t ldb, int64_t stride_b,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                       const std::complex<float> *a, int64_t lda, int64_t stride_a,
-                       std::complex<float> *b, int64_t ldb, int64_t stride_b, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                       const std::complex<double> *a, int64_t lda, int64_t stride_a,
-                       std::complex<double> *b, int64_t ldb, int64_t stride_b, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-}
-
-template <typename Func, typename T>
-inline sycl::event trsm_batch(const char *func_name, Func func, sycl::queue &queue,
-                              side *left_right, uplo *upper_lower, transpose *trans,
-                              diag *unit_diag, int64_t *m, int64_t *n, T *alpha, const T **a,
-                              int64_t *lda, T **b, int64_t *ldb, int64_t group_count,
-                              int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    for (int64_t i = 0; i < group_count; i++) {
-        overflow_check(m[i], n[i], lda[i], ldb[i], group_size[i]);
-    }
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            int64_t offset = 0;
-            cublasStatus_t err;
-            for (int64_t i = 0; i < group_count; i++) {
-                auto **a_ = reinterpret_cast<const cuDataType **>(a);
-                auto **b_ = reinterpret_cast<cuDataType **>(b);
-                CUBLAS_ERROR_FUNC_T_SYNC(
-                    func_name, func, err, handle, get_cublas_side_mode(left_right[i]),
-                    get_cublas_fill_mode(upper_lower[i]), get_cublas_operation(trans[i]),
-                    get_cublas_diag_type(unit_diag[i]), (int)m[i], (int)n[i],
-                    (cuDataType *)&alpha[i], a_ + offset, (int)lda[i], b_ + offset, (int)ldb[i],
-                    (int)group_size[i]);
-                offset += group_size[i];
-            }
-        });
-    });
-    return done;
-}
-
-#define TRSM_BATCH_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                              \
-    sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower,                \
-                           transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, TYPE *alpha, \
-                           const TYPE **a, int64_t *lda, TYPE **b, int64_t *ldb,                   \
-                           int64_t group_count, int64_t *group_size,                               \
-                           const std::vector<sycl::event> &dependencies) {                         \
-        return trsm_batch(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans,  \
-                          unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size,         \
-                          dependencies);                                                           \
-    }
-
-TRSM_BATCH_LAUNCHER_USM(float, cublasStrsmBatched)
-TRSM_BATCH_LAUNCHER_USM(double, cublasDtrsmBatched)
-TRSM_BATCH_LAUNCHER_USM(std::complex<float>, cublasCtrsmBatched)
-TRSM_BATCH_LAUNCHER_USM(std::complex<double>, cublasZtrsmBatched)
-
-#undef TRSM_BATCH_LAUNCHER_USM
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, float *alpha, const float **a, int64_t *lda, float *beta,
-                       float **c, int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, double *alpha, const double **a, int64_t *lda, double *beta,
-                       double **c, int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, std::complex<float> *alpha, const std::complex<float> **a,
-                       int64_t *lda, std::complex<float> *beta, std::complex<float> **c,
-                       int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, std::complex<double> *alpha, const std::complex<double> **a,
-                       int64_t *lda, std::complex<double> *beta, std::complex<double> **c,
-                       int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       float alpha, const float *a, int64_t lda, int64_t stride_a, float beta,
-                       float *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       double alpha, const double *a, int64_t lda, int64_t stride_a, double beta,
-                       double *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                       int64_t stride_a, std::complex<float> beta, std::complex<float> *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                       int64_t stride_a, std::complex<double> beta, std::complex<double> *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                           const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb,
-                           int64_t stride_b, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                           const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb,
-                           int64_t stride_b, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                           int64_t stride_a, std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                           int64_t stride_a, std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                           float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                           double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<float> alpha, std::complex<float> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<double> alpha, std::complex<double> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, float alpha, const float *a, int64_t lda, int64_t stride_a,
-                          float beta, const float *b, int64_t ldb, int64_t stride_b, float *c,
-                          int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, double alpha, const double *a, int64_t lda, int64_t stride_a,
-                          double beta, const double *b, int64_t ldb, int64_t stride_b, double *c,
-                          int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                          int64_t lda, int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                          std::complex<float> *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                          int64_t lda, int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                          std::complex<double> *c, int64_t ldc, int64_t stride_c,
-                          int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           float *alpha, const float **a, int64_t *lda, float **b, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           double *alpha, const double **a, int64_t *lda, double **b, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           std::complex<float> *alpha, const std::complex<float> **a, int64_t *lda,
-                           std::complex<float> **b, int64_t *ldb, int64_t group_count,
-                           int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           std::complex<double> *alpha, const std::complex<double> **a,
-                           int64_t *lda, std::complex<double> **b, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           float *alpha, float **ab, int64_t *lda, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           double *alpha, double **ab, int64_t *lda, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           std::complex<float> *alpha, std::complex<float> **ab, int64_t *lda,
-                           int64_t *ldb, int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           std::complex<double> *alpha, std::complex<double> **ab, int64_t *lda,
-                           int64_t *ldb, int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-} // namespace column_major
-namespace row_major {
-
-// Buffer APIs
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-                int64_t stridex, sycl::buffer<float, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-}
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-                int64_t stridex, sycl::buffer<double, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-}
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                int64_t incy, int64_t stridey, int64_t batch_size) {
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-}
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                int64_t incy, int64_t stridey, int64_t batch_size) {
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x, int64_t incx,
-                int64_t stridex, sycl::buffer<float, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<double, 1> &y, int64_t incy,
-                int64_t stridey, int64_t batch_size) {
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a, sycl::buffer<float, 1> &x,
-                int64_t incx, int64_t stride_x, float beta, sycl::buffer<float, 1> &y, int64_t incy,
-                int64_t stride_y, int64_t batch_size) {
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<double, 1> &x, int64_t incx, int64_t stride_x, double beta,
-                sycl::buffer<double, 1> &y, int64_t incy, int64_t stride_y, int64_t batch_size) {
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-                int64_t stride_x, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                int64_t incy, int64_t stride_y, int64_t batch_size) {
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-                int64_t stride_x, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &y, int64_t incy, int64_t stride_y,
-                int64_t batch_size) {
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a, sycl::buffer<float, 1> &x,
-                int64_t incx, int64_t stride_x, sycl::buffer<float, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<double, 1> &x, int64_t incx, int64_t stride_x,
-                sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &x, int64_t incx, int64_t stride_x,
-                sycl::buffer<std::complex<float>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &x, int64_t incx, int64_t stride_x,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-}
-
-#define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                               \
-    void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                    int64_t k, TYPE_S alpha, sycl::buffer<TYPE_A, 1> &a, int64_t lda,             \
-                    int64_t stride_a, sycl::buffer<TYPE_B, 1> &b, int64_t ldb, int64_t stride_b,  \
-                    TYPE_S beta, sycl::buffer<TYPE_C, 1> &c, int64_t ldc, int64_t stride_c,       \
-                    int64_t batch_size) {                                                         \
-        throw unimplemented("blas", "gemm_batch", "for row_major layout");                        \
-    }
-
-GEMM_STRIDED_BATCH_LAUNCHER(sycl::half, sycl::half, sycl::half, sycl::half)
-GEMM_STRIDED_BATCH_LAUNCHER(sycl::half, sycl::half, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, std::int32_t, float)
-GEMM_STRIDED_BATCH_LAUNCHER(float, float, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER(double, double, double, double)
-GEMM_STRIDED_BATCH_LAUNCHER(std::complex<float>, std::complex<float>, std::complex<float>,
-                            std::complex<float>)
-GEMM_STRIDED_BATCH_LAUNCHER(std::complex<double>, std::complex<double>, std::complex<double>,
-                            std::complex<double>)
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<float, 1> &b, int64_t ldb,
-                int64_t stride_b, int64_t batch_size) {
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-}
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<double, 1> &b, int64_t ldb,
-                int64_t stride_b, int64_t batch_size) {
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-}
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, int64_t stride_b,
-                int64_t batch_size) {
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-}
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, int64_t stride_b,
-                int64_t batch_size) {
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a, float beta,
-                sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                double beta, sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                int64_t stride_a, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-                int64_t stride_a, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                    sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                    sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size) {
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                    sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                    sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size) {
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                    int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-                    int64_t stride_b, int64_t batch_size) {
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                    int64_t lda, int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    int64_t ldb, int64_t stride_b, int64_t batch_size) {
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                    sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                    sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                   float beta, sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                   double beta, sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                   int64_t stride_a, std::complex<float> beta,
-                   sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<std::complex<float>, 1> &c, int64_t ldc, int64_t stride_c,
-                   int64_t batch_size) {
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                   int64_t lda, int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                   int64_t batch_size) {
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-}
-
-// USM APIs
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, int64_t *incx, float **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, int64_t *incx, double **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex<float> **x, int64_t *incx,
-                       std::complex<float> **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex<double> **x,
-                       int64_t *incx, std::complex<double> **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const float *x, int64_t incx,
-                       std::int64_t stridex, float *y, int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, int64_t incx,
-                       std::int64_t stridex, double *y, int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                       std::int64_t stridex, std::complex<float> *y, int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                       std::int64_t stridex, std::complex<double> *y, int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, const float **x, int64_t *incx,
-                       float **y, int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, const double **x,
-                       int64_t *incx, double **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex<float> *alpha,
-                       const std::complex<float> **x, int64_t *incx, std::complex<float> **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex<double> *alpha,
-                       const std::complex<double> **x, int64_t *incx, std::complex<double> **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx,
-                       int64_t stridex, float *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx,
-                       int64_t stridex, double *y, int64_t incy, int64_t stridey,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                       const std::complex<float> *x, int64_t incx, int64_t stridex,
-                       std::complex<float> *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                       const std::complex<double> *x, int64_t incx, int64_t stridex,
-                       std::complex<double> *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha,
-                       const float *a, int64_t lda, int64_t stride_a, const float *x, int64_t incx,
-                       int64_t stride_x, float beta, float *y, int64_t incy, int64_t stride_y,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha,
-                       const double *a, int64_t lda, int64_t stride_a, const double *x,
-                       int64_t incx, int64_t stride_x, double beta, double *y, int64_t incy,
-                       int64_t stride_y, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                       std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                       int64_t stride_a, const std::complex<float> *x, int64_t incx,
-                       int64_t stride_x, std::complex<float> beta, std::complex<float> *y,
-                       int64_t incy, int64_t stride_y, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                       std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                       int64_t stride_a, const std::complex<double> *x, int64_t incx,
-                       int64_t stride_x, std::complex<double> beta, std::complex<double> *y,
-                       int64_t incy, int64_t stride_y, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, float *alpha,
-                       const float **a, int64_t *lda, const float **x, int64_t *incx, float *beta,
-                       float **y, int64_t *incy, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, double *alpha,
-                       const double **a, int64_t *lda, const double **x, int64_t *incx,
-                       double *beta, double **y, int64_t *incy, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n,
-                       std::complex<float> *alpha, const std::complex<float> **a, int64_t *lda,
-                       const std::complex<float> **x, int64_t *incx, std::complex<float> *beta,
-                       std::complex<float> **y, int64_t *incy, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n,
-                       std::complex<double> *alpha, const std::complex<double> **a, int64_t *lda,
-                       const std::complex<double> **x, int64_t *incx, std::complex<double> *beta,
-                       std::complex<double> **y, int64_t *incy, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const float *a,
-                       int64_t lda, int64_t stride_a, const float *x, int64_t incx,
-                       int64_t stride_x, float *c, int64_t ldc, int64_t stride_c,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const double *a,
-                       int64_t lda, int64_t stride_a, const double *x, int64_t incx,
-                       int64_t stride_x, double *c, int64_t ldc, int64_t stride_c,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                       const std::complex<float> *a, int64_t lda, int64_t stride_a,
-                       const std::complex<float> *x, int64_t incx, int64_t stride_x,
-                       std::complex<float> *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                       const std::complex<double> *a, int64_t lda, int64_t stride_a,
-                       const std::complex<double> *x, int64_t incx, int64_t stride_x,
-                       std::complex<double> *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const float **a, int64_t *lda, const float **x, int64_t *incx, float **c,
-                       int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const double **a, int64_t *lda, const double **x, int64_t *incx, double **c,
-                       int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const std::complex<float> **a, int64_t *lda, const std::complex<float> **x,
-                       int64_t *incx, std::complex<float> **c, int64_t *ldc, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const std::complex<double> **a, int64_t *lda, const std::complex<double> **x,
-                       int64_t *incx, std::complex<double> **c, int64_t *ldc, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-}
-
-#define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                        \
-    sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,  \
-                           int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda,   \
-                           int64_t stride_a, const TYPE_B *b, int64_t ldb, int64_t stride_b,   \
-                           TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stride_c,              \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) { \
-        throw unimplemented("blas", "gemm_batch", "for row_major layout");                     \
-    }
-
-GEMM_STRIDED_BATCH_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, sycl::half)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(float, float, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(double, double, double, double)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, std::complex<float>, std::complex<float>,
-                                std::complex<float>)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, std::complex<double>, std::complex<double>,
-                                std::complex<double>)
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER_USM
-
-#define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                                    \
-    sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,   \
-                           int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda,  \
-                           const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \
-                           int64_t group_count, int64_t *group_size,                               \
-                           const std::vector<sycl::event> &dependencies) {                         \
-        throw unimplemented("blas", "gemm_batch", "for row_major layout");                         \
-    }
-
-GEMM_BATCH_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, sycl::half)
-GEMM_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float)
-GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, float, float)
-GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float)
-GEMM_BATCH_LAUNCHER_USM(float, float, float, float)
-GEMM_BATCH_LAUNCHER_USM(double, double, double, double)
-GEMM_BATCH_LAUNCHER_USM(std::complex<float>, std::complex<float>, std::complex<float>,
-                        std::complex<float>)
-GEMM_BATCH_LAUNCHER_USM(std::complex<double>, std::complex<double>, std::complex<double>,
-                        std::complex<double>)
-
-#undef GEMM_BATCH_LAUNCHER_USM
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, float alpha, const float *a,
-                       int64_t lda, int64_t stride_a, float *b, int64_t ldb, int64_t stride_b,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, double alpha, const double *a,
-                       int64_t lda, int64_t stride_a, double *b, int64_t ldb, int64_t stride_b,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                       const std::complex<float> *a, int64_t lda, int64_t stride_a,
-                       std::complex<float> *b, int64_t ldb, int64_t stride_b, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                       const std::complex<double> *a, int64_t lda, int64_t stride_a,
-                       std::complex<double> *b, int64_t ldb, int64_t stride_b, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-}
-
-template <typename Func, typename T>
-inline sycl::event trsm_batch(const char *func_name, Func func, sycl::queue &queue,
-                              side *left_right, uplo *upper_lower, transpose *trans,
-                              diag *unit_diag, int64_t *m, int64_t *n, T *alpha, const T **a,
-                              int64_t *lda, T **b, int64_t *ldb, int64_t group_count,
-                              int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-}
-
-#define TRSM_BATCH_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                              \
-    sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower,                \
-                           transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, TYPE *alpha, \
-                           const TYPE **a, int64_t *lda, TYPE **b, int64_t *ldb,                   \
-                           int64_t group_count, int64_t *group_size,                               \
-                           const std::vector<sycl::event> &dependencies) {                         \
-        return trsm_batch(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans,  \
-                          unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size,         \
-                          dependencies);                                                           \
-    }
-
-TRSM_BATCH_LAUNCHER_USM(float, cublasStrsmBatched)
-TRSM_BATCH_LAUNCHER_USM(double, cublasDtrsmBatched)
-TRSM_BATCH_LAUNCHER_USM(std::complex<float>, cublasCtrsmBatched)
-TRSM_BATCH_LAUNCHER_USM(std::complex<double>, cublasZtrsmBatched)
-
-#undef TRSM_BATCH_LAUNCHER_USM
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, float *alpha, const float **a, int64_t *lda, float *beta,
-                       float **c, int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, double *alpha, const double **a, int64_t *lda, double *beta,
-                       double **c, int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, std::complex<float> *alpha, const std::complex<float> **a,
-                       int64_t *lda, std::complex<float> *beta, std::complex<float> **c,
-                       int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, std::complex<double> *alpha, const std::complex<double> **a,
-                       int64_t *lda, std::complex<double> *beta, std::complex<double> **c,
-                       int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       float alpha, const float *a, int64_t lda, int64_t stride_a, float beta,
-                       float *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       double alpha, const double *a, int64_t lda, int64_t stride_a, double beta,
-                       double *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                       int64_t stride_a, std::complex<float> beta, std::complex<float> *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                       int64_t stride_a, std::complex<double> beta, std::complex<double> *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                           const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb,
-                           int64_t stride_b, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                           const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb,
-                           int64_t stride_b, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                           int64_t stride_a, std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                           int64_t stride_a, std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                           float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                           double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<float> alpha, std::complex<float> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<double> alpha, std::complex<double> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, float alpha, const float *a, int64_t lda, int64_t stride_a,
-                          float beta, const float *b, int64_t ldb, int64_t stride_b, float *c,
-                          int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, double alpha, const double *a, int64_t lda, int64_t stride_a,
-                          double beta, const double *b, int64_t ldb, int64_t stride_b, double *c,
-                          int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                          int64_t lda, int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                          std::complex<float> *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                          int64_t lda, int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                          std::complex<double> *c, int64_t ldc, int64_t stride_c,
-                          int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           float *alpha, const float **a, int64_t *lda, float **b, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           double *alpha, const double **a, int64_t *lda, double **b, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           std::complex<float> *alpha, const std::complex<float> **a, int64_t *lda,
-                           std::complex<float> **b, int64_t *ldb, int64_t group_count,
-                           int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           std::complex<double> *alpha, const std::complex<double> **a,
-                           int64_t *lda, std::complex<double> **b, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           float *alpha, float **ab, int64_t *lda, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           double *alpha, double **ab, int64_t *lda, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           std::complex<float> *alpha, std::complex<float> **ab, int64_t *lda,
-                           int64_t *ldb, int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           std::complex<double> *alpha, std::complex<double> **ab, int64_t *lda,
-                           int64_t *ldb, int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-} // namespace row_major
-} // namespace cublas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/cublas/cublas_extensions.cpp b/src/blas/backends/cublas/cublas_extensions.cpp
deleted file mode 100644
index cc80b483d..000000000
--- a/src/blas/backends/cublas/cublas_extensions.cpp
+++ /dev/null
@@ -1,756 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "cublas_helper.hpp"
-#include "cublas_task.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace cublas {
-namespace column_major {
-
-// Buffer APIs
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<int8_t, 1> &a, int64_t lda,
-               int8_t ao, sycl::buffer<int8_t, 1> &b, int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<int8_t, 1> &a, int64_t lda,
-               int8_t ao, sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a, int64_t lda,
-               uint8_t ao, sycl::buffer<int8_t, 1> &b, int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a, int64_t lda,
-               uint8_t ao, sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-           sycl::buffer<float, 1> &b, int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-           int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-           sycl::buffer<double, 1> &b, int64_t ldb, double beta, sycl::buffer<double, 1> &c,
-           int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-           std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-template <typename Func, typename T>
-void omatcopy(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m,
-              int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b,
-              int64_t ldb) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        const int64_t logical_m = (trans == oneapi::mkl::transpose::nontrans ? m : n);
-        const int64_t logical_n = (trans == oneapi::mkl::transpose::nontrans ? n : m);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans),
-                                     get_cublas_operation(trans), logical_m, logical_n,
-                                     (cuDataType *)&alpha, a_, lda, nullptr, nullptr, lda, b_, ldb);
-        });
-    });
-}
-
-#define OMATCOPY_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                   \
-    void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,          \
-                  sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) { \
-        omatcopy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb);     \
-    }
-
-OMATCOPY_LAUNCHER(float, cublasSgeam)
-OMATCOPY_LAUNCHER(double, cublasDgeam)
-OMATCOPY_LAUNCHER(std::complex<float>, cublasCgeam)
-OMATCOPY_LAUNCHER(std::complex<double>, cublasZgeam)
-
-#undef OMATCOPY_LAUNCHER
-
-template <typename Func, typename T>
-void omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m,
-               int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<T, 1> &b, int64_t ldb, std::int64_t strideb) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-#define OMATCOPY2_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                  \
-    void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,         \
-                   sycl::buffer<TYPE, 1> &a, int64_t lda, int64_t stridea,                        \
-                   sycl::buffer<TYPE, 1> &b, int64_t ldb, int64_t strideb) {                      \
-        omatcopy2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, lda, b, \
-                  ldb, strideb);                                                                  \
-    }
-
-OMATCOPY2_LAUNCHER(float, "unimplemented")
-OMATCOPY2_LAUNCHER(double, "unimplemented")
-OMATCOPY2_LAUNCHER(std::complex<float>, "unimplemented")
-OMATCOPY2_LAUNCHER(std::complex<double>, "unimplemented")
-#undef OMATCOPY2_LAUNCHER
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-              sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-              sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-              sycl::buffer<std::complex<float>, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-              sycl::buffer<std::complex<double>, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-template <typename Func, typename T>
-void omatadd(const char *func_name, Func func, sycl::queue &queue, transpose transa,
-             transpose transb, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
-             T beta, sycl::buffer<T, 1> &b, int64_t ldb, sycl::buffer<T, 1> &c, int64_t ldc) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            auto c_ = sc.get_mem<cuDataType *>(c_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa),
-                                     get_cublas_operation(transb), m, n, (cuDataType *)&alpha, a_,
-                                     lda, (cuDataType *)&beta, b_, ldb, c_, ldc);
-        });
-    });
-}
-
-#define OMATADD_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                     \
-    void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,     \
-                 TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, TYPE beta,                     \
-                 sycl::buffer<TYPE, 1> &b, int64_t ldb, sycl::buffer<TYPE, 1> &c, int64_t ldc) {   \
-        omatadd(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, \
-                b, ldb, c, ldc);                                                                   \
-    }
-
-OMATADD_LAUNCHER(float, cublasSgeam)
-OMATADD_LAUNCHER(double, cublasDgeam)
-OMATADD_LAUNCHER(std::complex<float>, cublasCgeam)
-OMATADD_LAUNCHER(std::complex<double>, cublasZgeam)
-
-#undef OMATADD_LAUNCHER
-
-// USM APIs
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda,
-                      int8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda,
-                      int8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda,
-                      uint8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda,
-                      uint8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b,
-                  int64_t ldb, float beta, float *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, double alpha, const double *a, int64_t lda, const double *b,
-                  int64_t ldb, double beta, double *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                  int64_t lda, const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                  std::complex<float> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                  int64_t lda, const std::complex<double> *b, int64_t ldb,
-                  std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-template <typename Func, typename T>
-sycl::event omatcopy(const char *func_name, Func func, sycl::queue &queue, transpose trans,
-                     int64_t m, int64_t n, T alpha, const T *a, int64_t lda, T *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        const int64_t logical_m = (trans == oneapi::mkl::transpose::nontrans ? m : n);
-        const int64_t logical_n = (trans == oneapi::mkl::transpose::nontrans ? n : m);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto b_ = reinterpret_cast<cuDataType *>(b);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans),
-                                     get_cublas_operation(trans), logical_m, logical_n,
-                                     (cuDataType *)&alpha, a_, lda, nullptr, nullptr, lda, b_, ldb);
-        });
-    });
-    return done;
-}
-
-#define OMATCOPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                             \
-    sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \
-                         const TYPE *a, int64_t lda, TYPE *b, int64_t ldb,                      \
-                         const std::vector<sycl::event> &dependencies) {                        \
-        return omatcopy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b,  \
-                        ldb, dependencies);                                                     \
-    }
-
-OMATCOPY_LAUNCHER_USM(float, cublasSgeam)
-OMATCOPY_LAUNCHER_USM(double, cublasDgeam)
-OMATCOPY_LAUNCHER_USM(std::complex<float>, cublasCgeam)
-OMATCOPY_LAUNCHER_USM(std::complex<double>, cublasZgeam)
-
-#undef OMATCOPY_LAUNCHER_USM
-
-template <typename Func, typename T>
-sycl::event omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans,
-                      int64_t m, int64_t n, T alpha, const T *a, int64_t lda, int64_t stridea, T *b,
-                      int64_t ldb, int64_t strideb, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-#define OMATCOPY2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                             \
-    sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \
-                          const TYPE *a, int64_t lda, int64_t stridea, TYPE *b, int64_t ldb,     \
-                          int64_t strideb, const std::vector<sycl::event> &dependencies) {       \
-        return omatcopy2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, \
-                         lda, b, ldb, strideb, dependencies);                                    \
-    }
-
-OMATCOPY2_LAUNCHER_USM(float, "unimplemented")
-OMATCOPY2_LAUNCHER_USM(double, "unimplemented")
-OMATCOPY2_LAUNCHER_USM(std::complex<float>, "unimplemented")
-OMATCOPY2_LAUNCHER_USM(std::complex<double>, "unimplemented")
-#undef OMATCOPY2_LAUNCHER_USM
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                     float *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                     double *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<float> alpha, std::complex<float> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<double> alpha, std::complex<double> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-template <typename Func, typename T>
-inline sycl::event omatadd(const char *func_name, Func func, sycl::queue &queue, transpose transa,
-                           transpose transb, int64_t m, int64_t n, T alpha, const T *a, int64_t lda,
-                           T beta, const T *b, int64_t ldb, T *c, int64_t ldc,
-                           const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto b_ = reinterpret_cast<const cuDataType *>(b);
-            auto c_ = reinterpret_cast<cuDataType *>(c);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa),
-                                     get_cublas_operation(transb), m, n, (cuDataType *)&alpha, a_,
-                                     lda, (cuDataType *)&beta, b_, ldb, c_, ldc);
-        });
-    });
-    return done;
-}
-
-#define OMATADD_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                             \
-    sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m,     \
-                        int64_t n, TYPE alpha, const TYPE *a, int64_t lda, TYPE beta,          \
-                        const TYPE *b, int64_t ldb, TYPE *c, int64_t ldc,                      \
-                        const std::vector<sycl::event> &dependencies) {                        \
-        return omatadd(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, \
-                       lda, beta, b, ldb, c, ldc, dependencies);                               \
-    }
-
-OMATADD_LAUNCHER_USM(float, cublasSgeam)
-OMATADD_LAUNCHER_USM(double, cublasDgeam)
-OMATADD_LAUNCHER_USM(std::complex<float>, cublasCgeam)
-OMATADD_LAUNCHER_USM(std::complex<double>, cublasZgeam)
-
-#undef OMATADD_LAUNCHER_USM
-
-} // namespace column_major
-
-namespace row_major {
-
-// Buffer APIs
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<int8_t, 1> &a, int64_t lda,
-               int8_t ao, sycl::buffer<int8_t, 1> &b, int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<int8_t, 1> &a, int64_t lda,
-               int8_t ao, sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a, int64_t lda,
-               uint8_t ao, sycl::buffer<int8_t, 1> &b, int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a, int64_t lda,
-               uint8_t ao, sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-           sycl::buffer<float, 1> &b, int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-           int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-           sycl::buffer<double, 1> &b, int64_t ldb, double beta, sycl::buffer<double, 1> &c,
-           int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-           std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-template <typename Func, typename T>
-void omatcopy(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m,
-              int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b,
-              int64_t ldb) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        const int64_t logical_m = (trans == oneapi::mkl::transpose::nontrans ? n : m);
-        const int64_t logical_n = (trans == oneapi::mkl::transpose::nontrans ? m : n);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans),
-                                     get_cublas_operation(trans), logical_m, logical_n,
-                                     (cuDataType *)&alpha, a_, lda, nullptr, nullptr, lda, b_, ldb);
-        });
-    });
-}
-
-#define OMATCOPY_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                   \
-    void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,          \
-                  sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) { \
-        omatcopy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb);     \
-    }
-
-OMATCOPY_LAUNCHER(float, cublasSgeam)
-OMATCOPY_LAUNCHER(double, cublasDgeam)
-OMATCOPY_LAUNCHER(std::complex<float>, cublasCgeam)
-OMATCOPY_LAUNCHER(std::complex<double>, cublasZgeam)
-
-#undef OMATCOPY_LAUNCHER
-
-template <typename Func, typename T>
-void omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m,
-               int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<T, 1> &b, int64_t ldb, std::int64_t strideb) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-#define OMATCOPY2_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                  \
-    void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,         \
-                   sycl::buffer<TYPE, 1> &a, int64_t lda, int64_t stridea,                        \
-                   sycl::buffer<TYPE, 1> &b, int64_t ldb, int64_t strideb) {                      \
-        omatcopy2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, lda, b, \
-                  ldb, strideb);                                                                  \
-    }
-
-OMATCOPY2_LAUNCHER(float, "unimplemented")
-OMATCOPY2_LAUNCHER(double, "unimplemented")
-OMATCOPY2_LAUNCHER(std::complex<float>, "unimplemented")
-OMATCOPY2_LAUNCHER(std::complex<double>, "unimplemented")
-#undef OMATCOPY2_LAUNCHER
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-              sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-              sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-              sycl::buffer<std::complex<float>, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-              sycl::buffer<std::complex<double>, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-template <typename Func, typename T>
-void omatadd(const char *func_name, Func func, sycl::queue &queue, transpose transa,
-             transpose transb, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
-             T beta, sycl::buffer<T, 1> &b, int64_t ldb, sycl::buffer<T, 1> &c, int64_t ldc) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            auto c_ = sc.get_mem<cuDataType *>(c_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa),
-                                     get_cublas_operation(transb), n, m, (cuDataType *)&alpha, a_,
-                                     lda, (cuDataType *)&beta, b_, ldb, c_, ldc);
-        });
-    });
-}
-
-#define OMATADD_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                     \
-    void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,     \
-                 TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, TYPE beta,                     \
-                 sycl::buffer<TYPE, 1> &b, int64_t ldb, sycl::buffer<TYPE, 1> &c, int64_t ldc) {   \
-        omatadd(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, \
-                b, ldb, c, ldc);                                                                   \
-    }
-
-OMATADD_LAUNCHER(float, cublasSgeam)
-OMATADD_LAUNCHER(double, cublasDgeam)
-OMATADD_LAUNCHER(std::complex<float>, cublasCgeam)
-OMATADD_LAUNCHER(std::complex<double>, cublasZgeam)
-
-#undef OMATADD_LAUNCHER
-
-// USM APIs
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda,
-                      int8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda,
-                      int8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda,
-                      uint8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda,
-                      uint8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b,
-                  int64_t ldb, float beta, float *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, double alpha, const double *a, int64_t lda, const double *b,
-                  int64_t ldb, double beta, double *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                  int64_t lda, const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                  std::complex<float> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                  int64_t lda, const std::complex<double> *b, int64_t ldb,
-                  std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-template <typename Func, typename T>
-sycl::event omatcopy(const char *func_name, Func func, sycl::queue &queue, transpose trans,
-                     int64_t m, int64_t n, T alpha, const T *a, int64_t lda, T *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        const int64_t logical_m = (trans == oneapi::mkl::transpose::nontrans ? n : m);
-        const int64_t logical_n = (trans == oneapi::mkl::transpose::nontrans ? m : n);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto b_ = reinterpret_cast<cuDataType *>(b);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans),
-                                     get_cublas_operation(trans), logical_m, logical_n,
-                                     (cuDataType *)&alpha, a_, lda, nullptr, nullptr, ldb, b_, ldb);
-        });
-    });
-    return done;
-}
-
-#define OMATCOPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                             \
-    sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \
-                         const TYPE *a, int64_t lda, TYPE *b, int64_t ldb,                      \
-                         const std::vector<sycl::event> &dependencies) {                        \
-        return omatcopy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b,  \
-                        ldb, dependencies);                                                     \
-    }
-
-OMATCOPY_LAUNCHER_USM(float, cublasSgeam)
-OMATCOPY_LAUNCHER_USM(double, cublasDgeam)
-OMATCOPY_LAUNCHER_USM(std::complex<float>, cublasCgeam)
-OMATCOPY_LAUNCHER_USM(std::complex<double>, cublasZgeam)
-
-#undef OMATCOPY_LAUNCHER_USM
-
-template <typename Func, typename T>
-sycl::event omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans,
-                      int64_t m, int64_t n, T alpha, const T *a, int64_t lda, int64_t stridea, T *b,
-                      int64_t ldb, int64_t strideb, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-#define OMATCOPY2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                             \
-    sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \
-                          const TYPE *a, int64_t lda, int64_t stridea, TYPE *b, int64_t ldb,     \
-                          int64_t strideb, const std::vector<sycl::event> &dependencies) {       \
-        return omatcopy2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, \
-                         lda, b, ldb, strideb, dependencies);                                    \
-    }
-
-OMATCOPY2_LAUNCHER_USM(float, "unimplemented")
-OMATCOPY2_LAUNCHER_USM(double, "unimplemented")
-OMATCOPY2_LAUNCHER_USM(std::complex<float>, "unimplemented")
-OMATCOPY2_LAUNCHER_USM(std::complex<double>, "unimplemented")
-#undef OMATCOPY2_LAUNCHER_USM
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                     float *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                     double *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<float> alpha, std::complex<float> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<double> alpha, std::complex<double> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-template <typename Func, typename T>
-inline sycl::event omatadd(const char *func_name, Func func, sycl::queue &queue, transpose transa,
-                           transpose transb, int64_t m, int64_t n, T alpha, const T *a, int64_t lda,
-                           T beta, const T *b, int64_t ldb, T *c, int64_t ldc,
-                           const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto b_ = reinterpret_cast<const cuDataType *>(b);
-            auto c_ = reinterpret_cast<cuDataType *>(c);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa),
-                                     get_cublas_operation(transb), n, m, (cuDataType *)&alpha, a_,
-                                     lda, (cuDataType *)&beta, b_, ldb, c_, ldc);
-        });
-    });
-    return done;
-}
-
-#define OMATADD_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                             \
-    sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m,     \
-                        int64_t n, TYPE alpha, const TYPE *a, int64_t lda, TYPE beta,          \
-                        const TYPE *b, int64_t ldb, TYPE *c, int64_t ldc,                      \
-                        const std::vector<sycl::event> &dependencies) {                        \
-        return omatadd(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, \
-                       lda, beta, b, ldb, c, ldc, dependencies);                               \
-    }
-
-OMATADD_LAUNCHER_USM(float, cublasSgeam)
-OMATADD_LAUNCHER_USM(double, cublasDgeam)
-OMATADD_LAUNCHER_USM(std::complex<float>, cublasCgeam)
-OMATADD_LAUNCHER_USM(std::complex<double>, cublasZgeam)
-
-#undef OMATADD_LAUNCHER_USM
-
-} // namespace row_major
-} // namespace cublas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/cublas/cublas_handle.hpp b/src/blas/backends/cublas/cublas_handle.hpp
deleted file mode 100644
index db9df5584..000000000
--- a/src/blas/backends/cublas/cublas_handle.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#ifndef CUBLAS_HANDLE_HPP
-#define CUBLAS_HANDLE_HPP
-#include <atomic>
-#include <unordered_map>
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace cublas {
-
-template <typename T>
-struct cublas_handle {
-    using handle_container_t = std::unordered_map<T, std::atomic<cublasHandle_t> *>;
-    handle_container_t cublas_handle_mapper_{};
-    ~cublas_handle() noexcept(false) {
-        for (auto &handle_pair : cublas_handle_mapper_) {
-            cublasStatus_t err;
-            if (handle_pair.second != nullptr) {
-                auto handle = handle_pair.second->exchange(nullptr);
-                if (handle != nullptr) {
-                    CUBLAS_ERROR_FUNC(cublasDestroy, err, handle);
-                    handle = nullptr;
-                }
-                else {
-                    // if the handle is nullptr it means the handle was already
-                    // destroyed by the ContextCallback and we're free to delete the
-                    // atomic object.
-                    delete handle_pair.second;
-                }
-
-                handle_pair.second = nullptr;
-            }
-        }
-        cublas_handle_mapper_.clear();
-    }
-};
-
-} // namespace cublas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-
-#endif // CUBLAS_HANDLE_HPP
diff --git a/src/blas/backends/cublas/cublas_helper.hpp b/src/blas/backends/cublas/cublas_helper.hpp
deleted file mode 100644
index 0fe7e7c5a..000000000
--- a/src/blas/backends/cublas/cublas_helper.hpp
+++ /dev/null
@@ -1,308 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-/**
- * @file cublas_*.cpp : contain the implementation of all the routines
- * for CUDA backend
- */
-#ifndef _CUBLAS_HELPER_HPP_
-#define _CUBLAS_HELPER_HPP_
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <complex>
-
-#include "oneapi/mkl/types.hpp"
-#include "runtime_support_helper.hpp"
-#include "dtype_string.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace cublas {
-
-// The static assert to make sure that all index types used in
-// src/oneMKL/backend/cublas/blas.hpp interface are int64_t
-template <typename... Next>
-struct is_int64 : std::false_type {};
-
-template <typename First>
-struct is_int64<First> : std::is_same<std::int64_t, First> {};
-
-template <typename First, typename... Next>
-struct is_int64<First, Next...>
-        : std::integral_constant<bool, std::is_same<std::int64_t, First>::value &&
-                                           is_int64<Next...>::value> {};
-
-template <typename... T>
-struct Overflow {
-    static void inline check(T...) {}
-};
-
-template <typename Index, typename... T>
-struct Overflow<Index, T...> {
-    static void inline check(Index index, T... next) {
-        if (std::abs(index) >= (1LL << 31)) {
-            throw std::runtime_error(
-                "Cublas index overflow. cublas does not support 64 bit integer as "
-                "data size. Thus, the data size should not be greater that maximum "
-                "supported size by 32 bit integer.");
-        }
-        Overflow<T...>::check(next...);
-    }
-};
-
-template <typename Index, typename... Next>
-void overflow_check(Index index, Next... indices) {
-    static_assert(is_int64<Index, Next...>::value, "oneMKL index type must be 64 bit integer.");
-    Overflow<Index, Next...>::check(index, indices...);
-}
-
-class cublas_error : virtual public std::runtime_error {
-protected:
-    inline const char *cublas_error_map(cublasStatus_t error) {
-        switch (error) {
-            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
-
-            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
-
-            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
-
-            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
-
-            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
-
-            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
-
-            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
-
-            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
-
-            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
-
-            case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
-
-            default: return "<unknown>";
-        }
-    }
-
-    int error_number; ///< Error number
-public:
-    /** Constructor (C++ STL string, cublasStatus_t).
-   *  @param msg The error message
-   *  @param err_num error number
-   */
-    explicit cublas_error(std::string message, cublasStatus_t result)
-            : std::runtime_error((message + std::string(cublas_error_map(result)))) {
-        error_number = static_cast<int>(result);
-    }
-
-    /** Destructor.
-   *  Virtual to allow for subclassing.
-   */
-    virtual ~cublas_error() throw() {}
-
-    /** Returns error number.
-   *  @return #error_number
-   */
-    virtual int getErrorNumber() const throw() {
-        return error_number;
-    }
-};
-
-class cuda_error : virtual public std::runtime_error {
-protected:
-    inline const char *cuda_error_map(CUresult result) {
-        switch (result) {
-            case CUDA_SUCCESS: return "CUDA_SUCCESS";
-            case CUDA_ERROR_NOT_PERMITTED: return "CUDA_ERROR_NOT_PERMITTED";
-            case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT";
-            case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE";
-            case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE";
-            case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY";
-            case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
-            default: return "<unknown>";
-        }
-    }
-    int error_number; ///< error number
-public:
-    /** Constructor (C++ STL string, CUresult).
-   *  @param msg The error message
-   *  @param err_num Error number
-   */
-    explicit cuda_error(std::string message, CUresult result)
-            : std::runtime_error((message + std::string(cuda_error_map(result)))) {
-        error_number = static_cast<int>(result);
-    }
-
-    /** Destructor.
-   *  Virtual to allow for subclassing.
-   */
-    virtual ~cuda_error() throw() {}
-
-    /** Returns error number.
-   *  @return #error_number
-   */
-    virtual int getErrorNumber() const throw() {
-        return error_number;
-    }
-};
-
-#define CUDA_ERROR_FUNC(name, err, ...)                                 \
-    err = name(__VA_ARGS__);                                            \
-    if (err != CUDA_SUCCESS) {                                          \
-        throw cuda_error(std::string(#name) + std::string(" : "), err); \
-    }
-
-#define CUBLAS_ERROR_FUNC(name, err, ...)                                 \
-    err = name(__VA_ARGS__);                                              \
-    if (err != CUBLAS_STATUS_SUCCESS) {                                   \
-        throw cublas_error(std::string(#name) + std::string(" : "), err); \
-    }
-
-#define CUBLAS_ERROR_FUNC_SYNC(name, err, handle, ...)                    \
-    err = name(handle, __VA_ARGS__);                                      \
-    if (err != CUBLAS_STATUS_SUCCESS) {                                   \
-        throw cublas_error(std::string(#name) + std::string(" : "), err); \
-    }                                                                     \
-    cudaStream_t currentStreamId;                                         \
-    CUBLAS_ERROR_FUNC(cublasGetStream, err, handle, &currentStreamId);    \
-    cuStreamSynchronize(currentStreamId);
-
-#define CUBLAS_ERROR_FUNC_T_SYNC(name, func, err, handle, ...)           \
-    err = func(handle, __VA_ARGS__);                                     \
-    if (err != CUBLAS_STATUS_SUCCESS) {                                  \
-        throw cublas_error(std::string(name) + std::string(" : "), err); \
-    }                                                                    \
-    cudaStream_t currentStreamId;                                        \
-    CUBLAS_ERROR_FUNC(cublasGetStream, err, handle, &currentStreamId);   \
-    cuStreamSynchronize(currentStreamId);
-
-inline cublasOperation_t get_cublas_operation(oneapi::mkl::transpose trn) {
-    switch (trn) {
-        case oneapi::mkl::transpose::nontrans: return CUBLAS_OP_N;
-        case oneapi::mkl::transpose::trans: return CUBLAS_OP_T;
-        case oneapi::mkl::transpose::conjtrans: return CUBLAS_OP_C;
-        default: throw "Wrong transpose Operation.";
-    }
-}
-
-inline cublasFillMode_t get_cublas_fill_mode(oneapi::mkl::uplo ul) {
-    switch (ul) {
-        case oneapi::mkl::uplo::upper: return CUBLAS_FILL_MODE_UPPER;
-        case oneapi::mkl::uplo::lower: return CUBLAS_FILL_MODE_LOWER;
-        default: throw "Wrong fill mode.";
-    }
-}
-
-inline cublasDiagType_t get_cublas_diag_type(oneapi::mkl::diag un) {
-    switch (un) {
-        case oneapi::mkl::diag::unit: return CUBLAS_DIAG_UNIT;
-        case oneapi::mkl::diag::nonunit: return CUBLAS_DIAG_NON_UNIT;
-        default: throw "Wrong diag type.";
-    }
-}
-
-inline cublasSideMode_t get_cublas_side_mode(oneapi::mkl::side lr) {
-    switch (lr) {
-        case oneapi::mkl::side::left: return CUBLAS_SIDE_LEFT;
-        case oneapi::mkl::side::right: return CUBLAS_SIDE_RIGHT;
-        default: throw "Wrong side mode.";
-    }
-}
-
-template <typename T>
-inline cudaDataType_t get_cublas_datatype() {
-    static_assert(false);
-}
-
-template <>
-inline cudaDataType_t get_cublas_datatype<__half>() {
-    return CUDA_R_16F;
-}
-
-template <>
-inline cudaDataType_t get_cublas_datatype<float>() {
-    return CUDA_R_32F;
-}
-
-template <>
-inline cudaDataType_t get_cublas_datatype<double>() {
-    return CUDA_R_64F;
-}
-
-template <>
-inline cudaDataType_t get_cublas_datatype<cuComplex>() {
-    return CUDA_C_32F;
-}
-
-template <>
-inline cudaDataType_t get_cublas_datatype<cuDoubleComplex>() {
-    return CUDA_C_64F;
-}
-
-template <>
-inline cudaDataType_t get_cublas_datatype<std::int8_t>() {
-    return CUDA_R_8I;
-}
-
-template <>
-inline cudaDataType_t get_cublas_datatype<std::uint8_t>() {
-    return CUDA_R_8U;
-}
-
-template <>
-inline cudaDataType_t get_cublas_datatype<std::int32_t>() {
-    return CUDA_R_32I;
-}
-
-template <>
-inline cudaDataType_t get_cublas_datatype<std::uint32_t>() {
-    return CUDA_R_32U;
-}
-
-/*converting std::complex<T> to cu<T>Complex*/
-/*converting sycl::half to __half*/
-template <typename T>
-struct CudaEquivalentType {
-    using Type = T;
-};
-template <>
-struct CudaEquivalentType<sycl::half> {
-    using Type = __half;
-};
-template <>
-struct CudaEquivalentType<std::complex<float>> {
-    using Type = cuComplex;
-};
-template <>
-struct CudaEquivalentType<std::complex<double>> {
-    using Type = cuDoubleComplex;
-};
-
-} // namespace cublas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-#endif // _CUBLAS_HELPER_HPP_
diff --git a/src/blas/backends/cublas/cublas_level1.cpp b/src/blas/backends/cublas/cublas_level1.cpp
deleted file mode 100644
index 5f7087727..000000000
--- a/src/blas/backends/cublas/cublas_level1.cpp
+++ /dev/null
@@ -1,1853 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "cublas_helper.hpp"
-#include "cublas_task.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace cublas {
-namespace column_major {
-
-// Buffer APIs
-
-// Level 1
-template <typename Func, typename T1, typename T2>
-inline void asum(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                 sycl::buffer<T1, 1> &x, const int64_t incx, sycl::buffer<T2, 1> &result) {
-    using cuDataType1 = typename CudaEquivalentType<T1>::Type;
-    using cuDataType2 = typename CudaEquivalentType<T2>::Type;
-    overflow_check(n, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto res_acc = result.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
-            // when the data is on buffer, it must be set to
-            // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<cuDataType1 *>(x_acc);
-            auto res_ = sc.get_mem<cuDataType2 *>(res_acc);
-            cublasStatus_t err;
-            // ASUM does not support negative index
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, std::abs(incx), res_);
-            // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid CUDA_ERROR_ILLEGAL_ADRESS errors
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-        });
-    });
-}
-
-#define ASUM_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE)                                         \
-    void asum(sycl::queue &queue, int64_t n, sycl::buffer<TYPE1, 1> &x, const int64_t incx, \
-              sycl::buffer<TYPE2, 1> &result) {                                             \
-        asum(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result);                   \
-    }
-ASUM_LAUNCHER(float, float, cublasSasum)
-ASUM_LAUNCHER(double, double, cublasDasum)
-ASUM_LAUNCHER(std::complex<float>, float, cublasScasum)
-ASUM_LAUNCHER(std::complex<double>, double, cublasDzasum)
-#undef ASUM_LAUNCHER
-
-template <typename Func, typename T1, typename T2>
-inline void scal(const char *func_name, Func func, sycl::queue &queue, int64_t n, T1 a,
-                 sycl::buffer<T2, 1> &x, int64_t incx) {
-    using cuDataType1 = typename CudaEquivalentType<T1>::Type;
-    using cuDataType2 = typename CudaEquivalentType<T2>::Type;
-    overflow_check(n, incx);
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = sc.get_mem<cuDataType2 *>(x_acc);
-            cublasStatus_t err;
-            // SCAL does not support negative incx
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, (cuDataType1 *)&a, x_,
-                                     std::abs(incx));
-        });
-    });
-}
-
-#define SCAL_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE)                                              \
-    void scal(sycl::queue &queue, int64_t n, TYPE1 a, sycl::buffer<TYPE2, 1> &x, int64_t incx) { \
-        scal(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, a, x, incx);                             \
-    }
-SCAL_LAUNCHER(float, float, cublasSscal)
-SCAL_LAUNCHER(double, double, cublasDscal)
-SCAL_LAUNCHER(std::complex<float>, std::complex<float>, cublasCscal)
-SCAL_LAUNCHER(std::complex<double>, std::complex<double>, cublasZscal)
-SCAL_LAUNCHER(float, std::complex<float>, cublasCsscal)
-SCAL_LAUNCHER(double, std::complex<double>, cublasZdscal)
-#undef SCAL_LAUNCHER
-
-template <typename Func, typename T>
-inline void axpy(const char *func_name, Func func, sycl::queue &queue, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, (cuDataType *)&alpha, x_,
-                                     incx, y_, incy);
-        });
-    });
-}
-
-#define AXPY_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                      \
-    void axpy(sycl::queue &queue, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                                          \
-        axpy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy);                \
-    }
-
-AXPY_LAUNCHER(float, cublasSaxpy)
-AXPY_LAUNCHER(double, cublasDaxpy)
-AXPY_LAUNCHER(std::complex<float>, cublasCaxpy)
-AXPY_LAUNCHER(std::complex<double>, cublasZaxpy)
-#undef AXPY_LAUNCHER
-
-void axpby(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x, int64_t incx,
-           float beta, sycl::buffer<float, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-
-void axpby(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x, int64_t incx,
-           double beta, sycl::buffer<double, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-
-void axpby(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-
-void axpby(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-
-template <typename Func, typename T1, typename T2>
-inline void rotg(const char *func_name, Func func, sycl::queue &queue, sycl::buffer<T1, 1> &a,
-                 sycl::buffer<T1, 1> &b, sycl::buffer<T2, 1> &c, sycl::buffer<T1, 1> &s) {
-    using cuDataType1 = typename CudaEquivalentType<T1>::Type;
-    using cuDataType2 = typename CudaEquivalentType<T2>::Type;
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        auto s_acc = s.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
-            // when the data is on buffer, it must be set to
-            // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto a_ = sc.get_mem<cuDataType1 *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType1 *>(b_acc);
-            auto c_ = sc.get_mem<cuDataType2 *>(c_acc);
-            auto s_ = sc.get_mem<cuDataType1 *>(s_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, a_, b_, c_, s_);
-            // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid CUDA_ERROR_ILLEGAL_ADRESS errors
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-        });
-    });
-}
-
-#define ROTG_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE)                                     \
-    void rotg(sycl::queue &queue, sycl::buffer<TYPE1, 1> &a, sycl::buffer<TYPE1, 1> &b, \
-              sycl::buffer<TYPE2, 1> &c, sycl::buffer<TYPE1, 1> &s) {                   \
-        rotg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, a, b, c, s);                       \
-    }
-
-ROTG_LAUNCHER(float, float, cublasSrotg)
-ROTG_LAUNCHER(double, double, cublasDrotg)
-ROTG_LAUNCHER(std::complex<float>, float, cublasCrotg)
-ROTG_LAUNCHER(std::complex<double>, double, cublasZrotg)
-#undef ROTG_LAUNCHER
-
-template <typename Func, typename T>
-inline void rotm(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &param) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        auto param_acc = param.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
-            // when the data is on buffer, it must be set to
-            // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            auto param_ = sc.get_mem<cuDataType *>(param_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy, param_);
-            // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid CUDA_ERROR_ILLEGAL_ADRESS errors
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-        });
-    });
-}
-
-#define ROTM_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                           \
-    void rotm(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, int64_t incx,  \
-              sycl::buffer<TYPE, 1> &y, int64_t incy, sycl::buffer<TYPE, 1> &param) { \
-        rotm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, param);     \
-    }
-
-ROTM_LAUNCHER(float, cublasSrotm)
-ROTM_LAUNCHER(double, cublasDrotm)
-#undef ROTM_LAUNCHER
-
-template <typename Func, typename T>
-inline void copy(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy);
-        });
-    });
-}
-
-#define COPY_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                          \
-    void copy(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        copy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy);           \
-    }
-
-COPY_LAUNCHER(float, cublasScopy)
-COPY_LAUNCHER(double, cublasDcopy)
-COPY_LAUNCHER(std::complex<float>, cublasCcopy)
-COPY_LAUNCHER(std::complex<double>, cublasZcopy)
-#undef COPY_LAUNCHER
-
-template <typename Func, typename T>
-inline void dot(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                sycl::buffer<T, 1> &x, const int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                sycl::buffer<T, 1> &result) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read>(cgh);
-        auto res_acc = result.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
-            // when the data is on buffer, it must be set to
-            // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            auto res_ = sc.get_mem<cuDataType *>(res_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy, res_);
-            // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid CUDA_ERROR_ILLEGAL_ADRESS errors
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-        });
-    });
-}
-
-#define DOT_LAUNCHER(EXT, TYPE, CUBLAS_ROUTINE)                                                  \
-    void dot##EXT(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, const int64_t incx,   \
-                  sycl::buffer<TYPE, 1> &y, const int64_t incy, sycl::buffer<TYPE, 1> &result) { \
-        dot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, result);                \
-    }
-DOT_LAUNCHER(, float, cublasSdot)
-DOT_LAUNCHER(, double, cublasDdot)
-DOT_LAUNCHER(c, std::complex<float>, cublasCdotc)
-DOT_LAUNCHER(c, std::complex<double>, cublasZdotc)
-DOT_LAUNCHER(u, std::complex<float>, cublasCdotu)
-DOT_LAUNCHER(u, std::complex<double>, cublasZdotu)
-#undef DOT_LAUNCHER
-
-template <typename Func, typename T1, typename T2, typename T3>
-inline void rot(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                sycl::buffer<T1, 1> &x, const int64_t incx, sycl::buffer<T1, 1> &y, int64_t incy,
-                T2 c, T3 s) {
-    using cuDataType1 = typename CudaEquivalentType<T1>::Type;
-    using cuDataType2 = typename CudaEquivalentType<T2>::Type;
-    using cuDataType3 = typename CudaEquivalentType<T3>::Type;
-    overflow_check(n, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
-            // when the data is on buffer, it must be set to
-            // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            // cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<cuDataType1 *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType1 *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy,
-                                     (cuDataType2 *)&c, (cuDataType3 *)&s);
-        });
-    });
-}
-
-#define ROT_LAUNCHER(TYPE1, TYPE2, TYPE3, CUBLAS_ROUTINE)                                  \
-    void rot(sycl::queue &queue, int64_t n, sycl::buffer<TYPE1, 1> &x, const int64_t incx, \
-             sycl::buffer<TYPE1, 1> &y, int64_t incy, TYPE2 c, TYPE3 s) {                  \
-        rot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s);            \
-    }
-
-ROT_LAUNCHER(float, float, float, cublasSrot)
-ROT_LAUNCHER(double, double, double, cublasDrot)
-ROT_LAUNCHER(std::complex<float>, float, float, cublasCsrot)
-ROT_LAUNCHER(std::complex<double>, double, double, cublasZdrot)
-#undef ROT_LAUNCHER
-
-void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer<float, 1> &x, int64_t incx,
-            sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &result) {
-    overflow_check(n, incx, incy);
-    // cuBLAS does not support sdot so we need to mimic sdot.
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.get_access<sycl::access::mode::read>(cgh);
-        auto res_acc = result.get_access<sycl::access::mode::write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
-            // when the data is on buffer, it must be set to
-            // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<float *>(x_acc);
-            auto y_ = sc.get_mem<float *>(y_acc);
-            auto res_ = sc.get_mem<float *>(res_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_SYNC(cublasSdot, err, handle, n, x_, incx, y_, incy, res_);
-            // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid CUDA_ERROR_ILLEGAL_ADRESS errors
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-        });
-    });
-    // Since SB is a host pointer we need to bring the result back to the host and
-    // add sb to it.
-    result.get_host_access(sycl::read_write)[0] += sb;
-}
-
-void dot(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-         sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<double, 1> &result) {
-    throw unimplemented("blas", "dot", "for column_major layout");
-}
-
-template <typename Func, typename T>
-inline void rotmg(const char *func_name, Func func, sycl::queue &queue, sycl::buffer<T, 1> &d1,
-                  sycl::buffer<T, 1> &d2, sycl::buffer<T, 1> &x1, T y1, sycl::buffer<T, 1> &param) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    sycl::buffer<T, 1> y1_buff(&y1, sycl::range<1>(1));
-    queue.submit([&](sycl::handler &cgh) {
-        auto d1_acc = d1.template get_access<sycl::access::mode::read_write>(cgh);
-        auto d2_acc = d2.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x1_acc = x1.template get_access<sycl::access::mode::read_write>(cgh);
-        auto y1_acc = y1_buff.template get_access<sycl::access::mode::read>(cgh);
-        auto param_acc = param.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
-            // when the data is on buffer, it must be set to
-            // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto d1_ = sc.get_mem<cuDataType *>(d1_acc);
-            auto d2_ = sc.get_mem<cuDataType *>(d2_acc);
-            auto x1_ = sc.get_mem<cuDataType *>(x1_acc);
-            auto y1_ = sc.get_mem<cuDataType *>(y1_acc);
-            auto param_ = sc.get_mem<cuDataType *>(param_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, d1_, d2_, x1_, y1_, param_);
-            // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid CUDA_ERROR_ILLEGAL_ADRESS errors
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-        });
-    });
-}
-
-#define ROTMG_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                             \
-    void rotmg(sycl::queue &queue, sycl::buffer<TYPE, 1> &d1, sycl::buffer<TYPE, 1> &d2, \
-               sycl::buffer<TYPE, 1> &x1, TYPE y1, sycl::buffer<TYPE, 1> &param) {       \
-        rotmg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, d1, d2, x1, y1, param);            \
-    }
-
-ROTMG_LAUNCHER(float, cublasSrotmg)
-ROTMG_LAUNCHER(double, cublasDrotmg)
-#undef ROTMG_LAUNCHER
-
-template <typename Func, typename T>
-inline void iamax(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                  sycl::buffer<T, 1> &x, const int64_t incx, sycl::buffer<int64_t, 1> &result) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx);
-    // cuBLAS does not support int64_t as return type for the data. So we need to
-    // mimic iamax. We are converting the result to be the int and then we convert
-    // it back to the actual data on the host.
-    // This change may cause failure as the result of integer overflow
-    // based on the size. Alternatively either we need to write a sycl kernel
-    // to elementwise copy the data between two buffer, or allow reinterpret cast
-    // to convert to different type with different typesize size.
-    sycl::buffer<int, 1> int_res_buff{ sycl::range<1>(1) };
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto int_res_acc = int_res_buff.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
-            // when the data is on buffer, it must be set to
-            // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto int_res_ = sc.get_mem<int *>(int_res_acc);
-            cublasStatus_t err;
-            // For negative incx, iamax returns 0. This behaviour is similar to that of
-            // reference netlib BLAS.
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, int_res_);
-            // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid CUDA_ERROR_ILLEGAL_ADRESS errors
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-        });
-    });
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto int_res_acc = int_res_buff.template get_access<sycl::access::mode::read>(cgh);
-        auto result_acc = result.template get_access<sycl::access::mode::write>(cgh);
-        cgh.single_task(
-            [=]() { result_acc[0] = std::max((int64_t)int_res_acc[0] - 1, (int64_t)0); });
-    });
-}
-
-#define IAMAX_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                \
-    void iamax(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, const int64_t incx, \
-               sycl::buffer<int64_t, 1> &result) {                                          \
-        iamax(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result);                  \
-    }
-IAMAX_LAUNCHER(float, cublasIsamax)
-IAMAX_LAUNCHER(double, cublasIdamax)
-IAMAX_LAUNCHER(std::complex<float>, cublasIcamax)
-IAMAX_LAUNCHER(std::complex<double>, cublasIzamax)
-#undef IAMAX_LAUNCHER
-
-template <typename Func, typename T>
-inline void swap(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy);
-        });
-    });
-}
-
-#define SWAP_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                          \
-    void swap(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        swap(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy);           \
-    }
-
-SWAP_LAUNCHER(float, cublasSswap)
-SWAP_LAUNCHER(double, cublasDswap)
-SWAP_LAUNCHER(std::complex<float>, cublasCswap)
-SWAP_LAUNCHER(std::complex<double>, cublasZswap)
-#undef SWAP_LAUNCHER
-
-template <typename Func, typename T>
-inline void iamin(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                  sycl::buffer<T, 1> &x, const int64_t incx, sycl::buffer<int64_t, 1> &result) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx);
-    // cuBLAS does not support int64_t as return type for the data. So we need to
-    // mimic iamin we are converting the result to be the int and then we convert
-    // it back to the actual data on the host.
-    // This change may cause failure as the result of integer overflow
-    // based on the size. Alternatively, either we need to write a sycl kernel
-    // to elementwise copy the data between two buffer, or allow reinterpret cast
-    // to convert to different type with different typesize size.
-    sycl::buffer<int, 1> int_res_buff{ sycl::range<1>(1) };
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto int_res_acc = int_res_buff.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
-            // when the data is on buffer, it must be set to
-            // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto int_res_ = sc.get_mem<int *>(int_res_acc);
-            cublasStatus_t err;
-            // For negative incx, iamin returns 0. This behaviour is similar to that of
-            // implemented as a reference IAMIN.
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, int_res_);
-            // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid CUDA_ERROR_ILLEGAL_ADRESS errors
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-        });
-    });
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto int_res_acc = int_res_buff.template get_access<sycl::access::mode::read>(cgh);
-        auto result_acc = result.template get_access<sycl::access::mode::write>(cgh);
-        cgh.single_task(
-            [=]() { result_acc[0] = std::max((int64_t)int_res_acc[0] - 1, (int64_t)0); });
-    });
-}
-
-#define IAMIN_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                \
-    void iamin(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, const int64_t incx, \
-               sycl::buffer<int64_t, 1> &result) {                                          \
-        iamin(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result);                  \
-    }
-IAMIN_LAUNCHER(float, cublasIsamin)
-IAMIN_LAUNCHER(double, cublasIdamin)
-IAMIN_LAUNCHER(std::complex<float>, cublasIcamin)
-IAMIN_LAUNCHER(std::complex<double>, cublasIzamin)
-#undef IAMIN_LAUNCHER
-
-template <typename Func, typename T1, typename T2>
-inline void nrm2(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                 sycl::buffer<T1, 1> &x, const int64_t incx, sycl::buffer<T2, 1> &result) {
-    using cuDataType1 = typename CudaEquivalentType<T1>::Type;
-    using cuDataType2 = typename CudaEquivalentType<T2>::Type;
-    overflow_check(n, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto res_acc = result.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST
-            // when the data is on buffer, it must be set to
-            // CUBLAS_POINTER_MODE_DEVICE mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            auto x_ = sc.get_mem<cuDataType1 *>(x_acc);
-            auto res_ = sc.get_mem<cuDataType2 *>(res_acc);
-            cublasStatus_t err;
-            // NRM2 does not support negative index
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, std::abs(incx), res_);
-            // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid CUDA_ERROR_ILLEGAL_ADRESS errors
-            cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-        });
-    });
-}
-
-#define NRM2_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE)                                         \
-    void nrm2(sycl::queue &queue, int64_t n, sycl::buffer<TYPE1, 1> &x, const int64_t incx, \
-              sycl::buffer<TYPE2, 1> &result) {                                             \
-        nrm2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result);                   \
-    }
-NRM2_LAUNCHER(float, float, cublasSnrm2)
-NRM2_LAUNCHER(double, double, cublasDnrm2)
-NRM2_LAUNCHER(std::complex<float>, float, cublasScnrm2)
-NRM2_LAUNCHER(std::complex<double>, double, cublasDznrm2)
-#undef NRM2_LAUNCHER
-
-// USM APIs
-
-// Level 1
-template <typename Func, typename T1, typename T2>
-inline sycl::event asum(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                        const T1 *x, const int64_t incx, T2 *result,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType1 = typename CudaEquivalentType<T1>::Type;
-    using cuDataType2 = typename CudaEquivalentType<T2>::Type;
-    overflow_check(n, incx);
-
-    bool result_on_device =
-        sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = reinterpret_cast<const cuDataType1 *>(x);
-            auto res_ = reinterpret_cast<cuDataType2 *>(result);
-            if (result_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            }
-            cublasStatus_t err;
-            // ASUM does not support negative index
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, std::abs(incx), res_);
-            if (result_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-            }
-        });
-    });
-    return done;
-}
-
-#define ASUM_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE)                                        \
-    sycl::event asum(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx,        \
-                     TYPE2 *result, const std::vector<sycl::event> &dependencies) {            \
-        return asum(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \
-    }
-ASUM_LAUNCHER_USM(float, float, cublasSasum)
-ASUM_LAUNCHER_USM(double, double, cublasDasum)
-ASUM_LAUNCHER_USM(std::complex<float>, float, cublasScasum)
-ASUM_LAUNCHER_USM(std::complex<double>, double, cublasDzasum)
-#undef ASUM_LAUNCHER_USM
-
-template <typename Func, typename T1, typename T2>
-inline sycl::event scal(const char *func_name, Func func, sycl::queue &queue, int64_t n, T1 a,
-                        T2 *x, int64_t incx, const std::vector<sycl::event> &dependencies) {
-    using cuDataType1 = typename CudaEquivalentType<T1>::Type;
-    using cuDataType2 = typename CudaEquivalentType<T2>::Type;
-    overflow_check(n, incx);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = reinterpret_cast<cuDataType2 *>(x);
-            cublasStatus_t err;
-            // SCAL does not support negative incx
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, (cuDataType1 *)&a, x_,
-                                     std::abs(incx));
-        });
-    });
-    return done;
-}
-
-#define SCAL_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE)                                   \
-    sycl::event scal(sycl::queue &queue, int64_t n, TYPE1 a, TYPE2 *x, int64_t incx,      \
-                     const std::vector<sycl::event> &dependencies) {                      \
-        return scal(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, a, x, incx, dependencies); \
-    }
-SCAL_LAUNCHER_USM(float, float, cublasSscal)
-SCAL_LAUNCHER_USM(double, double, cublasDscal)
-SCAL_LAUNCHER_USM(std::complex<float>, std::complex<float>, cublasCscal)
-SCAL_LAUNCHER_USM(std::complex<double>, std::complex<double>, cublasZscal)
-SCAL_LAUNCHER_USM(float, std::complex<float>, cublasCsscal)
-SCAL_LAUNCHER_USM(double, std::complex<double>, cublasZdscal)
-#undef SCAL_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event axpy(const char *func_name, Func func, sycl::queue &queue, int64_t n, T alpha,
-                        const T *x, int64_t incx, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, (cuDataType *)&alpha, x_,
-                                     incx, y_, incy);
-        });
-    });
-    return done;
-}
-
-#define AXPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                              \
-    sycl::event axpy(sycl::queue &queue, int64_t n, TYPE alpha, const TYPE *x, int64_t incx, \
-                     TYPE *y, int64_t incy, const std::vector<sycl::event> &dependencies) {  \
-        return axpy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy,      \
-                    dependencies);                                                           \
-    }
-
-AXPY_LAUNCHER_USM(float, cublasSaxpy)
-AXPY_LAUNCHER_USM(double, cublasDaxpy)
-AXPY_LAUNCHER_USM(std::complex<float>, cublasCaxpy)
-AXPY_LAUNCHER_USM(std::complex<double>, cublasZaxpy)
-#undef AXPY_LAUNCHER_USM
-
-sycl::event axpby(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx,
-                  float beta, float *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-sycl::event axpby(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx,
-                  double beta, double *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-sycl::event axpby(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                  const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                  std::complex<float> *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-sycl::event axpby(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                  const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                  std::complex<double> *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-
-template <typename Func, typename T1, typename T2>
-inline sycl::event rotg(const char *func_name, Func func, sycl::queue &queue, T1 *a, T1 *b, T2 *c,
-                        T1 *s, const std::vector<sycl::event> &dependencies) {
-    using cuDataType1 = typename CudaEquivalentType<T1>::Type;
-    using cuDataType2 = typename CudaEquivalentType<T2>::Type;
-    auto ctx = queue.get_context();
-    bool results_on_device = (sycl::get_pointer_type(a, ctx) == sycl::usm::alloc::device ||
-                              sycl::get_pointer_type(b, ctx) == sycl::usm::alloc::device ||
-                              sycl::get_pointer_type(c, ctx) == sycl::usm::alloc::device ||
-                              sycl::get_pointer_type(s, ctx) == sycl::usm::alloc::device);
-    if (results_on_device) {
-        if (sycl::get_pointer_type(a, ctx) == sycl::usm::alloc::unknown ||
-            sycl::get_pointer_type(b, ctx) == sycl::usm::alloc::unknown ||
-            sycl::get_pointer_type(c, ctx) == sycl::usm::alloc::unknown ||
-            sycl::get_pointer_type(s, ctx) == sycl::usm::alloc::unknown) {
-            throw oneapi::mkl::exception(
-                "blas", "rotg",
-                "If any pointer is only device accessible, all must be device accessible");
-        }
-    }
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType1 *>(a);
-            auto b_ = reinterpret_cast<cuDataType1 *>(b);
-            auto c_ = reinterpret_cast<cuDataType2 *>(c);
-            auto s_ = reinterpret_cast<cuDataType1 *>(s);
-            if (results_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            }
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, a_, b_, c_, s_);
-            if (results_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-            }
-        });
-    });
-    return done;
-}
-
-#define ROTG_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE)                                \
-    sycl::event rotg(sycl::queue &queue, TYPE1 *a, TYPE1 *b, TYPE2 *c, TYPE1 *s,       \
-                     const std::vector<sycl::event> &dependencies) {                   \
-        return rotg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, a, b, c, s, dependencies); \
-    }
-
-ROTG_LAUNCHER_USM(float, float, cublasSrotg)
-ROTG_LAUNCHER_USM(double, double, cublasDrotg)
-ROTG_LAUNCHER_USM(std::complex<float>, float, cublasCrotg)
-ROTG_LAUNCHER_USM(std::complex<double>, double, cublasZrotg)
-#undef ROTG_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event rotm(const char *func_name, Func func, sycl::queue &queue, int64_t n, T *x,
-                        int64_t incx, T *y, int64_t incy, T *param,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = reinterpret_cast<cuDataType *>(x);
-            auto y_ = reinterpret_cast<cuDataType *>(y);
-            auto param_ = reinterpret_cast<cuDataType *>(param);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy, param_);
-        });
-    });
-    return done;
-}
-
-#define ROTM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event rotm(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \
-                     TYPE *param, const std::vector<sycl::event> &dependencies) {                 \
-        return rotm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, param,           \
-                    dependencies);                                                                \
-    }
-
-ROTM_LAUNCHER_USM(float, cublasSrotm)
-ROTM_LAUNCHER_USM(double, cublasDrotm)
-#undef ROTM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event copy(const char *func_name, Func func, sycl::queue &queue, int64_t n, const T *x,
-                        int64_t incx, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy);
-        });
-    });
-    return done;
-}
-
-#define COPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                 \
-    sycl::event copy(sycl::queue &queue, int64_t n, const TYPE *x, int64_t incx, TYPE *y,       \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {              \
-        return copy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies); \
-    }
-
-COPY_LAUNCHER_USM(float, cublasScopy)
-COPY_LAUNCHER_USM(double, cublasDcopy)
-COPY_LAUNCHER_USM(std::complex<float>, cublasCcopy)
-COPY_LAUNCHER_USM(std::complex<double>, cublasZcopy)
-#undef COPY_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event dot(const char *func_name, Func func, sycl::queue &queue, int64_t n, const T *x,
-                       const int64_t incx, const T *y, int64_t incy, T *result,
-                       const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    bool result_on_device =
-        sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<const cuDataType *>(y);
-            auto res_ = reinterpret_cast<cuDataType *>(result);
-            if (result_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            }
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy, res_);
-            if (result_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-            }
-        });
-    });
-    return done;
-}
-
-#define DOT_LAUNCHER_USM(EXT, TYPE, CUBLAS_ROUTINE)                                        \
-    sycl::event dot##EXT(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \
-                         const TYPE *y, const int64_t incy, TYPE *result,                  \
-                         const std::vector<sycl::event> &dependencies) {                   \
-        return dot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, result,    \
-                   dependencies);                                                          \
-    }
-DOT_LAUNCHER_USM(, float, cublasSdot)
-DOT_LAUNCHER_USM(, double, cublasDdot)
-DOT_LAUNCHER_USM(c, std::complex<float>, cublasCdotc)
-DOT_LAUNCHER_USM(c, std::complex<double>, cublasZdotc)
-DOT_LAUNCHER_USM(u, std::complex<float>, cublasCdotu)
-DOT_LAUNCHER_USM(u, std::complex<double>, cublasZdotu)
-#undef DOT_LAUNCHER_USM
-
-template <typename Func, typename T1, typename T2, typename T3>
-inline sycl::event rot(const char *func_name, Func func, sycl::queue &queue, int64_t n, T1 *x,
-                       const int64_t incx, T1 *y, int64_t incy, T2 c, T3 s,
-                       const std::vector<sycl::event> &dependencies) {
-    using cuDataType1 = typename CudaEquivalentType<T1>::Type;
-    using cuDataType2 = typename CudaEquivalentType<T2>::Type;
-    using cuDataType3 = typename CudaEquivalentType<T3>::Type;
-    overflow_check(n, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = reinterpret_cast<cuDataType1 *>(x);
-            auto y_ = reinterpret_cast<cuDataType1 *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy,
-                                     (cuDataType2 *)&c, (cuDataType3 *)&s);
-        });
-    });
-    return done;
-}
-
-#define ROT_LAUNCHER_USM(TYPE1, TYPE2, TYPE3, CUBLAS_ROUTINE)                              \
-    sycl::event rot(sycl::queue &queue, int64_t n, TYPE1 *x, const int64_t incx, TYPE1 *y, \
-                    int64_t incy, TYPE2 c, TYPE3 s,                                        \
-                    const std::vector<sycl::event> &dependencies) {                        \
-        return rot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s,      \
-                   dependencies);                                                          \
-    }
-
-ROT_LAUNCHER_USM(float, float, float, cublasSrot)
-ROT_LAUNCHER_USM(double, double, double, cublasDrot)
-ROT_LAUNCHER_USM(std::complex<float>, float, float, cublasCsrot)
-ROT_LAUNCHER_USM(std::complex<double>, double, double, cublasZdrot)
-#undef ROT_LAUNCHER_USM
-
-sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx,
-                   const float *y, int64_t incy, float *result,
-                   const std::vector<sycl::event> &dependencies) {
-    overflow_check(n, incx, incy);
-    bool result_on_device =
-        sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
-    // cuBLAS does not support sdsdot so we need to mimic sdot.
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = reinterpret_cast<const float *>(x);
-            auto y_ = reinterpret_cast<const float *>(y);
-            auto res_ = reinterpret_cast<float *>(result);
-            if (result_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            }
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_SYNC(cublasSdot, err, handle, n, x_, incx, y_, incy, res_);
-            if (result_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-            }
-        });
-    });
-    done.wait();
-    if (result_on_device) {
-        // The following does copy device to host and then host to device
-        // just to adjust with sb constant. This is pretty inefficient, and
-        // should maybe be replaced with a sycl GPU kernel, but it duplicated what
-        // is done in the buffer API
-        float host_result;
-        queue.memcpy(&host_result, result, sizeof(float)).wait();
-        host_result += sb;
-        auto last_ev = queue.memcpy(result, &host_result, sizeof(float));
-        return last_ev;
-    }
-    else {
-        result[0] = result[0] + sb;
-        return done;
-    }
-}
-
-sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y,
-                int64_t incy, double *result, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dot", "for column_major layout");
-}
-
-template <typename Func, typename T>
-inline sycl::event rotmg(const char *func_name, Func func, sycl::queue &queue, T *d1, T *d2, T *x1,
-                         T y1, T *param, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    auto ctx = queue.get_context();
-    bool results_on_device = (sycl::get_pointer_type(d1, ctx) == sycl::usm::alloc::device ||
-                              sycl::get_pointer_type(d2, ctx) == sycl::usm::alloc::device ||
-                              sycl::get_pointer_type(x1, ctx) == sycl::usm::alloc::device);
-    if (results_on_device) {
-        if (sycl::get_pointer_type(d1, ctx) == sycl::usm::alloc::unknown ||
-            sycl::get_pointer_type(d2, ctx) == sycl::usm::alloc::unknown ||
-            sycl::get_pointer_type(x1, ctx) == sycl::usm::alloc::unknown) {
-            throw oneapi::mkl::exception(
-                "blas", "rotmg",
-                "If any pointer is only device accessible, all must be device accessible");
-        }
-    }
-    cuDataType *y1_;
-    if (results_on_device) {
-        y1_ = sycl::malloc_device<cuDataType>(1, queue);
-        queue.memcpy(y1_, &y1, sizeof(cuDataType)).wait();
-    }
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto d1_ = reinterpret_cast<cuDataType *>(d1);
-            auto d2_ = reinterpret_cast<cuDataType *>(d2);
-            auto x1_ = reinterpret_cast<cuDataType *>(x1);
-            auto param_ = reinterpret_cast<cuDataType *>(param);
-            cublasStatus_t err;
-            if (results_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-                CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, d1_, d2_, x1_, y1_, param_);
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-            }
-            else {
-                auto y1_c = reinterpret_cast<const cuDataType *>(&y1);
-                CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, d1_, d2_, x1_, y1_c, param_);
-            }
-        });
-    });
-    if (results_on_device) {
-        done.wait();
-        queue.memcpy(&y1, y1_, sizeof(cuDataType)).wait();
-        sycl::free(y1_, queue);
-    }
-    return done;
-}
-
-#define ROTMG_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event rotmg(sycl::queue &queue, TYPE *d1, TYPE *d2, TYPE *x1, TYPE y1, TYPE *param,      \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return rotmg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, d1, d2, x1, y1, param, dependencies); \
-    }
-
-ROTMG_LAUNCHER_USM(float, cublasSrotmg)
-ROTMG_LAUNCHER_USM(double, cublasDrotmg)
-#undef ROTMG_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event iamax(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                         const T *x, const int64_t incx, int64_t *result,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx);
-    // cuBLAS does not support int64_t as return type for the data. So we need to
-    // mimic iamax. We are converting the result to be the int and then we convert
-    // it back to the actual data on the host.
-    // This change may cause failure as the result of integer overflow
-    // based on the size.
-    int int_res = 0;
-    int *int_res_p = nullptr;
-    bool result_on_device =
-        sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
-    if (result_on_device) {
-        int_res_p = sycl::malloc_device<int>(1, queue);
-    }
-    else {
-        int_res_p = &int_res;
-    }
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            if (result_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            }
-            cublasStatus_t err;
-            // For negative incx, iamax returns 0. This behaviour is similar to that of
-            // reference iamax.
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, int_res_p);
-            if (result_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-            }
-        });
-    });
-    done.wait();
-    if (result_on_device) {
-        auto last_ev = queue.submit([&](sycl::handler &cgh) {
-            cgh.single_task([=]() { *result = std::max((int64_t)*int_res_p - 1, (int64_t)0); });
-        });
-        last_ev.wait();
-        sycl::free(int_res_p, queue);
-        return last_ev;
-    }
-    else {
-        result[0] = std::max((int64_t)(*int_res_p - 1), int64_t{ 0 });
-        return done;
-    }
-}
-
-#define IAMAX_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                \
-    sycl::event iamax(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx,         \
-                      int64_t *result, const std::vector<sycl::event> &dependencies) {          \
-        return iamax(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \
-    }
-IAMAX_LAUNCHER_USM(float, cublasIsamax)
-IAMAX_LAUNCHER_USM(double, cublasIdamax)
-IAMAX_LAUNCHER_USM(std::complex<float>, cublasIcamax)
-IAMAX_LAUNCHER_USM(std::complex<double>, cublasIzamax)
-#undef IAMAX_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event swap(const char *func_name, Func func, sycl::queue &queue, int64_t n, T *x,
-                        int64_t incx, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = reinterpret_cast<cuDataType *>(x);
-            auto y_ = reinterpret_cast<cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy);
-        });
-    });
-    return done;
-}
-
-#define SWAP_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event swap(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return swap(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies);   \
-    }
-
-SWAP_LAUNCHER_USM(float, cublasSswap)
-SWAP_LAUNCHER_USM(double, cublasDswap)
-SWAP_LAUNCHER_USM(std::complex<float>, cublasCswap)
-SWAP_LAUNCHER_USM(std::complex<double>, cublasZswap)
-#undef SWAP_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event iamin(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                         const T *x, const int64_t incx, int64_t *result,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx);
-    // cuBLAS does not support int64_t as return type for the data. So we need to
-    // mimic iamin. We are converting the result to be the int and then we convert
-    // it back to the actual data on the host.
-    // This change may cause failure as the result of integer overflow
-    // based on the size.
-    int int_res = 0;
-    int *int_res_p = nullptr;
-    bool result_on_device =
-        sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
-    if (result_on_device) {
-        int_res_p = sycl::malloc_device<int>(1, queue);
-    }
-    else {
-        int_res_p = &int_res;
-    }
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            if (result_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            }
-            cublasStatus_t err;
-            // For negative incx, iamin returns 0. This behaviour is similar to that of
-            // implemented iamin.
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, int_res_p);
-            if (result_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-            }
-        });
-    });
-    done.wait();
-    if (result_on_device) {
-        auto last_ev = queue.submit([&](sycl::handler &cgh) {
-            cgh.single_task([=]() { *result = std::max((int64_t)*int_res_p - 1, (int64_t)0); });
-        });
-        last_ev.wait();
-        sycl::free(int_res_p, queue);
-        return last_ev;
-    }
-    else {
-        result[0] = std::max((int64_t)(*int_res_p - 1), int64_t{ 0 });
-        return done;
-    }
-}
-
-#define IAMIN_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                \
-    sycl::event iamin(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx,         \
-                      int64_t *result, const std::vector<sycl::event> &dependencies) {          \
-        return iamin(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \
-    }
-IAMIN_LAUNCHER_USM(float, cublasIsamin)
-IAMIN_LAUNCHER_USM(double, cublasIdamin)
-IAMIN_LAUNCHER_USM(std::complex<float>, cublasIcamin)
-IAMIN_LAUNCHER_USM(std::complex<double>, cublasIzamin)
-#undef IAMIN_LAUNCHER_USM
-
-template <typename Func, typename T1, typename T2>
-inline sycl::event nrm2(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                        const T1 *x, const int64_t incx, T2 *result,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType1 = typename CudaEquivalentType<T1>::Type;
-    using cuDataType2 = typename CudaEquivalentType<T2>::Type;
-    overflow_check(n, incx);
-
-    bool result_on_device =
-        sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = reinterpret_cast<const cuDataType1 *>(x);
-            auto res_ = reinterpret_cast<cuDataType2 *>(result);
-            if (result_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
-            }
-            cublasStatus_t err;
-            // NRM2 does not support negative index
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, std::abs(incx), res_);
-            if (result_on_device) {
-                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-            }
-        });
-    });
-    return done;
-}
-
-#define NRM2_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE)                                        \
-    sycl::event nrm2(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx,        \
-                     TYPE2 *result, const std::vector<sycl::event> &dependencies) {            \
-        return nrm2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \
-    }
-NRM2_LAUNCHER_USM(float, float, cublasSnrm2)
-NRM2_LAUNCHER_USM(double, double, cublasDnrm2)
-NRM2_LAUNCHER_USM(std::complex<float>, float, cublasScnrm2)
-NRM2_LAUNCHER_USM(std::complex<double>, double, cublasDznrm2)
-#undef NRM2_LAUNCHER_USM
-
-} // namespace column_major
-namespace row_major {
-
-// Buffer APIs
-
-// Level 1
-template <typename Func, typename T1, typename T2>
-inline void asum(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                 sycl::buffer<T1, 1> &x, const int64_t incx, sycl::buffer<T2, 1> &result) {
-    throw unimplemented("blas", "asum", "for row_major layout");
-}
-
-#define ASUM_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE)                                         \
-    void asum(sycl::queue &queue, int64_t n, sycl::buffer<TYPE1, 1> &x, const int64_t incx, \
-              sycl::buffer<TYPE2, 1> &result) {                                             \
-        asum(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result);                   \
-    }
-ASUM_LAUNCHER(float, float, cublasSasum)
-ASUM_LAUNCHER(double, double, cublasDasum)
-ASUM_LAUNCHER(std::complex<float>, float, cublasScasum)
-ASUM_LAUNCHER(std::complex<double>, double, cublasDzasum)
-#undef ASUM_LAUNCHER
-
-template <typename Func, typename T1, typename T2>
-inline void scal(const char *func_name, Func func, sycl::queue &queue, int64_t n, T1 a,
-                 sycl::buffer<T2, 1> &x, int64_t incx) {
-    throw unimplemented("blas", "scal", "for row_major layout");
-}
-
-#define SCAL_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE)                                              \
-    void scal(sycl::queue &queue, int64_t n, TYPE1 a, sycl::buffer<TYPE2, 1> &x, int64_t incx) { \
-        scal(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, a, x, incx);                             \
-    }
-SCAL_LAUNCHER(float, float, cublasSscal)
-SCAL_LAUNCHER(double, double, cublasDscal)
-SCAL_LAUNCHER(std::complex<float>, std::complex<float>, cublasCscal)
-SCAL_LAUNCHER(std::complex<double>, std::complex<double>, cublasZscal)
-SCAL_LAUNCHER(float, std::complex<float>, cublasCsscal)
-SCAL_LAUNCHER(double, std::complex<double>, cublasZdscal)
-#undef SCAL_LAUNCHER
-
-template <typename Func, typename T>
-inline void axpy(const char *func_name, Func func, sycl::queue &queue, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpy", "for row_major layout");
-}
-
-#define AXPY_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                      \
-    void axpy(sycl::queue &queue, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                                          \
-        axpy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy);                \
-    }
-
-AXPY_LAUNCHER(float, cublasSaxpy)
-AXPY_LAUNCHER(double, cublasDaxpy)
-AXPY_LAUNCHER(std::complex<float>, cublasCaxpy)
-AXPY_LAUNCHER(std::complex<double>, cublasZaxpy)
-#undef AXPY_LAUNCHER
-
-void axpby(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x, int64_t incx,
-           float beta, sycl::buffer<float, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-
-void axpby(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x, int64_t incx,
-           double beta, sycl::buffer<double, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-
-void axpby(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-
-void axpby(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-
-template <typename Func, typename T1, typename T2>
-inline void rotg(const char *func_name, Func func, sycl::queue &queue, sycl::buffer<T1, 1> &a,
-                 sycl::buffer<T1, 1> &b, sycl::buffer<T2, 1> &c, sycl::buffer<T1, 1> &s) {
-    throw unimplemented("blas", "rotg", "for row_major layout");
-}
-
-#define ROTG_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE)                                     \
-    void rotg(sycl::queue &queue, sycl::buffer<TYPE1, 1> &a, sycl::buffer<TYPE1, 1> &b, \
-              sycl::buffer<TYPE2, 1> &c, sycl::buffer<TYPE1, 1> &s) {                   \
-        rotg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, a, b, c, s);                       \
-    }
-
-ROTG_LAUNCHER(float, float, cublasSrotg)
-ROTG_LAUNCHER(double, double, cublasDrotg)
-ROTG_LAUNCHER(std::complex<float>, float, cublasCrotg)
-ROTG_LAUNCHER(std::complex<double>, double, cublasZrotg)
-#undef ROTG_LAUNCHER
-
-template <typename Func, typename T>
-inline void rotm(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &param) {
-    throw unimplemented("blas", "rotm", "for row_major layout");
-}
-
-#define ROTM_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                           \
-    void rotm(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, int64_t incx,  \
-              sycl::buffer<TYPE, 1> &y, int64_t incy, sycl::buffer<TYPE, 1> &param) { \
-        rotm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, param);     \
-    }
-
-ROTM_LAUNCHER(float, cublasSrotm)
-ROTM_LAUNCHER(double, cublasDrotm)
-#undef ROTM_LAUNCHER
-
-template <typename Func, typename T>
-inline void copy(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "copy", "for row_major layout");
-}
-
-#define COPY_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                          \
-    void copy(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        copy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy);           \
-    }
-
-COPY_LAUNCHER(float, cublasScopy)
-COPY_LAUNCHER(double, cublasDcopy)
-COPY_LAUNCHER(std::complex<float>, cublasCcopy)
-COPY_LAUNCHER(std::complex<double>, cublasZcopy)
-#undef COPY_LAUNCHER
-
-template <typename Func, typename T>
-inline void dot(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                sycl::buffer<T, 1> &x, const int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                sycl::buffer<T, 1> &result) {
-    throw unimplemented("blas", "dot", "for row_major layout");
-}
-
-#define DOT_LAUNCHER(EXT, TYPE, CUBLAS_ROUTINE)                                                  \
-    void dot##EXT(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, const int64_t incx,   \
-                  sycl::buffer<TYPE, 1> &y, const int64_t incy, sycl::buffer<TYPE, 1> &result) { \
-        dot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, result);                \
-    }
-DOT_LAUNCHER(, float, cublasSdot)
-DOT_LAUNCHER(, double, cublasDdot)
-DOT_LAUNCHER(c, std::complex<float>, cublasCdotc)
-DOT_LAUNCHER(c, std::complex<double>, cublasZdotc)
-DOT_LAUNCHER(u, std::complex<float>, cublasCdotu)
-DOT_LAUNCHER(u, std::complex<double>, cublasZdotu)
-#undef DOT_LAUNCHER
-
-template <typename Func, typename T1, typename T2, typename T3>
-inline void rot(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                sycl::buffer<T1, 1> &x, const int64_t incx, sycl::buffer<T1, 1> &y, int64_t incy,
-                T2 c, T3 s) {
-    throw unimplemented("blas", "rot", "for row_major layout");
-}
-
-#define ROT_LAUNCHER(TYPE1, TYPE2, TYPE3, CUBLAS_ROUTINE)                                  \
-    void rot(sycl::queue &queue, int64_t n, sycl::buffer<TYPE1, 1> &x, const int64_t incx, \
-             sycl::buffer<TYPE1, 1> &y, int64_t incy, TYPE2 c, TYPE3 s) {                  \
-        rot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s);            \
-    }
-
-ROT_LAUNCHER(float, float, float, cublasSrot)
-ROT_LAUNCHER(double, double, double, cublasDrot)
-ROT_LAUNCHER(std::complex<float>, float, float, cublasCsrot)
-ROT_LAUNCHER(std::complex<double>, double, double, cublasZdrot)
-#undef ROT_LAUNCHER
-
-void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer<float, 1> &x, int64_t incx,
-            sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &result) {
-    throw unimplemented("blas", "sdsdot", "for row_major layout");
-}
-
-void dot(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-         sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<double, 1> &result) {
-    throw unimplemented("blas", "dot", "for row_major layout");
-}
-
-template <typename Func, typename T>
-inline void rotmg(const char *func_name, Func func, sycl::queue &queue, sycl::buffer<T, 1> &d1,
-                  sycl::buffer<T, 1> &d2, sycl::buffer<T, 1> &x1, T y1, sycl::buffer<T, 1> &param) {
-    throw unimplemented("blas", "rotmg", "for row_major layout");
-}
-
-#define ROTMG_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                             \
-    void rotmg(sycl::queue &queue, sycl::buffer<TYPE, 1> &d1, sycl::buffer<TYPE, 1> &d2, \
-               sycl::buffer<TYPE, 1> &x1, TYPE y1, sycl::buffer<TYPE, 1> &param) {       \
-        rotmg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, d1, d2, x1, y1, param);            \
-    }
-
-ROTMG_LAUNCHER(float, cublasSrotmg)
-ROTMG_LAUNCHER(double, cublasDrotmg)
-#undef ROTMG_LAUNCHER
-
-template <typename Func, typename T>
-inline void iamax(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                  sycl::buffer<T, 1> &x, const int64_t incx, sycl::buffer<int64_t, 1> &result) {
-    throw unimplemented("blas", "iamax", "for row_major layout");
-}
-
-#define IAMAX_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                \
-    void iamax(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, const int64_t incx, \
-               sycl::buffer<int64_t, 1> &result) {                                          \
-        iamax(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result);                  \
-    }
-IAMAX_LAUNCHER(float, cublasIsamax)
-IAMAX_LAUNCHER(double, cublasIdamax)
-IAMAX_LAUNCHER(std::complex<float>, cublasIcamax)
-IAMAX_LAUNCHER(std::complex<double>, cublasIzamax)
-#undef IAMAX_LAUNCHER
-
-template <typename Func, typename T>
-inline void swap(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "swap", "for row_major layout");
-}
-
-#define SWAP_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                          \
-    void swap(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        swap(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy);           \
-    }
-
-SWAP_LAUNCHER(float, cublasSswap)
-SWAP_LAUNCHER(double, cublasDswap)
-SWAP_LAUNCHER(std::complex<float>, cublasCswap)
-SWAP_LAUNCHER(std::complex<double>, cublasZswap)
-#undef SWAP_LAUNCHER
-
-template <typename Func, typename T>
-inline void iamin(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                  sycl::buffer<T, 1> &x, const int64_t incx, sycl::buffer<int64_t, 1> &result) {
-    throw unimplemented("blas", "iamin", "for row_major layout");
-}
-
-#define IAMIN_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                \
-    void iamin(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, const int64_t incx, \
-               sycl::buffer<int64_t, 1> &result) {                                          \
-        iamin(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result);                  \
-    }
-IAMIN_LAUNCHER(float, cublasIsamin)
-IAMIN_LAUNCHER(double, cublasIdamin)
-IAMIN_LAUNCHER(std::complex<float>, cublasIcamin)
-IAMIN_LAUNCHER(std::complex<double>, cublasIzamin)
-#undef IAMIN_LAUNCHER
-
-template <typename Func, typename T1, typename T2>
-inline void nrm2(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                 sycl::buffer<T1, 1> &x, const int64_t incx, sycl::buffer<T2, 1> &result) {
-    throw unimplemented("blas", "nrm2", "for row_major layout");
-}
-
-#define NRM2_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE)                                         \
-    void nrm2(sycl::queue &queue, int64_t n, sycl::buffer<TYPE1, 1> &x, const int64_t incx, \
-              sycl::buffer<TYPE2, 1> &result) {                                             \
-        nrm2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result);                   \
-    }
-NRM2_LAUNCHER(float, float, cublasSnrm2)
-NRM2_LAUNCHER(double, double, cublasDnrm2)
-NRM2_LAUNCHER(std::complex<float>, float, cublasScnrm2)
-NRM2_LAUNCHER(std::complex<double>, double, cublasDznrm2)
-#undef NRM2_LAUNCHER
-
-// USM APIs
-
-// Level 1
-template <typename Func, typename T1, typename T2>
-inline sycl::event asum(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                        const T1 *x, const int64_t incx, T2 *result,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "asum", "for row_major layout");
-}
-
-#define ASUM_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE)                                        \
-    sycl::event asum(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx,        \
-                     TYPE2 *result, const std::vector<sycl::event> &dependencies) {            \
-        return asum(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \
-    }
-ASUM_LAUNCHER_USM(float, float, cublasSasum)
-ASUM_LAUNCHER_USM(double, double, cublasDasum)
-ASUM_LAUNCHER_USM(std::complex<float>, float, cublasScasum)
-ASUM_LAUNCHER_USM(std::complex<double>, double, cublasDzasum)
-#undef ASUM_LAUNCHER_USM
-
-template <typename Func, typename T1, typename T2>
-inline sycl::event scal(const char *func_name, Func func, sycl::queue &queue, int64_t n, T1 a,
-                        T2 *x, int64_t incx, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "scal", "for row_major layout");
-}
-
-#define SCAL_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE)                                   \
-    sycl::event scal(sycl::queue &queue, int64_t n, TYPE1 a, TYPE2 *x, int64_t incx,      \
-                     const std::vector<sycl::event> &dependencies) {                      \
-        return scal(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, a, x, incx, dependencies); \
-    }
-SCAL_LAUNCHER_USM(float, float, cublasSscal)
-SCAL_LAUNCHER_USM(double, double, cublasDscal)
-SCAL_LAUNCHER_USM(std::complex<float>, std::complex<float>, cublasCscal)
-SCAL_LAUNCHER_USM(std::complex<double>, std::complex<double>, cublasZscal)
-SCAL_LAUNCHER_USM(float, std::complex<float>, cublasCsscal)
-SCAL_LAUNCHER_USM(double, std::complex<double>, cublasZdscal)
-#undef SCAL_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event axpy(const char *func_name, Func func, sycl::queue &queue, int64_t n, T alpha,
-                        const T *x, int64_t incx, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy", "for row_major layout");
-}
-
-#define AXPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                              \
-    sycl::event axpy(sycl::queue &queue, int64_t n, TYPE alpha, const TYPE *x, int64_t incx, \
-                     TYPE *y, int64_t incy, const std::vector<sycl::event> &dependencies) {  \
-        return axpy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy,      \
-                    dependencies);                                                           \
-    }
-
-AXPY_LAUNCHER_USM(float, cublasSaxpy)
-AXPY_LAUNCHER_USM(double, cublasDaxpy)
-AXPY_LAUNCHER_USM(std::complex<float>, cublasCaxpy)
-AXPY_LAUNCHER_USM(std::complex<double>, cublasZaxpy)
-#undef AXPY_LAUNCHER_USM
-
-sycl::event axpby(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx,
-                  float beta, float *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-sycl::event axpby(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx,
-                  double beta, double *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-sycl::event axpby(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                  const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                  std::complex<float> *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-sycl::event axpby(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                  const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                  std::complex<double> *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-
-template <typename Func, typename T1, typename T2>
-inline sycl::event rotg(const char *func_name, Func func, sycl::queue &queue, T1 *a, T1 *b, T2 *c,
-                        T1 *s, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "rotg", "for row_major layout");
-}
-
-#define ROTG_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE)                                \
-    sycl::event rotg(sycl::queue &queue, TYPE1 *a, TYPE1 *b, TYPE2 *c, TYPE1 *s,       \
-                     const std::vector<sycl::event> &dependencies) {                   \
-        return rotg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, a, b, c, s, dependencies); \
-    }
-
-ROTG_LAUNCHER_USM(float, float, cublasSrotg)
-ROTG_LAUNCHER_USM(double, double, cublasDrotg)
-ROTG_LAUNCHER_USM(std::complex<float>, float, cublasCrotg)
-ROTG_LAUNCHER_USM(std::complex<double>, double, cublasZrotg)
-#undef ROTG_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event rotm(const char *func_name, Func func, sycl::queue &queue, int64_t n, T *x,
-                        int64_t incx, T *y, int64_t incy, T *param,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "rotm", "for row_major layout");
-}
-
-#define ROTM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event rotm(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \
-                     TYPE *param, const std::vector<sycl::event> &dependencies) {                 \
-        return rotm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, param,           \
-                    dependencies);                                                                \
-    }
-
-ROTM_LAUNCHER_USM(float, cublasSrotm)
-ROTM_LAUNCHER_USM(double, cublasDrotm)
-#undef ROTM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event copy(const char *func_name, Func func, sycl::queue &queue, int64_t n, const T *x,
-                        int64_t incx, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy", "for row_major layout");
-}
-
-#define COPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                 \
-    sycl::event copy(sycl::queue &queue, int64_t n, const TYPE *x, int64_t incx, TYPE *y,       \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {              \
-        return copy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies); \
-    }
-
-COPY_LAUNCHER_USM(float, cublasScopy)
-COPY_LAUNCHER_USM(double, cublasDcopy)
-COPY_LAUNCHER_USM(std::complex<float>, cublasCcopy)
-COPY_LAUNCHER_USM(std::complex<double>, cublasZcopy)
-#undef COPY_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event dot(const char *func_name, Func func, sycl::queue &queue, int64_t n, const T *x,
-                       const int64_t incx, const T *y, int64_t incy, T *result,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dot", "for row_major layout");
-}
-
-#define DOT_LAUNCHER_USM(EXT, TYPE, CUBLAS_ROUTINE)                                        \
-    sycl::event dot##EXT(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \
-                         const TYPE *y, const int64_t incy, TYPE *result,                  \
-                         const std::vector<sycl::event> &dependencies) {                   \
-        return dot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, result,    \
-                   dependencies);                                                          \
-    }
-DOT_LAUNCHER_USM(, float, cublasSdot)
-DOT_LAUNCHER_USM(, double, cublasDdot)
-DOT_LAUNCHER_USM(c, std::complex<float>, cublasCdotc)
-DOT_LAUNCHER_USM(c, std::complex<double>, cublasZdotc)
-DOT_LAUNCHER_USM(u, std::complex<float>, cublasCdotu)
-DOT_LAUNCHER_USM(u, std::complex<double>, cublasZdotu)
-#undef DOT_LAUNCHER_USM
-
-template <typename Func, typename T1, typename T2, typename T3>
-inline sycl::event rot(const char *func_name, Func func, sycl::queue &queue, int64_t n, T1 *x,
-                       const int64_t incx, T1 *y, int64_t incy, T2 c, T3 s,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "rot", "for row_major layout");
-}
-
-#define ROT_LAUNCHER_USM(TYPE1, TYPE2, TYPE3, CUBLAS_ROUTINE)                              \
-    sycl::event rot(sycl::queue &queue, int64_t n, TYPE1 *x, const int64_t incx, TYPE1 *y, \
-                    int64_t incy, TYPE2 c, TYPE3 s,                                        \
-                    const std::vector<sycl::event> &dependencies) {                        \
-        return rot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s,      \
-                   dependencies);                                                          \
-    }
-
-ROT_LAUNCHER_USM(float, float, float, cublasSrot)
-ROT_LAUNCHER_USM(double, double, double, cublasDrot)
-ROT_LAUNCHER_USM(std::complex<float>, float, float, cublasCsrot)
-ROT_LAUNCHER_USM(std::complex<double>, double, double, cublasZdrot)
-#undef ROT_LAUNCHER_USM
-
-sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx,
-                   const float *y, int64_t incy, float *result,
-                   const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "sdsdot", "for row_major layout");
-}
-
-sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y,
-                int64_t incy, double *result, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dot", "for row_major layout");
-}
-
-template <typename Func, typename T>
-inline sycl::event rotmg(const char *func_name, Func func, sycl::queue &queue, T *d1, T *d2, T *x1,
-                         T y1, T *param, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "rotmg", "for row_major layout");
-}
-
-#define ROTMG_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event rotmg(sycl::queue &queue, TYPE *d1, TYPE *d2, TYPE *x1, TYPE y1, TYPE *param,      \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return rotmg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, d1, d2, x1, y1, param, dependencies); \
-    }
-
-ROTMG_LAUNCHER_USM(float, cublasSrotmg)
-ROTMG_LAUNCHER_USM(double, cublasDrotmg)
-#undef ROTMG_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event iamax(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                         const T *x, const int64_t incx, int64_t *result,
-                         const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "iamax", "for row_major layout");
-}
-
-#define IAMAX_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                \
-    sycl::event iamax(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx,         \
-                      int64_t *result, const std::vector<sycl::event> &dependencies) {          \
-        return iamax(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \
-    }
-IAMAX_LAUNCHER_USM(float, cublasIsamax)
-IAMAX_LAUNCHER_USM(double, cublasIdamax)
-IAMAX_LAUNCHER_USM(std::complex<float>, cublasIcamax)
-IAMAX_LAUNCHER_USM(std::complex<double>, cublasIzamax)
-#undef IAMAX_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event swap(const char *func_name, Func func, sycl::queue &queue, int64_t n, T *x,
-                        int64_t incx, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "swap", "for row_major layout");
-}
-
-#define SWAP_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event swap(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return swap(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies);   \
-    }
-
-SWAP_LAUNCHER_USM(float, cublasSswap)
-SWAP_LAUNCHER_USM(double, cublasDswap)
-SWAP_LAUNCHER_USM(std::complex<float>, cublasCswap)
-SWAP_LAUNCHER_USM(std::complex<double>, cublasZswap)
-#undef SWAP_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event iamin(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                         const T *x, const int64_t incx, int64_t *result,
-                         const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "iamin", "for row_major layout");
-}
-
-#define IAMIN_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                \
-    sycl::event iamin(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx,         \
-                      int64_t *result, const std::vector<sycl::event> &dependencies) {          \
-        return iamin(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \
-    }
-IAMIN_LAUNCHER_USM(float, cublasIsamin)
-IAMIN_LAUNCHER_USM(double, cublasIdamin)
-IAMIN_LAUNCHER_USM(std::complex<float>, cublasIcamin)
-IAMIN_LAUNCHER_USM(std::complex<double>, cublasIzamin)
-#undef IAMIN_LAUNCHER_USM
-
-template <typename Func, typename T1, typename T2>
-inline sycl::event nrm2(const char *func_name, Func func, sycl::queue &queue, int64_t n,
-                        const T1 *x, const int64_t incx, T2 *result,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "nrm2", "for row_major layout");
-}
-
-#define NRM2_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE)                                        \
-    sycl::event nrm2(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx,        \
-                     TYPE2 *result, const std::vector<sycl::event> &dependencies) {            \
-        return nrm2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \
-    }
-NRM2_LAUNCHER_USM(float, float, cublasSnrm2)
-NRM2_LAUNCHER_USM(double, double, cublasDnrm2)
-NRM2_LAUNCHER_USM(std::complex<float>, float, cublasScnrm2)
-NRM2_LAUNCHER_USM(std::complex<double>, double, cublasDznrm2)
-#undef NRM2_LAUNCHER_USM
-
-} // namespace row_major
-} // namespace cublas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/cublas/cublas_level2.cpp b/src/blas/backends/cublas/cublas_level2.cpp
deleted file mode 100644
index 8f711243b..000000000
--- a/src/blas/backends/cublas/cublas_level2.cpp
+++ /dev/null
@@ -1,2702 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "cublas_helper.hpp"
-#include "cublas_task.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace cublas {
-namespace column_major {
-
-// Buffer APIs
-
-template <typename Func, typename T>
-inline void gemv(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m,
-                 int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx, T beta, sycl::buffer<T, 1> &y, int64_t incy) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, m, lda, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), m,
-                                     n, (cuDataType *)&alpha, a_, lda, x_, incx,
-                                     (cuDataType *)&beta, y_, incy);
-        });
-    });
-}
-
-#define GEMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                        \
-    void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,               \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx,       \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                                 \
-        gemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, \
-             incy);                                                                                \
-    }
-
-GEMV_LAUNCHER(float, cublasSgemv)
-GEMV_LAUNCHER(double, cublasDgemv)
-GEMV_LAUNCHER(std::complex<float>, cublasCgemv)
-GEMV_LAUNCHER(std::complex<double>, cublasZgemv)
-#undef GEMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void gbmv(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m,
-                 int64_t n, int64_t kl, int64_t ku, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
-                 sycl::buffer<T, 1> &x, int64_t incx, T beta, sycl::buffer<T, 1> &y, int64_t incy) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, m, lda, kl, ku, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), m,
-                                     n, kl, ku, (cuDataType *)&alpha, a_, lda, x_, incx,
-                                     (cuDataType *)&beta, y_, incy);
-        });
-    });
-}
-
-#define GBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                       \
-    void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,  \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x,        \
-              int64_t incx, TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                  \
-        gbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, \
-             beta, y, incy);                                                                      \
-    }
-
-GBMV_LAUNCHER(float, cublasSgbmv)
-GBMV_LAUNCHER(double, cublasDgbmv)
-GBMV_LAUNCHER(std::complex<float>, cublasCgbmv)
-GBMV_LAUNCHER(std::complex<double>, cublasZgbmv)
-#undef GBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void ger(const char *func_name, Func func, sycl::queue &queue, int64_t m, int64_t n, T alpha,
-                sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                sycl::buffer<T, 1> &a, int64_t lda) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, m, lda, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, (cuDataType *)&alpha, x_,
-                                     incx, y_, incy, a_, lda);
-        });
-    });
-}
-
-#define GER_LAUNCHER(EXT, TYPE, CUBLAS_ROUTINE)                                                   \
-    void ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &x, \
-                  int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy, sycl::buffer<TYPE, 1> &a, \
-                  int64_t lda) {                                                                  \
-        ger(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda);       \
-    }
-
-GER_LAUNCHER(, float, cublasSger)
-GER_LAUNCHER(, double, cublasDger)
-GER_LAUNCHER(u, std::complex<float>, cublasCgeru)
-GER_LAUNCHER(u, std::complex<double>, cublasZgeru)
-GER_LAUNCHER(c, std::complex<float>, cublasCgerc)
-GER_LAUNCHER(c, std::complex<double>, cublasZgerc)
-#undef GER_LAUNCHER
-
-template <typename Func, typename T>
-inline void hbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 int64_t k, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx, T beta, sycl::buffer<T, 1> &y, int64_t incy) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, k, (cuDataType *)&alpha,
-                                     a_, lda, x_, incx, (cuDataType *)&beta, y_, incy);
-        });
-    });
-}
-
-#define HBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                     \
-    void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,           \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx,    \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        hbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, \
-             beta, y, incy);                                                                    \
-    }
-
-HBMV_LAUNCHER(std::complex<float>, cublasChbmv)
-HBMV_LAUNCHER(std::complex<double>, cublasZhbmv)
-#undef HBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void hemv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x, int64_t incx,
-                 T beta, sycl::buffer<T, 1> &y, int64_t incy) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_,
-                                     lda, x_, incx, (cuDataType *)&beta, y_, incy);
-        });
-    });
-}
-
-#define HEMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                        \
-    void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                         \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx,       \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                                 \
-        hemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, \
-             y, incy);                                                                             \
-    }
-
-HEMV_LAUNCHER(std::complex<float>, cublasChemv)
-HEMV_LAUNCHER(std::complex<double>, cublasZhemv)
-#undef HEMV_LAUNCHER
-
-template <typename Func, typename ScalarType, typename DataType>
-inline void her(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                ScalarType alpha, sycl::buffer<DataType, 1> &x, int64_t incx,
-                sycl::buffer<DataType, 1> &a, int64_t lda) {
-    using cuScalarType = typename CudaEquivalentType<ScalarType>::Type;
-    using cuDataType = typename CudaEquivalentType<DataType>::Type;
-    overflow_check(n, lda, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuScalarType *)&alpha,
-                                     x_, incx, a_, lda);
-        });
-    });
-}
-
-#define HER_LAUNCHER(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE)                                 \
-    void her(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha,             \
-             sycl::buffer<DATA_TYPE, 1> &x, int64_t incx, sycl::buffer<DATA_TYPE, 1> &a,     \
-             int64_t lda) {                                                                  \
-        her(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda); \
-    }
-
-HER_LAUNCHER(float, std::complex<float>, cublasCher)
-HER_LAUNCHER(double, std::complex<double>, cublasZher)
-
-#undef HER_LAUNCHER
-
-template <typename Func, typename T>
-inline void her2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a, int64_t lda) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_,
-                                     incx, y_, incy, a_, lda);
-        });
-    });
-}
-
-#define HER2_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                      \
-    void her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                       \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy,    \
-              sycl::buffer<TYPE, 1> &a, int64_t lda) {                                           \
-        her2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, \
-             lda);                                                                               \
-    }
-
-HER2_LAUNCHER(std::complex<float>, cublasCher2)
-HER2_LAUNCHER(std::complex<double>, cublasZher2)
-
-#undef HER2_LAUNCHER
-
-template <typename Func, typename T>
-inline void hpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &a, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_,
-                                     x_, incx, (cuDataType *)&beta, y_, incy);
-        });
-    });
-}
-
-#define HPMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                      \
-    void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                       \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx, TYPE beta,       \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                                          \
-        hpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, \
-             incy);                                                                              \
-    }
-
-HPMV_LAUNCHER(std::complex<float>, cublasChpmv)
-HPMV_LAUNCHER(std::complex<double>, cublasZhpmv)
-
-#undef HPMV_LAUNCHER
-
-template <typename Func, typename ScalarType, typename DataType>
-inline void hpr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                ScalarType alpha, sycl::buffer<DataType, 1> &x, int64_t incx,
-                sycl::buffer<DataType, 1> &a) {
-    using cuScalarType = typename CudaEquivalentType<ScalarType>::Type;
-    using cuDataType = typename CudaEquivalentType<DataType>::Type;
-    overflow_check(n, incx);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuScalarType *)&alpha,
-                                     x_, incx, a_);
-        });
-    });
-}
-
-#define HPR_LAUNCHER(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE)                               \
-    void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha,           \
-             sycl::buffer<DATA_TYPE, 1> &x, int64_t incx, sycl::buffer<DATA_TYPE, 1> &a) { \
-        hpr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a);    \
-    }
-
-HPR_LAUNCHER(float, std::complex<float>, cublasChpr)
-HPR_LAUNCHER(double, std::complex<double>, cublasZhpr)
-
-#undef HPR_LAUNCHER
-
-template <typename Func, typename T>
-inline void hpr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_,
-                                     incx, y_, incy, a_);
-        });
-    });
-}
-
-#define HPR2_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                       \
-    void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                        \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy,     \
-              sycl::buffer<TYPE, 1> &a) {                                                         \
-        hpr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a); \
-    }
-
-HPR2_LAUNCHER(std::complex<float>, cublasChpr2)
-HPR2_LAUNCHER(std::complex<double>, cublasZhpr2)
-
-#undef HPR2_LAUNCHER
-
-template <typename Func, typename T>
-inline void sbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 int64_t k, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx, T beta, sycl::buffer<T, 1> &y, int64_t incy) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, k, (cuDataType *)&alpha,
-                                     a_, lda, x_, incx, (cuDataType *)&beta, y_, incy);
-        });
-    });
-}
-
-#define SBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                     \
-    void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,           \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx,    \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        sbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, \
-             beta, y, incy);                                                                    \
-    }
-
-SBMV_LAUNCHER(float, cublasSsbmv)
-SBMV_LAUNCHER(double, cublasDsbmv)
-
-#undef SBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void symv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x, int64_t incx,
-                 T beta, sycl::buffer<T, 1> &y, int64_t incy) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_,
-                                     lda, x_, incx, (cuDataType *)&beta, y_, incy);
-        });
-    });
-}
-
-#define SYMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                        \
-    void symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                         \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx,       \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                                 \
-        symv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, \
-             y, incy);                                                                             \
-    }
-
-SYMV_LAUNCHER(float, cublasSsymv)
-SYMV_LAUNCHER(double, cublasDsymv)
-
-#undef SYMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void syr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                T alpha, sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &a, int64_t lda) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, incx);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_,
-                                     incx, a_, lda);
-        });
-    });
-}
-
-#define SYR_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                    \
-    void syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                     \
-             sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &a, int64_t lda) { \
-        syr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda);  \
-    }
-
-SYR_LAUNCHER(float, cublasSsyr)
-SYR_LAUNCHER(double, cublasDsyr)
-// Intel does not support the following two
-SYR_LAUNCHER(std::complex<float>, cublasCsyr)
-SYR_LAUNCHER(std::complex<double>, cublasZsyr)
-#undef SYR_LAUNCHER
-
-template <typename Func, typename T>
-inline void syr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a, int64_t lda) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_,
-                                     incx, y_, incy, a_, lda);
-        });
-    });
-}
-
-#define SYR2_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                      \
-    void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                       \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy,    \
-              sycl::buffer<TYPE, 1> &a, int64_t lda) {                                           \
-        syr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, \
-             lda);                                                                               \
-    }
-
-SYR2_LAUNCHER(float, cublasSsyr2)
-SYR2_LAUNCHER(double, cublasDsyr2)
-// Intel does not support the following two
-SYR2_LAUNCHER(std::complex<float>, cublasCsyr2)
-SYR2_LAUNCHER(std::complex<double>, cublasZsyr2)
-
-#undef SYR2_LAUNCHER
-
-template <typename Func, typename T>
-inline void spmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &a, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_,
-                                     x_, incx, (cuDataType *)&beta, y_, incy);
-        });
-    });
-}
-
-#define SPMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                      \
-    void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                       \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx, TYPE beta,       \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                                          \
-        spmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, \
-             incy);                                                                              \
-    }
-
-SPMV_LAUNCHER(float, cublasSspmv)
-SPMV_LAUNCHER(double, cublasDspmv)
-
-#undef SPMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void spr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                T alpha, sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &a) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_,
-                                     incx, a_);
-        });
-    });
-}
-
-#define SPR_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                              \
-    void spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,               \
-             sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &a) {        \
-        spr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a); \
-    }
-
-SPR_LAUNCHER(float, cublasSspr)
-SPR_LAUNCHER(double, cublasDspr)
-
-#undef SPR_LAUNCHER
-
-template <typename Func, typename T>
-inline void spr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            auto y_ = sc.get_mem<cuDataType *>(y_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_,
-                                     incx, y_, incy, a_);
-        });
-    });
-}
-
-#define SPR2_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                       \
-    void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                        \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy,     \
-              sycl::buffer<TYPE, 1> &a) {                                                         \
-        spr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a); \
-    }
-
-SPR2_LAUNCHER(float, cublasSspr2)
-SPR2_LAUNCHER(double, cublasDspr2)
-
-#undef SPR2_LAUNCHER
-
-template <typename Func, typename T>
-inline void tbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t n, int64_t k, sycl::buffer<T, 1> &a,
-                 int64_t lda, sycl::buffer<T, 1> &x, int64_t incx) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), n, k, a_, lda, x_, incx);
-        });
-    });
-}
-
-#define TBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                       \
-    void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,   \
-              int64_t k, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x,         \
-              int64_t incx) {                                                                     \
-        tbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, \
-             x, incx);                                                                            \
-    }
-
-TBMV_LAUNCHER(float, cublasStbmv)
-TBMV_LAUNCHER(double, cublasDtbmv)
-TBMV_LAUNCHER(std::complex<float>, cublasCtbmv)
-TBMV_LAUNCHER(std::complex<double>, cublasZtbmv)
-
-#undef TBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void tbsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t n, int64_t k, sycl::buffer<T, 1> &a,
-                 int64_t lda, sycl::buffer<T, 1> &x, int64_t incx) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), n, k, a_, lda, x_, incx);
-        });
-    });
-}
-
-#define TBSV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                       \
-    void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,   \
-              int64_t k, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x,         \
-              int64_t incx) {                                                                     \
-        tbsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, \
-             x, incx);                                                                            \
-    }
-
-TBSV_LAUNCHER(float, cublasStbsv)
-TBSV_LAUNCHER(double, cublasDtbsv)
-TBSV_LAUNCHER(std::complex<float>, cublasCtbsv)
-TBSV_LAUNCHER(std::complex<double>, cublasZtbsv)
-
-#undef TBSV_LAUNCHER
-
-template <typename Func, typename T>
-inline void tpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t n, sycl::buffer<T, 1> &a,
-                 sycl::buffer<T, 1> &x, int64_t incx) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), n, a_, x_, incx);
-        });
-    });
-}
-
-#define TPMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                     \
-    void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx) {               \
-        tpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x,    \
-             incx);                                                                             \
-    }
-
-TPMV_LAUNCHER(float, cublasStpmv)
-TPMV_LAUNCHER(double, cublasDtpmv)
-TPMV_LAUNCHER(std::complex<float>, cublasCtpmv)
-TPMV_LAUNCHER(std::complex<double>, cublasZtpmv)
-
-#undef TPMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void tpsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t n, sycl::buffer<T, 1> &a,
-                 sycl::buffer<T, 1> &x, int64_t incx) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), n, a_, x_, incx);
-        });
-    });
-}
-
-#define TPSV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                     \
-    void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx) {               \
-        tpsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x,    \
-             incx);                                                                             \
-    }
-
-TPSV_LAUNCHER(float, cublasStpsv)
-TPSV_LAUNCHER(double, cublasDtpsv)
-TPSV_LAUNCHER(std::complex<float>, cublasCtpsv)
-TPSV_LAUNCHER(std::complex<double>, cublasZtpsv)
-
-#undef TPSV_LAUNCHER
-
-template <typename Func, typename T>
-inline void trmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t n, sycl::buffer<T, 1> &a, int64_t lda,
-                 sycl::buffer<T, 1> &x, int64_t incx) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, incx);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), n, a_, lda, x_, incx);
-        });
-    });
-}
-
-#define TRMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                       \
-    void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,   \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx) {    \
-        trmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, \
-             incx);                                                                               \
-    }
-
-TRMV_LAUNCHER(float, cublasStrmv)
-TRMV_LAUNCHER(double, cublasDtrmv)
-TRMV_LAUNCHER(std::complex<float>, cublasCtrmv)
-TRMV_LAUNCHER(std::complex<double>, cublasZtrmv)
-
-#undef TRMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void trsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t n, sycl::buffer<T, 1> &a, int64_t lda,
-                 sycl::buffer<T, 1> &x, int64_t incx) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, incx);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto x_ = sc.get_mem<cuDataType *>(x_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), n, a_, lda, x_, incx);
-        });
-    });
-}
-
-#define TRSV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                       \
-    void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,   \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx) {    \
-        trsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, \
-             incx);                                                                               \
-    }
-
-TRSV_LAUNCHER(float, cublasStrsv)
-TRSV_LAUNCHER(double, cublasDtrsv)
-TRSV_LAUNCHER(std::complex<float>, cublasCtrsv)
-TRSV_LAUNCHER(std::complex<double>, cublasZtrsv)
-
-#undef TRSV_LAUNCHER
-
-// USM APIs
-
-template <typename Func, typename T>
-inline sycl::event gemv(const char *func_name, Func func, sycl::queue &queue, transpose trans,
-                        int64_t m, int64_t n, T alpha, const T *a, int64_t lda, const T *x,
-                        int64_t incx, T beta, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, m, lda, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), m,
-                                     n, (cuDataType *)&alpha, a_, lda, x_, incx,
-                                     (cuDataType *)&beta, y_, incy);
-        });
-    });
-    return done;
-}
-
-#define GEMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,       \
-                     const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {                \
-        return gemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx,  \
-                    beta, y, incy, dependencies);                                                 \
-    }
-
-GEMV_LAUNCHER_USM(float, cublasSgemv)
-GEMV_LAUNCHER_USM(double, cublasDgemv)
-GEMV_LAUNCHER_USM(std::complex<float>, cublasCgemv)
-GEMV_LAUNCHER_USM(std::complex<double>, cublasZgemv)
-#undef GEMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event gbmv(const char *func_name, Func func, sycl::queue &queue, transpose trans,
-                        int64_t m, int64_t n, int64_t kl, int64_t ku, T alpha, const T *a,
-                        int64_t lda, const T *x, int64_t incx, T beta, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, m, lda, kl, ku, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), m,
-                                     n, kl, ku, (cuDataType *)&alpha, a_, lda, x_, incx,
-                                     (cuDataType *)&beta, y_, incy);
-        });
-    });
-    return done;
-}
-
-#define GBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                    \
-    sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl,        \
-                     int64_t ku, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *x,            \
-                     int64_t incx, TYPE beta, TYPE *y, int64_t incy,                               \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return gbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, \
-                    incx, beta, y, incy, dependencies);                                            \
-    }
-
-GBMV_LAUNCHER_USM(float, cublasSgbmv)
-GBMV_LAUNCHER_USM(double, cublasDgbmv)
-GBMV_LAUNCHER_USM(std::complex<float>, cublasCgbmv)
-GBMV_LAUNCHER_USM(std::complex<double>, cublasZgbmv)
-#undef GBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event ger(const char *func_name, Func func, sycl::queue &queue, int64_t m, int64_t n,
-                       T alpha, const T *x, int64_t incx, const T *y, int64_t incy, T *a,
-                       int64_t lda, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, m, lda, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<const cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, (cuDataType *)&alpha, x_,
-                                     incx, y_, incy, a_, lda);
-        });
-    });
-    return done;
-}
-
-#define GER_LAUNCHER_USM(EXT, TYPE, CUBLAS_ROUTINE)                                               \
-    sycl::event ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, const TYPE *x,     \
-                         int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda,         \
-                         const std::vector<sycl::event> &dependencies) {                          \
-        return ger(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda, \
-                   dependencies);                                                                 \
-    }
-
-GER_LAUNCHER_USM(, float, cublasSger)
-GER_LAUNCHER_USM(, double, cublasDger)
-GER_LAUNCHER_USM(u, std::complex<float>, cublasCgeru)
-GER_LAUNCHER_USM(u, std::complex<double>, cublasZgeru)
-GER_LAUNCHER_USM(c, std::complex<float>, cublasCgerc)
-GER_LAUNCHER_USM(c, std::complex<double>, cublasZgerc)
-#undef GER_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, int64_t k, T alpha, const T *a, int64_t lda, const T *x,
-                        int64_t incx, T beta, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, k, (cuDataType *)&alpha,
-                                     a_, lda, x_, incx, (cuDataType *)&beta, y_, incy);
-        });
-    });
-    return done;
-}
-
-#define HBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,      \
-                     const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {                \
-        return hbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x,  \
-                    incx, beta, y, incy, dependencies);                                           \
-    }
-
-HBMV_LAUNCHER_USM(std::complex<float>, cublasChbmv)
-HBMV_LAUNCHER_USM(std::complex<double>, cublasZhbmv)
-#undef HBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hemv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *a, int64_t lda, const T *x, int64_t incx,
-                        T beta, T *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_,
-                                     lda, x_, incx, (cuDataType *)&beta, y_, incy);
-        });
-    });
-    return done;
-}
-
-#define HEMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \
-                     int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return hemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x,    \
-                    incx, beta, y, incy, dependencies);                                          \
-    }
-
-HEMV_LAUNCHER_USM(std::complex<float>, cublasChemv)
-HEMV_LAUNCHER_USM(std::complex<double>, cublasZhemv)
-#undef HEMV_LAUNCHER_USM
-
-template <typename Func, typename ScalarType, typename DataType>
-inline sycl::event her(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                       int64_t n, const ScalarType alpha, const DataType *x, int64_t incx,
-                       DataType *a, int64_t lda, const std::vector<sycl::event> &dependencies) {
-    using cuScalarType = typename CudaEquivalentType<ScalarType>::Type;
-    using cuDataType = typename CudaEquivalentType<DataType>::Type;
-    overflow_check(n, lda, incx);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuScalarType *)&alpha,
-                                     x_, incx, a_, lda);
-        });
-    });
-    return done;
-}
-
-#define HER_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE)                                   \
-    sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha,      \
-                    const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, int64_t lda,                   \
-                    const std::vector<sycl::event> &dependencies) {                                \
-        return her(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, \
-                   dependencies);                                                                  \
-    }
-
-HER_LAUNCHER_USM(float, std::complex<float>, cublasCher)
-HER_LAUNCHER_USM(double, std::complex<double>, cublasZher)
-
-#undef HER_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event her2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy,
-                        T *a, int64_t lda, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<const cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_,
-                                     incx, y_, incy, a_, lda);
-        });
-    });
-    return done;
-}
-
-#define HER2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda,            \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return her2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y,   \
-                    incy, a, lda, dependencies);                                                 \
-    }
-
-HER2_LAUNCHER_USM(std::complex<float>, cublasCher2)
-HER2_LAUNCHER_USM(std::complex<double>, cublasZher2)
-
-#undef HER2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *a, const T *x, int64_t incx, T beta, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_,
-                                     x_, incx, (cuDataType *)&beta, y_, incy);
-        });
-    });
-    return done;
-}
-
-#define HPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \
-                     const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy,              \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return hpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx,   \
-                    beta, y, incy, dependencies);                                                \
-    }
-
-HPMV_LAUNCHER_USM(std::complex<float>, cublasChpmv)
-HPMV_LAUNCHER_USM(std::complex<double>, cublasZhpmv)
-
-#undef HPMV_LAUNCHER_USM
-
-template <typename Func, typename ScalarType, typename DataType>
-inline sycl::event hpr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                       int64_t n, const ScalarType alpha, const DataType *x, int64_t incx,
-                       DataType *a, const std::vector<sycl::event> &dependencies) {
-    using cuScalarType = typename CudaEquivalentType<ScalarType>::Type;
-    using cuDataType = typename CudaEquivalentType<DataType>::Type;
-    overflow_check(n, incx);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuScalarType *)&alpha,
-                                     x_, incx, a_);
-        });
-    });
-    return done;
-}
-
-#define HPR_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE)                              \
-    sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \
-                    const DATA_TYPE *x, int64_t incx, DATA_TYPE *a,                           \
-                    const std::vector<sycl::event> &dependencies) {                           \
-        return hpr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, \
-                   dependencies);                                                             \
-    }
-
-HPR_LAUNCHER_USM(float, std::complex<float>, cublasChpr)
-HPR_LAUNCHER_USM(double, std::complex<double>, cublasZhpr)
-
-#undef HPR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hpr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy,
-                        T *a, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<const cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_,
-                                     incx, y_, incy, a_);
-        });
-    });
-    return done;
-}
-
-#define HPR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a,                         \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return hpr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y,   \
-                    incy, a, dependencies);                                                      \
-    }
-
-HPR2_LAUNCHER_USM(std::complex<float>, cublasChpr2)
-HPR2_LAUNCHER_USM(std::complex<double>, cublasZhpr2)
-
-#undef HPR2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event sbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, int64_t k, T alpha, const T *a, int64_t lda, const T *x,
-                        int64_t incx, T beta, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, k, (cuDataType *)&alpha,
-                                     a_, lda, x_, incx, (cuDataType *)&beta, y_, incy);
-        });
-    });
-    return done;
-}
-
-#define SBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,      \
-                     const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {                \
-        return sbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x,  \
-                    incx, beta, y, incy, dependencies);                                           \
-    }
-
-SBMV_LAUNCHER_USM(float, cublasSsbmv)
-SBMV_LAUNCHER_USM(double, cublasDsbmv)
-
-#undef SBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event symv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *a, int64_t lda, const T *x, int64_t incx,
-                        T beta, T *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_,
-                                     lda, x_, incx, (cuDataType *)&beta, y_, incy);
-        });
-    });
-    return done;
-}
-
-#define SYMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \
-                     int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return symv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x,    \
-                    incx, beta, y, incy, dependencies);                                          \
-    }
-
-SYMV_LAUNCHER_USM(float, cublasSsymv)
-SYMV_LAUNCHER_USM(double, cublasDsymv)
-
-#undef SYMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                       int64_t n, T alpha, const T *x, int64_t incx, T *a, int64_t lda,
-                       const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, incx);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_,
-                                     incx, a_, lda);
-        });
-    });
-    return done;
-}
-
-#define SYR_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                     \
-    sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x,    \
-                    int64_t incx, TYPE *a, int64_t lda,                                            \
-                    const std::vector<sycl::event> &dependencies) {                                \
-        return syr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, \
-                   dependencies);                                                                  \
-    }
-
-SYR_LAUNCHER_USM(float, cublasSsyr)
-SYR_LAUNCHER_USM(double, cublasDsyr)
-// Intel does not support the following two
-SYR_LAUNCHER_USM(std::complex<float>, cublasCsyr)
-SYR_LAUNCHER_USM(std::complex<double>, cublasZsyr)
-#undef SYR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy,
-                        T *a, int64_t lda, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<const cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_,
-                                     incx, y_, incy, a_, lda);
-        });
-    });
-    return done;
-}
-
-#define SYR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda,            \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return syr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y,   \
-                    incy, a, lda, dependencies);                                                 \
-    }
-
-SYR2_LAUNCHER_USM(float, cublasSsyr2)
-SYR2_LAUNCHER_USM(double, cublasDsyr2)
-// Intel does not support the following two
-SYR2_LAUNCHER_USM(std::complex<float>, cublasCsyr2)
-SYR2_LAUNCHER_USM(std::complex<double>, cublasZsyr2)
-
-#undef SYR2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event spmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *a, const T *x, int64_t incx, T beta, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_,
-                                     x_, incx, (cuDataType *)&beta, y_, incy);
-        });
-    });
-    return done;
-}
-
-#define SPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \
-                     const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy,              \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return spmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx,   \
-                    beta, y, incy, dependencies);                                                \
-    }
-
-SPMV_LAUNCHER_USM(float, cublasSspmv)
-SPMV_LAUNCHER_USM(double, cublasDspmv)
-
-#undef SPMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event spr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                       int64_t n, T alpha, const T *x, int64_t incx, T *a,
-                       const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_,
-                                     incx, a_);
-        });
-    });
-    return done;
-}
-
-#define SPR_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                    int64_t incx, TYPE *a, const std::vector<sycl::event> &dependencies) {      \
-        return spr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a,   \
-                   dependencies);                                                               \
-    }
-
-SPR_LAUNCHER_USM(float, cublasSspr)
-SPR_LAUNCHER_USM(double, cublasDspr)
-
-#undef SPR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event spr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy,
-                        T *a, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto y_ = reinterpret_cast<const cuDataType *>(y);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_,
-                                     incx, y_, incy, a_);
-        });
-    });
-    return done;
-}
-
-#define SPR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a,                         \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return spr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y,   \
-                    incy, a, dependencies);                                                      \
-    }
-
-SPR2_LAUNCHER_USM(float, cublasSspr2)
-SPR2_LAUNCHER_USM(double, cublasDspr2)
-
-#undef SPR2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t n, int64_t k, const T *a,
-                        int64_t lda, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto x_ = reinterpret_cast<cuDataType *>(x);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), n, k, a_, lda, x_, incx);
-        });
-    });
-    return done;
-}
-
-#define TBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,      \
-                     int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,    \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return tbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, \
-                    a, lda, x, incx, dependencies);                                              \
-    }
-
-TBMV_LAUNCHER_USM(float, cublasStbmv)
-TBMV_LAUNCHER_USM(double, cublasDtbmv)
-TBMV_LAUNCHER_USM(std::complex<float>, cublasCtbmv)
-TBMV_LAUNCHER_USM(std::complex<double>, cublasZtbmv)
-
-#undef TBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tbsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t n, int64_t k, const T *a,
-                        int64_t lda, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto x_ = reinterpret_cast<cuDataType *>(x);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), n, k, a_, lda, x_, incx);
-        });
-    });
-    return done;
-}
-
-#define TBSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,      \
-                     int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,    \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return tbsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, \
-                    a, lda, x, incx, dependencies);                                              \
-    }
-
-TBSV_LAUNCHER_USM(float, cublasStbsv)
-TBSV_LAUNCHER_USM(double, cublasDtbsv)
-TBSV_LAUNCHER_USM(std::complex<float>, cublasCtbsv)
-TBSV_LAUNCHER_USM(std::complex<double>, cublasZtbsv)
-
-#undef TBSV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t n, const T *a, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto x_ = reinterpret_cast<cuDataType *>(x);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), n, a_, x_, incx);
-        });
-    });
-    return done;
-}
-
-#define TPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,      \
-                     int64_t n, const TYPE *a, TYPE *x, int64_t incx,                            \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return tpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \
-                    x, incx, dependencies);                                                      \
-    }
-
-TPMV_LAUNCHER_USM(float, cublasStpmv)
-TPMV_LAUNCHER_USM(double, cublasDtpmv)
-TPMV_LAUNCHER_USM(std::complex<float>, cublasCtpmv)
-TPMV_LAUNCHER_USM(std::complex<double>, cublasZtpmv)
-
-#undef TPMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tpsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t n, const T *a, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, incx);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto x_ = reinterpret_cast<cuDataType *>(x);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), n, a_, x_, incx);
-        });
-    });
-    return done;
-}
-
-#define TPSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,      \
-                     int64_t n, const TYPE *a, TYPE *x, int64_t incx,                            \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return tpsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \
-                    x, incx, dependencies);                                                      \
-    }
-
-TPSV_LAUNCHER_USM(float, cublasStpsv)
-TPSV_LAUNCHER_USM(double, cublasDtpsv)
-TPSV_LAUNCHER_USM(std::complex<float>, cublasCtpsv)
-TPSV_LAUNCHER_USM(std::complex<double>, cublasZtpsv)
-
-#undef TPSV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t n, const T *a, int64_t lda, T *x,
-                        int64_t incx, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, incx);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto x_ = reinterpret_cast<cuDataType *>(x);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), n, a_, lda, x_, incx);
-        });
-    });
-    return done;
-}
-
-#define TRMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,      \
-                     int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,               \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return trmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \
-                    lda, x, incx, dependencies);                                                 \
-    }
-
-TRMV_LAUNCHER_USM(float, cublasStrmv)
-TRMV_LAUNCHER_USM(double, cublasDtrmv)
-TRMV_LAUNCHER_USM(std::complex<float>, cublasCtrmv)
-TRMV_LAUNCHER_USM(std::complex<double>, cublasZtrmv)
-
-#undef TRMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t n, const T *a, int64_t lda, T *x,
-                        int64_t incx, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, incx);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto x_ = reinterpret_cast<cuDataType *>(x);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), n, a_, lda, x_, incx);
-        });
-    });
-    return done;
-}
-
-#define TRSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,      \
-                     int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,               \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return trsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \
-                    lda, x, incx, dependencies);                                                 \
-    }
-
-TRSV_LAUNCHER_USM(float, cublasStrsv)
-TRSV_LAUNCHER_USM(double, cublasDtrsv)
-TRSV_LAUNCHER_USM(std::complex<float>, cublasCtrsv)
-TRSV_LAUNCHER_USM(std::complex<double>, cublasZtrsv)
-
-#undef TRSV_LAUNCHER_USM
-
-} // namespace column_major
-namespace row_major {
-
-// Buffer APIs
-
-template <typename Func, typename T>
-inline void gemv(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m,
-                 int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx, T beta, sycl::buffer<T, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "gemv", "for row_major layout");
-}
-
-#define GEMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                        \
-    void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,               \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx,       \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                                 \
-        gemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, \
-             incy);                                                                                \
-    }
-
-GEMV_LAUNCHER(float, cublasSgemv)
-GEMV_LAUNCHER(double, cublasDgemv)
-GEMV_LAUNCHER(std::complex<float>, cublasCgemv)
-GEMV_LAUNCHER(std::complex<double>, cublasZgemv)
-#undef GEMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void gbmv(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m,
-                 int64_t n, int64_t kl, int64_t ku, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
-                 sycl::buffer<T, 1> &x, int64_t incx, T beta, sycl::buffer<T, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "gbmv", "for row_major layout");
-}
-
-#define GBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                       \
-    void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,  \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x,        \
-              int64_t incx, TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                  \
-        gbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, \
-             beta, y, incy);                                                                      \
-    }
-
-GBMV_LAUNCHER(float, cublasSgbmv)
-GBMV_LAUNCHER(double, cublasDgbmv)
-GBMV_LAUNCHER(std::complex<float>, cublasCgbmv)
-GBMV_LAUNCHER(std::complex<double>, cublasZgbmv)
-#undef GBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void ger(const char *func_name, Func func, sycl::queue &queue, int64_t m, int64_t n, T alpha,
-                sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                sycl::buffer<T, 1> &a, int64_t lda) {
-    throw unimplemented("blas", "ger", "for row_major layout");
-}
-
-#define GER_LAUNCHER(EXT, TYPE, CUBLAS_ROUTINE)                                                   \
-    void ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &x, \
-                  int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy, sycl::buffer<TYPE, 1> &a, \
-                  int64_t lda) {                                                                  \
-        ger(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda);       \
-    }
-
-GER_LAUNCHER(, float, cublasSger)
-GER_LAUNCHER(, double, cublasDger)
-GER_LAUNCHER(u, std::complex<float>, cublasCgeru)
-GER_LAUNCHER(u, std::complex<double>, cublasZgeru)
-GER_LAUNCHER(c, std::complex<float>, cublasCgerc)
-GER_LAUNCHER(c, std::complex<double>, cublasZgerc)
-#undef GER_LAUNCHER
-
-template <typename Func, typename T>
-inline void hbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 int64_t k, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx, T beta, sycl::buffer<T, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "hbmv", "for row_major layout");
-}
-
-#define HBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                     \
-    void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,           \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx,    \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        hbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, \
-             beta, y, incy);                                                                    \
-    }
-
-HBMV_LAUNCHER(std::complex<float>, cublasChbmv)
-HBMV_LAUNCHER(std::complex<double>, cublasZhbmv)
-#undef HBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void hemv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x, int64_t incx,
-                 T beta, sycl::buffer<T, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "hemv", "for row_major layout");
-}
-
-#define HEMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                        \
-    void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                         \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx,       \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                                 \
-        hemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, \
-             y, incy);                                                                             \
-    }
-
-HEMV_LAUNCHER(std::complex<float>, cublasChemv)
-HEMV_LAUNCHER(std::complex<double>, cublasZhemv)
-#undef HEMV_LAUNCHER
-
-template <typename Func, typename ScalarType, typename DataType>
-inline void her(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                ScalarType alpha, sycl::buffer<DataType, 1> &x, int64_t incx,
-                sycl::buffer<DataType, 1> &a, int64_t lda) {
-    throw unimplemented("blas", "her", "for row_major layout");
-}
-
-#define HER_LAUNCHER(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE)                                 \
-    void her(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha,             \
-             sycl::buffer<DATA_TYPE, 1> &x, int64_t incx, sycl::buffer<DATA_TYPE, 1> &a,     \
-             int64_t lda) {                                                                  \
-        her(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda); \
-    }
-
-HER_LAUNCHER(float, std::complex<float>, cublasCher)
-HER_LAUNCHER(double, std::complex<double>, cublasZher)
-
-#undef HER_LAUNCHER
-
-template <typename Func, typename T>
-inline void her2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a, int64_t lda) {
-    throw unimplemented("blas", "her2", "for row_major layout");
-}
-
-#define HER2_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                      \
-    void her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                       \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy,    \
-              sycl::buffer<TYPE, 1> &a, int64_t lda) {                                           \
-        her2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, \
-             lda);                                                                               \
-    }
-
-HER2_LAUNCHER(std::complex<float>, cublasCher2)
-HER2_LAUNCHER(std::complex<double>, cublasZher2)
-
-#undef HER2_LAUNCHER
-
-template <typename Func, typename T>
-inline void hpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &a, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "hpmv", "for row_major layout");
-}
-
-#define HPMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                      \
-    void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                       \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx, TYPE beta,       \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                                          \
-        hpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, \
-             incy);                                                                              \
-    }
-
-HPMV_LAUNCHER(std::complex<float>, cublasChpmv)
-HPMV_LAUNCHER(std::complex<double>, cublasZhpmv)
-
-#undef HPMV_LAUNCHER
-
-template <typename Func, typename ScalarType, typename DataType>
-inline void hpr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                ScalarType alpha, sycl::buffer<DataType, 1> &x, int64_t incx,
-                sycl::buffer<DataType, 1> &a) {
-    throw unimplemented("blas", "hpr", "for row_major layout");
-}
-
-#define HPR_LAUNCHER(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE)                               \
-    void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha,           \
-             sycl::buffer<DATA_TYPE, 1> &x, int64_t incx, sycl::buffer<DATA_TYPE, 1> &a) { \
-        hpr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a);    \
-    }
-
-HPR_LAUNCHER(float, std::complex<float>, cublasChpr)
-HPR_LAUNCHER(double, std::complex<double>, cublasZhpr)
-
-#undef HPR_LAUNCHER
-
-template <typename Func, typename T>
-inline void hpr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a) {
-    throw unimplemented("blas", "hpr2", "for row_major layout");
-}
-
-#define HPR2_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                       \
-    void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                        \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy,     \
-              sycl::buffer<TYPE, 1> &a) {                                                         \
-        hpr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a); \
-    }
-
-HPR2_LAUNCHER(std::complex<float>, cublasChpr2)
-HPR2_LAUNCHER(std::complex<double>, cublasZhpr2)
-
-#undef HPR2_LAUNCHER
-
-template <typename Func, typename T>
-inline void sbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 int64_t k, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx, T beta, sycl::buffer<T, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "sbmv", "for row_major layout");
-}
-
-#define SBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                     \
-    void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,           \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx,    \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        sbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, \
-             beta, y, incy);                                                                    \
-    }
-
-SBMV_LAUNCHER(float, cublasSsbmv)
-SBMV_LAUNCHER(double, cublasDsbmv)
-
-#undef SBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void symv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x, int64_t incx,
-                 T beta, sycl::buffer<T, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "symv", "for row_major layout");
-}
-
-#define SYMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                        \
-    void symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                         \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx,       \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                                 \
-        symv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, \
-             y, incy);                                                                             \
-    }
-
-SYMV_LAUNCHER(float, cublasSsymv)
-SYMV_LAUNCHER(double, cublasDsymv)
-
-#undef SYMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void syr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                T alpha, sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &a, int64_t lda) {
-    throw unimplemented("blas", "syr", "for row_major layout");
-}
-
-#define SYR_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                    \
-    void syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                     \
-             sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &a, int64_t lda) { \
-        syr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda);  \
-    }
-
-SYR_LAUNCHER(float, cublasSsyr)
-SYR_LAUNCHER(double, cublasDsyr)
-// Intel does not support the following two
-SYR_LAUNCHER(std::complex<float>, cublasCsyr)
-SYR_LAUNCHER(std::complex<double>, cublasZsyr)
-#undef SYR_LAUNCHER
-
-template <typename Func, typename T>
-inline void syr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a, int64_t lda) {
-    throw unimplemented("blas", "syr2", "for row_major layout");
-}
-
-#define SYR2_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                      \
-    void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                       \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy,    \
-              sycl::buffer<TYPE, 1> &a, int64_t lda) {                                           \
-        syr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, \
-             lda);                                                                               \
-    }
-
-SYR2_LAUNCHER(float, cublasSsyr2)
-SYR2_LAUNCHER(double, cublasDsyr2)
-// Intel does not support the following two
-SYR2_LAUNCHER(std::complex<float>, cublasCsyr2)
-SYR2_LAUNCHER(std::complex<double>, cublasZsyr2)
-
-#undef SYR2_LAUNCHER
-
-template <typename Func, typename T>
-inline void spmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &a, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "spmv", "for row_major layout");
-}
-
-#define SPMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                      \
-    void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                       \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx, TYPE beta,       \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                                          \
-        spmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, \
-             incy);                                                                              \
-    }
-
-SPMV_LAUNCHER(float, cublasSspmv)
-SPMV_LAUNCHER(double, cublasDspmv)
-
-#undef SPMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void spr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                T alpha, sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &a) {
-    throw unimplemented("blas", "spr", "for row_major layout");
-}
-
-#define SPR_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                              \
-    void spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,               \
-             sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &a) {        \
-        spr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a); \
-    }
-
-SPR_LAUNCHER(float, cublasSspr)
-SPR_LAUNCHER(double, cublasDspr)
-
-#undef SPR_LAUNCHER
-
-template <typename Func, typename T>
-inline void spr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                 T alpha, sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a) {
-    throw unimplemented("blas", "spr2", "for row_major layout");
-}
-
-#define SPR2_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                       \
-    void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                        \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy,     \
-              sycl::buffer<TYPE, 1> &a) {                                                         \
-        spr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a); \
-    }
-
-SPR2_LAUNCHER(float, cublasSspr2)
-SPR2_LAUNCHER(double, cublasDspr2)
-
-#undef SPR2_LAUNCHER
-
-template <typename Func, typename T>
-inline void tbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t n, int64_t k, sycl::buffer<T, 1> &a,
-                 int64_t lda, sycl::buffer<T, 1> &x, int64_t incx) {
-    throw unimplemented("blas", "tbmv", "for row_major layout");
-}
-
-#define TBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                       \
-    void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,   \
-              int64_t k, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x,         \
-              int64_t incx) {                                                                     \
-        tbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, \
-             x, incx);                                                                            \
-    }
-
-TBMV_LAUNCHER(float, cublasStbmv)
-TBMV_LAUNCHER(double, cublasDtbmv)
-TBMV_LAUNCHER(std::complex<float>, cublasCtbmv)
-TBMV_LAUNCHER(std::complex<double>, cublasZtbmv)
-
-#undef TBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void tbsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t n, int64_t k, sycl::buffer<T, 1> &a,
-                 int64_t lda, sycl::buffer<T, 1> &x, int64_t incx) {
-    throw unimplemented("blas", "tbsv", "for row_major layout");
-}
-
-#define TBSV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                       \
-    void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,   \
-              int64_t k, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x,         \
-              int64_t incx) {                                                                     \
-        tbsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, \
-             x, incx);                                                                            \
-    }
-
-TBSV_LAUNCHER(float, cublasStbsv)
-TBSV_LAUNCHER(double, cublasDtbsv)
-TBSV_LAUNCHER(std::complex<float>, cublasCtbsv)
-TBSV_LAUNCHER(std::complex<double>, cublasZtbsv)
-
-#undef TBSV_LAUNCHER
-
-template <typename Func, typename T>
-inline void tpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t n, sycl::buffer<T, 1> &a,
-                 sycl::buffer<T, 1> &x, int64_t incx) {
-    throw unimplemented("blas", "tpmv", "for row_major layout");
-}
-
-#define TPMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                     \
-    void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx) {               \
-        tpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x,    \
-             incx);                                                                             \
-    }
-
-TPMV_LAUNCHER(float, cublasStpmv)
-TPMV_LAUNCHER(double, cublasDtpmv)
-TPMV_LAUNCHER(std::complex<float>, cublasCtpmv)
-TPMV_LAUNCHER(std::complex<double>, cublasZtpmv)
-
-#undef TPMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void tpsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t n, sycl::buffer<T, 1> &a,
-                 sycl::buffer<T, 1> &x, int64_t incx) {
-    throw unimplemented("blas", "tpsv", "for row_major layout");
-}
-
-#define TPSV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                     \
-    void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx) {               \
-        tpsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x,    \
-             incx);                                                                             \
-    }
-
-TPSV_LAUNCHER(float, cublasStpsv)
-TPSV_LAUNCHER(double, cublasDtpsv)
-TPSV_LAUNCHER(std::complex<float>, cublasCtpsv)
-TPSV_LAUNCHER(std::complex<double>, cublasZtpsv)
-
-#undef TPSV_LAUNCHER
-
-template <typename Func, typename T>
-inline void trmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t n, sycl::buffer<T, 1> &a, int64_t lda,
-                 sycl::buffer<T, 1> &x, int64_t incx) {
-    throw unimplemented("blas", "trmv", "for row_major layout");
-}
-
-#define TRMV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                       \
-    void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,   \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx) {    \
-        trmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, \
-             incx);                                                                               \
-    }
-
-TRMV_LAUNCHER(float, cublasStrmv)
-TRMV_LAUNCHER(double, cublasDtrmv)
-TRMV_LAUNCHER(std::complex<float>, cublasCtrmv)
-TRMV_LAUNCHER(std::complex<double>, cublasZtrmv)
-
-#undef TRMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void trsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, diag unit_diag, int64_t n, sycl::buffer<T, 1> &a, int64_t lda,
-                 sycl::buffer<T, 1> &x, int64_t incx) {
-    throw unimplemented("blas", "trsv", "for row_major layout");
-}
-
-#define TRSV_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                       \
-    void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,   \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx) {    \
-        trsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, \
-             incx);                                                                               \
-    }
-
-TRSV_LAUNCHER(float, cublasStrsv)
-TRSV_LAUNCHER(double, cublasDtrsv)
-TRSV_LAUNCHER(std::complex<float>, cublasCtrsv)
-TRSV_LAUNCHER(std::complex<double>, cublasZtrsv)
-
-#undef TRSV_LAUNCHER
-
-// USM APIs
-
-template <typename Func, typename T>
-inline sycl::event gemv(const char *func_name, Func func, sycl::queue &queue, transpose trans,
-                        int64_t m, int64_t n, T alpha, const T *a, int64_t lda, const T *x,
-                        int64_t incx, T beta, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv", "for row_major layout");
-}
-
-#define GEMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,       \
-                     const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {                \
-        return gemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx,  \
-                    beta, y, incy, dependencies);                                                 \
-    }
-
-GEMV_LAUNCHER_USM(float, cublasSgemv)
-GEMV_LAUNCHER_USM(double, cublasDgemv)
-GEMV_LAUNCHER_USM(std::complex<float>, cublasCgemv)
-GEMV_LAUNCHER_USM(std::complex<double>, cublasZgemv)
-#undef GEMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event gbmv(const char *func_name, Func func, sycl::queue &queue, transpose trans,
-                        int64_t m, int64_t n, int64_t kl, int64_t ku, T alpha, const T *a,
-                        int64_t lda, const T *x, int64_t incx, T beta, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gbmv", "for row_major layout");
-}
-
-#define GBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                    \
-    sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl,        \
-                     int64_t ku, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *x,            \
-                     int64_t incx, TYPE beta, TYPE *y, int64_t incy,                               \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return gbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, \
-                    incx, beta, y, incy, dependencies);                                            \
-    }
-
-GBMV_LAUNCHER_USM(float, cublasSgbmv)
-GBMV_LAUNCHER_USM(double, cublasDgbmv)
-GBMV_LAUNCHER_USM(std::complex<float>, cublasCgbmv)
-GBMV_LAUNCHER_USM(std::complex<double>, cublasZgbmv)
-#undef GBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event ger(const char *func_name, Func func, sycl::queue &queue, int64_t m, int64_t n,
-                       T alpha, const T *x, int64_t incx, const T *y, int64_t incy, T *a,
-                       int64_t lda, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "ger", "for row_major layout");
-}
-
-#define GER_LAUNCHER_USM(EXT, TYPE, CUBLAS_ROUTINE)                                               \
-    sycl::event ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, const TYPE *x,     \
-                         int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda,         \
-                         const std::vector<sycl::event> &dependencies) {                          \
-        return ger(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda, \
-                   dependencies);                                                                 \
-    }
-
-GER_LAUNCHER_USM(, float, cublasSger)
-GER_LAUNCHER_USM(, double, cublasDger)
-GER_LAUNCHER_USM(u, std::complex<float>, cublasCgeru)
-GER_LAUNCHER_USM(u, std::complex<double>, cublasZgeru)
-GER_LAUNCHER_USM(c, std::complex<float>, cublasCgerc)
-GER_LAUNCHER_USM(c, std::complex<double>, cublasZgerc)
-#undef GER_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, int64_t k, T alpha, const T *a, int64_t lda, const T *x,
-                        int64_t incx, T beta, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "hbmv", "for row_major layout");
-}
-
-#define HBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,      \
-                     const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {                \
-        return hbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x,  \
-                    incx, beta, y, incy, dependencies);                                           \
-    }
-
-HBMV_LAUNCHER_USM(std::complex<float>, cublasChbmv)
-HBMV_LAUNCHER_USM(std::complex<double>, cublasZhbmv)
-#undef HBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hemv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *a, int64_t lda, const T *x, int64_t incx,
-                        T beta, T *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "hemv", "for row_major layout");
-}
-
-#define HEMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \
-                     int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return hemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x,    \
-                    incx, beta, y, incy, dependencies);                                          \
-    }
-
-HEMV_LAUNCHER_USM(std::complex<float>, cublasChemv)
-HEMV_LAUNCHER_USM(std::complex<double>, cublasZhemv)
-#undef HEMV_LAUNCHER_USM
-
-template <typename Func, typename ScalarType, typename DataType>
-inline sycl::event her(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                       int64_t n, const ScalarType alpha, const DataType *x, int64_t incx,
-                       DataType *a, int64_t lda, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "her", "for row_major layout");
-}
-
-#define HER_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE)                                   \
-    sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha,      \
-                    const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, int64_t lda,                   \
-                    const std::vector<sycl::event> &dependencies) {                                \
-        return her(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, \
-                   dependencies);                                                                  \
-    }
-
-HER_LAUNCHER_USM(float, std::complex<float>, cublasCher)
-HER_LAUNCHER_USM(double, std::complex<double>, cublasZher)
-
-#undef HER_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event her2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy,
-                        T *a, int64_t lda, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "her2", "for row_major layout");
-}
-
-#define HER2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda,            \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return her2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y,   \
-                    incy, a, lda, dependencies);                                                 \
-    }
-
-HER2_LAUNCHER_USM(std::complex<float>, cublasCher2)
-HER2_LAUNCHER_USM(std::complex<double>, cublasZher2)
-
-#undef HER2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *a, const T *x, int64_t incx, T beta, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "hpmv", "for row_major layout");
-}
-
-#define HPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \
-                     const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy,              \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return hpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx,   \
-                    beta, y, incy, dependencies);                                                \
-    }
-
-HPMV_LAUNCHER_USM(std::complex<float>, cublasChpmv)
-HPMV_LAUNCHER_USM(std::complex<double>, cublasZhpmv)
-
-#undef HPMV_LAUNCHER_USM
-
-template <typename Func, typename ScalarType, typename DataType>
-inline sycl::event hpr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                       int64_t n, const ScalarType alpha, const DataType *x, int64_t incx,
-                       DataType *a, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "hpr", "for row_major layout");
-}
-
-#define HPR_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE)                              \
-    sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \
-                    const DATA_TYPE *x, int64_t incx, DATA_TYPE *a,                           \
-                    const std::vector<sycl::event> &dependencies) {                           \
-        return hpr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, \
-                   dependencies);                                                             \
-    }
-
-HPR_LAUNCHER_USM(float, std::complex<float>, cublasChpr)
-HPR_LAUNCHER_USM(double, std::complex<double>, cublasZhpr)
-
-#undef HPR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hpr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy,
-                        T *a, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "hpr2", "for row_major layout");
-}
-
-#define HPR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a,                         \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return hpr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y,   \
-                    incy, a, dependencies);                                                      \
-    }
-
-HPR2_LAUNCHER_USM(std::complex<float>, cublasChpr2)
-HPR2_LAUNCHER_USM(std::complex<double>, cublasZhpr2)
-
-#undef HPR2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event sbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, int64_t k, T alpha, const T *a, int64_t lda, const T *x,
-                        int64_t incx, T beta, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "sbmv", "for row_major layout");
-}
-
-#define SBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,      \
-                     const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {                \
-        return sbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x,  \
-                    incx, beta, y, incy, dependencies);                                           \
-    }
-
-SBMV_LAUNCHER_USM(float, cublasSsbmv)
-SBMV_LAUNCHER_USM(double, cublasDsbmv)
-
-#undef SBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event symv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *a, int64_t lda, const T *x, int64_t incx,
-                        T beta, T *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "symv", "for row_major layout");
-}
-
-#define SYMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \
-                     int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return symv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x,    \
-                    incx, beta, y, incy, dependencies);                                          \
-    }
-
-SYMV_LAUNCHER_USM(float, cublasSsymv)
-SYMV_LAUNCHER_USM(double, cublasDsymv)
-
-#undef SYMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                       int64_t n, T alpha, const T *x, int64_t incx, T *a, int64_t lda,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syr", "for row_major layout");
-}
-
-#define SYR_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                     \
-    sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x,    \
-                    int64_t incx, TYPE *a, int64_t lda,                                            \
-                    const std::vector<sycl::event> &dependencies) {                                \
-        return syr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, \
-                   dependencies);                                                                  \
-    }
-
-SYR_LAUNCHER_USM(float, cublasSsyr)
-SYR_LAUNCHER_USM(double, cublasDsyr)
-// Intel does not support the following two
-SYR_LAUNCHER_USM(std::complex<float>, cublasCsyr)
-SYR_LAUNCHER_USM(std::complex<double>, cublasZsyr)
-#undef SYR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy,
-                        T *a, int64_t lda, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syr2", "for row_major layout");
-}
-
-#define SYR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda,            \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return syr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y,   \
-                    incy, a, lda, dependencies);                                                 \
-    }
-
-SYR2_LAUNCHER_USM(float, cublasSsyr2)
-SYR2_LAUNCHER_USM(double, cublasDsyr2)
-// Intel does not support the following two
-SYR2_LAUNCHER_USM(std::complex<float>, cublasCsyr2)
-SYR2_LAUNCHER_USM(std::complex<double>, cublasZsyr2)
-
-#undef SYR2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event spmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *a, const T *x, int64_t incx, T beta, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "spmv", "for row_major layout");
-}
-
-#define SPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \
-                     const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy,              \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return spmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx,   \
-                    beta, y, incy, dependencies);                                                \
-    }
-
-SPMV_LAUNCHER_USM(float, cublasSspmv)
-SPMV_LAUNCHER_USM(double, cublasDspmv)
-
-#undef SPMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event spr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                       int64_t n, T alpha, const T *x, int64_t incx, T *a,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "spr", "for row_major layout");
-}
-
-#define SPR_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                    int64_t incx, TYPE *a, const std::vector<sycl::event> &dependencies) {      \
-        return spr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a,   \
-                   dependencies);                                                               \
-    }
-
-SPR_LAUNCHER_USM(float, cublasSspr)
-SPR_LAUNCHER_USM(double, cublasDspr)
-
-#undef SPR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event spr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy,
-                        T *a, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "spr2", "for row_major layout");
-}
-
-#define SPR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a,                         \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return spr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y,   \
-                    incy, a, dependencies);                                                      \
-    }
-
-SPR2_LAUNCHER_USM(float, cublasSspr2)
-SPR2_LAUNCHER_USM(double, cublasDspr2)
-
-#undef SPR2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t n, int64_t k, const T *a,
-                        int64_t lda, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "tbmv", "for row_major layout");
-}
-
-#define TBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,      \
-                     int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,    \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return tbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, \
-                    a, lda, x, incx, dependencies);                                              \
-    }
-
-TBMV_LAUNCHER_USM(float, cublasStbmv)
-TBMV_LAUNCHER_USM(double, cublasDtbmv)
-TBMV_LAUNCHER_USM(std::complex<float>, cublasCtbmv)
-TBMV_LAUNCHER_USM(std::complex<double>, cublasZtbmv)
-
-#undef TBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tbsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t n, int64_t k, const T *a,
-                        int64_t lda, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "tbsv", "for row_major layout");
-}
-
-#define TBSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,      \
-                     int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,    \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return tbsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, \
-                    a, lda, x, incx, dependencies);                                              \
-    }
-
-TBSV_LAUNCHER_USM(float, cublasStbsv)
-TBSV_LAUNCHER_USM(double, cublasDtbsv)
-TBSV_LAUNCHER_USM(std::complex<float>, cublasCtbsv)
-TBSV_LAUNCHER_USM(std::complex<double>, cublasZtbsv)
-
-#undef TBSV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t n, const T *a, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "tpmv", "for row_major layout");
-}
-
-#define TPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,      \
-                     int64_t n, const TYPE *a, TYPE *x, int64_t incx,                            \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return tpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \
-                    x, incx, dependencies);                                                      \
-    }
-
-TPMV_LAUNCHER_USM(float, cublasStpmv)
-TPMV_LAUNCHER_USM(double, cublasDtpmv)
-TPMV_LAUNCHER_USM(std::complex<float>, cublasCtpmv)
-TPMV_LAUNCHER_USM(std::complex<double>, cublasZtpmv)
-
-#undef TPMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tpsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t n, const T *a, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "tpsv", "for row_major layout");
-}
-
-#define TPSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,      \
-                     int64_t n, const TYPE *a, TYPE *x, int64_t incx,                            \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return tpsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \
-                    x, incx, dependencies);                                                      \
-    }
-
-TPSV_LAUNCHER_USM(float, cublasStpsv)
-TPSV_LAUNCHER_USM(double, cublasDtpsv)
-TPSV_LAUNCHER_USM(std::complex<float>, cublasCtpsv)
-TPSV_LAUNCHER_USM(std::complex<double>, cublasZtpsv)
-
-#undef TPSV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t n, const T *a, int64_t lda, T *x,
-                        int64_t incx, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trmv", "for row_major layout");
-}
-
-#define TRMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,      \
-                     int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,               \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return trmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \
-                    lda, x, incx, dependencies);                                                 \
-    }
-
-TRMV_LAUNCHER_USM(float, cublasStrmv)
-TRMV_LAUNCHER_USM(double, cublasDtrmv)
-TRMV_LAUNCHER_USM(std::complex<float>, cublasCtrmv)
-TRMV_LAUNCHER_USM(std::complex<double>, cublasZtrmv)
-
-#undef TRMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t n, const T *a, int64_t lda, T *x,
-                        int64_t incx, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsv", "for row_major layout");
-}
-
-#define TRSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                  \
-    sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,      \
-                     int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,               \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return trsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \
-                    lda, x, incx, dependencies);                                                 \
-    }
-
-TRSV_LAUNCHER_USM(float, cublasStrsv)
-TRSV_LAUNCHER_USM(double, cublasDtrsv)
-TRSV_LAUNCHER_USM(std::complex<float>, cublasCtrsv)
-TRSV_LAUNCHER_USM(std::complex<double>, cublasZtrsv)
-
-#undef TRSV_LAUNCHER_USM
-
-} // namespace row_major
-} // namespace cublas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/cublas/cublas_level3.cpp b/src/blas/backends/cublas/cublas_level3.cpp
deleted file mode 100644
index 5ea4e2152..000000000
--- a/src/blas/backends/cublas/cublas_level3.cpp
+++ /dev/null
@@ -1,1336 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "cublas_helper.hpp"
-#include "cublas_task.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace cublas {
-namespace column_major {
-
-// Buffer APIs
-
-template <typename Func, typename T>
-inline void gemm(const char *func_name, Func func, sycl::queue &queue, transpose transa,
-                 transpose transb, int64_t m, int64_t n, int64_t k, T alpha, sycl::buffer<T, 1> &a,
-                 int64_t lda, sycl::buffer<T, 1> &b, int64_t ldb, T beta, sycl::buffer<T, 1> &c,
-                 int64_t ldc) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, ldb, ldc);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            auto c_ = sc.get_mem<cuDataType *>(c_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa),
-                                     get_cublas_operation(transb), m, n, k, (cuDataType *)&alpha,
-                                     a_, lda, b_, ldb, (cuDataType *)&beta, c_, ldc);
-        });
-    });
-}
-
-#define GEMM_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                     \
-    void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,     \
-              int64_t k, TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda,                     \
-              sycl::buffer<TYPE, 1> &b, int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c,       \
-              int64_t ldc) {                                                                    \
-        gemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, lda, b, \
-             ldb, beta, c, ldc);                                                                \
-    }
-
-GEMM_LAUNCHER(float, cublasSgemm)
-GEMM_LAUNCHER(double, cublasDgemm)
-GEMM_LAUNCHER(std::complex<float>, cublasCgemm)
-GEMM_LAUNCHER(std::complex<double>, cublasZgemm)
-
-#undef GEMM_LAUNCHER
-
-template <typename T_A, typename T_B, typename T_C, typename DATATYPE_A, typename DATATYPE_B,
-          typename DATATYPE_C>
-inline void gemm_ex(DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, sycl::queue &queue,
-                    transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, T_C alpha,
-                    sycl::buffer<T_A, 1> &a, int64_t lda, sycl::buffer<T_B, 1> &b, int64_t ldb,
-                    T_C beta, sycl::buffer<T_C, 1> &c, int64_t ldc) {
-    using cuDataType_A = typename CudaEquivalentType<T_A>::Type;
-    using cuDataType_B = typename CudaEquivalentType<T_B>::Type;
-    using cuDataType_C = typename CudaEquivalentType<T_C>::Type;
-    overflow_check(m, n, k, lda, ldb, ldc);
-    queue.submit([&](sycl::handler &cgh) {
-        if (!verify_support<sycl::half, T_A, T_B, T_C>(queue, sycl::aspect::fp16)) {
-            throw oneapi::mkl::unimplemented(
-                "blas", "sycl::half", "half is not supported by the device or the sycl compiler");
-        }
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType_A *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType_B *>(b_acc);
-            auto c_ = sc.get_mem<cuDataType_C *>(c_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_SYNC(cublasGemmEx, err, handle, get_cublas_operation(transa),
-                                   get_cublas_operation(transb), m, n, k, (cuDataType_C *)&alpha,
-                                   a_, DT_A, lda, b_, DT_B, ldb, (cuDataType_C *)&beta, c_, DT_C,
-                                   ldc, DT_C, CUBLAS_GEMM_DEFAULT);
-        });
-    });
-}
-
-#define GEMM_EX_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, CUDADATATYPE_A, CUDADATATYPE_B, CUDADATATYPE_C) \
-    void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,      \
-              int64_t k, TYPE_C alpha, sycl::buffer<TYPE_A, 1> &a, int64_t lda,                  \
-              sycl::buffer<TYPE_B, 1> &b, int64_t ldb, TYPE_C beta, sycl::buffer<TYPE_C, 1> &c,  \
-              int64_t ldc) {                                                                     \
-        gemm_ex(CUDADATATYPE_A, CUDADATATYPE_B, CUDADATATYPE_C, queue, transa, transb, m, n, k,  \
-                alpha, a, lda, b, ldb, beta, c, ldc);                                            \
-    }
-
-GEMM_EX_LAUNCHER(sycl::half, sycl::half, float, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F)
-GEMM_EX_LAUNCHER(sycl::half, sycl::half, sycl::half, CUDA_R_16F, CUDA_R_16F, CUDA_R_16F)
-
-#undef GEMM_EX_LAUNCHER
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          float alpha, sycl::buffer<bfloat16, 1> &a, int64_t lda, sycl::buffer<bfloat16, 1> &b,
-          int64_t ldb, float beta, sycl::buffer<float, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "gemm", "for column_major layout");
-}
-
-template <typename Func, typename T>
-inline void symm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                 uplo upper_lower, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &a,
-                 int64_t lda, sycl::buffer<T, 1> &b, int64_t ldb, T beta, sycl::buffer<T, 1> &c,
-                 int64_t ldc) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            auto c_ = sc.get_mem<cuDataType *>(c_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right),
-                                     get_cublas_fill_mode(upper_lower), m, n, (cuDataType *)&alpha,
-                                     a_, lda, b_, ldb, (cuDataType *)&beta, c_, ldc);
-        });
-    });
-}
-
-#define SYMM_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                        \
-    void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,         \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b,         \
-              int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c, int64_t ldc) {                     \
-        symm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, \
-             b, ldb, beta, c, ldc);                                                                \
-    }
-
-SYMM_LAUNCHER(float, cublasSsymm)
-SYMM_LAUNCHER(double, cublasDsymm)
-SYMM_LAUNCHER(std::complex<float>, cublasCsymm)
-SYMM_LAUNCHER(std::complex<double>, cublasZsymm)
-
-#undef SYMM_LAUNCHER
-
-template <typename Func, typename T>
-inline void hemm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                 uplo upper_lower, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &a,
-                 int64_t lda, sycl::buffer<T, 1> &b, int64_t ldb, T beta, sycl::buffer<T, 1> &c,
-                 int64_t ldc) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            auto c_ = sc.get_mem<cuDataType *>(c_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right),
-                                     get_cublas_fill_mode(upper_lower), m, n, (cuDataType *)&alpha,
-                                     a_, lda, b_, ldb, (cuDataType *)&beta, c_, ldc);
-        });
-    });
-}
-
-#define HEMM_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                        \
-    void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,         \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b,         \
-              int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c, int64_t ldc) {                     \
-        hemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, \
-             b, ldb, beta, c, ldc);                                                                \
-    }
-HEMM_LAUNCHER(std::complex<float>, cublasChemm)
-HEMM_LAUNCHER(std::complex<double>, cublasZhemm)
-
-#undef HEMM_LAUNCHER
-
-template <typename Func, typename T>
-inline void syrk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, int64_t n, int64_t k, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
-                 T beta, sycl::buffer<T, 1> &c, int64_t ldc) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, k, lda, ldc);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto c_ = sc.get_mem<cuDataType *>(c_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     n, k, (cuDataType *)&alpha, a_, lda, (cuDataType *)&beta, c_,
-                                     ldc);
-        });
-    });
-}
-
-#define SYRK_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                   \
-    void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,    \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, TYPE beta,                   \
-              sycl::buffer<TYPE, 1> &c, int64_t ldc) {                                        \
-        syrk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, \
-             beta, c, ldc);                                                                   \
-    }
-
-SYRK_LAUNCHER(float, cublasSsyrk)
-SYRK_LAUNCHER(double, cublasDsyrk)
-SYRK_LAUNCHER(std::complex<float>, cublasCsyrk)
-SYRK_LAUNCHER(std::complex<double>, cublasZsyrk)
-
-#undef SYRK_LAUNCHER
-
-template <typename Func, typename DataType, typename ScalarType>
-inline void herk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, int64_t n, int64_t k, ScalarType alpha,
-                 sycl::buffer<DataType, 1> &a, int64_t lda, ScalarType beta,
-                 sycl::buffer<DataType, 1> &c, int64_t ldc) {
-    using cuDataType = typename CudaEquivalentType<DataType>::Type;
-    using cuScalarType = typename CudaEquivalentType<ScalarType>::Type;
-    overflow_check(n, k, lda, ldc);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto c_ = sc.get_mem<cuDataType *>(c_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     n, k, (cuScalarType *)&alpha, a_, lda, (cuScalarType *)&beta,
-                                     c_, ldc);
-        });
-    });
-}
-
-#define HERK_LAUNCHER(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE)                                  \
-    void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,     \
-              SCALAR_TYPE alpha, sycl::buffer<DATA_TYPE, 1> &a, int64_t lda, SCALAR_TYPE beta, \
-              sycl::buffer<DATA_TYPE, 1> &c, int64_t ldc) {                                    \
-        herk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda,  \
-             beta, c, ldc);                                                                    \
-    }
-
-HERK_LAUNCHER(std::complex<float>, float, cublasCherk)
-HERK_LAUNCHER(std::complex<double>, double, cublasZherk)
-
-#undef HERK_LAUNCHER
-
-template <typename Func, typename T>
-inline void syr2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                  transpose trans, int64_t n, int64_t k, T alpha, sycl::buffer<T, 1> &a,
-                  int64_t lda, sycl::buffer<T, 1> &b, int64_t ldb, T beta, sycl::buffer<T, 1> &c,
-                  int64_t ldc) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, k, lda, ldb, ldc);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            auto c_ = sc.get_mem<cuDataType *>(c_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     n, k, (cuDataType *)&alpha, a_, lda, b_, ldb,
-                                     (cuDataType *)&beta, c_, ldc);
-        });
-    });
-}
-
-#define SYR2K_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                      \
-    void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,       \
-               TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b,       \
-               int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c, int64_t ldc) {                   \
-        syr2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, \
-              ldb, beta, c, ldc);                                                                 \
-    }
-SYR2K_LAUNCHER(float, cublasSsyr2k)
-SYR2K_LAUNCHER(double, cublasDsyr2k)
-SYR2K_LAUNCHER(std::complex<float>, cublasCsyr2k)
-SYR2K_LAUNCHER(std::complex<double>, cublasZsyr2k)
-
-#undef SYR2K_LAUNCHER
-
-template <typename Func, typename DataType, typename ScalarType>
-inline void her2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                  transpose trans, int64_t n, int64_t k, DataType alpha,
-                  sycl::buffer<DataType, 1> &a, int64_t lda, sycl::buffer<DataType, 1> &b,
-                  int64_t ldb, ScalarType beta, sycl::buffer<DataType, 1> &c, int64_t ldc) {
-    using cuDataType = typename CudaEquivalentType<DataType>::Type;
-    using cuScalarType = typename CudaEquivalentType<ScalarType>::Type;
-    overflow_check(n, k, lda, ldb, ldc);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            auto c_ = sc.get_mem<cuDataType *>(c_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     n, k, (cuDataType *)&alpha, a_, lda, b_, ldb,
-                                     (cuScalarType *)&beta, c_, ldc);
-        });
-    });
-}
-
-#define HER2K_LAUNCHER(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE)                                    \
-    void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,       \
-               DATA_TYPE alpha, sycl::buffer<DATA_TYPE, 1> &a, int64_t lda,                       \
-               sycl::buffer<DATA_TYPE, 1> &b, int64_t ldb, SCALAR_TYPE beta,                      \
-               sycl::buffer<DATA_TYPE, 1> &c, int64_t ldc) {                                      \
-        her2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, \
-              ldb, beta, c, ldc);                                                                 \
-    }
-
-HER2K_LAUNCHER(std::complex<float>, float, cublasCher2k)
-HER2K_LAUNCHER(std::complex<double>, double, cublasZher2k)
-
-#undef HER2K_LAUNCHER
-
-// NOTE: In cublas TRMM diverted from the netlib blas and for performance
-// reason it requires the C matrix to be
-// separated from the B matrix. It is possible to use B instead of C, but this
-// will slow-down the code.
-template <typename Func, typename T>
-inline void trmm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                 uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b, int64_t ldb) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right),
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), m, n, (cuDataType *)&alpha,
-                                     a_, lda, b_, ldb, b_, ldb);
-        });
-    });
-}
-
-#define TRMM_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                        \
-    void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,              \
-              diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &a,          \
-              int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) {                                \
-        trmm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, \
-             n, alpha, a, lda, b, ldb);                                                            \
-    }
-TRMM_LAUNCHER(float, cublasStrmm)
-TRMM_LAUNCHER(double, cublasDtrmm)
-TRMM_LAUNCHER(std::complex<float>, cublasCtrmm)
-TRMM_LAUNCHER(std::complex<double>, cublasZtrmm)
-
-#undef TRMM_LAUNCHER
-
-template <typename Func, typename T>
-inline void trsm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                 uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b, int64_t ldb) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right),
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), m, n, (cuDataType *)&alpha,
-                                     a_, lda, b_, ldb);
-        });
-    });
-}
-
-#define TRSM_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                        \
-    void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,              \
-              diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &a,          \
-              int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) {                                \
-        trsm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, \
-             n, alpha, a, lda, b, ldb);                                                            \
-    }
-TRSM_LAUNCHER(float, cublasStrsm)
-TRSM_LAUNCHER(double, cublasDtrsm)
-TRSM_LAUNCHER(std::complex<float>, cublasCtrsm)
-TRSM_LAUNCHER(std::complex<double>, cublasZtrsm)
-
-#undef TRSM_LAUNCHER
-
-// USM APIs
-
-template <typename Func, typename T>
-inline sycl::event gemm(const char *func_name, Func func, sycl::queue &queue, transpose transa,
-                        transpose transb, int64_t m, int64_t n, int64_t k, T alpha, const T *a,
-                        int64_t lda, const T *b, int64_t ldb, T beta, T *c, int64_t ldc,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, ldb, ldc);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto b_ = reinterpret_cast<const cuDataType *>(b);
-            auto c_ = reinterpret_cast<cuDataType *>(c);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa),
-                                     get_cublas_operation(transb), m, n, k, (cuDataType *)&alpha,
-                                     a_, lda, b_, ldb, (cuDataType *)&beta, c_, ldc);
-        });
-    });
-    return done;
-}
-
-#define GEMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                    \
-    sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                     int64_t k, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b,             \
-                     int64_t ldb, TYPE beta, TYPE *c, int64_t ldc,                                 \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return gemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a,     \
-                    lda, b, ldb, beta, c, ldc, dependencies);                                      \
-    }
-
-GEMM_LAUNCHER_USM(float, cublasSgemm)
-GEMM_LAUNCHER_USM(double, cublasDgemm)
-GEMM_LAUNCHER_USM(std::complex<float>, cublasCgemm)
-GEMM_LAUNCHER_USM(std::complex<double>, cublasZgemm)
-
-#undef GEMM_LAUNCHER_USM
-
-template <typename T_A, typename T_B, typename T_C, typename DATATYPE_A, typename DATATYPE_B,
-          typename DATATYPE_C>
-inline sycl::event gemm_ex_usm(DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C,
-                               sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                               int64_t n, int64_t k, T_C alpha, const T_A *a, int64_t lda,
-                               const T_B *b, int64_t ldb, T_C beta, T_C *c, int64_t ldc,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType_A = typename CudaEquivalentType<T_A>::Type;
-    using cuDataType_B = typename CudaEquivalentType<T_B>::Type;
-    using cuDataType_C = typename CudaEquivalentType<T_C>::Type;
-    overflow_check(m, n, k, lda, ldb, ldc);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType_A *>(a);
-            auto b_ = reinterpret_cast<const cuDataType_B *>(b);
-            auto c_ = reinterpret_cast<cuDataType_C *>(c);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_SYNC(cublasGemmEx, err, handle, get_cublas_operation(transa),
-                                   get_cublas_operation(transb), m, n, k, (cuDataType_C *)&alpha,
-                                   a_, DT_A, lda, b_, DT_B, ldb, (cuDataType_C *)&beta, c_, DT_C,
-                                   ldc, DT_C, CUBLAS_GEMM_DEFAULT);
-        });
-    });
-    return done;
-}
-
-#define GEMM_EX_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, CUDADATATYPE_A, CUDADATATYPE_B,               \
-                             CUDADATATYPE_C)                                                       \
-    sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                     int64_t k, TYPE_C alpha, const TYPE_A *a, int64_t lda, const TYPE_B *b,       \
-                     int64_t ldb, TYPE_C beta, TYPE_C *c, int64_t ldc,                             \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return gemm_ex_usm(CUDADATATYPE_A, CUDADATATYPE_B, CUDADATATYPE_C, queue, transa, transb,  \
-                           m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);            \
-    }
-
-GEMM_EX_LAUNCHER_USM(sycl::half, sycl::half, float, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F)
-GEMM_EX_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, CUDA_R_16F, CUDA_R_16F, CUDA_R_16F)
-
-#undef GEMM_EX_LAUNCHER_USM
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, float alpha, const bfloat16 *a, int64_t lda, const bfloat16 *b,
-                 int64_t ldb, float beta, float *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm", "for column_major layout");
-}
-
-template <typename Func, typename T>
-inline sycl::event symm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                        uplo upper_lower, int64_t m, int64_t n, T alpha, const T *a, int64_t lda,
-                        const T *b, int64_t ldb, T beta, T *c, int64_t ldc,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto b_ = reinterpret_cast<const cuDataType *>(b);
-            auto c_ = reinterpret_cast<cuDataType *>(c);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right),
-                                     get_cublas_fill_mode(upper_lower), m, n, (cuDataType *)&alpha,
-                                     a_, lda, b_, ldb, (cuDataType *)&beta, c_, ldc);
-        });
-    });
-    return done;
-}
-
-#define SYMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \
-                     TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb,          \
-                     TYPE beta, TYPE *c, int64_t ldc,                                             \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return symm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, \
-                    a, lda, b, ldb, beta, c, ldc, dependencies);                                  \
-    }
-
-SYMM_LAUNCHER_USM(float, cublasSsymm)
-SYMM_LAUNCHER_USM(double, cublasDsymm)
-SYMM_LAUNCHER_USM(std::complex<float>, cublasCsymm)
-SYMM_LAUNCHER_USM(std::complex<double>, cublasZsymm)
-
-#undef SYMM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hemm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                        uplo upper_lower, int64_t m, int64_t n, T alpha, const T *a, int64_t lda,
-                        const T *b, int64_t ldb, T beta, T *c, int64_t ldc,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto b_ = reinterpret_cast<const cuDataType *>(b);
-            auto c_ = reinterpret_cast<cuDataType *>(c);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right),
-                                     get_cublas_fill_mode(upper_lower), m, n, (cuDataType *)&alpha,
-                                     a_, lda, b_, ldb, (cuDataType *)&beta, c_, ldc);
-        });
-    });
-    return done;
-}
-
-#define HEMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \
-                     TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb,          \
-                     TYPE beta, TYPE *c, int64_t ldc,                                             \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return hemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, \
-                    a, lda, b, ldb, beta, c, ldc, dependencies);                                  \
-    }
-HEMM_LAUNCHER_USM(std::complex<float>, cublasChemm)
-HEMM_LAUNCHER_USM(std::complex<double>, cublasZhemm)
-
-#undef HEMM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syrk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, int64_t n, int64_t k, T alpha, const T *a, int64_t lda,
-                        T beta, T *c, int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, k, lda, ldc);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto c_ = reinterpret_cast<cuDataType *>(c);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     n, k, (cuDataType *)&alpha, a_, lda, (cuDataType *)&beta, c_,
-                                     ldc);
-        });
-    });
-    return done;
-}
-
-#define SYRK_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \
-                     TYPE alpha, const TYPE *a, int64_t lda, TYPE beta, TYPE *c, int64_t ldc,     \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return syrk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a,   \
-                    lda, beta, c, ldc, dependencies);                                             \
-    }
-
-SYRK_LAUNCHER_USM(float, cublasSsyrk)
-SYRK_LAUNCHER_USM(double, cublasDsyrk)
-SYRK_LAUNCHER_USM(std::complex<float>, cublasCsyrk)
-SYRK_LAUNCHER_USM(std::complex<double>, cublasZsyrk)
-
-#undef SYRK_LAUNCHER_USM
-
-template <typename Func, typename DataType, typename ScalarType>
-inline sycl::event herk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, int64_t n, int64_t k, const ScalarType alpha,
-                        const DataType *a, int64_t lda, const ScalarType beta, DataType *c,
-                        int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<DataType>::Type;
-    using cuScalarType = typename CudaEquivalentType<ScalarType>::Type;
-    overflow_check(n, k, lda, ldc);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto c_ = reinterpret_cast<cuDataType *>(c);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     n, k, (cuScalarType *)&alpha, a_, lda, (cuScalarType *)&beta,
-                                     c_, ldc);
-        });
-    });
-    return done;
-}
-
-#define HERK_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE)                                 \
-    sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \
-                     const SCALAR_TYPE alpha, const DATA_TYPE *a, int64_t lda,                    \
-                     const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc,                           \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return herk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a,   \
-                    lda, beta, c, ldc, dependencies);                                             \
-    }
-
-HERK_LAUNCHER_USM(std::complex<float>, float, cublasCherk)
-HERK_LAUNCHER_USM(std::complex<double>, double, cublasZherk)
-
-#undef HERK_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syr2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                         transpose trans, int64_t n, int64_t k, T alpha, const T *a, int64_t lda,
-                         const T *b, int64_t ldb, T beta, T *c, int64_t ldc,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, k, lda, ldb, ldc);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto b_ = reinterpret_cast<const cuDataType *>(b);
-            auto c_ = reinterpret_cast<cuDataType *>(c);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     n, k, (cuDataType *)&alpha, a_, lda, b_, ldb,
-                                     (cuDataType *)&beta, c_, ldc);
-        });
-    });
-    return done;
-}
-
-#define SYR2K_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \
-                      TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb,          \
-                      TYPE beta, TYPE *c, int64_t ldc,                                             \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return syr2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a,   \
-                     lda, b, ldb, beta, c, ldc, dependencies);                                     \
-    }
-SYR2K_LAUNCHER_USM(float, cublasSsyr2k)
-SYR2K_LAUNCHER_USM(double, cublasDsyr2k)
-SYR2K_LAUNCHER_USM(std::complex<float>, cublasCsyr2k)
-SYR2K_LAUNCHER_USM(std::complex<double>, cublasZsyr2k)
-
-#undef SYR2K_LAUNCHER_USM
-
-template <typename Func, typename DataType, typename ScalarType>
-inline sycl::event her2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                         transpose trans, int64_t n, int64_t k, const DataType alpha,
-                         const DataType *a, int64_t lda, const DataType *b, int64_t ldb,
-                         const ScalarType beta, DataType *c, int64_t ldc,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<DataType>::Type;
-    using cuScalarType = typename CudaEquivalentType<ScalarType>::Type;
-    overflow_check(n, k, lda, ldb, ldc);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto b_ = reinterpret_cast<const cuDataType *>(b);
-            auto c_ = reinterpret_cast<cuDataType *>(c);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle,
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     n, k, (cuDataType *)&alpha, a_, lda, b_, ldb,
-                                     (cuScalarType *)&beta, c_, ldc);
-        });
-    });
-    return done;
-}
-
-#define HER2K_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE)                                 \
-    sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \
-                      const DATA_TYPE alpha, const DATA_TYPE *a, int64_t lda, const DATA_TYPE *b,  \
-                      int64_t ldb, const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc,              \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return her2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a,   \
-                     lda, b, ldb, beta, c, ldc, dependencies);                                     \
-    }
-
-HER2K_LAUNCHER_USM(std::complex<float>, float, cublasCher2k)
-HER2K_LAUNCHER_USM(std::complex<double>, double, cublasZher2k)
-
-#undef HER2K_LAUNCHER_USM
-
-// NOTE: In cublas TRMM diverted from the netlib blas and for performance
-// reason it requires the C matrix to be
-// separated from the B matrix. It is possible to use B instead of C, but this
-// will slow-down the code.
-template <typename Func, typename T>
-inline sycl::event trmm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n,
-                        T alpha, const T *a, int64_t lda, T *b, int64_t ldb,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto b_ = reinterpret_cast<cuDataType *>(b);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right),
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), m, n, (cuDataType *)&alpha,
-                                     a_, lda, b_, ldb, b_, ldb);
-        });
-    });
-    return done;
-}
-
-#define TRMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                    \
-    sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,       \
-                     diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \
-                     TYPE *b, int64_t ldb, const std::vector<sycl::event> &dependencies) {         \
-        return trmm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans,        \
-                    unit_diag, m, n, alpha, a, lda, b, ldb, dependencies);                         \
-    }
-TRMM_LAUNCHER_USM(float, cublasStrmm)
-TRMM_LAUNCHER_USM(double, cublasDtrmm)
-TRMM_LAUNCHER_USM(std::complex<float>, cublasCtrmm)
-TRMM_LAUNCHER_USM(std::complex<double>, cublasZtrmm)
-
-#undef TRMM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trsm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n,
-                        T alpha, const T *a, int64_t lda, T *b, int64_t ldb,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<const cuDataType *>(a);
-            auto b_ = reinterpret_cast<cuDataType *>(b);
-            cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right),
-                                     get_cublas_fill_mode(upper_lower), get_cublas_operation(trans),
-                                     get_cublas_diag_type(unit_diag), m, n, (cuDataType *)&alpha,
-                                     a_, lda, b_, ldb);
-        });
-    });
-    return done;
-}
-
-#define TRSM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                    \
-    sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,       \
-                     diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \
-                     TYPE *b, int64_t ldb, const std::vector<sycl::event> &dependencies) {         \
-        return trsm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans,        \
-                    unit_diag, m, n, alpha, a, lda, b, ldb, dependencies);                         \
-    }
-TRSM_LAUNCHER_USM(float, cublasStrsm)
-TRSM_LAUNCHER_USM(double, cublasDtrsm)
-TRSM_LAUNCHER_USM(std::complex<float>, cublasCtrsm)
-TRSM_LAUNCHER_USM(std::complex<double>, cublasZtrsm)
-
-#undef TRSM_LAUNCHER_USM
-
-} // namespace column_major
-namespace row_major {
-
-// Buffer APIs
-
-template <typename Func, typename T>
-inline void gemm(const char *func_name, Func func, sycl::queue &queue, transpose transa,
-                 transpose transb, int64_t m, int64_t n, int64_t k, T alpha, sycl::buffer<T, 1> &a,
-                 int64_t lda, sycl::buffer<T, 1> &b, int64_t ldb, T beta, sycl::buffer<T, 1> &c,
-                 int64_t ldc) {
-    throw unimplemented("blas", "gemm", "for row_major layout");
-}
-
-#define GEMM_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                     \
-    void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,     \
-              int64_t k, TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda,                     \
-              sycl::buffer<TYPE, 1> &b, int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c,       \
-              int64_t ldc) {                                                                    \
-        gemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, lda, b, \
-             ldb, beta, c, ldc);                                                                \
-    }
-
-GEMM_LAUNCHER(float, cublasSgemm)
-GEMM_LAUNCHER(double, cublasDgemm)
-GEMM_LAUNCHER(std::complex<float>, cublasCgemm)
-GEMM_LAUNCHER(std::complex<double>, cublasZgemm)
-
-#undef GEMM_LAUNCHER
-
-template <typename T_A, typename T_B, typename T_C, typename DATATYPE_A, typename DATATYPE_B,
-          typename DATATYPE_C>
-inline void gemm_ex(DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, sycl::queue &queue,
-                    transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, T_C alpha,
-                    sycl::buffer<T_A, 1> &a, int64_t lda, sycl::buffer<T_B, 1> &b, int64_t ldb,
-                    T_C beta, sycl::buffer<T_C, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "gemm", "for row_major layout");
-}
-
-#define GEMM_EX_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, CUDADATATYPE_A, CUDADATATYPE_B, CUDADATATYPE_C) \
-    void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,      \
-              int64_t k, TYPE_C alpha, sycl::buffer<TYPE_A, 1> &a, int64_t lda,                  \
-              sycl::buffer<TYPE_B, 1> &b, int64_t ldb, TYPE_C beta, sycl::buffer<TYPE_C, 1> &c,  \
-              int64_t ldc) {                                                                     \
-        gemm_ex(CUDADATATYPE_A, CUDADATATYPE_B, CUDADATATYPE_C, queue, transa, transb, m, n, k,  \
-                alpha, a, lda, b, ldb, beta, c, ldc);                                            \
-    }
-
-GEMM_EX_LAUNCHER(sycl::half, sycl::half, float, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F)
-GEMM_EX_LAUNCHER(sycl::half, sycl::half, sycl::half, CUDA_R_16F, CUDA_R_16F, CUDA_R_16F)
-
-#undef GEMM_EX_LAUNCHER
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          float alpha, sycl::buffer<bfloat16, 1> &a, int64_t lda, sycl::buffer<bfloat16, 1> &b,
-          int64_t ldb, float beta, sycl::buffer<float, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "gemm", "for row_major layout");
-}
-
-template <typename Func, typename T>
-inline void symm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                 uplo upper_lower, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &a,
-                 int64_t lda, sycl::buffer<T, 1> &b, int64_t ldb, T beta, sycl::buffer<T, 1> &c,
-                 int64_t ldc) {
-    throw unimplemented("blas", "symm", "for row_major layout");
-}
-
-#define SYMM_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                        \
-    void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,         \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b,         \
-              int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c, int64_t ldc) {                     \
-        symm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, \
-             b, ldb, beta, c, ldc);                                                                \
-    }
-
-SYMM_LAUNCHER(float, cublasSsymm)
-SYMM_LAUNCHER(double, cublasDsymm)
-SYMM_LAUNCHER(std::complex<float>, cublasCsymm)
-SYMM_LAUNCHER(std::complex<double>, cublasZsymm)
-
-#undef SYMM_LAUNCHER
-
-template <typename Func, typename T>
-inline void hemm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                 uplo upper_lower, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &a,
-                 int64_t lda, sycl::buffer<T, 1> &b, int64_t ldb, T beta, sycl::buffer<T, 1> &c,
-                 int64_t ldc) {
-    throw unimplemented("blas", "hemm", "for row_major layout");
-}
-
-#define HEMM_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                        \
-    void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,         \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b,         \
-              int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c, int64_t ldc) {                     \
-        hemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, \
-             b, ldb, beta, c, ldc);                                                                \
-    }
-HEMM_LAUNCHER(std::complex<float>, cublasChemm)
-HEMM_LAUNCHER(std::complex<double>, cublasZhemm)
-
-#undef HEMM_LAUNCHER
-
-template <typename Func, typename T>
-inline void syrk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, int64_t n, int64_t k, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
-                 T beta, sycl::buffer<T, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "syrk", "for row_major layout");
-}
-
-#define SYRK_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                   \
-    void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,    \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, TYPE beta,                   \
-              sycl::buffer<TYPE, 1> &c, int64_t ldc) {                                        \
-        syrk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, \
-             beta, c, ldc);                                                                   \
-    }
-
-SYRK_LAUNCHER(float, cublasSsyrk)
-SYRK_LAUNCHER(double, cublasDsyrk)
-SYRK_LAUNCHER(std::complex<float>, cublasCsyrk)
-SYRK_LAUNCHER(std::complex<double>, cublasZsyrk)
-
-#undef SYRK_LAUNCHER
-
-template <typename Func, typename DataType, typename ScalarType>
-inline void herk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                 transpose trans, int64_t n, int64_t k, ScalarType alpha,
-                 sycl::buffer<DataType, 1> &a, int64_t lda, ScalarType beta,
-                 sycl::buffer<DataType, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "herk", "for row_major layout");
-}
-
-#define HERK_LAUNCHER(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE)                                  \
-    void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,     \
-              SCALAR_TYPE alpha, sycl::buffer<DATA_TYPE, 1> &a, int64_t lda, SCALAR_TYPE beta, \
-              sycl::buffer<DATA_TYPE, 1> &c, int64_t ldc) {                                    \
-        herk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda,  \
-             beta, c, ldc);                                                                    \
-    }
-
-HERK_LAUNCHER(std::complex<float>, float, cublasCherk)
-HERK_LAUNCHER(std::complex<double>, double, cublasZherk)
-
-#undef HERK_LAUNCHER
-
-template <typename Func, typename T>
-inline void syr2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                  transpose trans, int64_t n, int64_t k, T alpha, sycl::buffer<T, 1> &a,
-                  int64_t lda, sycl::buffer<T, 1> &b, int64_t ldb, T beta, sycl::buffer<T, 1> &c,
-                  int64_t ldc) {
-    throw unimplemented("blas", "syr2k", "for row_major layout");
-}
-
-#define SYR2K_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                      \
-    void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,       \
-               TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b,       \
-               int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c, int64_t ldc) {                   \
-        syr2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, \
-              ldb, beta, c, ldc);                                                                 \
-    }
-SYR2K_LAUNCHER(float, cublasSsyr2k)
-SYR2K_LAUNCHER(double, cublasDsyr2k)
-SYR2K_LAUNCHER(std::complex<float>, cublasCsyr2k)
-SYR2K_LAUNCHER(std::complex<double>, cublasZsyr2k)
-
-#undef SYR2K_LAUNCHER
-
-template <typename Func, typename DataType, typename ScalarType>
-inline void her2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                  transpose trans, int64_t n, int64_t k, DataType alpha,
-                  sycl::buffer<DataType, 1> &a, int64_t lda, sycl::buffer<DataType, 1> &b,
-                  int64_t ldb, ScalarType beta, sycl::buffer<DataType, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "her2k", "for row_major layout");
-}
-
-#define HER2K_LAUNCHER(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE)                                    \
-    void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,       \
-               DATA_TYPE alpha, sycl::buffer<DATA_TYPE, 1> &a, int64_t lda,                       \
-               sycl::buffer<DATA_TYPE, 1> &b, int64_t ldb, SCALAR_TYPE beta,                      \
-               sycl::buffer<DATA_TYPE, 1> &c, int64_t ldc) {                                      \
-        her2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, \
-              ldb, beta, c, ldc);                                                                 \
-    }
-
-HER2K_LAUNCHER(std::complex<float>, float, cublasCher2k)
-HER2K_LAUNCHER(std::complex<double>, double, cublasZher2k)
-
-#undef HER2K_LAUNCHER
-
-// NOTE: In cublas TRMM diverted from the netlib blas and for performance
-// reason it requires the C matrix to be
-// separated from the B matrix. It is possible to use B instead of C, but this
-// will slow-down the code.
-template <typename Func, typename T>
-inline void trmm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                 uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b, int64_t ldb) {
-    throw unimplemented("blas", "trmm", "for row_major layout");
-}
-
-#define TRMM_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                        \
-    void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,              \
-              diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &a,          \
-              int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) {                                \
-        trmm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, \
-             n, alpha, a, lda, b, ldb);                                                            \
-    }
-TRMM_LAUNCHER(float, cublasStrmm)
-TRMM_LAUNCHER(double, cublasDtrmm)
-TRMM_LAUNCHER(std::complex<float>, cublasCtrmm)
-TRMM_LAUNCHER(std::complex<double>, cublasZtrmm)
-
-#undef TRMM_LAUNCHER
-
-template <typename Func, typename T>
-inline void trsm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                 uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b, int64_t ldb) {
-    throw unimplemented("blas", "trsm", "for row_major layout");
-}
-
-#define TRSM_LAUNCHER(TYPE, CUBLAS_ROUTINE)                                                        \
-    void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,              \
-              diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &a,          \
-              int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) {                                \
-        trsm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, \
-             n, alpha, a, lda, b, ldb);                                                            \
-    }
-TRSM_LAUNCHER(float, cublasStrsm)
-TRSM_LAUNCHER(double, cublasDtrsm)
-TRSM_LAUNCHER(std::complex<float>, cublasCtrsm)
-TRSM_LAUNCHER(std::complex<double>, cublasZtrsm)
-
-#undef TRSM_LAUNCHER
-
-// USM APIs
-
-template <typename Func, typename T>
-inline sycl::event gemm(const char *func_name, Func func, sycl::queue &queue, transpose transa,
-                        transpose transb, int64_t m, int64_t n, int64_t k, T alpha, const T *a,
-                        int64_t lda, const T *b, int64_t ldb, T beta, T *c, int64_t ldc,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm", "for row_major layout");
-}
-
-#define GEMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                    \
-    sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                     int64_t k, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b,             \
-                     int64_t ldb, TYPE beta, TYPE *c, int64_t ldc,                                 \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return gemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a,     \
-                    lda, b, ldb, beta, c, ldc, dependencies);                                      \
-    }
-
-GEMM_LAUNCHER_USM(float, cublasSgemm)
-GEMM_LAUNCHER_USM(double, cublasDgemm)
-GEMM_LAUNCHER_USM(std::complex<float>, cublasCgemm)
-GEMM_LAUNCHER_USM(std::complex<double>, cublasZgemm)
-
-#undef GEMM_LAUNCHER_USM
-
-template <typename T_A, typename T_B, typename T_C, typename DATATYPE_A, typename DATATYPE_B,
-          typename DATATYPE_C>
-inline sycl::event gemm_ex_usm(DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C,
-                               sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                               int64_t n, int64_t k, T_C alpha, const T_A *a, int64_t lda,
-                               const T_B *b, int64_t ldb, T_C beta, T_C *c, int64_t ldc,
-                               const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm", "for row_major layout");
-}
-
-#define GEMM_EX_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, CUDADATATYPE_A, CUDADATATYPE_B,               \
-                             CUDADATATYPE_C)                                                       \
-    sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                     int64_t k, TYPE_C alpha, const TYPE_A *a, int64_t lda, const TYPE_B *b,       \
-                     int64_t ldb, TYPE_C beta, TYPE_C *c, int64_t ldc,                             \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return gemm_ex_usm(CUDADATATYPE_A, CUDADATATYPE_B, CUDADATATYPE_C, queue, transa, transb,  \
-                           m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);            \
-    }
-
-GEMM_EX_LAUNCHER_USM(sycl::half, sycl::half, float, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F)
-GEMM_EX_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, CUDA_R_16F, CUDA_R_16F, CUDA_R_16F)
-
-#undef GEMM_EX_LAUNCHER_USM
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, float alpha, const bfloat16 *a, int64_t lda, const bfloat16 *b,
-                 int64_t ldb, float beta, float *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm", "for row_major layout");
-}
-
-template <typename Func, typename T>
-inline sycl::event symm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                        uplo upper_lower, int64_t m, int64_t n, T alpha, const T *a, int64_t lda,
-                        const T *b, int64_t ldb, T beta, T *c, int64_t ldc,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "symm", "for row_major layout");
-}
-
-#define SYMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \
-                     TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb,          \
-                     TYPE beta, TYPE *c, int64_t ldc,                                             \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return symm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, \
-                    a, lda, b, ldb, beta, c, ldc, dependencies);                                  \
-    }
-
-SYMM_LAUNCHER_USM(float, cublasSsymm)
-SYMM_LAUNCHER_USM(double, cublasDsymm)
-SYMM_LAUNCHER_USM(std::complex<float>, cublasCsymm)
-SYMM_LAUNCHER_USM(std::complex<double>, cublasZsymm)
-
-#undef SYMM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hemm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                        uplo upper_lower, int64_t m, int64_t n, T alpha, const T *a, int64_t lda,
-                        const T *b, int64_t ldb, T beta, T *c, int64_t ldc,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "hemm", "for row_major layout");
-}
-
-#define HEMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \
-                     TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb,          \
-                     TYPE beta, TYPE *c, int64_t ldc,                                             \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return hemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, \
-                    a, lda, b, ldb, beta, c, ldc, dependencies);                                  \
-    }
-HEMM_LAUNCHER_USM(std::complex<float>, cublasChemm)
-HEMM_LAUNCHER_USM(std::complex<double>, cublasZhemm)
-
-#undef HEMM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syrk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, int64_t n, int64_t k, T alpha, const T *a, int64_t lda,
-                        T beta, T *c, int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk", "for row_major layout");
-}
-
-#define SYRK_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \
-                     TYPE alpha, const TYPE *a, int64_t lda, TYPE beta, TYPE *c, int64_t ldc,     \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return syrk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a,   \
-                    lda, beta, c, ldc, dependencies);                                             \
-    }
-
-SYRK_LAUNCHER_USM(float, cublasSsyrk)
-SYRK_LAUNCHER_USM(double, cublasDsyrk)
-SYRK_LAUNCHER_USM(std::complex<float>, cublasCsyrk)
-SYRK_LAUNCHER_USM(std::complex<double>, cublasZsyrk)
-
-#undef SYRK_LAUNCHER_USM
-
-template <typename Func, typename DataType, typename ScalarType>
-inline sycl::event herk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                        transpose trans, int64_t n, int64_t k, const ScalarType alpha,
-                        const DataType *a, int64_t lda, const ScalarType beta, DataType *c,
-                        int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "herk", "for row_major layout");
-}
-
-#define HERK_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE)                                 \
-    sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \
-                     const SCALAR_TYPE alpha, const DATA_TYPE *a, int64_t lda,                    \
-                     const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc,                           \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return herk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a,   \
-                    lda, beta, c, ldc, dependencies);                                             \
-    }
-
-HERK_LAUNCHER_USM(std::complex<float>, float, cublasCherk)
-HERK_LAUNCHER_USM(std::complex<double>, double, cublasZherk)
-
-#undef HERK_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syr2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                         transpose trans, int64_t n, int64_t k, T alpha, const T *a, int64_t lda,
-                         const T *b, int64_t ldb, T beta, T *c, int64_t ldc,
-                         const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syr2k", "for row_major layout");
-}
-
-#define SYR2K_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                   \
-    sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \
-                      TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb,          \
-                      TYPE beta, TYPE *c, int64_t ldc,                                             \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return syr2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a,   \
-                     lda, b, ldb, beta, c, ldc, dependencies);                                     \
-    }
-SYR2K_LAUNCHER_USM(float, cublasSsyr2k)
-SYR2K_LAUNCHER_USM(double, cublasDsyr2k)
-SYR2K_LAUNCHER_USM(std::complex<float>, cublasCsyr2k)
-SYR2K_LAUNCHER_USM(std::complex<double>, cublasZsyr2k)
-
-#undef SYR2K_LAUNCHER_USM
-
-template <typename Func, typename DataType, typename ScalarType>
-inline sycl::event her2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower,
-                         transpose trans, int64_t n, int64_t k, const DataType alpha,
-                         const DataType *a, int64_t lda, const DataType *b, int64_t ldb,
-                         const ScalarType beta, DataType *c, int64_t ldc,
-                         const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "her2k", "for row_major layout");
-}
-
-#define HER2K_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE)                                 \
-    sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \
-                      const DATA_TYPE alpha, const DATA_TYPE *a, int64_t lda, const DATA_TYPE *b,  \
-                      int64_t ldb, const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc,              \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return her2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a,   \
-                     lda, b, ldb, beta, c, ldc, dependencies);                                     \
-    }
-
-HER2K_LAUNCHER_USM(std::complex<float>, float, cublasCher2k)
-HER2K_LAUNCHER_USM(std::complex<double>, double, cublasZher2k)
-
-#undef HER2K_LAUNCHER_USM
-
-// NOTE: In cublas TRMM diverted from the netlib blas and for performance
-// reason it requires the C matrix to be
-// separated from the B matrix. It is possible to use B instead of C, but this
-// will slow-down the code.
-template <typename Func, typename T>
-inline sycl::event trmm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n,
-                        T alpha, const T *a, int64_t lda, T *b, int64_t ldb,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trmm", "for row_major layout");
-}
-
-#define TRMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                    \
-    sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,       \
-                     diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \
-                     TYPE *b, int64_t ldb, const std::vector<sycl::event> &dependencies) {         \
-        return trmm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans,        \
-                    unit_diag, m, n, alpha, a, lda, b, ldb, dependencies);                         \
-    }
-TRMM_LAUNCHER_USM(float, cublasStrmm)
-TRMM_LAUNCHER_USM(double, cublasDtrmm)
-TRMM_LAUNCHER_USM(std::complex<float>, cublasCtrmm)
-TRMM_LAUNCHER_USM(std::complex<double>, cublasZtrmm)
-
-#undef TRMM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trsm(const char *func_name, Func func, sycl::queue &queue, side left_right,
-                        uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n,
-                        T alpha, const T *a, int64_t lda, T *b, int64_t ldb,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm", "for row_major layout");
-}
-
-#define TRSM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                    \
-    sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,       \
-                     diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \
-                     TYPE *b, int64_t ldb, const std::vector<sycl::event> &dependencies) {         \
-        return trsm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans,        \
-                    unit_diag, m, n, alpha, a, lda, b, ldb, dependencies);                         \
-    }
-TRSM_LAUNCHER_USM(float, cublasStrsm)
-TRSM_LAUNCHER_USM(double, cublasDtrsm)
-TRSM_LAUNCHER_USM(std::complex<float>, cublasCtrsm)
-TRSM_LAUNCHER_USM(std::complex<double>, cublasZtrsm)
-
-#undef TRSM_LAUNCHER_USM
-
-} // namespace row_major
-} // namespace cublas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/cublas/cublas_scope_handle.cpp b/src/blas/backends/cublas/cublas_scope_handle.cpp
deleted file mode 100644
index 05d1c1935..000000000
--- a/src/blas/backends/cublas/cublas_scope_handle.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "cublas_scope_handle.hpp"
-#if __has_include(<sycl/detail/common.hpp>)
-#include <sycl/detail/common.hpp>
-#else
-#include <CL/sycl/detail/common.hpp>
-#endif
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace cublas {
-
-/**
- * Inserts a new element in the map if its key is unique. This new element
- * is constructed in place using args as the arguments for the construction
- * of a value_type (which is an object of a pair type). The insertion only
- * takes place if no other element in the container has a key equivalent to
- * the one being emplaced (keys in a map container are unique).
- */
-thread_local cublas_handle<pi_context> CublasScopedContextHandler::handle_helper =
-    cublas_handle<pi_context>{};
-
-CublasScopedContextHandler::CublasScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih)
-        : ih(ih),
-          needToRecover_(false) {
-    placedContext_ = new sycl::context(queue.get_context());
-    auto cudaDevice = ih.get_native_device<sycl::backend::ext_oneapi_cuda>();
-    CUresult err;
-    CUcontext desired;
-    CUDA_ERROR_FUNC(cuCtxGetCurrent, err, &original_);
-    CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, err, &desired, cudaDevice);
-    if (original_ != desired) {
-        // Sets the desired context as the active one for the thread
-        CUDA_ERROR_FUNC(cuCtxSetCurrent, err, desired);
-        // No context is installed and the suggested context is primary
-        // This is the most common case. We can activate the context in the
-        // thread and leave it there until all the PI context referring to the
-        // same underlying CUDA primary context are destroyed. This emulates
-        // the behaviour of the CUDA runtime api, and avoids costly context
-        // switches. No action is required on this side of the if.
-        needToRecover_ = !(original_ == nullptr);
-    }
-}
-
-CublasScopedContextHandler::~CublasScopedContextHandler() noexcept(false) {
-    if (needToRecover_) {
-        CUresult err;
-        CUDA_ERROR_FUNC(cuCtxSetCurrent, err, original_);
-    }
-    delete placedContext_;
-}
-
-void ContextCallback(void *userData) {
-    auto *ptr = static_cast<std::atomic<cublasHandle_t> *>(userData);
-    if (!ptr) {
-        return;
-    }
-    auto handle = ptr->exchange(nullptr);
-    if (handle != nullptr) {
-        cublasStatus_t err1;
-        CUBLAS_ERROR_FUNC(cublasDestroy, err1, handle);
-        handle = nullptr;
-    }
-    else {
-        // if the handle is nullptr it means the handle was already destroyed by
-        // the cublas_handle destructor and we're free to delete the atomic
-        // object.
-        delete ptr;
-    }
-}
-
-cublasHandle_t CublasScopedContextHandler::get_handle(const sycl::queue &queue) {
-    auto cudaDevice = ih.get_native_device<sycl::backend::ext_oneapi_cuda>();
-    CUresult cuErr;
-    CUcontext desired;
-    CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, cuErr, &desired, cudaDevice);
-    auto piPlacedContext_ = reinterpret_cast<pi_context>(desired);
-    CUstream streamId = get_stream(queue);
-    cublasStatus_t err;
-    auto it = handle_helper.cublas_handle_mapper_.find(piPlacedContext_);
-    if (it != handle_helper.cublas_handle_mapper_.end()) {
-        if (it->second == nullptr) {
-            handle_helper.cublas_handle_mapper_.erase(it);
-        }
-        else {
-            auto handle = it->second->load();
-            if (handle != nullptr) {
-                cudaStream_t currentStreamId;
-                CUBLAS_ERROR_FUNC(cublasGetStream, err, handle, &currentStreamId);
-                if (currentStreamId != streamId) {
-                    CUBLAS_ERROR_FUNC(cublasSetStream, err, handle, streamId);
-                }
-                return handle;
-            }
-            else {
-                handle_helper.cublas_handle_mapper_.erase(it);
-            }
-        }
-    }
-
-    cublasHandle_t handle;
-
-    CUBLAS_ERROR_FUNC(cublasCreate, err, &handle);
-    CUBLAS_ERROR_FUNC(cublasSetStream, err, handle, streamId);
-
-    auto insert_iter = handle_helper.cublas_handle_mapper_.insert(
-        std::make_pair(piPlacedContext_, new std::atomic<cublasHandle_t>(handle)));
-
-    sycl::detail::pi::contextSetExtendedDeleter(*placedContext_, ContextCallback,
-                                                insert_iter.first->second);
-
-    return handle;
-}
-
-CUstream CublasScopedContextHandler::get_stream(const sycl::queue &queue) {
-    return sycl::get_native<sycl::backend::ext_oneapi_cuda>(queue);
-}
-sycl::context CublasScopedContextHandler::get_context(const sycl::queue &queue) {
-    return queue.get_context();
-}
-
-} // namespace cublas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/cublas/cublas_scope_handle.hpp b/src/blas/backends/cublas/cublas_scope_handle.hpp
deleted file mode 100644
index 7648130be..000000000
--- a/src/blas/backends/cublas/cublas_scope_handle.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#ifndef _CUBLAS_SCOPED_HANDLE_HPP_
-#define _CUBLAS_SCOPED_HANDLE_HPP_
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#if __has_include(<sycl/context.hpp>)
-#if __SYCL_COMPILER_VERSION <= 20220930
-#include <sycl/backend/cuda.hpp>
-#endif
-#include <sycl/context.hpp>
-#include <sycl/detail/pi.hpp>
-#else
-#include <CL/sycl/backend/cuda.hpp>
-#include <CL/sycl/context.hpp>
-#include <CL/sycl/detail/pi.hpp>
-#endif
-#include <atomic>
-#include <memory>
-#include <thread>
-#include <unordered_map>
-#include "cublas_helper.hpp"
-#include "cublas_handle.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace cublas {
-
-/**
-* @brief NVIDIA advise for handle creation:
-https://devtalk.nvidia.com/default/topic/838794/gpu-accelerated libraries/using-cublas-in-different-cuda-streams/
-According to NVIDIA: 
-1)	It is required that different handles to be used for different devices:
- http://docs.nvidia.com/cuda/cublas/index.html#cublas-context	
-2)	It is recommended (but not required, if care is taken) that different handles be used for different host threads: 
-http://docs.nvidia.com/cuda/cublas/index.html#thread-safety2changeme
-3)	It is neither required nor recommended that different handles be used for different streams on the same device,
- using the same host thread.
-
-However, the 3 above advises are for using cuda runtime API. The NVIDIA runtime API creates a default context for users. 
-The createHandle function in cuBLAS uses the context located on top of the stack for each thread. Then, the cuBLAS routine 
-uses this context for resource allocation/access. Calling a cuBLAS function with a handle created for context A and 
-memories/queue created for context B results in a segmentation fault. Thus we need to create one handle per context 
-and per thread. A context can have multiple streams, so the important thing here is to have one cublasHandle per driver 
-context and that cuBLAS handle can switch between multiple streams created for that context. Here, we are dealing with 
-CUDA driver API, therefore, the SYCL-CUDA backend controls the context. If a queue(equivalent of CUDA stream) is associated 
-with a context different from the one on top of the thread stack(can be any context which associated at any time by either 
-the runtime or user for any specific reason), the context associated with the queue must be moved on top of the stack 
-temporarily for the requested routine operations. However, after the cuBLAS routine execution, the original context must 
-be restored to prevent intervening with the original user/runtime execution set up. Here, the RAII type context switch 
-is used to guarantee to recover the original CUDA context. The cuBLAS handle allocates internal resources, therefore, 
-the handle must be destroyed when the context goes out of scope. This will bind the life of cuBLAS handle to the SYCL context.
-**/
-
-class CublasScopedContextHandler {
-    CUcontext original_;
-    sycl::context *placedContext_;
-    bool needToRecover_;
-    sycl::interop_handle &ih;
-    static thread_local cublas_handle<pi_context> handle_helper;
-    CUstream get_stream(const sycl::queue &queue);
-    sycl::context get_context(const sycl::queue &queue);
-
-public:
-    CublasScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih);
-
-    ~CublasScopedContextHandler() noexcept(false);
-    /**
-   * @brief get_handle: creates the handle by implicitly impose the advice
-   * given by nvidia for creating a cublas_handle. (e.g. one cuStream per device
-   * per thread).
-   * @param queue sycl queue.
-   * @return cublasHandle_t a handle to construct cublas routines
-   */
-    cublasHandle_t get_handle(const sycl::queue &queue);
-    // This is a work-around function for reinterpret_casting the memory. This
-    // will be fixed when SYCL-2020 has been implemented for Pi backend.
-    template <typename T, typename U>
-    inline T get_mem(U acc) {
-        CUdeviceptr cudaPtr = ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(acc);
-        return reinterpret_cast<T>(cudaPtr);
-    }
-
-    void wait_stream(const sycl::queue &queue) {
-        cuStreamSynchronize(get_stream(queue));
-    }
-};
-
-} // namespace cublas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-#endif //_CUBLAS_SCOPED_HANDLE_HPP_
diff --git a/src/blas/backends/cublas/cublas_scope_handle_hipsycl.cpp b/src/blas/backends/cublas/cublas_scope_handle_hipsycl.cpp
deleted file mode 100644
index 20675c212..000000000
--- a/src/blas/backends/cublas/cublas_scope_handle_hipsycl.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "cublas_scope_handle_hipsycl.hpp"
-#include "cublas_handle.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace cublas {
-
-thread_local cublas_handle<int> CublasScopedContextHandler::handle_helper = cublas_handle<int>{};
-
-CublasScopedContextHandler::CublasScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih)
-        : interop_h(ih) {}
-
-cublasHandle_t CublasScopedContextHandler::get_handle(const sycl::queue &queue) {
-    sycl::device device = queue.get_device();
-    int current_device = interop_h.get_native_device<sycl::backend::cuda>();
-    CUstream streamId = get_stream(queue);
-    cublasStatus_t err;
-    auto it = handle_helper.cublas_handle_mapper_.find(current_device);
-    if (it != handle_helper.cublas_handle_mapper_.end()) {
-        if (it->second == nullptr) {
-            handle_helper.cublas_handle_mapper_.erase(it);
-        }
-        else {
-            auto handle = it->second->load();
-            if (handle != nullptr) {
-                cudaStream_t currentStreamId;
-                CUBLAS_ERROR_FUNC(cublasGetStream, err, handle, &currentStreamId);
-                if (currentStreamId != streamId) {
-                    CUBLAS_ERROR_FUNC(cublasSetStream, err, handle, streamId);
-                }
-                return handle;
-            }
-            else {
-                handle_helper.cublas_handle_mapper_.erase(it);
-            }
-        }
-    }
-    cublasHandle_t handle;
-
-    CUBLAS_ERROR_FUNC(cublasCreate, err, &handle);
-    CUBLAS_ERROR_FUNC(cublasSetStream, err, handle, streamId);
-
-    auto insert_iter = handle_helper.cublas_handle_mapper_.insert(
-        std::make_pair(current_device, new std::atomic<cublasHandle_t>(handle)));
-    return handle;
-}
-
-CUstream CublasScopedContextHandler::get_stream(const sycl::queue &queue) {
-    return interop_h.get_native_queue<sycl::backend::cuda>();
-}
-
-} // namespace cublas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
\ No newline at end of file
diff --git a/src/blas/backends/cublas/cublas_scope_handle_hipsycl.hpp b/src/blas/backends/cublas/cublas_scope_handle_hipsycl.hpp
deleted file mode 100644
index c7ec3e520..000000000
--- a/src/blas/backends/cublas/cublas_scope_handle_hipsycl.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#ifndef CUBLAS_SCOPED_HANDLE_HIPSYCL_HPP
-#define CUBLAS_SCOPED_HANDLE_HIPSYCL_HPP
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <memory>
-#include <thread>
-#include <unordered_map>
-#include "cublas_helper.hpp"
-#include "cublas_handle.hpp"
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace cublas {
-
-/**
-* @brief NVIDIA advise for handle creation:
-https://devtalk.nvidia.com/default/topic/838794/gpu-accelerated libraries/using-cublas-in-different-cuda-streams/
-According to NVIDIA: 
-1)	It is required that different handles to be used for different devices:
- http://docs.nvidia.com/cuda/cublas/index.html#cublas-context	
-2)	It is recommended (but not required, if care is taken) that different handles be used for different host threads: 
-http://docs.nvidia.com/cuda/cublas/index.html#thread-safety2changeme
-3)	It is neither required nor recommended that different handles be used for different streams on the same device,
- using the same host thread.
-However, the 3 above advises are for using cuda runtime API. The NVIDIA runtime API creates a default context for users. 
-The createHandle function in cuBLAS uses the context located on top of the stack for each thread. Then, the cuBLAS routine 
-uses this context for resource allocation/access. Calling a cuBLAS function with a handle created for context A and 
-memories/queue created for context B results in a segmentation fault. Thus we need to create one handle per context 
-and per thread. A context can have multiple streams, so the important thing here is to have one cublasHandle per driver 
-context and that cuBLAS handle can switch between multiple streams created for that context. Here, we are dealing with 
-CUDA driver API, therefore, the SYCL-CUDA backend controls the context. If a queue(equivalent of CUDA stream) is associated 
-with a context different from the one on top of the thread stack(can be any context which associated at any time by either 
-the runtime or user for any specific reason), the context associated with the queue must be moved on top of the stack 
-temporarily for the requested routine operations. However, after the cuBLAS routine execution, the original context must 
-be restored to prevent intervening with the original user/runtime execution set up. Here, the RAII type context switch 
-is used to guarantee to recover the original CUDA context. The cuBLAS handle allocates internal resources, therefore, 
-the handle must be destroyed when the context goes out of scope. This will bind the life of cuBLAS handle to the SYCL context.
-**/
-
-class CublasScopedContextHandler {
-    sycl::interop_handle interop_h;
-    static thread_local cublas_handle<int> handle_helper;
-    sycl::context get_context(const sycl::queue &queue);
-    CUstream get_stream(const sycl::queue &queue);
-
-public:
-    CublasScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih);
-
-    cublasHandle_t get_handle(const sycl::queue &queue);
-
-    // This is a work-around function for reinterpret_casting the memory. This
-    // will be fixed when SYCL-2020 has been implemented for Pi backend.
-    template <typename T, typename U>
-    inline T get_mem(U acc) {
-        return reinterpret_cast<T>(interop_h.get_native_mem<sycl::backend::cuda>(acc));
-    }
-};
-
-} // namespace cublas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-#endif //CUBLAS_SCOPED_HANDLE_HIPSYCL_HPP
diff --git a/src/blas/backends/cublas/cublas_task.hpp b/src/blas/backends/cublas/cublas_task.hpp
deleted file mode 100644
index e5cf0d7c2..000000000
--- a/src/blas/backends/cublas/cublas_task.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _MKL_BLAS_CUBLAS_TASK_HPP_
-#define _MKL_BLAS_CUBLAS_TASK_HPP_
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <complex>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl/types.hpp"
-#ifndef __HIPSYCL__
-#include "cublas_scope_handle.hpp"
-#if __has_include(<sycl/detail/pi.hpp>)
-#include <sycl/detail/pi.hpp>
-#else
-#include <CL/sycl/detail/pi.hpp>
-#endif
-#else
-#include "cublas_scope_handle_hipsycl.hpp"
-namespace sycl {
-using interop_handler = sycl::interop_handle;
-}
-#endif
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace cublas {
-
-#ifdef __HIPSYCL__
-template <typename H, typename F>
-static inline void host_task_internal(H &cgh, sycl::queue queue, F f) {
-    cgh.hipSYCL_enqueue_custom_operation([f, queue](sycl::interop_handle ih) {
-        auto sc = CublasScopedContextHandler(queue, ih);
-        f(sc);
-    });
-}
-#else
-template <typename H, typename F>
-static inline void host_task_internal(H &cgh, sycl::queue queue, F f) {
-    cgh.host_task([f, queue](sycl::interop_handle ih) {
-        auto sc = CublasScopedContextHandler(queue, ih);
-        f(sc);
-    });
-}
-#endif
-template <typename H, typename F>
-static inline void onemkl_cublas_host_task(H &cgh, sycl::queue queue, F f) {
-    (void)host_task_internal(cgh, queue, f);
-}
-
-} // namespace cublas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-#endif // _MKL_BLAS_CUBLAS_TASK_HPP_
diff --git a/src/blas/backends/cublas/cublas_wrappers.cpp b/src/blas/backends/cublas/cublas_wrappers.cpp
deleted file mode 100644
index ee5c7239f..000000000
--- a/src/blas/backends/cublas/cublas_wrappers.cpp
+++ /dev/null
@@ -1,1006 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "blas/function_table.hpp"
-#include "oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hpp"
-
-#define WRAPPER_VERSION 1
-
-extern "C" blas_function_table_t mkl_blas_table = {
-    WRAPPER_VERSION,
-    oneapi::mkl::blas::cublas::column_major::asum,
-    oneapi::mkl::blas::cublas::column_major::asum,
-    oneapi::mkl::blas::cublas::column_major::asum,
-    oneapi::mkl::blas::cublas::column_major::asum,
-    oneapi::mkl::blas::cublas::column_major::axpy,
-    oneapi::mkl::blas::cublas::column_major::axpy,
-    oneapi::mkl::blas::cublas::column_major::axpy,
-    oneapi::mkl::blas::cublas::column_major::axpy,
-    oneapi::mkl::blas::cublas::column_major::axpy_batch,
-    oneapi::mkl::blas::cublas::column_major::axpy_batch,
-    oneapi::mkl::blas::cublas::column_major::axpy_batch,
-    oneapi::mkl::blas::cublas::column_major::axpy_batch,
-    oneapi::mkl::blas::cublas::column_major::axpby,
-    oneapi::mkl::blas::cublas::column_major::axpby,
-    oneapi::mkl::blas::cublas::column_major::axpby,
-    oneapi::mkl::blas::cublas::column_major::axpby,
-    oneapi::mkl::blas::cublas::column_major::copy,
-    oneapi::mkl::blas::cublas::column_major::copy,
-    oneapi::mkl::blas::cublas::column_major::copy,
-    oneapi::mkl::blas::cublas::column_major::copy,
-    oneapi::mkl::blas::cublas::column_major::copy_batch,
-    oneapi::mkl::blas::cublas::column_major::copy_batch,
-    oneapi::mkl::blas::cublas::column_major::copy_batch,
-    oneapi::mkl::blas::cublas::column_major::copy_batch,
-    oneapi::mkl::blas::cublas::column_major::dot,
-    oneapi::mkl::blas::cublas::column_major::dot,
-    oneapi::mkl::blas::cublas::column_major::dot,
-    oneapi::mkl::blas::cublas::column_major::dotc,
-    oneapi::mkl::blas::cublas::column_major::dotc,
-    oneapi::mkl::blas::cublas::column_major::dotu,
-    oneapi::mkl::blas::cublas::column_major::dotu,
-    oneapi::mkl::blas::cublas::column_major::iamin,
-    oneapi::mkl::blas::cublas::column_major::iamin,
-    oneapi::mkl::blas::cublas::column_major::iamin,
-    oneapi::mkl::blas::cublas::column_major::iamin,
-    oneapi::mkl::blas::cublas::column_major::iamax,
-    oneapi::mkl::blas::cublas::column_major::iamax,
-    oneapi::mkl::blas::cublas::column_major::iamax,
-    oneapi::mkl::blas::cublas::column_major::iamax,
-    oneapi::mkl::blas::cublas::column_major::nrm2,
-    oneapi::mkl::blas::cublas::column_major::nrm2,
-    oneapi::mkl::blas::cublas::column_major::nrm2,
-    oneapi::mkl::blas::cublas::column_major::nrm2,
-    oneapi::mkl::blas::cublas::column_major::rot,
-    oneapi::mkl::blas::cublas::column_major::rot,
-    oneapi::mkl::blas::cublas::column_major::rot,
-    oneapi::mkl::blas::cublas::column_major::rot,
-    oneapi::mkl::blas::cublas::column_major::rotg,
-    oneapi::mkl::blas::cublas::column_major::rotg,
-    oneapi::mkl::blas::cublas::column_major::rotg,
-    oneapi::mkl::blas::cublas::column_major::rotg,
-    oneapi::mkl::blas::cublas::column_major::rotm,
-    oneapi::mkl::blas::cublas::column_major::rotm,
-    oneapi::mkl::blas::cublas::column_major::rotmg,
-    oneapi::mkl::blas::cublas::column_major::rotmg,
-    oneapi::mkl::blas::cublas::column_major::scal,
-    oneapi::mkl::blas::cublas::column_major::scal,
-    oneapi::mkl::blas::cublas::column_major::scal,
-    oneapi::mkl::blas::cublas::column_major::scal,
-    oneapi::mkl::blas::cublas::column_major::scal,
-    oneapi::mkl::blas::cublas::column_major::scal,
-    oneapi::mkl::blas::cublas::column_major::sdsdot,
-    oneapi::mkl::blas::cublas::column_major::swap,
-    oneapi::mkl::blas::cublas::column_major::swap,
-    oneapi::mkl::blas::cublas::column_major::swap,
-    oneapi::mkl::blas::cublas::column_major::swap,
-    oneapi::mkl::blas::cublas::column_major::gbmv,
-    oneapi::mkl::blas::cublas::column_major::gbmv,
-    oneapi::mkl::blas::cublas::column_major::gbmv,
-    oneapi::mkl::blas::cublas::column_major::gbmv,
-    oneapi::mkl::blas::cublas::column_major::gemv,
-    oneapi::mkl::blas::cublas::column_major::gemv,
-    oneapi::mkl::blas::cublas::column_major::gemv,
-    oneapi::mkl::blas::cublas::column_major::gemv,
-    oneapi::mkl::blas::cublas::column_major::gemv_batch,
-    oneapi::mkl::blas::cublas::column_major::gemv_batch,
-    oneapi::mkl::blas::cublas::column_major::gemv_batch,
-    oneapi::mkl::blas::cublas::column_major::gemv_batch,
-    oneapi::mkl::blas::cublas::column_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::column_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::column_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::column_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::column_major::ger,
-    oneapi::mkl::blas::cublas::column_major::ger,
-    oneapi::mkl::blas::cublas::column_major::gerc,
-    oneapi::mkl::blas::cublas::column_major::gerc,
-    oneapi::mkl::blas::cublas::column_major::geru,
-    oneapi::mkl::blas::cublas::column_major::geru,
-    oneapi::mkl::blas::cublas::column_major::hbmv,
-    oneapi::mkl::blas::cublas::column_major::hbmv,
-    oneapi::mkl::blas::cublas::column_major::hemv,
-    oneapi::mkl::blas::cublas::column_major::hemv,
-    oneapi::mkl::blas::cublas::column_major::her,
-    oneapi::mkl::blas::cublas::column_major::her,
-    oneapi::mkl::blas::cublas::column_major::her2,
-    oneapi::mkl::blas::cublas::column_major::her2,
-    oneapi::mkl::blas::cublas::column_major::hpmv,
-    oneapi::mkl::blas::cublas::column_major::hpmv,
-    oneapi::mkl::blas::cublas::column_major::hpr,
-    oneapi::mkl::blas::cublas::column_major::hpr,
-    oneapi::mkl::blas::cublas::column_major::hpr2,
-    oneapi::mkl::blas::cublas::column_major::hpr2,
-    oneapi::mkl::blas::cublas::column_major::sbmv,
-    oneapi::mkl::blas::cublas::column_major::sbmv,
-    oneapi::mkl::blas::cublas::column_major::spmv,
-    oneapi::mkl::blas::cublas::column_major::spmv,
-    oneapi::mkl::blas::cublas::column_major::spr,
-    oneapi::mkl::blas::cublas::column_major::spr,
-    oneapi::mkl::blas::cublas::column_major::spr2,
-    oneapi::mkl::blas::cublas::column_major::spr2,
-    oneapi::mkl::blas::cublas::column_major::symv,
-    oneapi::mkl::blas::cublas::column_major::symv,
-    oneapi::mkl::blas::cublas::column_major::syr,
-    oneapi::mkl::blas::cublas::column_major::syr,
-    oneapi::mkl::blas::cublas::column_major::syr2,
-    oneapi::mkl::blas::cublas::column_major::syr2,
-    oneapi::mkl::blas::cublas::column_major::tbmv,
-    oneapi::mkl::blas::cublas::column_major::tbmv,
-    oneapi::mkl::blas::cublas::column_major::tbmv,
-    oneapi::mkl::blas::cublas::column_major::tbmv,
-    oneapi::mkl::blas::cublas::column_major::tbsv,
-    oneapi::mkl::blas::cublas::column_major::tbsv,
-    oneapi::mkl::blas::cublas::column_major::tbsv,
-    oneapi::mkl::blas::cublas::column_major::tbsv,
-    oneapi::mkl::blas::cublas::column_major::tpmv,
-    oneapi::mkl::blas::cublas::column_major::tpmv,
-    oneapi::mkl::blas::cublas::column_major::tpmv,
-    oneapi::mkl::blas::cublas::column_major::tpmv,
-    oneapi::mkl::blas::cublas::column_major::tpsv,
-    oneapi::mkl::blas::cublas::column_major::tpsv,
-    oneapi::mkl::blas::cublas::column_major::tpsv,
-    oneapi::mkl::blas::cublas::column_major::tpsv,
-    oneapi::mkl::blas::cublas::column_major::trmv,
-    oneapi::mkl::blas::cublas::column_major::trmv,
-    oneapi::mkl::blas::cublas::column_major::trmv,
-    oneapi::mkl::blas::cublas::column_major::trmv,
-    oneapi::mkl::blas::cublas::column_major::trsv,
-    oneapi::mkl::blas::cublas::column_major::trsv,
-    oneapi::mkl::blas::cublas::column_major::trsv,
-    oneapi::mkl::blas::cublas::column_major::trsv,
-    oneapi::mkl::blas::cublas::column_major::gemm,
-    oneapi::mkl::blas::cublas::column_major::gemm,
-    oneapi::mkl::blas::cublas::column_major::gemm,
-    oneapi::mkl::blas::cublas::column_major::gemm,
-    oneapi::mkl::blas::cublas::column_major::gemm,
-    oneapi::mkl::blas::cublas::column_major::gemm,
-    oneapi::mkl::blas::cublas::column_major::gemm,
-    oneapi::mkl::blas::cublas::column_major::hemm,
-    oneapi::mkl::blas::cublas::column_major::hemm,
-    oneapi::mkl::blas::cublas::column_major::herk,
-    oneapi::mkl::blas::cublas::column_major::herk,
-    oneapi::mkl::blas::cublas::column_major::her2k,
-    oneapi::mkl::blas::cublas::column_major::her2k,
-    oneapi::mkl::blas::cublas::column_major::symm,
-    oneapi::mkl::blas::cublas::column_major::symm,
-    oneapi::mkl::blas::cublas::column_major::symm,
-    oneapi::mkl::blas::cublas::column_major::symm,
-    oneapi::mkl::blas::cublas::column_major::syrk,
-    oneapi::mkl::blas::cublas::column_major::syrk,
-    oneapi::mkl::blas::cublas::column_major::syrk,
-    oneapi::mkl::blas::cublas::column_major::syrk,
-    oneapi::mkl::blas::cublas::column_major::syrk_batch,
-    oneapi::mkl::blas::cublas::column_major::syrk_batch,
-    oneapi::mkl::blas::cublas::column_major::syrk_batch,
-    oneapi::mkl::blas::cublas::column_major::syrk_batch,
-    oneapi::mkl::blas::cublas::column_major::syr2k,
-    oneapi::mkl::blas::cublas::column_major::syr2k,
-    oneapi::mkl::blas::cublas::column_major::syr2k,
-    oneapi::mkl::blas::cublas::column_major::syr2k,
-    oneapi::mkl::blas::cublas::column_major::trmm,
-    oneapi::mkl::blas::cublas::column_major::trmm,
-    oneapi::mkl::blas::cublas::column_major::trmm,
-    oneapi::mkl::blas::cublas::column_major::trmm,
-    oneapi::mkl::blas::cublas::column_major::trsm,
-    oneapi::mkl::blas::cublas::column_major::trsm,
-    oneapi::mkl::blas::cublas::column_major::trsm,
-    oneapi::mkl::blas::cublas::column_major::trsm,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::trsm_batch,
-    oneapi::mkl::blas::cublas::column_major::trsm_batch,
-    oneapi::mkl::blas::cublas::column_major::trsm_batch,
-    oneapi::mkl::blas::cublas::column_major::trsm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemmt,
-    oneapi::mkl::blas::cublas::column_major::gemmt,
-    oneapi::mkl::blas::cublas::column_major::gemmt,
-    oneapi::mkl::blas::cublas::column_major::gemmt,
-    oneapi::mkl::blas::cublas::column_major::gemm_bias,
-    oneapi::mkl::blas::cublas::column_major::gemm_bias,
-    oneapi::mkl::blas::cublas::column_major::gemm_bias,
-    oneapi::mkl::blas::cublas::column_major::gemm_bias,
-    oneapi::mkl::blas::cublas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::column_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::column_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::column_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::column_major::omatcopy,
-    oneapi::mkl::blas::cublas::column_major::omatcopy,
-    oneapi::mkl::blas::cublas::column_major::omatcopy,
-    oneapi::mkl::blas::cublas::column_major::omatcopy,
-    oneapi::mkl::blas::cublas::column_major::omatcopy2,
-    oneapi::mkl::blas::cublas::column_major::omatcopy2,
-    oneapi::mkl::blas::cublas::column_major::omatcopy2,
-    oneapi::mkl::blas::cublas::column_major::omatcopy2,
-    oneapi::mkl::blas::cublas::column_major::imatcopy,
-    oneapi::mkl::blas::cublas::column_major::imatcopy,
-    oneapi::mkl::blas::cublas::column_major::imatcopy,
-    oneapi::mkl::blas::cublas::column_major::imatcopy,
-    oneapi::mkl::blas::cublas::column_major::omatadd,
-    oneapi::mkl::blas::cublas::column_major::omatadd,
-    oneapi::mkl::blas::cublas::column_major::omatadd,
-    oneapi::mkl::blas::cublas::column_major::omatadd,
-    oneapi::mkl::blas::cublas::column_major::asum,
-    oneapi::mkl::blas::cublas::column_major::asum,
-    oneapi::mkl::blas::cublas::column_major::asum,
-    oneapi::mkl::blas::cublas::column_major::asum,
-    oneapi::mkl::blas::cublas::column_major::axpy,
-    oneapi::mkl::blas::cublas::column_major::axpy,
-    oneapi::mkl::blas::cublas::column_major::axpy,
-    oneapi::mkl::blas::cublas::column_major::axpy,
-    oneapi::mkl::blas::cublas::column_major::axpy_batch,
-    oneapi::mkl::blas::cublas::column_major::axpy_batch,
-    oneapi::mkl::blas::cublas::column_major::axpy_batch,
-    oneapi::mkl::blas::cublas::column_major::axpy_batch,
-    oneapi::mkl::blas::cublas::column_major::axpy_batch,
-    oneapi::mkl::blas::cublas::column_major::axpy_batch,
-    oneapi::mkl::blas::cublas::column_major::axpy_batch,
-    oneapi::mkl::blas::cublas::column_major::axpy_batch,
-    oneapi::mkl::blas::cublas::column_major::axpby,
-    oneapi::mkl::blas::cublas::column_major::axpby,
-    oneapi::mkl::blas::cublas::column_major::axpby,
-    oneapi::mkl::blas::cublas::column_major::axpby,
-    oneapi::mkl::blas::cublas::column_major::copy,
-    oneapi::mkl::blas::cublas::column_major::copy,
-    oneapi::mkl::blas::cublas::column_major::copy,
-    oneapi::mkl::blas::cublas::column_major::copy,
-    oneapi::mkl::blas::cublas::column_major::copy_batch,
-    oneapi::mkl::blas::cublas::column_major::copy_batch,
-    oneapi::mkl::blas::cublas::column_major::copy_batch,
-    oneapi::mkl::blas::cublas::column_major::copy_batch,
-    oneapi::mkl::blas::cublas::column_major::copy_batch,
-    oneapi::mkl::blas::cublas::column_major::copy_batch,
-    oneapi::mkl::blas::cublas::column_major::copy_batch,
-    oneapi::mkl::blas::cublas::column_major::copy_batch,
-    oneapi::mkl::blas::cublas::column_major::dot,
-    oneapi::mkl::blas::cublas::column_major::dot,
-    oneapi::mkl::blas::cublas::column_major::dot,
-    oneapi::mkl::blas::cublas::column_major::dotc,
-    oneapi::mkl::blas::cublas::column_major::dotc,
-    oneapi::mkl::blas::cublas::column_major::dotu,
-    oneapi::mkl::blas::cublas::column_major::dotu,
-    oneapi::mkl::blas::cublas::column_major::iamin,
-    oneapi::mkl::blas::cublas::column_major::iamin,
-    oneapi::mkl::blas::cublas::column_major::iamin,
-    oneapi::mkl::blas::cublas::column_major::iamin,
-    oneapi::mkl::blas::cublas::column_major::iamax,
-    oneapi::mkl::blas::cublas::column_major::iamax,
-    oneapi::mkl::blas::cublas::column_major::iamax,
-    oneapi::mkl::blas::cublas::column_major::iamax,
-    oneapi::mkl::blas::cublas::column_major::nrm2,
-    oneapi::mkl::blas::cublas::column_major::nrm2,
-    oneapi::mkl::blas::cublas::column_major::nrm2,
-    oneapi::mkl::blas::cublas::column_major::nrm2,
-    oneapi::mkl::blas::cublas::column_major::rot,
-    oneapi::mkl::blas::cublas::column_major::rot,
-    oneapi::mkl::blas::cublas::column_major::rot,
-    oneapi::mkl::blas::cublas::column_major::rot,
-    oneapi::mkl::blas::cublas::column_major::rotg,
-    oneapi::mkl::blas::cublas::column_major::rotg,
-    oneapi::mkl::blas::cublas::column_major::rotg,
-    oneapi::mkl::blas::cublas::column_major::rotg,
-    oneapi::mkl::blas::cublas::column_major::rotm,
-    oneapi::mkl::blas::cublas::column_major::rotm,
-    oneapi::mkl::blas::cublas::column_major::rotmg,
-    oneapi::mkl::blas::cublas::column_major::rotmg,
-    oneapi::mkl::blas::cublas::column_major::scal,
-    oneapi::mkl::blas::cublas::column_major::scal,
-    oneapi::mkl::blas::cublas::column_major::scal,
-    oneapi::mkl::blas::cublas::column_major::scal,
-    oneapi::mkl::blas::cublas::column_major::scal,
-    oneapi::mkl::blas::cublas::column_major::scal,
-    oneapi::mkl::blas::cublas::column_major::sdsdot,
-    oneapi::mkl::blas::cublas::column_major::swap,
-    oneapi::mkl::blas::cublas::column_major::swap,
-    oneapi::mkl::blas::cublas::column_major::swap,
-    oneapi::mkl::blas::cublas::column_major::swap,
-    oneapi::mkl::blas::cublas::column_major::gbmv,
-    oneapi::mkl::blas::cublas::column_major::gbmv,
-    oneapi::mkl::blas::cublas::column_major::gbmv,
-    oneapi::mkl::blas::cublas::column_major::gbmv,
-    oneapi::mkl::blas::cublas::column_major::gemv,
-    oneapi::mkl::blas::cublas::column_major::gemv,
-    oneapi::mkl::blas::cublas::column_major::gemv,
-    oneapi::mkl::blas::cublas::column_major::gemv,
-    oneapi::mkl::blas::cublas::column_major::gemv_batch,
-    oneapi::mkl::blas::cublas::column_major::gemv_batch,
-    oneapi::mkl::blas::cublas::column_major::gemv_batch,
-    oneapi::mkl::blas::cublas::column_major::gemv_batch,
-    oneapi::mkl::blas::cublas::column_major::gemv_batch,
-    oneapi::mkl::blas::cublas::column_major::gemv_batch,
-    oneapi::mkl::blas::cublas::column_major::gemv_batch,
-    oneapi::mkl::blas::cublas::column_major::gemv_batch,
-    oneapi::mkl::blas::cublas::column_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::column_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::column_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::column_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::column_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::column_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::column_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::column_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::column_major::ger,
-    oneapi::mkl::blas::cublas::column_major::ger,
-    oneapi::mkl::blas::cublas::column_major::gerc,
-    oneapi::mkl::blas::cublas::column_major::gerc,
-    oneapi::mkl::blas::cublas::column_major::geru,
-    oneapi::mkl::blas::cublas::column_major::geru,
-    oneapi::mkl::blas::cublas::column_major::hbmv,
-    oneapi::mkl::blas::cublas::column_major::hbmv,
-    oneapi::mkl::blas::cublas::column_major::hemv,
-    oneapi::mkl::blas::cublas::column_major::hemv,
-    oneapi::mkl::blas::cublas::column_major::her,
-    oneapi::mkl::blas::cublas::column_major::her,
-    oneapi::mkl::blas::cublas::column_major::her2,
-    oneapi::mkl::blas::cublas::column_major::her2,
-    oneapi::mkl::blas::cublas::column_major::hpmv,
-    oneapi::mkl::blas::cublas::column_major::hpmv,
-    oneapi::mkl::blas::cublas::column_major::hpr,
-    oneapi::mkl::blas::cublas::column_major::hpr,
-    oneapi::mkl::blas::cublas::column_major::hpr2,
-    oneapi::mkl::blas::cublas::column_major::hpr2,
-    oneapi::mkl::blas::cublas::column_major::sbmv,
-    oneapi::mkl::blas::cublas::column_major::sbmv,
-    oneapi::mkl::blas::cublas::column_major::spmv,
-    oneapi::mkl::blas::cublas::column_major::spmv,
-    oneapi::mkl::blas::cublas::column_major::spr,
-    oneapi::mkl::blas::cublas::column_major::spr,
-    oneapi::mkl::blas::cublas::column_major::spr2,
-    oneapi::mkl::blas::cublas::column_major::spr2,
-    oneapi::mkl::blas::cublas::column_major::symv,
-    oneapi::mkl::blas::cublas::column_major::symv,
-    oneapi::mkl::blas::cublas::column_major::syr,
-    oneapi::mkl::blas::cublas::column_major::syr,
-    oneapi::mkl::blas::cublas::column_major::syr2,
-    oneapi::mkl::blas::cublas::column_major::syr2,
-    oneapi::mkl::blas::cublas::column_major::tbmv,
-    oneapi::mkl::blas::cublas::column_major::tbmv,
-    oneapi::mkl::blas::cublas::column_major::tbmv,
-    oneapi::mkl::blas::cublas::column_major::tbmv,
-    oneapi::mkl::blas::cublas::column_major::tbsv,
-    oneapi::mkl::blas::cublas::column_major::tbsv,
-    oneapi::mkl::blas::cublas::column_major::tbsv,
-    oneapi::mkl::blas::cublas::column_major::tbsv,
-    oneapi::mkl::blas::cublas::column_major::tpmv,
-    oneapi::mkl::blas::cublas::column_major::tpmv,
-    oneapi::mkl::blas::cublas::column_major::tpmv,
-    oneapi::mkl::blas::cublas::column_major::tpmv,
-    oneapi::mkl::blas::cublas::column_major::tpsv,
-    oneapi::mkl::blas::cublas::column_major::tpsv,
-    oneapi::mkl::blas::cublas::column_major::tpsv,
-    oneapi::mkl::blas::cublas::column_major::tpsv,
-    oneapi::mkl::blas::cublas::column_major::trmv,
-    oneapi::mkl::blas::cublas::column_major::trmv,
-    oneapi::mkl::blas::cublas::column_major::trmv,
-    oneapi::mkl::blas::cublas::column_major::trmv,
-    oneapi::mkl::blas::cublas::column_major::trsv,
-    oneapi::mkl::blas::cublas::column_major::trsv,
-    oneapi::mkl::blas::cublas::column_major::trsv,
-    oneapi::mkl::blas::cublas::column_major::trsv,
-    oneapi::mkl::blas::cublas::column_major::gemm,
-    oneapi::mkl::blas::cublas::column_major::gemm,
-    oneapi::mkl::blas::cublas::column_major::gemm,
-    oneapi::mkl::blas::cublas::column_major::gemm,
-    oneapi::mkl::blas::cublas::column_major::gemm,
-    oneapi::mkl::blas::cublas::column_major::gemm,
-    oneapi::mkl::blas::cublas::column_major::gemm,
-    oneapi::mkl::blas::cublas::column_major::hemm,
-    oneapi::mkl::blas::cublas::column_major::hemm,
-    oneapi::mkl::blas::cublas::column_major::herk,
-    oneapi::mkl::blas::cublas::column_major::herk,
-    oneapi::mkl::blas::cublas::column_major::her2k,
-    oneapi::mkl::blas::cublas::column_major::her2k,
-    oneapi::mkl::blas::cublas::column_major::symm,
-    oneapi::mkl::blas::cublas::column_major::symm,
-    oneapi::mkl::blas::cublas::column_major::symm,
-    oneapi::mkl::blas::cublas::column_major::symm,
-    oneapi::mkl::blas::cublas::column_major::syrk,
-    oneapi::mkl::blas::cublas::column_major::syrk,
-    oneapi::mkl::blas::cublas::column_major::syrk,
-    oneapi::mkl::blas::cublas::column_major::syrk,
-    oneapi::mkl::blas::cublas::column_major::syrk_batch,
-    oneapi::mkl::blas::cublas::column_major::syrk_batch,
-    oneapi::mkl::blas::cublas::column_major::syrk_batch,
-    oneapi::mkl::blas::cublas::column_major::syrk_batch,
-    oneapi::mkl::blas::cublas::column_major::syrk_batch,
-    oneapi::mkl::blas::cublas::column_major::syrk_batch,
-    oneapi::mkl::blas::cublas::column_major::syrk_batch,
-    oneapi::mkl::blas::cublas::column_major::syrk_batch,
-    oneapi::mkl::blas::cublas::column_major::syr2k,
-    oneapi::mkl::blas::cublas::column_major::syr2k,
-    oneapi::mkl::blas::cublas::column_major::syr2k,
-    oneapi::mkl::blas::cublas::column_major::syr2k,
-    oneapi::mkl::blas::cublas::column_major::trmm,
-    oneapi::mkl::blas::cublas::column_major::trmm,
-    oneapi::mkl::blas::cublas::column_major::trmm,
-    oneapi::mkl::blas::cublas::column_major::trmm,
-    oneapi::mkl::blas::cublas::column_major::trsm,
-    oneapi::mkl::blas::cublas::column_major::trsm,
-    oneapi::mkl::blas::cublas::column_major::trsm,
-    oneapi::mkl::blas::cublas::column_major::trsm,
-    oneapi::mkl::blas::cublas::column_major::trsm_batch,
-    oneapi::mkl::blas::cublas::column_major::trsm_batch,
-    oneapi::mkl::blas::cublas::column_major::trsm_batch,
-    oneapi::mkl::blas::cublas::column_major::trsm_batch,
-    oneapi::mkl::blas::cublas::column_major::trsm_batch,
-    oneapi::mkl::blas::cublas::column_major::trsm_batch,
-    oneapi::mkl::blas::cublas::column_major::trsm_batch,
-    oneapi::mkl::blas::cublas::column_major::trsm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemm_batch,
-    oneapi::mkl::blas::cublas::column_major::gemmt,
-    oneapi::mkl::blas::cublas::column_major::gemmt,
-    oneapi::mkl::blas::cublas::column_major::gemmt,
-    oneapi::mkl::blas::cublas::column_major::gemmt,
-    oneapi::mkl::blas::cublas::column_major::gemm_bias,
-    oneapi::mkl::blas::cublas::column_major::gemm_bias,
-    oneapi::mkl::blas::cublas::column_major::gemm_bias,
-    oneapi::mkl::blas::cublas::column_major::gemm_bias,
-    oneapi::mkl::blas::cublas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::column_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::column_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::column_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::column_major::omatcopy,
-    oneapi::mkl::blas::cublas::column_major::omatcopy,
-    oneapi::mkl::blas::cublas::column_major::omatcopy,
-    oneapi::mkl::blas::cublas::column_major::omatcopy,
-    oneapi::mkl::blas::cublas::column_major::omatcopy2,
-    oneapi::mkl::blas::cublas::column_major::omatcopy2,
-    oneapi::mkl::blas::cublas::column_major::omatcopy2,
-    oneapi::mkl::blas::cublas::column_major::omatcopy2,
-    oneapi::mkl::blas::cublas::column_major::imatcopy,
-    oneapi::mkl::blas::cublas::column_major::imatcopy,
-    oneapi::mkl::blas::cublas::column_major::imatcopy,
-    oneapi::mkl::blas::cublas::column_major::imatcopy,
-    oneapi::mkl::blas::cublas::column_major::omatadd,
-    oneapi::mkl::blas::cublas::column_major::omatadd,
-    oneapi::mkl::blas::cublas::column_major::omatadd,
-    oneapi::mkl::blas::cublas::column_major::omatadd,
-    oneapi::mkl::blas::cublas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::asum,
-    oneapi::mkl::blas::cublas::row_major::asum,
-    oneapi::mkl::blas::cublas::row_major::asum,
-    oneapi::mkl::blas::cublas::row_major::asum,
-    oneapi::mkl::blas::cublas::row_major::axpy,
-    oneapi::mkl::blas::cublas::row_major::axpy,
-    oneapi::mkl::blas::cublas::row_major::axpy,
-    oneapi::mkl::blas::cublas::row_major::axpy,
-    oneapi::mkl::blas::cublas::row_major::axpy_batch,
-    oneapi::mkl::blas::cublas::row_major::axpy_batch,
-    oneapi::mkl::blas::cublas::row_major::axpy_batch,
-    oneapi::mkl::blas::cublas::row_major::axpy_batch,
-    oneapi::mkl::blas::cublas::row_major::axpby,
-    oneapi::mkl::blas::cublas::row_major::axpby,
-    oneapi::mkl::blas::cublas::row_major::axpby,
-    oneapi::mkl::blas::cublas::row_major::axpby,
-    oneapi::mkl::blas::cublas::row_major::copy,
-    oneapi::mkl::blas::cublas::row_major::copy,
-    oneapi::mkl::blas::cublas::row_major::copy,
-    oneapi::mkl::blas::cublas::row_major::copy,
-    oneapi::mkl::blas::cublas::row_major::copy_batch,
-    oneapi::mkl::blas::cublas::row_major::copy_batch,
-    oneapi::mkl::blas::cublas::row_major::copy_batch,
-    oneapi::mkl::blas::cublas::row_major::copy_batch,
-    oneapi::mkl::blas::cublas::row_major::dot,
-    oneapi::mkl::blas::cublas::row_major::dot,
-    oneapi::mkl::blas::cublas::row_major::dot,
-    oneapi::mkl::blas::cublas::row_major::dotc,
-    oneapi::mkl::blas::cublas::row_major::dotc,
-    oneapi::mkl::blas::cublas::row_major::dotu,
-    oneapi::mkl::blas::cublas::row_major::dotu,
-    oneapi::mkl::blas::cublas::row_major::iamin,
-    oneapi::mkl::blas::cublas::row_major::iamin,
-    oneapi::mkl::blas::cublas::row_major::iamin,
-    oneapi::mkl::blas::cublas::row_major::iamin,
-    oneapi::mkl::blas::cublas::row_major::iamax,
-    oneapi::mkl::blas::cublas::row_major::iamax,
-    oneapi::mkl::blas::cublas::row_major::iamax,
-    oneapi::mkl::blas::cublas::row_major::iamax,
-    oneapi::mkl::blas::cublas::row_major::nrm2,
-    oneapi::mkl::blas::cublas::row_major::nrm2,
-    oneapi::mkl::blas::cublas::row_major::nrm2,
-    oneapi::mkl::blas::cublas::row_major::nrm2,
-    oneapi::mkl::blas::cublas::row_major::rot,
-    oneapi::mkl::blas::cublas::row_major::rot,
-    oneapi::mkl::blas::cublas::row_major::rot,
-    oneapi::mkl::blas::cublas::row_major::rot,
-    oneapi::mkl::blas::cublas::row_major::rotg,
-    oneapi::mkl::blas::cublas::row_major::rotg,
-    oneapi::mkl::blas::cublas::row_major::rotg,
-    oneapi::mkl::blas::cublas::row_major::rotg,
-    oneapi::mkl::blas::cublas::row_major::rotm,
-    oneapi::mkl::blas::cublas::row_major::rotm,
-    oneapi::mkl::blas::cublas::row_major::rotmg,
-    oneapi::mkl::blas::cublas::row_major::rotmg,
-    oneapi::mkl::blas::cublas::row_major::scal,
-    oneapi::mkl::blas::cublas::row_major::scal,
-    oneapi::mkl::blas::cublas::row_major::scal,
-    oneapi::mkl::blas::cublas::row_major::scal,
-    oneapi::mkl::blas::cublas::row_major::scal,
-    oneapi::mkl::blas::cublas::row_major::scal,
-    oneapi::mkl::blas::cublas::row_major::sdsdot,
-    oneapi::mkl::blas::cublas::row_major::swap,
-    oneapi::mkl::blas::cublas::row_major::swap,
-    oneapi::mkl::blas::cublas::row_major::swap,
-    oneapi::mkl::blas::cublas::row_major::swap,
-    oneapi::mkl::blas::cublas::row_major::gbmv,
-    oneapi::mkl::blas::cublas::row_major::gbmv,
-    oneapi::mkl::blas::cublas::row_major::gbmv,
-    oneapi::mkl::blas::cublas::row_major::gbmv,
-    oneapi::mkl::blas::cublas::row_major::gemv,
-    oneapi::mkl::blas::cublas::row_major::gemv,
-    oneapi::mkl::blas::cublas::row_major::gemv,
-    oneapi::mkl::blas::cublas::row_major::gemv,
-    oneapi::mkl::blas::cublas::row_major::gemv_batch,
-    oneapi::mkl::blas::cublas::row_major::gemv_batch,
-    oneapi::mkl::blas::cublas::row_major::gemv_batch,
-    oneapi::mkl::blas::cublas::row_major::gemv_batch,
-    oneapi::mkl::blas::cublas::row_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::row_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::row_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::row_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::row_major::ger,
-    oneapi::mkl::blas::cublas::row_major::ger,
-    oneapi::mkl::blas::cublas::row_major::gerc,
-    oneapi::mkl::blas::cublas::row_major::gerc,
-    oneapi::mkl::blas::cublas::row_major::geru,
-    oneapi::mkl::blas::cublas::row_major::geru,
-    oneapi::mkl::blas::cublas::row_major::hbmv,
-    oneapi::mkl::blas::cublas::row_major::hbmv,
-    oneapi::mkl::blas::cublas::row_major::hemv,
-    oneapi::mkl::blas::cublas::row_major::hemv,
-    oneapi::mkl::blas::cublas::row_major::her,
-    oneapi::mkl::blas::cublas::row_major::her,
-    oneapi::mkl::blas::cublas::row_major::her2,
-    oneapi::mkl::blas::cublas::row_major::her2,
-    oneapi::mkl::blas::cublas::row_major::hpmv,
-    oneapi::mkl::blas::cublas::row_major::hpmv,
-    oneapi::mkl::blas::cublas::row_major::hpr,
-    oneapi::mkl::blas::cublas::row_major::hpr,
-    oneapi::mkl::blas::cublas::row_major::hpr2,
-    oneapi::mkl::blas::cublas::row_major::hpr2,
-    oneapi::mkl::blas::cublas::row_major::sbmv,
-    oneapi::mkl::blas::cublas::row_major::sbmv,
-    oneapi::mkl::blas::cublas::row_major::spmv,
-    oneapi::mkl::blas::cublas::row_major::spmv,
-    oneapi::mkl::blas::cublas::row_major::spr,
-    oneapi::mkl::blas::cublas::row_major::spr,
-    oneapi::mkl::blas::cublas::row_major::spr2,
-    oneapi::mkl::blas::cublas::row_major::spr2,
-    oneapi::mkl::blas::cublas::row_major::symv,
-    oneapi::mkl::blas::cublas::row_major::symv,
-    oneapi::mkl::blas::cublas::row_major::syr,
-    oneapi::mkl::blas::cublas::row_major::syr,
-    oneapi::mkl::blas::cublas::row_major::syr2,
-    oneapi::mkl::blas::cublas::row_major::syr2,
-    oneapi::mkl::blas::cublas::row_major::tbmv,
-    oneapi::mkl::blas::cublas::row_major::tbmv,
-    oneapi::mkl::blas::cublas::row_major::tbmv,
-    oneapi::mkl::blas::cublas::row_major::tbmv,
-    oneapi::mkl::blas::cublas::row_major::tbsv,
-    oneapi::mkl::blas::cublas::row_major::tbsv,
-    oneapi::mkl::blas::cublas::row_major::tbsv,
-    oneapi::mkl::blas::cublas::row_major::tbsv,
-    oneapi::mkl::blas::cublas::row_major::tpmv,
-    oneapi::mkl::blas::cublas::row_major::tpmv,
-    oneapi::mkl::blas::cublas::row_major::tpmv,
-    oneapi::mkl::blas::cublas::row_major::tpmv,
-    oneapi::mkl::blas::cublas::row_major::tpsv,
-    oneapi::mkl::blas::cublas::row_major::tpsv,
-    oneapi::mkl::blas::cublas::row_major::tpsv,
-    oneapi::mkl::blas::cublas::row_major::tpsv,
-    oneapi::mkl::blas::cublas::row_major::trmv,
-    oneapi::mkl::blas::cublas::row_major::trmv,
-    oneapi::mkl::blas::cublas::row_major::trmv,
-    oneapi::mkl::blas::cublas::row_major::trmv,
-    oneapi::mkl::blas::cublas::row_major::trsv,
-    oneapi::mkl::blas::cublas::row_major::trsv,
-    oneapi::mkl::blas::cublas::row_major::trsv,
-    oneapi::mkl::blas::cublas::row_major::trsv,
-    oneapi::mkl::blas::cublas::row_major::gemm,
-    oneapi::mkl::blas::cublas::row_major::gemm,
-    oneapi::mkl::blas::cublas::row_major::gemm,
-    oneapi::mkl::blas::cublas::row_major::gemm,
-    oneapi::mkl::blas::cublas::row_major::gemm,
-    oneapi::mkl::blas::cublas::row_major::gemm,
-    oneapi::mkl::blas::cublas::row_major::gemm,
-    oneapi::mkl::blas::cublas::row_major::hemm,
-    oneapi::mkl::blas::cublas::row_major::hemm,
-    oneapi::mkl::blas::cublas::row_major::herk,
-    oneapi::mkl::blas::cublas::row_major::herk,
-    oneapi::mkl::blas::cublas::row_major::her2k,
-    oneapi::mkl::blas::cublas::row_major::her2k,
-    oneapi::mkl::blas::cublas::row_major::symm,
-    oneapi::mkl::blas::cublas::row_major::symm,
-    oneapi::mkl::blas::cublas::row_major::symm,
-    oneapi::mkl::blas::cublas::row_major::symm,
-    oneapi::mkl::blas::cublas::row_major::syrk,
-    oneapi::mkl::blas::cublas::row_major::syrk,
-    oneapi::mkl::blas::cublas::row_major::syrk,
-    oneapi::mkl::blas::cublas::row_major::syrk,
-    oneapi::mkl::blas::cublas::row_major::syrk_batch,
-    oneapi::mkl::blas::cublas::row_major::syrk_batch,
-    oneapi::mkl::blas::cublas::row_major::syrk_batch,
-    oneapi::mkl::blas::cublas::row_major::syrk_batch,
-    oneapi::mkl::blas::cublas::row_major::syr2k,
-    oneapi::mkl::blas::cublas::row_major::syr2k,
-    oneapi::mkl::blas::cublas::row_major::syr2k,
-    oneapi::mkl::blas::cublas::row_major::syr2k,
-    oneapi::mkl::blas::cublas::row_major::trmm,
-    oneapi::mkl::blas::cublas::row_major::trmm,
-    oneapi::mkl::blas::cublas::row_major::trmm,
-    oneapi::mkl::blas::cublas::row_major::trmm,
-    oneapi::mkl::blas::cublas::row_major::trsm,
-    oneapi::mkl::blas::cublas::row_major::trsm,
-    oneapi::mkl::blas::cublas::row_major::trsm,
-    oneapi::mkl::blas::cublas::row_major::trsm,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::trsm_batch,
-    oneapi::mkl::blas::cublas::row_major::trsm_batch,
-    oneapi::mkl::blas::cublas::row_major::trsm_batch,
-    oneapi::mkl::blas::cublas::row_major::trsm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemmt,
-    oneapi::mkl::blas::cublas::row_major::gemmt,
-    oneapi::mkl::blas::cublas::row_major::gemmt,
-    oneapi::mkl::blas::cublas::row_major::gemmt,
-    oneapi::mkl::blas::cublas::row_major::gemm_bias,
-    oneapi::mkl::blas::cublas::row_major::gemm_bias,
-    oneapi::mkl::blas::cublas::row_major::gemm_bias,
-    oneapi::mkl::blas::cublas::row_major::gemm_bias,
-    oneapi::mkl::blas::cublas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::row_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::row_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::row_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::row_major::omatcopy,
-    oneapi::mkl::blas::cublas::row_major::omatcopy,
-    oneapi::mkl::blas::cublas::row_major::omatcopy,
-    oneapi::mkl::blas::cublas::row_major::omatcopy,
-    oneapi::mkl::blas::cublas::row_major::omatcopy2,
-    oneapi::mkl::blas::cublas::row_major::omatcopy2,
-    oneapi::mkl::blas::cublas::row_major::omatcopy2,
-    oneapi::mkl::blas::cublas::row_major::omatcopy2,
-    oneapi::mkl::blas::cublas::row_major::imatcopy,
-    oneapi::mkl::blas::cublas::row_major::imatcopy,
-    oneapi::mkl::blas::cublas::row_major::imatcopy,
-    oneapi::mkl::blas::cublas::row_major::imatcopy,
-    oneapi::mkl::blas::cublas::row_major::omatadd,
-    oneapi::mkl::blas::cublas::row_major::omatadd,
-    oneapi::mkl::blas::cublas::row_major::omatadd,
-    oneapi::mkl::blas::cublas::row_major::omatadd,
-    oneapi::mkl::blas::cublas::row_major::asum,
-    oneapi::mkl::blas::cublas::row_major::asum,
-    oneapi::mkl::blas::cublas::row_major::asum,
-    oneapi::mkl::blas::cublas::row_major::asum,
-    oneapi::mkl::blas::cublas::row_major::axpy,
-    oneapi::mkl::blas::cublas::row_major::axpy,
-    oneapi::mkl::blas::cublas::row_major::axpy,
-    oneapi::mkl::blas::cublas::row_major::axpy,
-    oneapi::mkl::blas::cublas::row_major::axpy_batch,
-    oneapi::mkl::blas::cublas::row_major::axpy_batch,
-    oneapi::mkl::blas::cublas::row_major::axpy_batch,
-    oneapi::mkl::blas::cublas::row_major::axpy_batch,
-    oneapi::mkl::blas::cublas::row_major::axpy_batch,
-    oneapi::mkl::blas::cublas::row_major::axpy_batch,
-    oneapi::mkl::blas::cublas::row_major::axpy_batch,
-    oneapi::mkl::blas::cublas::row_major::axpy_batch,
-    oneapi::mkl::blas::cublas::row_major::axpby,
-    oneapi::mkl::blas::cublas::row_major::axpby,
-    oneapi::mkl::blas::cublas::row_major::axpby,
-    oneapi::mkl::blas::cublas::row_major::axpby,
-    oneapi::mkl::blas::cublas::row_major::copy,
-    oneapi::mkl::blas::cublas::row_major::copy,
-    oneapi::mkl::blas::cublas::row_major::copy,
-    oneapi::mkl::blas::cublas::row_major::copy,
-    oneapi::mkl::blas::cublas::row_major::copy_batch,
-    oneapi::mkl::blas::cublas::row_major::copy_batch,
-    oneapi::mkl::blas::cublas::row_major::copy_batch,
-    oneapi::mkl::blas::cublas::row_major::copy_batch,
-    oneapi::mkl::blas::cublas::row_major::copy_batch,
-    oneapi::mkl::blas::cublas::row_major::copy_batch,
-    oneapi::mkl::blas::cublas::row_major::copy_batch,
-    oneapi::mkl::blas::cublas::row_major::copy_batch,
-    oneapi::mkl::blas::cublas::row_major::dot,
-    oneapi::mkl::blas::cublas::row_major::dot,
-    oneapi::mkl::blas::cublas::row_major::dot,
-    oneapi::mkl::blas::cublas::row_major::dotc,
-    oneapi::mkl::blas::cublas::row_major::dotc,
-    oneapi::mkl::blas::cublas::row_major::dotu,
-    oneapi::mkl::blas::cublas::row_major::dotu,
-    oneapi::mkl::blas::cublas::row_major::iamin,
-    oneapi::mkl::blas::cublas::row_major::iamin,
-    oneapi::mkl::blas::cublas::row_major::iamin,
-    oneapi::mkl::blas::cublas::row_major::iamin,
-    oneapi::mkl::blas::cublas::row_major::iamax,
-    oneapi::mkl::blas::cublas::row_major::iamax,
-    oneapi::mkl::blas::cublas::row_major::iamax,
-    oneapi::mkl::blas::cublas::row_major::iamax,
-    oneapi::mkl::blas::cublas::row_major::nrm2,
-    oneapi::mkl::blas::cublas::row_major::nrm2,
-    oneapi::mkl::blas::cublas::row_major::nrm2,
-    oneapi::mkl::blas::cublas::row_major::nrm2,
-    oneapi::mkl::blas::cublas::row_major::rot,
-    oneapi::mkl::blas::cublas::row_major::rot,
-    oneapi::mkl::blas::cublas::row_major::rot,
-    oneapi::mkl::blas::cublas::row_major::rot,
-    oneapi::mkl::blas::cublas::row_major::rotg,
-    oneapi::mkl::blas::cublas::row_major::rotg,
-    oneapi::mkl::blas::cublas::row_major::rotg,
-    oneapi::mkl::blas::cublas::row_major::rotg,
-    oneapi::mkl::blas::cublas::row_major::rotm,
-    oneapi::mkl::blas::cublas::row_major::rotm,
-    oneapi::mkl::blas::cublas::row_major::rotmg,
-    oneapi::mkl::blas::cublas::row_major::rotmg,
-    oneapi::mkl::blas::cublas::row_major::scal,
-    oneapi::mkl::blas::cublas::row_major::scal,
-    oneapi::mkl::blas::cublas::row_major::scal,
-    oneapi::mkl::blas::cublas::row_major::scal,
-    oneapi::mkl::blas::cublas::row_major::scal,
-    oneapi::mkl::blas::cublas::row_major::scal,
-    oneapi::mkl::blas::cublas::row_major::sdsdot,
-    oneapi::mkl::blas::cublas::row_major::swap,
-    oneapi::mkl::blas::cublas::row_major::swap,
-    oneapi::mkl::blas::cublas::row_major::swap,
-    oneapi::mkl::blas::cublas::row_major::swap,
-    oneapi::mkl::blas::cublas::row_major::gbmv,
-    oneapi::mkl::blas::cublas::row_major::gbmv,
-    oneapi::mkl::blas::cublas::row_major::gbmv,
-    oneapi::mkl::blas::cublas::row_major::gbmv,
-    oneapi::mkl::blas::cublas::row_major::gemv,
-    oneapi::mkl::blas::cublas::row_major::gemv,
-    oneapi::mkl::blas::cublas::row_major::gemv,
-    oneapi::mkl::blas::cublas::row_major::gemv,
-    oneapi::mkl::blas::cublas::row_major::gemv_batch,
-    oneapi::mkl::blas::cublas::row_major::gemv_batch,
-    oneapi::mkl::blas::cublas::row_major::gemv_batch,
-    oneapi::mkl::blas::cublas::row_major::gemv_batch,
-    oneapi::mkl::blas::cublas::row_major::gemv_batch,
-    oneapi::mkl::blas::cublas::row_major::gemv_batch,
-    oneapi::mkl::blas::cublas::row_major::gemv_batch,
-    oneapi::mkl::blas::cublas::row_major::gemv_batch,
-    oneapi::mkl::blas::cublas::row_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::row_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::row_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::row_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::row_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::row_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::row_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::row_major::dgmm_batch,
-    oneapi::mkl::blas::cublas::row_major::ger,
-    oneapi::mkl::blas::cublas::row_major::ger,
-    oneapi::mkl::blas::cublas::row_major::gerc,
-    oneapi::mkl::blas::cublas::row_major::gerc,
-    oneapi::mkl::blas::cublas::row_major::geru,
-    oneapi::mkl::blas::cublas::row_major::geru,
-    oneapi::mkl::blas::cublas::row_major::hbmv,
-    oneapi::mkl::blas::cublas::row_major::hbmv,
-    oneapi::mkl::blas::cublas::row_major::hemv,
-    oneapi::mkl::blas::cublas::row_major::hemv,
-    oneapi::mkl::blas::cublas::row_major::her,
-    oneapi::mkl::blas::cublas::row_major::her,
-    oneapi::mkl::blas::cublas::row_major::her2,
-    oneapi::mkl::blas::cublas::row_major::her2,
-    oneapi::mkl::blas::cublas::row_major::hpmv,
-    oneapi::mkl::blas::cublas::row_major::hpmv,
-    oneapi::mkl::blas::cublas::row_major::hpr,
-    oneapi::mkl::blas::cublas::row_major::hpr,
-    oneapi::mkl::blas::cublas::row_major::hpr2,
-    oneapi::mkl::blas::cublas::row_major::hpr2,
-    oneapi::mkl::blas::cublas::row_major::sbmv,
-    oneapi::mkl::blas::cublas::row_major::sbmv,
-    oneapi::mkl::blas::cublas::row_major::spmv,
-    oneapi::mkl::blas::cublas::row_major::spmv,
-    oneapi::mkl::blas::cublas::row_major::spr,
-    oneapi::mkl::blas::cublas::row_major::spr,
-    oneapi::mkl::blas::cublas::row_major::spr2,
-    oneapi::mkl::blas::cublas::row_major::spr2,
-    oneapi::mkl::blas::cublas::row_major::symv,
-    oneapi::mkl::blas::cublas::row_major::symv,
-    oneapi::mkl::blas::cublas::row_major::syr,
-    oneapi::mkl::blas::cublas::row_major::syr,
-    oneapi::mkl::blas::cublas::row_major::syr2,
-    oneapi::mkl::blas::cublas::row_major::syr2,
-    oneapi::mkl::blas::cublas::row_major::tbmv,
-    oneapi::mkl::blas::cublas::row_major::tbmv,
-    oneapi::mkl::blas::cublas::row_major::tbmv,
-    oneapi::mkl::blas::cublas::row_major::tbmv,
-    oneapi::mkl::blas::cublas::row_major::tbsv,
-    oneapi::mkl::blas::cublas::row_major::tbsv,
-    oneapi::mkl::blas::cublas::row_major::tbsv,
-    oneapi::mkl::blas::cublas::row_major::tbsv,
-    oneapi::mkl::blas::cublas::row_major::tpmv,
-    oneapi::mkl::blas::cublas::row_major::tpmv,
-    oneapi::mkl::blas::cublas::row_major::tpmv,
-    oneapi::mkl::blas::cublas::row_major::tpmv,
-    oneapi::mkl::blas::cublas::row_major::tpsv,
-    oneapi::mkl::blas::cublas::row_major::tpsv,
-    oneapi::mkl::blas::cublas::row_major::tpsv,
-    oneapi::mkl::blas::cublas::row_major::tpsv,
-    oneapi::mkl::blas::cublas::row_major::trmv,
-    oneapi::mkl::blas::cublas::row_major::trmv,
-    oneapi::mkl::blas::cublas::row_major::trmv,
-    oneapi::mkl::blas::cublas::row_major::trmv,
-    oneapi::mkl::blas::cublas::row_major::trsv,
-    oneapi::mkl::blas::cublas::row_major::trsv,
-    oneapi::mkl::blas::cublas::row_major::trsv,
-    oneapi::mkl::blas::cublas::row_major::trsv,
-    oneapi::mkl::blas::cublas::row_major::gemm,
-    oneapi::mkl::blas::cublas::row_major::gemm,
-    oneapi::mkl::blas::cublas::row_major::gemm,
-    oneapi::mkl::blas::cublas::row_major::gemm,
-    oneapi::mkl::blas::cublas::row_major::gemm,
-    oneapi::mkl::blas::cublas::row_major::gemm,
-    oneapi::mkl::blas::cublas::row_major::gemm,
-    oneapi::mkl::blas::cublas::row_major::hemm,
-    oneapi::mkl::blas::cublas::row_major::hemm,
-    oneapi::mkl::blas::cublas::row_major::herk,
-    oneapi::mkl::blas::cublas::row_major::herk,
-    oneapi::mkl::blas::cublas::row_major::her2k,
-    oneapi::mkl::blas::cublas::row_major::her2k,
-    oneapi::mkl::blas::cublas::row_major::symm,
-    oneapi::mkl::blas::cublas::row_major::symm,
-    oneapi::mkl::blas::cublas::row_major::symm,
-    oneapi::mkl::blas::cublas::row_major::symm,
-    oneapi::mkl::blas::cublas::row_major::syrk,
-    oneapi::mkl::blas::cublas::row_major::syrk,
-    oneapi::mkl::blas::cublas::row_major::syrk,
-    oneapi::mkl::blas::cublas::row_major::syrk,
-    oneapi::mkl::blas::cublas::row_major::syrk_batch,
-    oneapi::mkl::blas::cublas::row_major::syrk_batch,
-    oneapi::mkl::blas::cublas::row_major::syrk_batch,
-    oneapi::mkl::blas::cublas::row_major::syrk_batch,
-    oneapi::mkl::blas::cublas::row_major::syrk_batch,
-    oneapi::mkl::blas::cublas::row_major::syrk_batch,
-    oneapi::mkl::blas::cublas::row_major::syrk_batch,
-    oneapi::mkl::blas::cublas::row_major::syrk_batch,
-    oneapi::mkl::blas::cublas::row_major::syr2k,
-    oneapi::mkl::blas::cublas::row_major::syr2k,
-    oneapi::mkl::blas::cublas::row_major::syr2k,
-    oneapi::mkl::blas::cublas::row_major::syr2k,
-    oneapi::mkl::blas::cublas::row_major::trmm,
-    oneapi::mkl::blas::cublas::row_major::trmm,
-    oneapi::mkl::blas::cublas::row_major::trmm,
-    oneapi::mkl::blas::cublas::row_major::trmm,
-    oneapi::mkl::blas::cublas::row_major::trsm,
-    oneapi::mkl::blas::cublas::row_major::trsm,
-    oneapi::mkl::blas::cublas::row_major::trsm,
-    oneapi::mkl::blas::cublas::row_major::trsm,
-    oneapi::mkl::blas::cublas::row_major::trsm_batch,
-    oneapi::mkl::blas::cublas::row_major::trsm_batch,
-    oneapi::mkl::blas::cublas::row_major::trsm_batch,
-    oneapi::mkl::blas::cublas::row_major::trsm_batch,
-    oneapi::mkl::blas::cublas::row_major::trsm_batch,
-    oneapi::mkl::blas::cublas::row_major::trsm_batch,
-    oneapi::mkl::blas::cublas::row_major::trsm_batch,
-    oneapi::mkl::blas::cublas::row_major::trsm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemm_batch,
-    oneapi::mkl::blas::cublas::row_major::gemmt,
-    oneapi::mkl::blas::cublas::row_major::gemmt,
-    oneapi::mkl::blas::cublas::row_major::gemmt,
-    oneapi::mkl::blas::cublas::row_major::gemmt,
-    oneapi::mkl::blas::cublas::row_major::gemm_bias,
-    oneapi::mkl::blas::cublas::row_major::gemm_bias,
-    oneapi::mkl::blas::cublas::row_major::gemm_bias,
-    oneapi::mkl::blas::cublas::row_major::gemm_bias,
-    oneapi::mkl::blas::cublas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::row_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::row_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::row_major::omatadd_batch,
-    oneapi::mkl::blas::cublas::row_major::omatcopy,
-    oneapi::mkl::blas::cublas::row_major::omatcopy,
-    oneapi::mkl::blas::cublas::row_major::omatcopy,
-    oneapi::mkl::blas::cublas::row_major::omatcopy,
-    oneapi::mkl::blas::cublas::row_major::omatcopy2,
-    oneapi::mkl::blas::cublas::row_major::omatcopy2,
-    oneapi::mkl::blas::cublas::row_major::omatcopy2,
-    oneapi::mkl::blas::cublas::row_major::omatcopy2,
-    oneapi::mkl::blas::cublas::row_major::imatcopy,
-    oneapi::mkl::blas::cublas::row_major::imatcopy,
-    oneapi::mkl::blas::cublas::row_major::imatcopy,
-    oneapi::mkl::blas::cublas::row_major::imatcopy,
-    oneapi::mkl::blas::cublas::row_major::omatadd,
-    oneapi::mkl::blas::cublas::row_major::omatadd,
-    oneapi::mkl::blas::cublas::row_major::omatadd,
-    oneapi::mkl::blas::cublas::row_major::omatadd,
-    oneapi::mkl::blas::cublas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::cublas::row_major::imatcopy_batch,
-};
diff --git a/src/blas/backends/mkl_common/mkl_batch.cxx b/src/blas/backends/mkl_common/mkl_batch.cxx
deleted file mode 100644
index 6358a3922..000000000
--- a/src/blas/backends/mkl_common/mkl_batch.cxx
+++ /dev/null
@@ -1,1072 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-                std::int64_t stridex, sycl::buffer<float, 1> &y, int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size);
-}
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-                std::int64_t stridex, sycl::buffer<double, 1> &y, int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size);
-}
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-                int64_t incx, std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size);
-}
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-                int64_t incx, std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size);
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<double, 1> &y, int64_t incy,
-                int64_t stridey, int64_t batch_size) {
-    blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size);
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x, int64_t incx,
-                int64_t stridex, sycl::buffer<float, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size);
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size);
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-    blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size);
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a, sycl::buffer<float, 1> &x,
-                int64_t incx, int64_t stride_x, float beta, sycl::buffer<float, 1> &y, int64_t incy,
-                int64_t stride_y, int64_t batch_size) {
-    blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x, beta, y,
-                           incy, stride_y, batch_size);
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<double, 1> &x, int64_t incx, int64_t stride_x, double beta,
-                sycl::buffer<double, 1> &y, int64_t incy, int64_t stride_y, int64_t batch_size) {
-    blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x, beta, y,
-                           incy, stride_y, batch_size);
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-                int64_t stride_x, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                int64_t incy, int64_t stride_y, int64_t batch_size) {
-    blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x, beta, y,
-                           incy, stride_y, batch_size);
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-                int64_t stride_x, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &y, int64_t incy, int64_t stride_y,
-                int64_t batch_size) {
-    blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x, beta, y,
-                           incy, stride_y, batch_size);
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a, sycl::buffer<float, 1> &x,
-                int64_t incx, int64_t stride_x, sycl::buffer<float, 1> &c, int64_t ldc,
-                int64_t stride_c, int64_t batch_size) {
-    blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c, ldc,
-                           stride_c, batch_size);
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<double, 1> &x, int64_t incx, int64_t stride_x,
-                sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c, ldc,
-                           stride_c, batch_size);
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &x, int64_t incx, int64_t stride_x,
-                sycl::buffer<std::complex<float>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c, ldc,
-                           stride_c, batch_size);
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &x, int64_t incx, int64_t stride_x,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c, ldc,
-                           stride_c, batch_size);
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
-                           stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b, double beta,
-                sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
-                           stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-                int64_t stride_b, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
-                           stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                int64_t ldb, int64_t stride_b, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
-                           stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, sycl::half alpha, sycl::buffer<sycl::half, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<sycl::half, 1> &b, int64_t ldb, int64_t stride_b,
-                sycl::half beta, sycl::buffer<sycl::half, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
-                           stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, float alpha, sycl::buffer<sycl::half, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<sycl::half, 1> &b, int64_t ldb, int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
-                           stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<std::int8_t, 1> &b, int64_t ldb, int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    throw unimplemented("blas", "gemm_batch",
-                        "unsupported dtype combination: int8_t, int8_t, float, float");
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<std::int8_t, 1> &b, int64_t ldb, int64_t stride_b,
-                float beta, sycl::buffer<std::int32_t, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
-                           stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<float, 1> &b, int64_t ldb,
-                int64_t stride_b, int64_t batch_size) {
-    blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                           stride_a, b, ldb, stride_b, batch_size);
-}
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<double, 1> &b, int64_t ldb,
-                int64_t stride_b, int64_t batch_size) {
-    blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                           stride_a, b, ldb, stride_b, batch_size);
-}
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, int64_t stride_b,
-                int64_t batch_size) {
-    blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                           stride_a, b, ldb, stride_b, batch_size);
-}
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, int64_t stride_b,
-                int64_t batch_size) {
-    blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda,
-                           stride_a, b, ldb, stride_b, batch_size);
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a, float beta,
-                sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-                           stride_c, batch_size);
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                double beta, sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-                           stride_c, batch_size);
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                int64_t stride_a, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-                           stride_c, batch_size);
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-                int64_t stride_a, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-    blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc,
-                           stride_c, batch_size);
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                    sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                    sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size) {
-    blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b,
-                               batch_size);
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                    sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                    sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size) {
-    blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b,
-                               batch_size);
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                    int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-                    int64_t stride_b, int64_t batch_size) {
-    blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b,
-                               batch_size);
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                    int64_t lda, int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    int64_t ldb, int64_t stride_b, int64_t batch_size) {
-    blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b,
-                               batch_size);
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                    sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size) {
-    blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                    sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size) {
-    blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) {
-    blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) {
-    blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size);
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                   float beta, sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb,
-                              stride_b, c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                   double beta, sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-    blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb,
-                              stride_b, c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                   int64_t stride_a, std::complex<float> beta,
-                   sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<std::complex<float>, 1> &c, int64_t ldc, int64_t stride_c,
-                   int64_t batch_size) {
-    blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb,
-                              stride_b, c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                   int64_t lda, int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                   int64_t batch_size) {
-    blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb,
-                              stride_b, c, ldc, stride_c, batch_size);
-}
-
-// USM APIs
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const float *x, int64_t incx,
-                       std::int64_t stridex, float *y, int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size,
-                                  dependencies);
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, int64_t incx,
-                       std::int64_t stridex, double *y, int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size,
-                                  dependencies);
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                       std::int64_t stridex, std::complex<float> *y, int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size,
-                                  dependencies);
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                       std::int64_t stridex, std::complex<double> *y, int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size,
-                                  dependencies);
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, int64_t *incx, float **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::copy_batch(queue, n, x, incx, y, incy, group_count, group_size,
-                                  dependencies);
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, int64_t *incx, double **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::copy_batch(queue, n, x, incx, y, incy, group_count, group_size,
-                                  dependencies);
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex<float> **x, int64_t *incx,
-                       std::complex<float> **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::copy_batch(queue, n, x, incx, y, incy, group_count, group_size,
-                                  dependencies);
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex<double> **x,
-                       int64_t *incx, std::complex<double> **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::copy_batch(queue, n, x, incx, y, incy, group_count, group_size,
-                                  dependencies);
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx,
-                       int64_t stridex, float *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size,
-                                  dependencies);
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx,
-                       int64_t stridex, double *y, int64_t incy, int64_t stridey,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size,
-                                  dependencies);
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                       const std::complex<float> *x, int64_t incx, int64_t stridex,
-                       std::complex<float> *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size,
-                                  dependencies);
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                       const std::complex<double> *x, int64_t incx, int64_t stridex,
-                       std::complex<double> *y, int64_t incy, int64_t stridey, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size,
-                                  dependencies);
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, const float **x, int64_t *incx,
-                       float **y, int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, group_size,
-                                  dependencies);
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, const double **x,
-                       int64_t *incx, double **y, int64_t *incy, int64_t group_count,
-                       int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, group_size,
-                                  dependencies);
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex<float> *alpha,
-                       const std::complex<float> **x, int64_t *incx, std::complex<float> **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, group_size,
-                                  dependencies);
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex<double> *alpha,
-                       const std::complex<double> **x, int64_t *incx, std::complex<double> **y,
-                       int64_t *incy, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, group_size,
-                                  dependencies);
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha,
-                       const float *a, int64_t lda, int64_t stride_a, const float *x, int64_t incx,
-                       int64_t stride_x, float beta, float *y, int64_t incy, int64_t stride_y,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x,
-                                  beta, y, incy, stride_y, batch_size, dependencies);
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha,
-                       const double *a, int64_t lda, int64_t stride_a, const double *x,
-                       int64_t incx, int64_t stride_x, double beta, double *y, int64_t incy,
-                       int64_t stride_y, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x,
-                                  beta, y, incy, stride_y, batch_size, dependencies);
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                       std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                       int64_t stride_a, const std::complex<float> *x, int64_t incx,
-                       int64_t stride_x, std::complex<float> beta, std::complex<float> *y,
-                       int64_t incy, int64_t stride_y, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x,
-                                  beta, y, incy, stride_y, batch_size, dependencies);
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                       std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                       int64_t stride_a, const std::complex<double> *x, int64_t incx,
-                       int64_t stride_x, std::complex<double> beta, std::complex<double> *y,
-                       int64_t incy, int64_t stride_y, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x,
-                                  beta, y, incy, stride_y, batch_size, dependencies);
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, float *alpha,
-                       const float **a, int64_t *lda, const float **x, int64_t *incx, float *beta,
-                       float **y, int64_t *incy, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, x, incx, beta, y, incy,
-                                  group_count, groupsize, dependencies);
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, double *alpha,
-                       const double **a, int64_t *lda, const double **x, int64_t *incx,
-                       double *beta, double **y, int64_t *incy, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, x, incx, beta, y, incy,
-                                  group_count, groupsize, dependencies);
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n,
-                       std::complex<float> *alpha, const std::complex<float> **a, int64_t *lda,
-                       const std::complex<float> **x, int64_t *incx, std::complex<float> *beta,
-                       std::complex<float> **y, int64_t *incy, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, x, incx, beta, y, incy,
-                                  group_count, groupsize, dependencies);
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n,
-                       std::complex<double> *alpha, const std::complex<double> **a, int64_t *lda,
-                       const std::complex<double> **x, int64_t *incx, std::complex<double> *beta,
-                       std::complex<double> **y, int64_t *incy, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, x, incx, beta, y, incy,
-                                  group_count, groupsize, dependencies);
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const float *a,
-                       int64_t lda, int64_t stride_a, const float *x, int64_t incx,
-                       int64_t stride_x, float *c, int64_t ldc, int64_t stride_c,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c,
-                                  ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const double *a,
-                       int64_t lda, int64_t stride_a, const double *x, int64_t incx,
-                       int64_t stride_x, double *c, int64_t ldc, int64_t stride_c,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c,
-                                  ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                       const std::complex<float> *a, int64_t lda, int64_t stride_a,
-                       const std::complex<float> *x, int64_t incx, int64_t stride_x,
-                       std::complex<float> *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c,
-                                  ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                       const std::complex<double> *a, int64_t lda, int64_t stride_a,
-                       const std::complex<double> *x, int64_t incx, int64_t stride_x,
-                       std::complex<double> *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c,
-                                  ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const float **a, int64_t *lda, const float **x, int64_t *incx, float **c,
-                       int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count,
-                                  groupsize, dependencies);
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const double **a, int64_t *lda, const double **x, int64_t *incx, double **c,
-                       int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count,
-                                  groupsize, dependencies);
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const std::complex<float> **a, int64_t *lda, const std::complex<float> **x,
-                       int64_t *incx, std::complex<float> **c, int64_t *ldc, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count,
-                                  groupsize, dependencies);
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                       const std::complex<double> **a, int64_t *lda, const std::complex<double> **x,
-                       int64_t *incx, std::complex<double> **c, int64_t *ldc, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count,
-                                  groupsize, dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, float alpha, const float *a, int64_t lda, int64_t stride_a,
-                       const float *b, int64_t ldb, int64_t stride_b, float beta, float *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
-                                  stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, double alpha, const double *a, int64_t lda, int64_t stride_a,
-                       const double *b, int64_t ldb, int64_t stride_b, double beta, double *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
-                                  stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                       int64_t lda, int64_t stride_a, const std::complex<float> *b, int64_t ldb,
-                       int64_t stride_b, std::complex<float> beta, std::complex<float> *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
-                                  stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                       int64_t lda, int64_t stride_a, const std::complex<double> *b, int64_t ldb,
-                       int64_t stride_b, std::complex<double> beta, std::complex<double> *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
-                                  stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, sycl::half alpha, const sycl::half *a, int64_t lda,
-                       int64_t stride_a, const sycl::half *b, int64_t ldb, int64_t stride_b,
-                       sycl::half beta, sycl::half *c, int64_t ldc, int64_t stride_c,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
-                                  stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, float alpha, const sycl::half *a, int64_t lda, int64_t stride_a,
-                       const sycl::half *b, int64_t ldb, int64_t stride_b, float beta, float *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
-                                  stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, float alpha, const std::int8_t *a, int64_t lda, int64_t stride_a,
-                       const std::int8_t *b, int64_t ldb, int64_t stride_b, float beta, float *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch",
-                        "unsupported dtype combination: int8_t, int8_t, float, float");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, float alpha, const std::int8_t *a, int64_t lda, int64_t stride_a,
-                       const std::int8_t *b, int64_t ldb, int64_t stride_b, float beta,
-                       std::int32_t *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb,
-                                  stride_b, beta, c, ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, float *alpha, const float **a, int64_t *lda,
-                       const float **b, int64_t *ldb, float *beta, float **c, int64_t *ldc,
-                       int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
-                                  ldc, group_count, group_size, dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, double *alpha, const double **a, int64_t *lda,
-                       const double **b, int64_t *ldb, double *beta, double **c, int64_t *ldc,
-                       int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
-                                  ldc, group_count, group_size, dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, std::complex<float> *alpha,
-                       const std::complex<float> **a, int64_t *lda, const std::complex<float> **b,
-                       int64_t *ldb, std::complex<float> *beta, std::complex<float> **c,
-                       int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
-                                  ldc, group_count, group_size, dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, std::complex<double> *alpha,
-                       const std::complex<double> **a, int64_t *lda, const std::complex<double> **b,
-                       int64_t *ldb, std::complex<double> *beta, std::complex<double> **c,
-                       int64_t *ldc, int64_t group_count, int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
-                                  ldc, group_count, group_size, dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, sycl::half *alpha, const sycl::half **a,
-                       int64_t *lda, const sycl::half **b, int64_t *ldb, sycl::half *beta,
-                       sycl::half **c, int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
-                                  ldc, group_count, groupsize, dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, float *alpha, const sycl::half **a, int64_t *lda,
-                       const sycl::half **b, int64_t *ldb, float *beta, float **c, int64_t *ldc,
-                       int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
-                                  ldc, group_count, groupsize, dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, float *alpha, const std::int8_t **a, int64_t *lda,
-                       const std::int8_t **b, int64_t *ldb, float *beta, float **c, int64_t *ldc,
-                       int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch",
-                        "unsupported dtype combination: int8_t, int8_t, float, float");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, float *alpha, const std::int8_t **a, int64_t *lda,
-                       const std::int8_t **b, int64_t *ldb, float *beta, std::int32_t **c,
-                       int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
-                                  ldc, group_count, groupsize, dependencies);
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, float alpha, const float *a,
-                       int64_t lda, int64_t stride_a, float *b, int64_t ldb, int64_t stride_b,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                                  lda, stride_a, b, ldb, stride_b, batch_size, dependencies);
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, double alpha, const double *a,
-                       int64_t lda, int64_t stride_a, double *b, int64_t ldb, int64_t stride_b,
-                       int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                                  lda, stride_a, b, ldb, stride_b, batch_size, dependencies);
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                       const std::complex<float> *a, int64_t lda, int64_t stride_a,
-                       std::complex<float> *b, int64_t ldb, int64_t stride_b, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                                  lda, stride_a, b, ldb, stride_b, batch_size, dependencies);
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                       const std::complex<double> *a, int64_t lda, int64_t stride_a,
-                       std::complex<double> *b, int64_t ldb, int64_t stride_b, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                                  lda, stride_a, b, ldb, stride_b, batch_size, dependencies);
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans,
-                       diag *unit_diag, int64_t *m, int64_t *n, float *alpha, const float **a,
-                       int64_t *lda, float **b, int64_t *ldb, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                                  lda, b, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans,
-                       diag *unit_diag, int64_t *m, int64_t *n, double *alpha, const double **a,
-                       int64_t *lda, double **b, int64_t *ldb, int64_t group_count,
-                       int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                                  lda, b, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans,
-                       diag *unit_diag, int64_t *m, int64_t *n, std::complex<float> *alpha,
-                       const std::complex<float> **a, int64_t *lda, std::complex<float> **b,
-                       int64_t *ldb, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                                  lda, b, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans,
-                       diag *unit_diag, int64_t *m, int64_t *n, std::complex<double> *alpha,
-                       const std::complex<double> **a, int64_t *lda, std::complex<double> **b,
-                       int64_t *ldb, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a,
-                                  lda, b, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       float alpha, const float *a, int64_t lda, int64_t stride_a, float beta,
-                       float *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c,
-                                  ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       double alpha, const double *a, int64_t lda, int64_t stride_a, double beta,
-                       double *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c,
-                                  ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                       int64_t stride_a, std::complex<float> beta, std::complex<float> *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c,
-                                  ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                       std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                       int64_t stride_a, std::complex<double> beta, std::complex<double> *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c,
-                                  ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, float *alpha, const float **a, int64_t *lda, float *beta,
-                       float **c, int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
-                                  group_count, groupsize, dependencies);
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, double *alpha, const double **a, int64_t *lda, double *beta,
-                       double **c, int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
-                                  group_count, groupsize, dependencies);
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, std::complex<float> *alpha, const std::complex<float> **a,
-                       int64_t *lda, std::complex<float> *beta, std::complex<float> **c,
-                       int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
-                                  group_count, groupsize, dependencies);
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                       int64_t *k, std::complex<double> *alpha, const std::complex<double> **a,
-                       int64_t *lda, std::complex<double> *beta, std::complex<double> **c,
-                       int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-    return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
-                                  group_count, groupsize, dependencies);
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                           const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb,
-                           int64_t stride_b, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b,
-                                      batch_size, dependencies);
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                           const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb,
-                           int64_t stride_b, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b,
-                                      batch_size, dependencies);
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                           int64_t stride_a, std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b,
-                                      batch_size, dependencies);
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                           int64_t stride_a, std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b,
-                                      batch_size, dependencies);
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                           float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size,
-                                      dependencies);
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                           double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size,
-                                      dependencies);
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<float> alpha, std::complex<float> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size,
-                                      dependencies);
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<double> alpha, std::complex<double> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size,
-                                      dependencies);
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, float alpha, const float *a, int64_t lda, int64_t stride_a,
-                          float beta, const float *b, int64_t ldb, int64_t stride_b, float *c,
-                          int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b,
-                                     ldb, stride_b, c, ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, double alpha, const double *a, int64_t lda, int64_t stride_a,
-                          double beta, const double *b, int64_t ldb, int64_t stride_b, double *c,
-                          int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b,
-                                     ldb, stride_b, c, ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                          int64_t lda, int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                          std::complex<float> *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b,
-                                     ldb, stride_b, c, ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                          int64_t lda, int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                          std::complex<double> *c, int64_t ldc, int64_t stride_c,
-                          int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b,
-                                     ldb, stride_b, c, ldc, stride_c, batch_size, dependencies);
-}
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           float* alpha, const float** a, int64_t* lda, float** b, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    return blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, b, ldb, group_count,
-                                      groupsize, dependencies);
-}
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           double* alpha, const double** a, int64_t* lda, double** b, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    return blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, b, ldb, group_count,
-                                      groupsize, dependencies);
-}
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<float>* alpha, const std::complex<float>** a, int64_t* lda,
-                           std::complex<float>** b, int64_t* ldb, int64_t group_count,
-                           int64_t* groupsize, const std::vector<sycl::event>& dependencies) {
-    return blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, b, ldb, group_count,
-                                      groupsize, dependencies);
-}
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<double>* alpha, const std::complex<double>** a,
-                           int64_t* lda, std::complex<double>** b, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    return blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, b, ldb, group_count,
-                                      groupsize, dependencies);
-}
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           float* alpha, float** ab, int64_t* lda, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    return blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, group_count,
-                                      groupsize, dependencies);
-}
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           double* alpha, double** ab, int64_t* lda, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    return blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, group_count,
-                                      groupsize, dependencies);
-}
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<float>* alpha, std::complex<float>** ab, int64_t* lda,
-                           int64_t* ldb, int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    return blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, group_count,
-                                      groupsize, dependencies);
-}
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<double>* alpha, std::complex<double>** ab, int64_t* lda,
-                           int64_t* ldb, int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-    return blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, group_count,
-                                      groupsize, dependencies);
-}
diff --git a/src/blas/backends/mkl_common/mkl_blas_backend.hpp b/src/blas/backends/mkl_common/mkl_blas_backend.hpp
deleted file mode 100644
index d45208a6d..000000000
--- a/src/blas/backends/mkl_common/mkl_blas_backend.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#include <complex>
-
-#include "mkl_version.h"
-#include "oneapi/mkl/types.hpp"
-
-namespace oneapi {
-namespace mkl {
-
-template <typename T>
-class value_or_pointer {
-    T value_;
-    const T* ptr_;
-
-public:
-    // Constructor from value. Accepts not only type T but anything convertible to T.
-    template <typename U, std::enable_if_t<std::is_convertible_v<U, T>, int> = 0>
-    value_or_pointer(U value) : value_(value),
-                                ptr_(nullptr) {}
-
-    // Constructor from pointer, assumed to be device-accessible.
-    value_or_pointer(const T* ptr) : value_(T(0)), ptr_(ptr) {}
-
-    bool fixed() const {
-        return ptr_ == nullptr;
-    }
-
-    T get_fixed_value() const {
-        return value_;
-    }
-
-    const T* get_pointer() const {
-        return ptr_;
-    }
-
-    T get() const {
-        return ptr_ ? *ptr_ : value_;
-    }
-
-    void make_device_accessible(sycl::queue& queue) {
-        if (!fixed() &&
-            sycl::get_pointer_type(ptr_, queue.get_context()) == sycl::usm::alloc::unknown) {
-            *this = *ptr_;
-        }
-    }
-};
-
-namespace blas {
-
-namespace column_major {
-
-#include "mkl_blas_backend.hxx"
-
-}
-
-namespace row_major {
-
-#include "mkl_blas_backend.hxx"
-
-}
-
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/mkl_common/mkl_blas_backend.hxx b/src/blas/backends/mkl_common/mkl_blas_backend.hxx
deleted file mode 100644
index 10e441bd7..000000000
--- a/src/blas/backends/mkl_common/mkl_blas_backend.hxx
+++ /dev/null
@@ -1,2494 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/// level3, buffer
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &b, std::int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-          std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &b, std::int64_t ldb, double beta, sycl::buffer<double, 1> &c,
-          std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, sycl::half alpha, sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-          sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, sycl::half beta,
-          sycl::buffer<sycl::half, 1> &c, std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-          sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-          std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<bfloat16, 1> &a, std::int64_t lda,
-          sycl::buffer<bfloat16, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<bfloat16, 1> &a, std::int64_t lda,
-          sycl::buffer<bfloat16, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<bfloat16, 1> &c, std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a, std::int64_t lda,
-          sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc);
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a, std::int64_t lda,
-          sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-          std::int64_t ldc);
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          float alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, float beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          double alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, double beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-           float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-           std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc);
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-           double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-           std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc);
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-           std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-           std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-           std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-           std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &b, std::int64_t ldb);
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb);
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &b, std::int64_t ldb);
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb);
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-
-// level 3, USM
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, value_or_pointer<float> alpha, const float *a,
-                 std::int64_t lda, const float *b, std::int64_t ldb, value_or_pointer<float> beta,
-                 float *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, value_or_pointer<double> alpha, const double *a,
-                 std::int64_t lda, const double *b, std::int64_t ldb, value_or_pointer<double> beta,
-                 double *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, value_or_pointer<std::complex<float>> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                 std::int64_t ldb, value_or_pointer<std::complex<float>> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, value_or_pointer<std::complex<double>> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                 std::int64_t ldb, value_or_pointer<std::complex<double>> beta,
-                 std::complex<double> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, value_or_pointer<sycl::half> alpha,
-                 const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                 value_or_pointer<sycl::half> beta, sycl::half *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, value_or_pointer<float> alpha, const sycl::half *a,
-                 std::int64_t lda, const sycl::half *b, std::int64_t ldb, value_or_pointer<float> beta,
-                 float *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, value_or_pointer<float> alpha,
-                 const bfloat16 *a, std::int64_t lda, const bfloat16 *b,
-                 std::int64_t ldb, value_or_pointer<float> beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, value_or_pointer<float> alpha,
-                 const bfloat16 *a, std::int64_t lda, const bfloat16 *b,
-                 std::int64_t ldb, value_or_pointer<float> beta, bfloat16 *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, value_or_pointer<float> alpha, const std::int8_t *a,
-                 std::int64_t lda, const std::int8_t *b, std::int64_t ldb,
-                 value_or_pointer<float> beta, std::int32_t *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, value_or_pointer<float> alpha, const std::int8_t *a,
-                 std::int64_t lda, const std::int8_t *b, std::int64_t ldb,
-                 value_or_pointer<float> beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                 std::int64_t n, value_or_pointer<float> alpha, const float *a, std::int64_t lda,
-                 const float *b, std::int64_t ldb, value_or_pointer<float> beta, float *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                 std::int64_t n, value_or_pointer<double> alpha, const double *a, std::int64_t lda,
-                 const double *b, std::int64_t ldb, value_or_pointer<double> beta, double *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                 std::int64_t n, value_or_pointer<std::complex<float>> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                 std::int64_t ldb, value_or_pointer<std::complex<float>> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                 std::int64_t n, value_or_pointer<std::complex<double>> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                 std::int64_t ldb, value_or_pointer<std::complex<double>> beta,
-                 std::complex<double> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                 std::int64_t n, value_or_pointer<std::complex<float>> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                 std::int64_t ldb, value_or_pointer<std::complex<float>> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                 std::int64_t n, value_or_pointer<std::complex<double>> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                 std::int64_t ldb, value_or_pointer<std::complex<double>> beta,
-                 std::complex<double> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                 std::int64_t k, value_or_pointer<float> alpha, const float *a, std::int64_t lda,
-                 value_or_pointer<float> beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                 std::int64_t k, value_or_pointer<double> alpha, const double *a, std::int64_t lda,
-                 value_or_pointer<double> beta, double *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                 std::int64_t k, value_or_pointer<std::complex<float>> alpha,
-                 const std::complex<float> *a, std::int64_t lda,
-                 value_or_pointer<std::complex<float>> beta, std::complex<float> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                 std::int64_t k, value_or_pointer<std::complex<double>> alpha,
-                 const std::complex<double> *a, std::int64_t lda,
-                 value_or_pointer<std::complex<double>> beta, std::complex<double> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                 std::int64_t k, value_or_pointer<float> alpha, const std::complex<float> *a,
-                 std::int64_t lda, value_or_pointer<float> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                 std::int64_t k, value_or_pointer<double> alpha, const std::complex<double> *a,
-                 std::int64_t lda, value_or_pointer<double> beta, std::complex<double> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                  std::int64_t k, value_or_pointer<float> alpha, const float *a, std::int64_t lda,
-                  const float *b, std::int64_t ldb, value_or_pointer<float> beta, float *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                  std::int64_t k, value_or_pointer<double> alpha, const double *a, std::int64_t lda,
-                  const double *b, std::int64_t ldb, value_or_pointer<double> beta, double *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                  std::int64_t k, value_or_pointer<std::complex<float>> alpha,
-                  const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                  std::int64_t ldb, value_or_pointer<std::complex<float>> beta, std::complex<float> *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                  std::int64_t k, value_or_pointer<std::complex<double>> alpha,
-                  const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                  std::int64_t ldb, value_or_pointer<std::complex<double>> beta,
-                  std::complex<double> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                  std::int64_t k, value_or_pointer<std::complex<float>> alpha,
-                  const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                  std::int64_t ldb, value_or_pointer<float> beta, std::complex<float> *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                  std::int64_t k, value_or_pointer<std::complex<double>> alpha,
-                  const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                  std::int64_t ldb, value_or_pointer<double> beta, std::complex<double> *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t m, std::int64_t n, value_or_pointer<float> alpha,
-                 const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t m, std::int64_t n, value_or_pointer<double> alpha,
-                 const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t m, std::int64_t n,
-                 value_or_pointer<std::complex<float>> alpha, const std::complex<float> *a,
-                 std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t m, std::int64_t n,
-                 value_or_pointer<std::complex<double>> alpha, const std::complex<double> *a,
-                 std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t m, std::int64_t n, value_or_pointer<float> alpha,
-                 const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t m, std::int64_t n, value_or_pointer<double> alpha,
-                 const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t m, std::int64_t n,
-                 value_or_pointer<std::complex<float>> alpha, const std::complex<float> *a,
-                 std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t m, std::int64_t n,
-                 value_or_pointer<std::complex<double>> alpha, const std::complex<double> *a,
-                 std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies = {});
-
-// level 2, buffer
-
-void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-          std::int64_t ku, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-          std::int64_t incy);
-
-void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-          std::int64_t ku, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta, sycl::buffer<double, 1> &y,
-          std::int64_t incy);
-
-void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-          std::int64_t ku, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-          std::int64_t ku, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy, sycl::buffer<float, 1> &a,
-         std::int64_t lda);
-
-void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-         std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda);
-
-void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-
-void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-
-void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-
-void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-
-void hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-void hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-void her(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-
-void her(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-
-void her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-
-void her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-
-void hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-          std::int64_t incy);
-
-void hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy);
-
-void hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a);
-
-void hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a);
-
-void hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a);
-
-void hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a);
-
-void sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a, std::int64_t lda);
-
-void syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a,
-         std::int64_t lda);
-
-void syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda);
-
-void syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda);
-
-void spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-         sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a);
-
-void spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a);
-
-void spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a);
-
-void spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &a);
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx);
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx);
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx);
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx);
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          std::int64_t k, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx);
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x, std::int64_t incx);
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx);
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx);
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx);
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x, std::int64_t incx);
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx);
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx);
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx);
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx);
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx);
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx);
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-// level 2, USM
-
-sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                 value_or_pointer<float> alpha, const float *a, std::int64_t lda, const float *x,
-                 std::int64_t incx, value_or_pointer<float> beta, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                 value_or_pointer<double> alpha, const double *a, std::int64_t lda, const double *x,
-                 std::int64_t incx, value_or_pointer<double> beta, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                 value_or_pointer<std::complex<float>> alpha, const std::complex<float> *a,
-                 std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                 value_or_pointer<std::complex<float>> beta, std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                 value_or_pointer<std::complex<double>> alpha, const std::complex<double> *a,
-                 std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                 value_or_pointer<std::complex<double>> beta, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                 std::int64_t kl, std::int64_t ku, value_or_pointer<float> alpha, const float *a,
-                 std::int64_t lda, const float *x, std::int64_t incx, value_or_pointer<float> beta,
-                 float *y, std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                 std::int64_t kl, std::int64_t ku, value_or_pointer<double> alpha, const double *a,
-                 std::int64_t lda, const double *x, std::int64_t incx, value_or_pointer<double> beta,
-                 double *y, std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                 std::int64_t kl, std::int64_t ku, value_or_pointer<std::complex<float>> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
-                 std::int64_t incx, value_or_pointer<std::complex<float>> beta, std::complex<float> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                 std::int64_t kl, std::int64_t ku, value_or_pointer<std::complex<double>> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
-                 std::int64_t incx, value_or_pointer<std::complex<double>> beta,
-                 std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, value_or_pointer<float> alpha,
-                const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
-                std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, value_or_pointer<double> alpha,
-                const double *x, std::int64_t incx, const double *y, std::int64_t incy, double *a,
-                std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 value_or_pointer<std::complex<float>> alpha, const std::complex<float> *x,
-                 std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
-                 std::complex<float> *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 value_or_pointer<std::complex<double>> alpha, const std::complex<double> *x,
-                 std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
-                 std::complex<double> *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 value_or_pointer<std::complex<float>> alpha, const std::complex<float> *x,
-                 std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
-                 std::complex<float> *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 value_or_pointer<std::complex<double>> alpha, const std::complex<double> *x,
-                 std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
-                 std::complex<double> *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                 value_or_pointer<std::complex<float>> alpha, const std::complex<float> *a,
-                 std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                 value_or_pointer<std::complex<float>> beta, std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                 value_or_pointer<std::complex<double>> alpha, const std::complex<double> *a,
-                 std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                 value_or_pointer<std::complex<double>> beta, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 value_or_pointer<std::complex<float>> alpha, const std::complex<float> *a,
-                 std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                 value_or_pointer<std::complex<float>> beta, std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 value_or_pointer<std::complex<double>> alpha, const std::complex<double> *a,
-                 std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                 value_or_pointer<std::complex<double>> beta, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<float> alpha,
-                const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
-                std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<double> alpha,
-                const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
-                std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 value_or_pointer<std::complex<float>> alpha, const std::complex<float> *x,
-                 std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
-                 std::complex<float> *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event her2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 value_or_pointer<std::complex<double>> alpha, const std::complex<double> *x,
-                 std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
-                 std::complex<double> *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 value_or_pointer<std::complex<float>> alpha, const std::complex<float> *a,
-                 const std::complex<float> *x, std::int64_t incx,
-                 value_or_pointer<std::complex<float>> beta, std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 value_or_pointer<std::complex<double>> alpha, const std::complex<double> *a,
-                 const std::complex<double> *x, std::int64_t incx,
-                 value_or_pointer<std::complex<double>> beta, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<float> alpha,
-                const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<double> alpha,
-                const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 value_or_pointer<std::complex<float>> alpha, const std::complex<float> *x,
-                 std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
-                 std::complex<float> *a, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 value_or_pointer<std::complex<double>> alpha, const std::complex<double> *x,
-                 std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
-                 std::complex<double> *a, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                 value_or_pointer<float> alpha, const float *a, std::int64_t lda, const float *x,
-                 std::int64_t incx, value_or_pointer<float> beta, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k,
-                 value_or_pointer<double> alpha, const double *a, std::int64_t lda, const double *x,
-                 std::int64_t incx, value_or_pointer<double> beta, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<float> alpha,
-                 const float *a, std::int64_t lda, const float *x, std::int64_t incx,
-                 value_or_pointer<float> beta, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<double> alpha,
-                 const double *a, std::int64_t lda, const double *x, std::int64_t incx,
-                 value_or_pointer<double> beta, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<float> alpha,
-                const float *x, std::int64_t incx, float *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<double> alpha,
-                const double *x, std::int64_t incx, double *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<float> alpha,
-                 const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<double> alpha,
-                 const double *x, std::int64_t incx, const double *y, std::int64_t incy, double *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<float> alpha,
-                 const float *a, const float *x, std::int64_t incx, value_or_pointer<float> beta,
-                 float *y, std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<double> alpha,
-                 const double *a, const double *x, std::int64_t incx, value_or_pointer<double> beta,
-                 double *y, std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<float> alpha,
-                const float *x, std::int64_t incx, float *a,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<double> alpha,
-                const double *x, std::int64_t incx, double *a,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<float> alpha,
-                 const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer<double> alpha,
-                 const double *x, std::int64_t incx, const double *y, std::int64_t incy, double *a,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, std::int64_t k, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, std::int64_t k, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, std::int64_t k, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, std::int64_t k, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const float *a, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const double *a, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const std::complex<float> *a, std::complex<float> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const std::complex<double> *a, std::complex<double> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const float *a, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const double *a, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const std::complex<float> *a, std::complex<float> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const std::complex<double> *a, std::complex<double> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const double *a, std::int64_t lda, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const double *a, std::int64_t lda, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies = {});
-
-// level 1, buffer
-
-void dotc(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result);
-
-void dotc(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result);
-
-void dotu(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result);
-
-void dotu(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result);
-
-void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result, index_base base=index_base::zero);
-
-void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result, index_base base=index_base::zero);
-
-void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result,
-           index_base base=index_base::zero);
-
-void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result,
-           index_base base=index_base::zero);
-
-void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result, index_base base=index_base::zero);
-
-void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result, index_base base=index_base::zero);
-
-void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result,
-           index_base base=index_base::zero);
-
-void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result,
-           index_base base=index_base::zero);
-
-void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result);
-
-void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &result);
-
-void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result);
-
-void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result);
-
-void axpy(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void axpy(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void axpy(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void axpy(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-void axpby(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-           std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void axpby(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-           std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void axpby(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void axpby(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-         sycl::buffer<float, 1> &y, std::int64_t incy, sycl::buffer<float, 1> &result);
-
-void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-         sycl::buffer<double, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &result);
-
-void sdsdot(sycl::queue &queue, std::int64_t n, float sb, sycl::buffer<float, 1> &x,
-            std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-            sycl::buffer<float, 1> &result);
-
-void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-         sycl::buffer<float, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &result);
-
-void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result);
-
-void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &result);
-
-void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result);
-
-void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result);
-
-void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-         std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c,
-         float s);
-
-void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-         std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c,
-         double s);
-
-void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-         sycl::buffer<float, 1> &y, std::int64_t incy, float c, float s);
-
-void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-         sycl::buffer<double, 1> &y, std::int64_t incy, double c, double s);
-
-void rotg(sycl::queue &queue, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &b,
-          sycl::buffer<float, 1> &c, sycl::buffer<float, 1> &s);
-
-void rotg(sycl::queue &queue, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &b,
-          sycl::buffer<double, 1> &c, sycl::buffer<double, 1> &s);
-
-void rotg(sycl::queue &queue, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<std::complex<float>, 1> &s);
-
-void rotg(sycl::queue &queue, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<std::complex<double>, 1> &s);
-
-void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &y, std::int64_t incy, sycl::buffer<float, 1> &param);
-
-void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &param);
-
-void rotmg(sycl::queue &queue, sycl::buffer<float, 1> &d1, sycl::buffer<float, 1> &d2,
-           sycl::buffer<float, 1> &x1, float y1, sycl::buffer<float, 1> &param);
-
-void rotmg(sycl::queue &queue, sycl::buffer<double, 1> &d1, sycl::buffer<double, 1> &d2,
-           sycl::buffer<double, 1> &x1, double y1, sycl::buffer<double, 1> &param);
-
-void scal(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-          std::int64_t incx);
-
-void scal(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-          std::int64_t incx);
-
-void scal(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-
-void scal(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-void scal(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx);
-
-void scal(sycl::queue &queue, std::int64_t n, double alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-
-void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &y, std::int64_t incy);
-
-void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy);
-
-void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-
-void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-
-// level 1, USM
-
-sycl::event dotc(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                 std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
-                 std::complex<float> *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dotc(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                 std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
-                 std::complex<double> *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dotu(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                 std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
-                 std::complex<float> *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dotu(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                 std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
-                 std::complex<double> *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamax(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                  std::int64_t *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamax(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                  std::int64_t *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamax(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamax(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamin(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                  std::int64_t *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamin(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                  std::int64_t *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamin(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event iamin(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event asum(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                 std::int64_t incx, float *result,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event asum(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                 std::int64_t incx, double *result,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event asum(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                 float *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event asum(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                 double *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy(sycl::queue &queue, std::int64_t n, value_or_pointer<float> alpha, const float *x,
-                 std::int64_t incx, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy(sycl::queue &queue, std::int64_t n, value_or_pointer<double> alpha, const double *x,
-                 std::int64_t incx, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy(sycl::queue &queue, std::int64_t n, value_or_pointer<std::complex<float>> alpha,
-                 const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy(sycl::queue &queue, std::int64_t n, value_or_pointer<std::complex<double>> alpha,
-                 const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpby(sycl::queue &queue, std::int64_t n, value_or_pointer<float> alpha, const float *x,
-                  std::int64_t incx, value_or_pointer<float> beta, float *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpby(sycl::queue &queue, std::int64_t n, value_or_pointer<double> alpha, const double *x,
-                  std::int64_t incx, value_or_pointer<double> beta, double *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpby(sycl::queue &queue, std::int64_t n, value_or_pointer<std::complex<float>> alpha,
-                  const std::complex<float> *x, std::int64_t incx,
-                  value_or_pointer<std::complex<float>> beta, std::complex<float> *y,
-                  std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpby(sycl::queue &queue, std::int64_t n, value_or_pointer<std::complex<double>> alpha,
-                  const std::complex<double> *x, std::int64_t incx,
-                  value_or_pointer<std::complex<double>> beta, std::complex<double> *y,
-                  std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                 std::int64_t incx, std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                 std::int64_t incx, std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                const float *y, std::int64_t incy, float *result,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dot(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                const double *y, std::int64_t incy, double *result,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event sdsdot(sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx,
-                   const float *y, std::int64_t incy, float *result,
-                   const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                const float *y, std::int64_t incy, double *result,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event nrm2(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                 std::int64_t incx, float *result,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event nrm2(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                 std::int64_t incx, double *result,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event nrm2(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                 float *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event nrm2(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                 double *result, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
-                std::complex<float> *y, std::int64_t incy, value_or_pointer<float> c,
-                value_or_pointer<float> s, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
-                std::complex<double> *y, std::int64_t incy, value_or_pointer<double> c,
-                value_or_pointer<double> s, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rot(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
-                std::int64_t incy, value_or_pointer<float> c, value_or_pointer<float> s,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rot(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
-                std::int64_t incy, value_or_pointer<double> c, value_or_pointer<double> s,
-                const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c, double *s,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotg(sycl::queue &queue, std::complex<float> *a, std::complex<float> *b, float *c,
-                 std::complex<float> *s, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotg(sycl::queue &queue, std::complex<double> *a, std::complex<double> *b, double *c,
-                 std::complex<double> *s, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotm(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
-                 std::int64_t incy, const float *param,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotm(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
-                 std::int64_t incy, const double *param,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1, value_or_pointer<float> y1,
-                  float *param, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1, value_or_pointer<double> y1,
-                  double *param, const std::vector<sycl::event> &dependencies = {});
-
-#define ONEMKL_DECLARE_SCAL(T, Ts)                                                         \
-    sycl::event scal(sycl::queue &queue, std::int64_t n, value_or_pointer<Ts> alpha, T *x, \
-                     std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-ONEMKL_DECLARE_SCAL(float, float)
-ONEMKL_DECLARE_SCAL(double, double)
-ONEMKL_DECLARE_SCAL(std::complex<float>, std::complex<float>)
-ONEMKL_DECLARE_SCAL(std::complex<double>, std::complex<double>)
-ONEMKL_DECLARE_SCAL(std::complex<float>, float)
-ONEMKL_DECLARE_SCAL(std::complex<double>, double)
-sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies = {});
-
-#undef ONEMKL_DECLARE_SCAL
-
-sycl::event swap(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event swap(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
-                 std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
-                 std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies = {});
-
-// extensions, buffer
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
-           std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-           sycl::buffer<float, 1> &b, std::int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-           std::int64_t ldc);
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
-           std::int64_t k, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-           sycl::buffer<double, 1> &b, std::int64_t ldb, double beta, sycl::buffer<double, 1> &c,
-           std::int64_t ldc);
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
-           std::int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-           std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-           std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n,
-           std::int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-           std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-               std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int8_t ao,
-               sycl::buffer<std::uint8_t, 1> &b, std::int64_t ldb, std::uint8_t bo, float beta,
-               sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<std::int32_t, 1> &co);
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-               std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int8_t ao,
-               sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int8_t bo, float beta,
-               sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<std::int32_t, 1> &co);
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-               std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<std::uint8_t, 1> &a, std::int64_t lda, std::uint8_t ao,
-               sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int8_t bo, float beta,
-               sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<std::int32_t, 1> &co);
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-               std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<std::uint8_t, 1> &a, std::int64_t lda, std::uint8_t ao,
-               sycl::buffer<std::uint8_t, 1> &b, std::int64_t ldb, std::uint8_t bo, float beta,
-               sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-               sycl::buffer<std::int32_t, 1> &co);
-
-// extensions, USM
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  std::int64_t n, std::int64_t k, value_or_pointer<float> alpha, const float *a,
-                  std::int64_t lda, const float *b, std::int64_t ldb, value_or_pointer<float> beta,
-                  float *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  std::int64_t n, std::int64_t k, value_or_pointer<double> alpha, const double *a,
-                  std::int64_t lda, const double *b, std::int64_t ldb, value_or_pointer<double> beta,
-                  double *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  std::int64_t n, std::int64_t k, value_or_pointer<std::complex<float>> alpha,
-                  const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                  std::int64_t ldb, value_or_pointer<std::complex<float>> beta, std::complex<float> *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  std::int64_t n, std::int64_t k, value_or_pointer<std::complex<double>> alpha,
-                  const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                  std::int64_t ldb, value_or_pointer<std::complex<double>> beta,
-                  std::complex<double> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer<float> alpha,
-                      const std::int8_t *a, std::int64_t lda, std::int8_t ao, const std::uint8_t *b,
-                      std::int64_t ldb, std::uint8_t bo, value_or_pointer<float> beta, std::int32_t *c,
-                      std::int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer<float> alpha,
-                      const std::int8_t *a, std::int64_t lda, std::int8_t ao, const std::int8_t *b,
-                      std::int64_t ldb, std::int8_t bo, value_or_pointer<float> beta, std::int32_t *c,
-                      std::int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer<float> alpha,
-                      const std::uint8_t *a, std::int64_t lda, std::uint8_t ao,
-                      const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                      value_or_pointer<float> beta, std::int32_t *c, std::int64_t ldc,
-                      const std::int32_t *co, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer<float> alpha,
-                      const std::uint8_t *a, std::int64_t lda, std::uint8_t ao,
-                      const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                      value_or_pointer<float> beta, std::int32_t *c, std::int64_t ldc,
-                      const std::int32_t *co, const std::vector<sycl::event> &dependencies = {});
-
-// batch, buffer
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a, float beta,
-                sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size);
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                double beta, sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size);
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                int64_t stride_a, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                int64_t ldc, int64_t stride_c, int64_t batch_size);
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-                int64_t stride_a, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size);
-
-void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<float, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size);
-
-void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<double, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size);
-
-void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex, float beta,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size);
-
-void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex, double beta,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size);
-
-void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size);
-
-void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size);
-
-void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size);
-
-void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size);
-
-void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size);
-
-void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size);
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size);
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size);
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size);
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size);
-
-void axpy_batch(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<float, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-void axpy_batch(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<double, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-
-void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size);
-
-void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<float, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<double, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, double beta, sycl::buffer<double, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, sycl::half alpha, sycl::buffer<sycl::half, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<sycl::half, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, sycl::half beta,
-                sycl::buffer<sycl::half, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<sycl::half, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<sycl::half, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<bfloat16, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<bfloat16, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer<bfloat16, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<bfloat16, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<bfloat16, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int8_t, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size);
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int8_t, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                    float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                    sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                    std::int64_t batch_size);
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                    double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size);
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                    std::int64_t lda, std::int64_t stride_a,
-                    sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size);
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                    std::int64_t lda, std::int64_t stride_a,
-                    sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size);
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                    float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size);
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                    double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size);
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-                    std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                    std::int64_t batch_size);
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-                    std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                    std::int64_t batch_size);
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                   std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                   std::int64_t stride_a, float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                   std::int64_t stride_b, sycl::buffer<float, 1> &c, std::int64_t ldc,
-                   std::int64_t stride_c, std::int64_t batch_size);
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                   std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                   std::int64_t stride_a, double beta, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                   std::int64_t stride_b, sycl::buffer<double, 1> &c, std::int64_t ldc,
-                   std::int64_t stride_c, std::int64_t batch_size);
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                   std::int64_t n, std::complex<float> alpha,
-                   sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                   std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                   std::int64_t n, std::complex<double> alpha,
-                   sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                   std::int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                   std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size);
-
-void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha,
-              sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-              std::int64_t ldb);
-
-void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha,
-              sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-              std::int64_t ldb);
-
-void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-              std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-              sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-
-void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-              std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-
-void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha,
-               sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stridea,
-               sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t strideb);
-
-void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha,
-               sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stridea,
-               sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t strideb);
-
-void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-               std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-               std::int64_t strideb);
-
-void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-               std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb);
-
-void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha,
-              sycl::buffer<float, 1> &ab, std::int64_t lda, std::int64_t ldb);
-
-void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha,
-              sycl::buffer<double, 1> &ab, std::int64_t lda, std::int64_t ldb);
-
-void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-              std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda,
-              std::int64_t ldb);
-
-void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-              std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb);
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-             float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, float beta,
-             sycl::buffer<float, 1> &b, std::int64_t ldb, sycl::buffer<float, 1> &c,
-             std::int64_t ldc);
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-             double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, double beta,
-             sycl::buffer<double, 1> &b, std::int64_t ldb, sycl::buffer<double, 1> &c,
-             std::int64_t ldc);
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-             std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-             std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-             std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-             std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-// batch, usm
-
-sycl::event syrk_batch(sycl::queue &queue, const uplo *upper_lower, const transpose *trans,
-                       const std::int64_t *n, const std::int64_t *k, const float *alpha,
-                       const float **a, const std::int64_t *lda, const float *beta, float **c,
-                       const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies = {});
-sycl::event syrk_batch(sycl::queue &queue, const uplo *upper_lower, const transpose *trans,
-                       const std::int64_t *n, const std::int64_t *k, const double *alpha,
-                       const double **a, const std::int64_t *lda, const double *beta, double **c,
-                       const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies = {});
-sycl::event syrk_batch(sycl::queue &queue, const uplo *upper_lower, const transpose *trans,
-                       const std::int64_t *n, const std::int64_t *k,
-                       const std::complex<float> *alpha, const std::complex<float> **a,
-                       const std::int64_t *lda, const std::complex<float> *beta,
-                       std::complex<float> **c, const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies = {});
-sycl::event syrk_batch(sycl::queue &queue, const uplo *upper_lower, const transpose *trans,
-                       const std::int64_t *n, const std::int64_t *k,
-                       const std::complex<double> *alpha, const std::complex<double> **a,
-                       const std::int64_t *lda, const std::complex<double> *beta,
-                       std::complex<double> **c, const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                       std::int64_t k, value_or_pointer<float> alpha, const float *a, std::int64_t lda,
-                       std::int64_t stride_a, value_or_pointer<float> beta, float *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                       std::int64_t k, value_or_pointer<double> alpha, const double *a,
-                       std::int64_t lda, std::int64_t stride_a, value_or_pointer<double> beta,
-                       double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                       std::int64_t k, value_or_pointer<std::complex<float>> alpha,
-                       const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                       value_or_pointer<std::complex<float>> beta, std::complex<float> *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                       std::int64_t k, value_or_pointer<std::complex<double>> alpha,
-                       const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                       value_or_pointer<std::complex<double>> beta, std::complex<double> *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                       std::int64_t stridex, float *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                       std::int64_t stridex, double *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                       std::int64_t incx, std::int64_t stridex, std::complex<float> *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                       std::int64_t incx, std::int64_t stridex, std::complex<double> *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, const std::int64_t *n, const float **x,
-                       const std::int64_t *incx, float **y, const std::int64_t *incy,
-                       std::int64_t group_count, const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, const std::int64_t *n, const double **x,
-                       const std::int64_t *incx, double **y, const std::int64_t *incy,
-                       std::int64_t group_count, const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, const std::int64_t *n, const std::complex<float> **x,
-                       const std::int64_t *incx, std::complex<float> **y, const std::int64_t *incy,
-                       std::int64_t group_count, const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event copy_batch(sycl::queue &queue, const std::int64_t *n, const std::complex<double> **x,
-                       const std::int64_t *incx, std::complex<double> **y, const std::int64_t *incy,
-                       std::int64_t group_count, const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                       const float *a, std::int64_t lda, std::int64_t stridea, const float *x,
-                       std::int64_t incx, std::int64_t stridex, float *c, std::int64_t ldc,
-                       std::int64_t stridec, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                       const double *a, std::int64_t lda, std::int64_t stridea, const double *x,
-                       std::int64_t incx, std::int64_t stridex, double *c, std::int64_t ldc,
-                       std::int64_t stridec, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                       const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-                       const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<float> *c, std::int64_t ldc, std::int64_t stridec,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n,
-                       const std::complex<double> *a, std::int64_t lda, std::int64_t stridea,
-                       const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<double> *c, std::int64_t ldc, std::int64_t stridec,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, const side *left_right, const std::int64_t *m,
-                       const std::int64_t *n, const float **a, const std::int64_t *lda,
-                       const float **x, const std::int64_t *incx, float **c,
-                       const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, const side *left_right, const std::int64_t *m,
-                       const std::int64_t *n, const double **a, const std::int64_t *lda,
-                       const double **x, const std::int64_t *incx, double **c,
-                       const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, const side *left_right, const std::int64_t *m,
-                       const std::int64_t *n, const std::complex<float> **a,
-                       const std::int64_t *lda, const std::complex<float> **x,
-                       const std::int64_t *incx, std::complex<float> **c, const std::int64_t *ldc,
-                       std::int64_t group_count, const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event dgmm_batch(sycl::queue &queue, const side *left_right, const std::int64_t *m,
-                       const std::int64_t *n, const std::complex<double> **a,
-                       const std::int64_t *lda, const std::complex<double> **x,
-                       const std::int64_t *incx, std::complex<double> **c, const std::int64_t *ldc,
-                       std::int64_t group_count, const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                       value_or_pointer<float> alpha, const float *a, std::int64_t lda,
-                       std::int64_t stridea, const float *x, std::int64_t incx,
-                       std::int64_t stridex, value_or_pointer<float> beta, float *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                       value_or_pointer<double> alpha, const double *a, std::int64_t lda,
-                       std::int64_t stridea, const double *x, std::int64_t incx,
-                       std::int64_t stridex, value_or_pointer<double> beta, double *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                       value_or_pointer<std::complex<float>> alpha, const std::complex<float> *a,
-                       std::int64_t lda, std::int64_t stridea, const std::complex<float> *x,
-                       std::int64_t incx, std::int64_t stridex,
-                       value_or_pointer<std::complex<float>> beta, std::complex<float> *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                       value_or_pointer<std::complex<double>> alpha, const std::complex<double> *a,
-                       std::int64_t lda, std::int64_t stridea, const std::complex<double> *x,
-                       std::int64_t incx, std::int64_t stridex,
-                       value_or_pointer<std::complex<double>> beta, std::complex<double> *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m,
-                       const std::int64_t *n, const float *alpha, const float **a,
-                       const std::int64_t *lda, const float **x, const std::int64_t *incx,
-                       const float *beta, float **y, const std::int64_t *incy,
-                       std::int64_t group_count, const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m,
-                       const std::int64_t *n, const double *alpha, const double **a,
-                       const std::int64_t *lda, const double **x, const std::int64_t *incx,
-                       const double *beta, double **y, const std::int64_t *incy,
-                       std::int64_t group_count, const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m,
-                       const std::int64_t *n, const std::complex<float> *alpha,
-                       const std::complex<float> **a, const std::int64_t *lda,
-                       const std::complex<float> **x, const std::int64_t *incx,
-                       const std::complex<float> *beta, std::complex<float> **y,
-                       const std::int64_t *incy, std::int64_t group_count,
-                       const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemv_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m,
-                       const std::int64_t *n, const std::complex<double> *alpha,
-                       const std::complex<double> **a, const std::int64_t *lda,
-                       const std::complex<double> **x, const std::int64_t *incx,
-                       const std::complex<double> *beta, std::complex<double> **y,
-                       const std::int64_t *incy, std::int64_t group_count,
-                       const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, const std::int64_t *n, const double *alpha,
-                       const double **x, const std::int64_t *incx, double **y,
-                       const std::int64_t *incy, std::int64_t group_count,
-                       const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, const std::int64_t *n, const float *alpha,
-                       const float **x, const std::int64_t *incx, float **y,
-                       const std::int64_t *incy, std::int64_t group_count,
-                       const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, const std::int64_t *n, const std::complex<double> *alpha,
-                       const std::complex<double> **x, const std::int64_t *incx,
-                       std::complex<double> **y, const std::int64_t *incy, std::int64_t group_count,
-                       const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, const std::int64_t *n, const std::complex<float> *alpha,
-                       const std::complex<float> **x, const std::int64_t *incx,
-                       std::complex<float> **y, const std::int64_t *incy, std::int64_t group_count,
-                       const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, value_or_pointer<float> alpha,
-                       const float *x, std::int64_t incx, std::int64_t stridex, float *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, value_or_pointer<double> alpha,
-                       const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, value_or_pointer<std::complex<float>> alpha,
-                       const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t n,
-                       value_or_pointer<std::complex<double>> alpha, const std::complex<double> *x,
-                       std::int64_t incx, std::int64_t stridex, std::complex<double> *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb,
-                       const std::int64_t *m, const std::int64_t *n, const std::int64_t *k,
-                       const float *alpha, const float **a, const std::int64_t *lda,
-                       const float **b, const std::int64_t *ldb, const float *beta, float **c,
-                       const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb,
-                       const std::int64_t *m, const std::int64_t *n, const std::int64_t *k,
-                       const double *alpha, const double **a, const std::int64_t *lda,
-                       const double **b, const std::int64_t *ldb, const double *beta, double **c,
-                       const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb,
-                       const std::int64_t *m, const std::int64_t *n, const std::int64_t *k,
-                       const std::complex<float> *alpha, const std::complex<float> **a,
-                       const std::int64_t *lda, const std::complex<float> **b,
-                       const std::int64_t *ldb, const std::complex<float> *beta,
-                       std::complex<float> **c, const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb,
-                       const std::int64_t *m, const std::int64_t *n, const std::int64_t *k,
-                       const std::complex<double> *alpha, const std::complex<double> **a,
-                       const std::int64_t *lda, const std::complex<double> **b,
-                       const std::int64_t *ldb, const std::complex<double> *beta,
-                       std::complex<double> **c, const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb,
-                       const std::int64_t *m, const std::int64_t *n, const std::int64_t *k,
-                       const sycl::half *alpha, const sycl::half **a, const std::int64_t *lda,
-                       const sycl::half **b, const std::int64_t *ldb, const sycl::half *beta,
-                       sycl::half **c, const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb,
-                       const std::int64_t *m, const std::int64_t *n, const std::int64_t *k,
-                       const float *alpha, const sycl::half **a, const std::int64_t *lda,
-                       const sycl::half **b, const std::int64_t *ldb, const float *beta, float **c,
-                       const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb,
-                       const std::int64_t *m, const std::int64_t *n, const std::int64_t *k,
-                       const float *alpha, const bfloat16 **a, const std::int64_t *lda,
-                       const bfloat16 **b, const std::int64_t *ldb, const float *beta,
-                       bfloat16 **c, const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb,
-                       const std::int64_t *m, const std::int64_t *n, const std::int64_t *k,
-                       const float *alpha, const bfloat16 **a, const std::int64_t *lda,
-                       const bfloat16 **b, const std::int64_t *ldb, const float *beta,
-                       float **c, const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb,
-                       const std::int64_t *m, const std::int64_t *n, const std::int64_t *k,
-                       const float *alpha, const std::int8_t **a, const std::int64_t *lda,
-                       const std::int8_t **b, const std::int64_t *ldb, const float *beta,
-                       std::int32_t **c, const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb,
-                       const std::int64_t *m, const std::int64_t *n, const std::int64_t *k,
-                       const float *alpha, const std::int8_t **a, const std::int64_t *lda,
-                       const std::int8_t **b, const std::int64_t *ldb, const float *beta, float **c,
-                       const std::int64_t *ldc, std::int64_t group_count,
-                       const std::int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                       std::int64_t n, std::int64_t k, value_or_pointer<float> alpha, const float *a,
-                       std::int64_t lda, std::int64_t stride_a, const float *b, std::int64_t ldb,
-                       std::int64_t stride_b, value_or_pointer<float> beta, float *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                       std::int64_t n, std::int64_t k, value_or_pointer<double> alpha, const double *a,
-                       std::int64_t lda, std::int64_t stride_a, const double *b, std::int64_t ldb,
-                       std::int64_t stride_b, value_or_pointer<double> beta, double *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                       std::int64_t n, std::int64_t k, value_or_pointer<std::complex<float>> alpha,
-                       const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                       value_or_pointer<std::complex<float>> beta, std::complex<float> *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                       std::int64_t n, std::int64_t k, value_or_pointer<std::complex<double>> alpha,
-                       const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                       value_or_pointer<std::complex<double>> beta, std::complex<double> *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                       std::int64_t n, std::int64_t k, value_or_pointer<sycl::half> alpha,
-                       const sycl::half *a, std::int64_t lda, std::int64_t stride_a,
-                       const sycl::half *b, std::int64_t ldb, std::int64_t stride_b,
-                       value_or_pointer<sycl::half> beta, sycl::half *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                       std::int64_t n, std::int64_t k, value_or_pointer<float> alpha,
-                       const sycl::half *a, std::int64_t lda, std::int64_t stride_a,
-                       const sycl::half *b, std::int64_t ldb, std::int64_t stride_b,
-                       value_or_pointer<float> beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                       std::int64_t n, std::int64_t k, value_or_pointer<float> alpha,
-                       const bfloat16 *a, std::int64_t lda, std::int64_t stride_a,
-                       const bfloat16 *b, std::int64_t ldb, std::int64_t stride_b,
-                       value_or_pointer<float> beta, bfloat16 *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                       std::int64_t n, std::int64_t k, value_or_pointer<float> alpha,
-                       const bfloat16 *a, std::int64_t lda, std::int64_t stride_a,
-                       const bfloat16 *b, std::int64_t ldb, std::int64_t stride_b,
-                       value_or_pointer<float> beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                       std::int64_t n, std::int64_t k, value_or_pointer<float> alpha,
-                       const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b,
-                       value_or_pointer<float> beta, std::int32_t *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                       std::int64_t n, std::int64_t k, value_or_pointer<float> alpha,
-                       const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b,
-                       value_or_pointer<float> beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, std::int64_t m, std::int64_t n, value_or_pointer<float> alpha,
-                       const float *a, std::int64_t lda, std::int64_t stride_a, float *b,
-                       std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, std::int64_t m, std::int64_t n, value_or_pointer<double> alpha,
-                       const double *a, std::int64_t lda, std::int64_t stride_a, double *b,
-                       std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, std::int64_t m, std::int64_t n,
-                       value_or_pointer<std::complex<float>> alpha, const std::complex<float> *a,
-                       std::int64_t lda, std::int64_t stride_a, std::complex<float> *b,
-                       std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                       diag unit_diag, std::int64_t m, std::int64_t n,
-                       value_or_pointer<std::complex<double>> alpha, const std::complex<double> *a,
-                       std::int64_t lda, std::int64_t stride_a, std::complex<double> *b,
-                       std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, const side *left_right, const uplo *upper_lower,
-                       const transpose *trans, const diag *unit_diag, const std::int64_t *m,
-                       const std::int64_t *n, const float *alpha, const float **a,
-                       const std::int64_t *lda, float **b, const std::int64_t *ldb,
-                       std::int64_t group_count, const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, const side *left_right, const uplo *upper_lower,
-                       const transpose *trans, const diag *unit_diag, const std::int64_t *m,
-                       const std::int64_t *n, const double *alpha, const double **a,
-                       const std::int64_t *lda, double **b, const std::int64_t *ldb,
-                       std::int64_t group_count, const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, const side *left_right, const uplo *upper_lower,
-                       const transpose *trans, const diag *unit_diag, const std::int64_t *m,
-                       const std::int64_t *n, const std::complex<float> *alpha,
-                       const std::complex<float> **a, const std::int64_t *lda,
-                       std::complex<float> **b, const std::int64_t *ldb, std::int64_t group_count,
-                       const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event trsm_batch(sycl::queue &queue, const side *left_right, const uplo *upper_lower,
-                       const transpose *trans, const diag *unit_diag, const std::int64_t *m,
-                       const std::int64_t *n, const std::complex<double> *alpha,
-                       const std::complex<double> **a, const std::int64_t *lda,
-                       std::complex<double> **b, const std::int64_t *ldb, std::int64_t group_count,
-                       const std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                           value_or_pointer<float> alpha, const float *a, std::int64_t lda,
-                           std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                           value_or_pointer<double> alpha, const double *a, std::int64_t lda,
-                           std::int64_t stride_a, double *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                           value_or_pointer<std::complex<float>> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<float> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                           value_or_pointer<std::complex<double>> alpha, const std::complex<double> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<double> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                           value_or_pointer<float> alpha, float *ab, std::int64_t lda,
-                           std::int64_t ldb, std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                           value_or_pointer<double> alpha, double *ab, std::int64_t lda,
-                           std::int64_t ldb, std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                           value_or_pointer<std::complex<float>> alpha, std::complex<float> *ab,
-                           std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                           value_or_pointer<std::complex<double>> alpha, std::complex<double> *ab,
-                           std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                          std::int64_t n, value_or_pointer<float> alpha, const float *a,
-                          std::int64_t lda, std::int64_t stride_a, value_or_pointer<float> beta,
-                          const float *b, std::int64_t ldb, std::int64_t stride_b, float *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                          std::int64_t n, value_or_pointer<double> alpha, const double *a,
-                          std::int64_t lda, std::int64_t stride_a, value_or_pointer<double> beta,
-                          const double *b, std::int64_t ldb, std::int64_t stride_b, double *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                          std::int64_t n, value_or_pointer<std::complex<float>> alpha,
-                          const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                          value_or_pointer<std::complex<float>> beta, const std::complex<float> *b,
-                          std::int64_t ldb, std::int64_t stride_b, std::complex<float> *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                          std::int64_t n, value_or_pointer<std::complex<double>> alpha,
-                          const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                          value_or_pointer<std::complex<double>> beta, const std::complex<double> *b,
-                          std::int64_t ldb, std::int64_t stride_b, std::complex<double> *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     value_or_pointer<float> alpha, const float *a, std::int64_t lda, float *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     value_or_pointer<double> alpha, const double *a, std::int64_t lda, double *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     value_or_pointer<std::complex<float>> alpha, const std::complex<float> *a,
-                     std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     value_or_pointer<std::complex<double>> alpha, const std::complex<double> *a,
-                     std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                      value_or_pointer<float> alpha, const float *a, std::int64_t lda,
-                      std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                      value_or_pointer<double> alpha, const double *a, std::int64_t lda,
-                      std::int64_t stridea, double *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                      value_or_pointer<std::complex<float>> alpha, const std::complex<float> *a,
-                      std::int64_t lda, std::int64_t stridea, std::complex<float> *b,
-                      std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                      value_or_pointer<std::complex<double>> alpha, const std::complex<double> *a,
-                      std::int64_t lda, std::int64_t stridea, std::complex<double> *b,
-                      std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     value_or_pointer<float> alpha, float *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     value_or_pointer<double> alpha, double *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     value_or_pointer<std::complex<float>> alpha, std::complex<float> *ab,
-                     std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     value_or_pointer<std::complex<double>> alpha, std::complex<double> *ab,
-                     std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                    std::int64_t n, value_or_pointer<float> alpha, const float *a, std::int64_t lda,
-                    value_or_pointer<float> beta, const float *b, std::int64_t ldb, float *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                    std::int64_t n, value_or_pointer<double> alpha, const double *a, std::int64_t lda,
-                    value_or_pointer<double> beta, const double *b, std::int64_t ldb, double *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                    std::int64_t n, value_or_pointer<std::complex<float>> alpha,
-                    const std::complex<float> *a, std::int64_t lda,
-                    value_or_pointer<std::complex<float>> beta, const std::complex<float> *b,
-                    std::int64_t ldb, std::complex<float> *c, std::int64_t ldc,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                    std::int64_t n, value_or_pointer<std::complex<double>> alpha,
-                    const std::complex<double> *a, std::int64_t lda,
-                    value_or_pointer<std::complex<double>> beta, const std::complex<double> *b,
-                    std::int64_t ldb, std::complex<double> *c, std::int64_t ldc,
-                    const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m,
-                           const std::int64_t *n, const float *alpha, const float **a,
-                           const std::int64_t *lda, float **b, const std::int64_t *ldb,
-                           std::int64_t group_count, const std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m,
-                           const std::int64_t *n, const double *alpha, const double **a,
-                           const std::int64_t *lda, double **b, const std::int64_t *ldb,
-                           std::int64_t group_count, const std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m,
-                           const std::int64_t *n, const std::complex<float> *alpha,
-                           const std::complex<float> **a, const std::int64_t *lda,
-                           std::complex<float> **b, const std::int64_t *ldb,
-                           std::int64_t group_count, const std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event omatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m,
-                           const std::int64_t *n, const std::complex<double> *alpha,
-                           const std::complex<double> **a, const std::int64_t *lda,
-                           std::complex<double> **b, const std::int64_t *ldb,
-                           std::int64_t group_count, const std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m,
-                           const std::int64_t *n, const float *alpha, float **ab,
-                           const std::int64_t *lda, const std::int64_t *ldb,
-                           std::int64_t group_count, const std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m,
-                           const std::int64_t *n, const double *alpha, double **ab,
-                           const std::int64_t *lda, const std::int64_t *ldb,
-                           std::int64_t group_count, const std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m,
-                           const std::int64_t *n, const std::complex<float> *alpha,
-                           std::complex<float> **ab, const std::int64_t *lda,
-                           const std::int64_t *ldb, std::int64_t group_count,
-                           const std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies = {});
-
-sycl::event imatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m,
-                           const std::int64_t *n, const std::complex<double> *alpha,
-                           std::complex<double> **ab, const std::int64_t *lda,
-                           const std::int64_t *ldb, std::int64_t group_count,
-                           const std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies = {});
diff --git a/src/blas/backends/mkl_common/mkl_extensions.cxx b/src/blas/backends/mkl_common/mkl_extensions.cxx
deleted file mode 100644
index 4672af5c7..000000000
--- a/src/blas/backends/mkl_common/mkl_extensions.cxx
+++ /dev/null
@@ -1,359 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<int8_t, 1> &a, int64_t lda,
-               int8_t ao, sycl::buffer<int8_t, 1> &b, int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo,
-                          beta, c, ldc, co);
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<int8_t, 1> &a, int64_t lda,
-               int8_t ao, sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo,
-                          beta, c, ldc, co);
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a, int64_t lda,
-               uint8_t ao, sycl::buffer<int8_t, 1> &b, int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo,
-                          beta, c, ldc, co);
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a, int64_t lda,
-               uint8_t ao, sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo,
-                          beta, c, ldc, co);
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-           sycl::buffer<float, 1> &b, int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-           int64_t ldc) {
-    blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
-                      ldc);
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-           sycl::buffer<double, 1> &b, int64_t ldb, double beta, sycl::buffer<double, 1> &c,
-           int64_t ldc) {
-    blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
-                      ldc);
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
-                      ldc);
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-           std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c,
-                      ldc);
-}
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-              sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b, int64_t ldb) {
-    blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-              sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b, int64_t ldb) {
-    blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-              sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-              sycl::buffer<std::complex<float>, 1> &b, int64_t ldb) {
-    blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-              sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-              sycl::buffer<std::complex<double>, 1> &b, int64_t ldb) {
-    blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-               sycl::buffer<float, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<float, 1> &b, int64_t ldb, std::int64_t strideb) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-               sycl::buffer<double, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<double, 1> &b, int64_t ldb, std::int64_t strideb) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-               sycl::buffer<std::complex<float>, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::int64_t strideb) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-               std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-               std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-               std::int64_t strideb) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-              sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb) {
-    blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-              sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb) {
-    blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-              sycl::buffer<std::complex<float>, 1> &ab, int64_t lda, int64_t ldb) {
-    blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-              sycl::buffer<std::complex<double>, 1> &ab, int64_t lda, int64_t ldb) {
-    blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             float alpha, sycl::buffer<float, 1> &a, int64_t lda, float beta,
-             sycl::buffer<float, 1> &b, int64_t ldb, sycl::buffer<float, 1> &c, int64_t ldc) {
-    blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             double alpha, sycl::buffer<double, 1> &a, int64_t lda, double beta,
-             sycl::buffer<double, 1> &b, int64_t ldb, sycl::buffer<double, 1> &c, int64_t ldc) {
-    blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-             std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-             sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc);
-}
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-             std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-             sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc);
-}
-
-// USM APIs
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda,
-                      int8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb,
-                                 bo, beta, c, ldc, co, dependencies);
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda,
-                      int8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb,
-                                 bo, beta, c, ldc, co, dependencies);
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda,
-                      uint8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb,
-                                 bo, beta, c, ldc, co, dependencies);
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda,
-                      uint8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb,
-                                 bo, beta, c, ldc, co, dependencies);
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b,
-                  int64_t ldb, float beta, float *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta,
-                             c, ldc, dependencies);
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, double alpha, const double *a, int64_t lda, const double *b,
-                  int64_t ldb, double beta, double *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta,
-                             c, ldc, dependencies);
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                  int64_t lda, const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                  std::complex<float> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta,
-                             c, ldc, dependencies);
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                  int64_t lda, const std::complex<double> *b, int64_t ldb,
-                  std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta,
-                             c, ldc, dependencies);
-}
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                     const float *a, int64_t lda, float *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies);
-}
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                     const double *a, int64_t lda, double *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies);
-}
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                     std::complex<float> *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies);
-}
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                     std::complex<double> *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies);
-}
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                      const float *a, int64_t lda, std::int64_t stridea, float *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                      const double *a, int64_t lda, std::int64_t stridea, double *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                      std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                      std::int64_t stridea, std::complex<float> *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                      std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                      std::int64_t stridea, std::complex<double> *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                     float *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    return blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb, dependencies);
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                     double *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    return blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb, dependencies);
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<float> alpha, std::complex<float> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    return blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb, dependencies);
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<double> alpha, std::complex<double> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    return blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb, dependencies);
-}
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    float alpha, const float *a, int64_t lda, float beta, const float *b,
-                    int64_t ldb, float *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc,
-                               dependencies);
-}
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    double alpha, const double *a, int64_t lda, double beta, const double *b,
-                    int64_t ldb, double *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc,
-                               dependencies);
-}
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                    std::complex<float> beta, const std::complex<float> *b, int64_t ldb,
-                    std::complex<float> *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc,
-                               dependencies);
-}
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                    std::complex<double> beta, const std::complex<double> *b, int64_t ldb,
-                    std::complex<double> *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    return blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc,
-                               dependencies);
-}
diff --git a/src/blas/backends/mkl_common/mkl_level1.cxx b/src/blas/backends/mkl_common/mkl_level1.cxx
deleted file mode 100644
index 85ccb0025..000000000
--- a/src/blas/backends/mkl_common/mkl_level1.cxx
+++ /dev/null
@@ -1,645 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    blas_major::asum(queue, n, x, incx, result);
-}
-
-void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &result) {
-    blas_major::asum(queue, n, x, incx, result);
-}
-
-void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    blas_major::asum(queue, n, x, incx, result);
-}
-
-void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    blas_major::asum(queue, n, x, incx, result);
-}
-
-void axpy(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    blas_major::axpy(queue, n, alpha, x, incx, y, incy);
-}
-
-void axpy(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    blas_major::axpy(queue, n, alpha, x, incx, y, incy);
-}
-
-void axpy(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    blas_major::axpy(queue, n, alpha, x, incx, y, incy);
-}
-
-void axpy(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    blas_major::axpy(queue, n, alpha, x, incx, y, incy);
-}
-
-void axpby(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-           std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-           std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy);
-}
-
-void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &y, std::int64_t incy) {
-    blas_major::copy(queue, n, x, incx, y, incy);
-}
-
-void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    blas_major::copy(queue, n, x, incx, y, incy);
-}
-
-void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    blas_major::copy(queue, n, x, incx, y, incy);
-}
-
-void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    blas_major::copy(queue, n, x, incx, y, incy);
-}
-
-void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-         sycl::buffer<float, 1> &y, std::int64_t incy, sycl::buffer<float, 1> &result) {
-    blas_major::dot(queue, n, x, incx, y, incy, result);
-}
-
-void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-         sycl::buffer<double, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &result) {
-    blas_major::dot(queue, n, x, incx, y, incy, result);
-}
-
-void sdsdot(sycl::queue &queue, std::int64_t n, float sb, sycl::buffer<float, 1> &x,
-            std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-            sycl::buffer<float, 1> &result) {
-    blas_major::sdsdot(queue, n, sb, x, incx, y, incy, result);
-}
-
-void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-         sycl::buffer<float, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &result) {
-    blas_major::dot(queue, n, x, incx, y, incy, result);
-}
-
-void dotc(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    blas_major::dotc(queue, n, x, incx, y, incy, result);
-}
-
-void dotc(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    blas_major::dotc(queue, n, x, incx, y, incy, result);
-}
-
-void dotu(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    blas_major::dotu(queue, n, x, incx, y, incy, result);
-}
-
-void dotu(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    blas_major::dotu(queue, n, x, incx, y, incy, result);
-}
-
-void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    blas_major::nrm2(queue, n, x, incx, result);
-}
-
-void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &result) {
-    blas_major::nrm2(queue, n, x, incx, result);
-}
-
-void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    blas_major::nrm2(queue, n, x, incx, result);
-}
-
-void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    blas_major::nrm2(queue, n, x, incx, result);
-}
-
-void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-         std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c,
-         float s) {
-    blas_major::rot(queue, n, x, incx, y, incy, c, s);
-}
-
-void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-         std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c,
-         double s) {
-    blas_major::rot(queue, n, x, incx, y, incy, c, s);
-}
-
-void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-         sycl::buffer<float, 1> &y, std::int64_t incy, float c, float s) {
-    blas_major::rot(queue, n, x, incx, y, incy, c, s);
-}
-
-void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-         sycl::buffer<double, 1> &y, std::int64_t incy, double c, double s) {
-    blas_major::rot(queue, n, x, incx, y, incy, c, s);
-}
-
-void rotg(sycl::queue &queue, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &b,
-          sycl::buffer<float, 1> &c, sycl::buffer<float, 1> &s) {
-    blas_major::rotg(queue, a, b, c, s);
-}
-
-void rotg(sycl::queue &queue, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &b,
-          sycl::buffer<double, 1> &c, sycl::buffer<double, 1> &s) {
-    blas_major::rotg(queue, a, b, c, s);
-}
-
-void rotg(sycl::queue &queue, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<std::complex<float>, 1> &s) {
-    blas_major::rotg(queue, a, b, c, s);
-}
-
-void rotg(sycl::queue &queue, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<std::complex<double>, 1> &s) {
-    blas_major::rotg(queue, a, b, c, s);
-}
-
-void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &y, std::int64_t incy, sycl::buffer<float, 1> &param) {
-    blas_major::rotm(queue, n, x, incx, y, incy, param);
-}
-
-void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &param) {
-    blas_major::rotm(queue, n, x, incx, y, incy, param);
-}
-
-void rotmg(sycl::queue &queue, sycl::buffer<float, 1> &d1, sycl::buffer<float, 1> &d2,
-           sycl::buffer<float, 1> &x1, float y1, sycl::buffer<float, 1> &param) {
-    blas_major::rotmg(queue, d1, d2, x1, y1, param);
-}
-
-void rotmg(sycl::queue &queue, sycl::buffer<double, 1> &d1, sycl::buffer<double, 1> &d2,
-           sycl::buffer<double, 1> &x1, double y1, sycl::buffer<double, 1> &param) {
-    blas_major::rotmg(queue, d1, d2, x1, y1, param);
-}
-
-void scal(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-          std::int64_t incx) {
-    blas_major::scal(queue, n, alpha, x, incx);
-}
-
-void scal(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-          std::int64_t incx) {
-    blas_major::scal(queue, n, alpha, x, incx);
-}
-
-void scal(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    blas_major::scal(queue, n, alpha, x, incx);
-}
-
-void scal(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    blas_major::scal(queue, n, alpha, x, incx);
-}
-
-void scal(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx) {
-    blas_major::scal(queue, n, alpha, x, incx);
-}
-
-void scal(sycl::queue &queue, std::int64_t n, double alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    blas_major::scal(queue, n, alpha, x, incx);
-}
-
-void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &y, std::int64_t incy) {
-    blas_major::swap(queue, n, x, incx, y, incy);
-}
-
-void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    blas_major::swap(queue, n, x, incx, y, incy);
-}
-
-void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    blas_major::swap(queue, n, x, incx, y, incy);
-}
-
-void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    blas_major::swap(queue, n, x, incx, y, incy);
-}
-
-void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    blas_major::iamax(queue, n, x, incx, result);
-}
-
-void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    blas_major::iamax(queue, n, x, incx, result);
-}
-
-void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    blas_major::iamax(queue, n, x, incx, result);
-}
-
-void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    blas_major::iamax(queue, n, x, incx, result);
-}
-
-void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    blas_major::iamin(queue, n, x, incx, result);
-}
-
-void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    blas_major::iamin(queue, n, x, incx, result);
-}
-
-void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    blas_major::iamin(queue, n, x, incx, result);
-}
-
-void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    blas_major::iamin(queue, n, x, incx, result);
-}
-
-// USM APIs
-
-sycl::event asum(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                 std::int64_t incx, float *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::asum(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event asum(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                 std::int64_t incx, double *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::asum(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event asum(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                 float *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::asum(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event asum(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                 double *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::asum(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event axpy(sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx,
-                 float *y, std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
-}
-
-sycl::event axpy(sycl::queue &queue, std::int64_t n, double alpha, const double *x,
-                 std::int64_t incx, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
-}
-
-sycl::event axpy(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
-}
-
-sycl::event axpy(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpy(queue, n, alpha, x, incx, y, incy, dependencies);
-}
-
-sycl::event axpby(sycl::queue &queue, std::int64_t n, float alpha, const float *x,
-                  std::int64_t incx, float beta, float *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event axpby(sycl::queue &queue, std::int64_t n, double alpha, const double *x,
-                  std::int64_t incx, double beta, double *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event axpby(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                  const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                  std::complex<float> *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event axpby(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                  const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                  std::complex<double> *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event copy(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::copy(queue, n, x, incx, y, incy, dependencies);
-}
-
-sycl::event copy(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::copy(queue, n, x, incx, y, incy, dependencies);
-}
-
-sycl::event copy(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                 std::int64_t incx, std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::copy(queue, n, x, incx, y, incy, dependencies);
-}
-
-sycl::event copy(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                 std::int64_t incx, std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::copy(queue, n, x, incx, y, incy, dependencies);
-}
-
-sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                const float *y, std::int64_t incy, float *result,
-                const std::vector<sycl::event> &dependencies) {
-    return blas_major::dot(queue, n, x, incx, y, incy, result, dependencies);
-}
-
-sycl::event dot(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                const double *y, std::int64_t incy, double *result,
-                const std::vector<sycl::event> &dependencies) {
-    return blas_major::dot(queue, n, x, incx, y, incy, result, dependencies);
-}
-
-sycl::event sdsdot(sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx,
-                   const float *y, std::int64_t incy, float *result,
-                   const std::vector<sycl::event> &dependencies) {
-    return blas_major::sdsdot(queue, n, sb, x, incx, y, incy, result, dependencies);
-}
-
-sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                const float *y, std::int64_t incy, double *result,
-                const std::vector<sycl::event> &dependencies) {
-    return blas_major::dot(queue, n, x, incx, y, incy, result, dependencies);
-}
-
-sycl::event dotc(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                 std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
-                 std::complex<float> *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::dotc(queue, n, x, incx, y, incy, result, dependencies);
-}
-
-sycl::event dotc(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                 std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
-                 std::complex<double> *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::dotc(queue, n, x, incx, y, incy, result, dependencies);
-}
-
-sycl::event dotu(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                 std::int64_t incx, const std::complex<float> *y, std::int64_t incy,
-                 std::complex<float> *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::dotu(queue, n, x, incx, y, incy, result, dependencies);
-}
-
-sycl::event dotu(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                 std::int64_t incx, const std::complex<double> *y, std::int64_t incy,
-                 std::complex<double> *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::dotu(queue, n, x, incx, y, incy, result, dependencies);
-}
-
-sycl::event nrm2(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                 std::int64_t incx, float *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::nrm2(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event nrm2(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                 std::int64_t incx, double *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::nrm2(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event nrm2(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                 float *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::nrm2(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event nrm2(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                 double *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::nrm2(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
-                std::complex<float> *y, std::int64_t incy, float c, float s,
-                const std::vector<sycl::event> &dependencies) {
-    return blas_major::rot(queue, n, x, incx, y, incy, c, s, dependencies);
-}
-
-sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
-                std::complex<double> *y, std::int64_t incy, double c, double s,
-                const std::vector<sycl::event> &dependencies) {
-    return blas_major::rot(queue, n, x, incx, y, incy, c, s, dependencies);
-}
-
-sycl::event rot(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
-                std::int64_t incy, float c, float s, const std::vector<sycl::event> &dependencies) {
-    return blas_major::rot(queue, n, x, incx, y, incy, c, s, dependencies);
-}
-
-sycl::event rot(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
-                std::int64_t incy, double c, double s,
-                const std::vector<sycl::event> &dependencies) {
-    return blas_major::rot(queue, n, x, incx, y, incy, c, s, dependencies);
-}
-
-sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::rotg(queue, a, b, c, s, dependencies);
-}
-
-sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c, double *s,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::rotg(queue, a, b, c, s, dependencies);
-}
-
-sycl::event rotg(sycl::queue &queue, std::complex<float> *a, std::complex<float> *b, float *c,
-                 std::complex<float> *s, const std::vector<sycl::event> &dependencies) {
-    return blas_major::rotg(queue, a, b, c, s, dependencies);
-}
-
-sycl::event rotg(sycl::queue &queue, std::complex<double> *a, std::complex<double> *b, double *c,
-                 std::complex<double> *s, const std::vector<sycl::event> &dependencies) {
-    return blas_major::rotg(queue, a, b, c, s, dependencies);
-}
-
-sycl::event rotm(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
-                 std::int64_t incy, float *param, const std::vector<sycl::event> &dependencies) {
-    return blas_major::rotm(queue, n, x, incx, y, incy, param, dependencies);
-}
-
-sycl::event rotm(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
-                 std::int64_t incy, double *param, const std::vector<sycl::event> &dependencies) {
-    return blas_major::rotm(queue, n, x, incx, y, incy, param, dependencies);
-}
-
-sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::rotmg(queue, d1, d2, x1, y1, param, dependencies);
-}
-
-sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::rotmg(queue, d1, d2, x1, y1, param, dependencies);
-}
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::scal(queue, n, alpha, x, incx, dependencies);
-}
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::scal(queue, n, alpha, x, incx, dependencies);
-}
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::scal(queue, n, alpha, x, incx, dependencies);
-}
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::scal(queue, n, alpha, x, incx, dependencies);
-}
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, std::complex<float> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return blas_major::scal(queue, n, alpha, x, incx, dependencies);
-}
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, std::complex<double> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return blas_major::scal(queue, n, alpha, x, incx, dependencies);
-}
-
-sycl::event swap(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::swap(queue, n, x, incx, y, incy, dependencies);
-}
-
-sycl::event swap(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::swap(queue, n, x, incx, y, incy, dependencies);
-}
-
-sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex<float> *x, std::int64_t incx,
-                 std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::swap(queue, n, x, incx, y, incy, dependencies);
-}
-
-sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex<double> *x, std::int64_t incx,
-                 std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::swap(queue, n, x, incx, y, incy, dependencies);
-}
-
-sycl::event iamax(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                  std::int64_t *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::iamax(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event iamax(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                  std::int64_t *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::iamax(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event iamax(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::iamax(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event iamax(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::iamax(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event iamin(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                  std::int64_t *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::iamin(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event iamin(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                  std::int64_t *result, const std::vector<sycl::event> &dependencies) {
-    return blas_major::iamin(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event iamin(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::iamin(queue, n, x, incx, result, dependencies);
-}
-
-sycl::event iamin(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::iamin(queue, n, x, incx, result, dependencies);
-}
diff --git a/src/blas/backends/mkl_common/mkl_level2.cxx b/src/blas/backends/mkl_common/mkl_level2.cxx
deleted file mode 100644
index 83494be12..000000000
--- a/src/blas/backends/mkl_common/mkl_level2.cxx
+++ /dev/null
@@ -1,862 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-          std::int64_t ku, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-          std::int64_t ku, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-          std::int64_t ku, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl,
-          std::int64_t ku, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy, sycl::buffer<float, 1> &a,
-         std::int64_t lda) {
-    blas_major::ger(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
-         sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-         std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda) {
-    blas_major::ger(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    blas_major::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    blas_major::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    blas_major::geru(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    blas_major::geru(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void hbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    blas_major::hbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void hbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    blas_major::hbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void hemv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    blas_major::hemv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void hemv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    blas_major::hemv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void her(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    blas_major::her(queue, uplo, n, alpha, x, incx, a, lda);
-}
-
-void her(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    blas_major::her(queue, uplo, n, alpha, x, incx, a, lda);
-}
-
-void her2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    blas_major::her2(queue, uplo, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void her2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    blas_major::her2(queue, uplo, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void hpmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-          std::int64_t incy) {
-    blas_major::hpmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy);
-}
-
-void hpmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx, std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          std::int64_t incy) {
-    blas_major::hpmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy);
-}
-
-void hpr(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a) {
-    blas_major::hpr(queue, uplo, n, alpha, x, incx, a);
-}
-
-void hpr(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a) {
-    blas_major::hpr(queue, uplo, n, alpha, x, incx, a);
-}
-
-void hpr2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a) {
-    blas_major::hpr2(queue, uplo, n, alpha, x, incx, y, incy, a);
-}
-
-void hpr2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a) {
-    blas_major::hpr2(queue, uplo, n, alpha, x, incx, y, incy, a);
-}
-
-void sbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    blas_major::sbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void sbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    blas_major::sbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void spmv(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    blas_major::spmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy);
-}
-
-void spmv(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    blas_major::spmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy);
-}
-
-void spr(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &a) {
-    blas_major::spr(queue, uplo, n, alpha, x, incx, a);
-}
-
-void spr(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &a) {
-    blas_major::spr(queue, uplo, n, alpha, x, incx, a);
-}
-
-void spr2(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-          sycl::buffer<float, 1> &a) {
-    blas_major::spr2(queue, uplo, n, alpha, x, incx, y, incy, a);
-}
-
-void spr2(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-          sycl::buffer<double, 1> &a) {
-    blas_major::spr2(queue, uplo, n, alpha, x, incx, y, incy, a);
-}
-
-void symv(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, std::int64_t incy) {
-    blas_major::symv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void symv(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, std::int64_t incy) {
-    blas_major::symv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void syr(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &a, std::int64_t lda) {
-    blas_major::syr(queue, uplo, n, alpha, x, incx, a, lda);
-}
-
-void syr(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &a, std::int64_t lda) {
-    blas_major::syr(queue, uplo, n, alpha, x, incx, a, lda);
-}
-
-void syr2(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-          sycl::buffer<float, 1> &a, std::int64_t lda) {
-    blas_major::syr2(queue, uplo, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void syr2(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-          std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-          sycl::buffer<double, 1> &a, std::int64_t lda) {
-    blas_major::syr2(queue, uplo, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx) {
-    blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx);
-}
-
-void tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx) {
-    blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx);
-}
-
-void tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx);
-}
-
-void tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx);
-}
-
-void tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx) {
-    blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx);
-}
-
-void tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx) {
-    blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx);
-}
-
-void tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx);
-}
-
-void tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx);
-}
-
-void tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx);
-}
-
-void tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx);
-}
-
-void tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx) {
-    blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx);
-}
-
-void tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx) {
-    blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx);
-}
-
-void tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx);
-}
-
-void tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx);
-}
-
-void tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, sycl::buffer<std::complex<float>, 1> &x,
-          std::int64_t incx) {
-    blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx);
-}
-
-void tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, sycl::buffer<std::complex<double>, 1> &x,
-          std::int64_t incx) {
-    blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx);
-}
-
-void trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx) {
-    blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx);
-}
-
-void trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx) {
-    blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx);
-}
-
-void trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx);
-}
-
-void trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx);
-}
-
-void trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx) {
-    blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx);
-}
-
-void trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx) {
-    blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx);
-}
-
-void trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx);
-}
-
-void trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx);
-}
-
-// USM APIs
-
-sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha,
-                 const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta,
-                 float *y, std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
-                            dependencies);
-}
-
-sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha,
-                 const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta,
-                 double *y, std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
-                            dependencies);
-}
-
-sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                 const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
-                            dependencies);
-}
-
-sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                 const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
-                            dependencies);
-}
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                 std::int64_t kl, std::int64_t ku, float alpha, const float *a, std::int64_t lda,
-                 const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
-                            dependencies);
-}
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                 std::int64_t kl, std::int64_t ku, double alpha, const double *a, std::int64_t lda,
-                 const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
-                            dependencies);
-}
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                 std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
-                 std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
-                            dependencies);
-}
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                 std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
-                 std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy,
-                            dependencies);
-}
-
-sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, const float *x,
-                std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    return blas_major::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, const double *x,
-                std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    return blas_major::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                 std::int64_t incy, std::complex<float> *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
-                 std::int64_t incy, std::complex<double> *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                 std::int64_t incy, std::complex<float> *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
-                 std::int64_t incy, std::complex<double> *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event hbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k,
-                 std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                 const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::hbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event hbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k,
-                 std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                 const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::hbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event hemv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
-                 std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::hemv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event hemv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
-                 std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::hemv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event her(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha,
-                const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
-                std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return blas_major::her(queue, uplo, n, alpha, x, incx, a, lda, dependencies);
-}
-
-sycl::event her(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha,
-                const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
-                std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return blas_major::her(queue, uplo, n, alpha, x, incx, a, lda, dependencies);
-}
-
-sycl::event her2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                 std::int64_t incy, std::complex<float> *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::her2(queue, uplo, n, alpha, x, incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event her2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
-                 std::int64_t incy, std::complex<double> *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::her2(queue, uplo, n, alpha, x, incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event hpmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, const std::complex<float> *x, std::int64_t incx,
-                 std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::hpmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event hpmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, const std::complex<double> *x, std::int64_t incx,
-                 std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::hpmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event hpr(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha,
-                const std::complex<float> *x, std::int64_t incx, std::complex<float> *a,
-                const std::vector<sycl::event> &dependencies) {
-    return blas_major::hpr(queue, uplo, n, alpha, x, incx, a, dependencies);
-}
-
-sycl::event hpr(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha,
-                const std::complex<double> *x, std::int64_t incx, std::complex<double> *a,
-                const std::vector<sycl::event> &dependencies) {
-    return blas_major::hpr(queue, uplo, n, alpha, x, incx, a, dependencies);
-}
-
-sycl::event hpr2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                 std::int64_t incy, std::complex<float> *a,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::hpr2(queue, uplo, n, alpha, x, incx, y, incy, a, dependencies);
-}
-
-sycl::event hpr2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
-                 std::int64_t incy, std::complex<double> *a,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::hpr2(queue, uplo, n, alpha, x, incx, y, incy, a, dependencies);
-}
-
-sycl::event sbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k, float alpha,
-                 const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta,
-                 float *y, std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::sbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event sbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k, double alpha,
-                 const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta,
-                 double *y, std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::sbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event spmv(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, const float *a,
-                 const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::spmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event spmv(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, const double *a,
-                 const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::spmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event spr(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, const float *x,
-                std::int64_t incx, float *a, const std::vector<sycl::event> &dependencies) {
-    return blas_major::spr(queue, uplo, n, alpha, x, incx, a, dependencies);
-}
-
-sycl::event spr(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, const double *x,
-                std::int64_t incx, double *a, const std::vector<sycl::event> &dependencies) {
-    return blas_major::spr(queue, uplo, n, alpha, x, incx, a, dependencies);
-}
-
-sycl::event spr2(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, const float *x,
-                 std::int64_t incx, const float *y, std::int64_t incy, float *a,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::spr2(queue, uplo, n, alpha, x, incx, y, incy, a, dependencies);
-}
-
-sycl::event spr2(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, const double *x,
-                 std::int64_t incx, const double *y, std::int64_t incy, double *a,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::spr2(queue, uplo, n, alpha, x, incx, y, incy, a, dependencies);
-}
-
-sycl::event symv(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, const float *a,
-                 std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::symv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event symv(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, const double *a,
-                 std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return blas_major::symv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event syr(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, const float *x,
-                std::int64_t incx, float *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    return blas_major::syr(queue, uplo, n, alpha, x, incx, a, lda, dependencies);
-}
-
-sycl::event syr(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, const double *x,
-                std::int64_t incx, double *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    return blas_major::syr(queue, uplo, n, alpha, x, incx, a, lda, dependencies);
-}
-
-sycl::event syr2(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, const float *x,
-                 std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::syr2(queue, uplo, n, alpha, x, incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event syr2(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, const double *x,
-                 std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::syr2(queue, uplo, n, alpha, x, incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 std::int64_t k, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 std::int64_t k, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 std::int64_t k, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 std::int64_t k, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const float *a, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx, dependencies);
-}
-
-sycl::event tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const double *a, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx, dependencies);
-}
-
-sycl::event tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx, dependencies);
-}
-
-sycl::event tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx, dependencies);
-}
-
-sycl::event tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const float *a, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx, dependencies);
-}
-
-sycl::event tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const double *a, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx, dependencies);
-}
-
-sycl::event tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const std::complex<float> *a, std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx, dependencies);
-}
-
-sycl::event tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const std::complex<double> *a, std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx, dependencies);
-}
-
-sycl::event trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const float *a, std::int64_t lda, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies);
-}
-
-sycl::event trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const double *a, std::int64_t lda, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies);
-}
-
-sycl::event trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies);
-}
-
-sycl::event trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies);
-}
-
-sycl::event trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const float *a, std::int64_t lda, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies);
-}
-
-sycl::event trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const double *a, std::int64_t lda, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies);
-}
-
-sycl::event trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const std::complex<float> *a, std::int64_t lda, std::complex<float> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies);
-}
-
-sycl::event trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n,
-                 const std::complex<double> *a, std::int64_t lda, std::complex<double> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies);
-}
diff --git a/src/blas/backends/mkl_common/mkl_level3.cxx b/src/blas/backends/mkl_common/mkl_level3.cxx
deleted file mode 100644
index d52c710f1..000000000
--- a/src/blas/backends/mkl_common/mkl_level3.cxx
+++ /dev/null
@@ -1,519 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &b, std::int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-          std::int64_t ldc) {
-    blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &b, std::int64_t ldb, double beta, sycl::buffer<double, 1> &c,
-          std::int64_t ldc) {
-    blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, sycl::half alpha, sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-          sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, sycl::half beta,
-          sycl::buffer<sycl::half, 1> &c, std::int64_t ldc) {
-    blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-          sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-          std::int64_t ldc) {
-    blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<bfloat16, 1> &a, std::int64_t lda,
-          sycl::buffer<bfloat16, 1> &b, std::int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-          std::int64_t ldc) {
-    blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    blas_major::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    blas_major::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
-}
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
-}
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
-}
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
-}
-
-void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          float alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, float beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    blas_major::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
-}
-
-void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-          double alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, double beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    blas_major::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);
-}
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-           float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-           std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-           double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-           std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-           std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-           std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-           std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    blas_major::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k,
-           std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    blas_major::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &b, std::int64_t ldb) {
-    blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b,
-                     ldb);
-}
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb) {
-    blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b,
-                     ldb);
-}
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b,
-                     ldb);
-}
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b,
-                     ldb);
-}
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &b, std::int64_t ldb) {
-    blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b,
-                     ldb);
-}
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb) {
-    blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b,
-                     ldb);
-}
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b,
-                     ldb);
-}
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b,
-                     ldb);
-}
-
-// USM APIs
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                 const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            dependencies);
-}
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                 const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            dependencies);
-}
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                 std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            dependencies);
-}
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                 std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            dependencies);
-}
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half *a,
-                 std::int64_t lda, const sycl::half *b, std::int64_t ldb, sycl::half beta,
-                 sycl::half *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            dependencies);
-}
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, float alpha, const sycl::half *a, std::int64_t lda,
-                 const sycl::half *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            dependencies);
-}
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                 std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a, std::int64_t lda,
-                 const bfloat16 *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                            dependencies);
-}
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                 std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *b,
-                 std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c,
-                            ldc, dependencies);
-}
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                 std::int64_t n, double alpha, const double *a, std::int64_t lda, const double *b,
-                 std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c,
-                            ldc, dependencies);
-}
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                 std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                 std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
-                 std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c,
-                            ldc, dependencies);
-}
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                 std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                 std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
-                 std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c,
-                            ldc, dependencies);
-}
-
-sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                 std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                 std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
-                 std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c,
-                            ldc, dependencies);
-}
-
-sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m,
-                 std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                 std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
-                 std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c,
-                            ldc, dependencies);
-}
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                 std::int64_t k, float alpha, const float *a, std::int64_t lda, float beta,
-                 float *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
-                            dependencies);
-}
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                 std::int64_t k, double alpha, const double *a, std::int64_t lda, double beta,
-                 double *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
-                            dependencies);
-}
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                 std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                 std::int64_t lda, std::complex<float> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
-                            dependencies);
-}
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                 std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                 std::int64_t lda, std::complex<double> beta, std::complex<double> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
-                            dependencies);
-}
-
-sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                 std::int64_t k, float alpha, const std::complex<float> *a, std::int64_t lda,
-                 float beta, std::complex<float> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
-                            dependencies);
-}
-
-sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                 std::int64_t k, double alpha, const std::complex<double> *a, std::int64_t lda,
-                 double beta, std::complex<double> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc,
-                            dependencies);
-}
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                  std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b,
-                  std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             dependencies);
-}
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                  std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b,
-                  std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             dependencies);
-}
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                  std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                  std::int64_t lda, const std::complex<float> *b, std::int64_t ldb,
-                  std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             dependencies);
-}
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                  std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                  std::int64_t lda, const std::complex<double> *b, std::int64_t ldb,
-                  std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             dependencies);
-}
-
-sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                  std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                  std::int64_t lda, const std::complex<float> *b, std::int64_t ldb, float beta,
-                  std::complex<float> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             dependencies);
-}
-
-sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n,
-                  std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                  std::int64_t lda, const std::complex<double> *b, std::int64_t ldb, double beta,
-                  std::complex<double> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return blas_major::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                             dependencies);
-}
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a,
-                 std::int64_t lda, float *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, dependencies);
-}
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a,
-                 std::int64_t lda, double *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, dependencies);
-}
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                 std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    return blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, dependencies);
-}
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                 std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    return blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, dependencies);
-}
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a,
-                 std::int64_t lda, float *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, dependencies);
-}
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a,
-                 std::int64_t lda, double *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, dependencies);
-}
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                 std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, dependencies);
-}
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                 std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    return blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda,
-                            b, ldb, dependencies);
-}
diff --git a/src/blas/backends/mklcpu/CMakeLists.txt b/src/blas/backends/mklcpu/CMakeLists.txt
deleted file mode 100644
index 322741d26..000000000
--- a/src/blas/backends/mklcpu/CMakeLists.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(LIB_NAME onemkl_blas_mklcpu)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-set(SOURCES
-  mklcpu_level1.cpp mklcpu_level2.cpp mklcpu_level3.cpp mklcpu_batch.cpp mklcpu_extensions.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: mklcpu_wrappers.cpp>)
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT ${SOURCES})
-add_dependencies(onemkl_backend_libs_blas ${LIB_NAME})
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET ${LIB_OBJ} SOURCES ${SOURCES})
-endif()
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${PROJECT_SOURCE_DIR}/src/include
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-
-if(TARGET MKL::MKL_SYCL::BLAS)
-  target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL MKL::MKL_SYCL::BLAS)
-else()
-  target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL MKL::MKL_DPCPP)
-endif()
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-#Set libraries as not transitive for dynamic
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/blas/backends/mklcpu/mklcpu_batch.cpp b/src/blas/backends/mklcpu/mklcpu_batch.cpp
deleted file mode 100644
index 5ecf4cc69..000000000
--- a/src/blas/backends/mklcpu/mklcpu_batch.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "../mkl_common/mkl_blas_backend.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace mklcpu {
-namespace column_major {
-
-namespace blas_major = ::oneapi::mkl::blas::column_major;
-#include "../mkl_common/mkl_batch.cxx"
-
-} // namespace column_major
-namespace row_major {
-
-namespace blas_major = ::oneapi::mkl::blas::row_major;
-#include "../mkl_common/mkl_batch.cxx"
-
-} // namespace row_major
-} // namespace mklcpu
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/mklcpu/mklcpu_extensions.cpp b/src/blas/backends/mklcpu/mklcpu_extensions.cpp
deleted file mode 100644
index 215addd5e..000000000
--- a/src/blas/backends/mklcpu/mklcpu_extensions.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "../mkl_common/mkl_blas_backend.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace mklcpu {
-namespace column_major {
-
-namespace blas_major = ::oneapi::mkl::blas::column_major;
-#include "../mkl_common/mkl_extensions.cxx"
-
-} // namespace column_major
-namespace row_major {
-
-namespace blas_major = ::oneapi::mkl::blas::row_major;
-#include "../mkl_common/mkl_extensions.cxx"
-
-} // namespace row_major
-} // namespace mklcpu
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/mklcpu/mklcpu_level1.cpp b/src/blas/backends/mklcpu/mklcpu_level1.cpp
deleted file mode 100644
index a4d786673..000000000
--- a/src/blas/backends/mklcpu/mklcpu_level1.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp"
-
-#include "../mkl_common/mkl_blas_backend.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace mklcpu {
-namespace column_major {
-
-namespace blas_major = ::oneapi::mkl::blas::column_major;
-#include "../mkl_common/mkl_level1.cxx"
-
-} // namespace column_major
-namespace row_major {
-
-namespace blas_major = ::oneapi::mkl::blas::row_major;
-#include "../mkl_common/mkl_level1.cxx"
-
-} // namespace row_major
-} // namespace mklcpu
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/mklcpu/mklcpu_level2.cpp b/src/blas/backends/mklcpu/mklcpu_level2.cpp
deleted file mode 100644
index 7bd46078c..000000000
--- a/src/blas/backends/mklcpu/mklcpu_level2.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp"
-
-#include "../mkl_common/mkl_blas_backend.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace mklcpu {
-namespace column_major {
-
-namespace blas_major = ::oneapi::mkl::blas::column_major;
-#include "../mkl_common/mkl_level2.cxx"
-
-} // namespace column_major
-namespace row_major {
-
-namespace blas_major = ::oneapi::mkl::blas::row_major;
-#include "../mkl_common/mkl_level2.cxx"
-
-} // namespace row_major
-} // namespace mklcpu
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/mklcpu/mklcpu_level3.cpp b/src/blas/backends/mklcpu/mklcpu_level3.cpp
deleted file mode 100644
index 6433fc98b..000000000
--- a/src/blas/backends/mklcpu/mklcpu_level3.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp"
-
-#include "../mkl_common/mkl_blas_backend.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace mklcpu {
-namespace column_major {
-
-namespace blas_major = ::oneapi::mkl::blas::column_major;
-#include "../mkl_common/mkl_level3.cxx"
-
-} // namespace column_major
-namespace row_major {
-
-namespace blas_major = ::oneapi::mkl::blas::row_major;
-#include "../mkl_common/mkl_level3.cxx"
-
-} // namespace row_major
-} // namespace mklcpu
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/mklcpu/mklcpu_wrappers.cpp b/src/blas/backends/mklcpu/mklcpu_wrappers.cpp
deleted file mode 100644
index 527f38a18..000000000
--- a/src/blas/backends/mklcpu/mklcpu_wrappers.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "blas/function_table.hpp"
-#include "oneapi/mkl/blas/detail/mklcpu/onemkl_blas_mklcpu.hpp"
-
-#define WRAPPER_VERSION 1
-
-extern "C" ONEMKL_EXPORT blas_function_table_t mkl_blas_table = {
-    WRAPPER_VERSION,
-#define BACKEND mklcpu
-#define MAJOR   column_major
-#include "../backend_wrappers.cxx"
-#undef MAJOR
-#define MAJOR row_major
-#include "../backend_wrappers.cxx"
-#undef MAJOR
-#undef BACKEND
-};
diff --git a/src/blas/backends/mklgpu/CMakeLists.txt b/src/blas/backends/mklgpu/CMakeLists.txt
deleted file mode 100644
index c971d1afd..000000000
--- a/src/blas/backends/mklgpu/CMakeLists.txt
+++ /dev/null
@@ -1,72 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(LIB_NAME onemkl_blas_mklgpu)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT
-  mklgpu_level1.cpp mklgpu_level2.cpp mklgpu_level3.cpp mklgpu_batch.cpp mklgpu_extensions.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: mklgpu_wrappers.cpp>
-)
-add_dependencies(onemkl_backend_libs_blas ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${PROJECT_SOURCE_DIR}/src/include
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-
-if(TARGET MKL::MKL_SYCL::BLAS)
-  target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL MKL::MKL_SYCL::BLAS)
-else()
-  target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL MKL::MKL_DPCPP)
-endif()
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-#Set libraries as not transitive for dynamic
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/blas/backends/mklgpu/mklgpu_batch.cpp b/src/blas/backends/mklgpu/mklgpu_batch.cpp
deleted file mode 100644
index bad2db82c..000000000
--- a/src/blas/backends/mklgpu/mklgpu_batch.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "../mkl_common/mkl_blas_backend.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace mklgpu {
-namespace column_major {
-
-namespace blas_major = ::oneapi::mkl::blas::column_major;
-#include "../mkl_common/mkl_batch.cxx"
-
-} // namespace column_major
-namespace row_major {
-
-namespace blas_major = ::oneapi::mkl::blas::row_major;
-#include "../mkl_common/mkl_batch.cxx"
-
-} // namespace row_major
-} // namespace mklgpu
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/mklgpu/mklgpu_extensions.cpp b/src/blas/backends/mklgpu/mklgpu_extensions.cpp
deleted file mode 100644
index c4b1635c8..000000000
--- a/src/blas/backends/mklgpu/mklgpu_extensions.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "../mkl_common/mkl_blas_backend.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace mklgpu {
-namespace column_major {
-
-namespace blas_major = ::oneapi::mkl::blas::column_major;
-#include "../mkl_common/mkl_extensions.cxx"
-
-} // namespace column_major
-namespace row_major {
-
-namespace blas_major = ::oneapi::mkl::blas::row_major;
-#include "../mkl_common/mkl_extensions.cxx"
-
-} // namespace row_major
-} // namespace mklgpu
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/mklgpu/mklgpu_level1.cpp b/src/blas/backends/mklgpu/mklgpu_level1.cpp
deleted file mode 100644
index 9d853e23d..000000000
--- a/src/blas/backends/mklgpu/mklgpu_level1.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp"
-
-#include "../mkl_common/mkl_blas_backend.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace mklgpu {
-namespace column_major {
-
-namespace blas_major = ::oneapi::mkl::blas::column_major;
-#include "../mkl_common/mkl_level1.cxx"
-
-} // namespace column_major
-
-namespace row_major {
-
-namespace blas_major = ::oneapi::mkl::blas::row_major;
-#include "../mkl_common/mkl_level1.cxx"
-
-} // namespace row_major
-} // namespace mklgpu
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/mklgpu/mklgpu_level2.cpp b/src/blas/backends/mklgpu/mklgpu_level2.cpp
deleted file mode 100644
index 2d3fc6b39..000000000
--- a/src/blas/backends/mklgpu/mklgpu_level2.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp"
-
-#include "../mkl_common/mkl_blas_backend.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace mklgpu {
-namespace column_major {
-
-namespace blas_major = ::oneapi::mkl::blas::column_major;
-#include "../mkl_common/mkl_level2.cxx"
-
-} // namespace column_major
-namespace row_major {
-
-namespace blas_major = ::oneapi::mkl::blas::row_major;
-#include "../mkl_common/mkl_level2.cxx"
-
-} // namespace row_major
-} // namespace mklgpu
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/mklgpu/mklgpu_level3.cpp b/src/blas/backends/mklgpu/mklgpu_level3.cpp
deleted file mode 100644
index 6362c4eaa..000000000
--- a/src/blas/backends/mklgpu/mklgpu_level3.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp"
-
-#include "../mkl_common/mkl_blas_backend.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace mklgpu {
-namespace column_major {
-
-namespace blas_major = ::oneapi::mkl::blas::column_major;
-#include "../mkl_common/mkl_level3.cxx"
-
-} // namespace column_major
-namespace row_major {
-
-namespace blas_major = ::oneapi::mkl::blas::row_major;
-#include "../mkl_common/mkl_level3.cxx"
-
-} // namespace row_major
-} // namespace mklgpu
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/mklgpu/mklgpu_wrappers.cpp b/src/blas/backends/mklgpu/mklgpu_wrappers.cpp
deleted file mode 100644
index 39b49a93a..000000000
--- a/src/blas/backends/mklgpu/mklgpu_wrappers.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "blas/function_table.hpp"
-#include "oneapi/mkl/blas/detail/mklgpu/onemkl_blas_mklgpu.hpp"
-
-#define WRAPPER_VERSION 1
-
-extern "C" ONEMKL_EXPORT blas_function_table_t mkl_blas_table = {
-    WRAPPER_VERSION,
-#define BACKEND mklgpu
-#define MAJOR   column_major
-#include "../backend_wrappers.cxx"
-#undef MAJOR
-#define MAJOR row_major
-#include "../backend_wrappers.cxx"
-#undef MAJOR
-#undef BACKEND
-};
diff --git a/src/blas/backends/netlib/CMakeLists.txt b/src/blas/backends/netlib/CMakeLists.txt
deleted file mode 100644
index fd5275fc0..000000000
--- a/src/blas/backends/netlib/CMakeLists.txt
+++ /dev/null
@@ -1,76 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(LIB_NAME onemkl_blas_netlib)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-# Add third-party library 
-find_package(NETLIB REQUIRED)
-
-set(SOURCES netlib_common.hpp
-  netlib_level1.cpp netlib_level2.cpp netlib_level3.cpp netlib_batch.cpp netlib_extensions.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: netlib_wrappers.cpp>
-)
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT ${SOURCES})
-add_dependencies(onemkl_backend_libs_blas ${LIB_NAME})
-
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET ${LIB_OBJ} SOURCES ${SOURCES})
-endif()
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${PROJECT_SOURCE_DIR}/src/include
-          ${CMAKE_BINARY_DIR}/bin
-          ${NETLIB_INCLUDE}
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-
-target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL ${NETLIB_LINK})
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/blas/backends/netlib/netlib_batch.cpp b/src/blas/backends/netlib/netlib_batch.cpp
deleted file mode 100644
index 69197b09d..000000000
--- a/src/blas/backends/netlib/netlib_batch.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "netlib_common.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/netlib/onemkl_blas_netlib.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace netlib {
-namespace column_major {
-
-#define COLUMN_MAJOR
-#include "netlib_batch.cxx"
-#undef COLUMN_MAJOR
-
-} // namespace column_major
-namespace row_major {
-
-#define ROW_MAJOR
-#include "netlib_batch.cxx"
-#undef ROW_MAJOR
-
-} // namespace row_major
-} // namespace netlib
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/netlib/netlib_batch.cxx b/src/blas/backends/netlib/netlib_batch.cxx
deleted file mode 100644
index 7a2839dd4..000000000
--- a/src/blas/backends/netlib/netlib_batch.cxx
+++ /dev/null
@@ -1,1620 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-                int64_t stridex, sycl::buffer<float, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-#endif
-}
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-                int64_t stridex, sycl::buffer<double, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-#endif
-}
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                int64_t incy, int64_t stridey, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-#endif
-}
-
-void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                int64_t incy, int64_t stridey, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-#endif
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<float, 1> &y, int64_t incy,
-                int64_t stridey, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-#endif
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                int64_t incx, int64_t stridex, sycl::buffer<double, 1> &y, int64_t incy,
-                int64_t stridey, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-#endif
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-#endif
-}
-
-void axpy_batch(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &x, int64_t incx, int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, int64_t incy, int64_t stridey,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-#endif
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<float, 1> &x, int64_t incx, int64_t stride_x, float beta,
-                sycl::buffer<float, 1> &y, int64_t incy, int64_t stride_y, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-#endif
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<double, 1> &x, int64_t incx, int64_t stride_x, double beta,
-                sycl::buffer<double, 1> &y, int64_t incy, int64_t stride_y,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-#endif
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-                int64_t stride_x, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &y, int64_t incy, int64_t stride_y,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-#endif
-}
-
-void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<std::complex<double>, 1> &x,
-                int64_t incx, int64_t stride_x, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &y, int64_t incy, int64_t stride_y,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-#endif
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<float, 1> &x, int64_t incx, int64_t stride_x,
-                sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-#endif
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<double, 1> &x, int64_t incx, int64_t stride_x,
-                sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-#endif
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &x, int64_t incx, int64_t stride_x,
-                sycl::buffer<std::complex<float>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-#endif
-}
-
-void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &x, int64_t incx, int64_t stride_x,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-#endif
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b,
-                double beta, sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                int64_t ldb, int64_t stride_b, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                int64_t ldb, int64_t stride_b, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, sycl::half alpha, sycl::buffer<sycl::half, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<sycl::half, 1> &b, int64_t ldb, int64_t stride_b,
-                sycl::half beta, sycl::buffer<sycl::half, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, float alpha, sycl::buffer<sycl::half, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<sycl::half, 1> &b, int64_t ldb, int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<std::int8_t, 1> &b, int64_t ldb, int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                int64_t k, float alpha, sycl::buffer<std::int8_t, 1> &a, int64_t lda,
-                int64_t stride_a, sycl::buffer<std::int8_t, 1> &b, int64_t ldb, int64_t stride_b,
-                float beta, sycl::buffer<std::int32_t, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<float, 1> &b, int64_t ldb,
-                int64_t stride_b, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-#endif
-}
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                int64_t lda, int64_t stride_a, sycl::buffer<double, 1> &b, int64_t ldb,
-                int64_t stride_b, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-#endif
-}
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, int64_t stride_b,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-#endif
-}
-
-void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, int64_t lda, int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, int64_t stride_b,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-#endif
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                float beta, sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-#endif
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                double beta, sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-#endif
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                int64_t stride_a, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-#endif
-}
-
-void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                int64_t lda, int64_t stride_a, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-#endif
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                    sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                    sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-#endif
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                    sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                    sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-#endif
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                    int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-                    int64_t stride_b, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-#endif
-}
-
-void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                    int64_t lda, int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    int64_t ldb, int64_t stride_b, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-#endif
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                    sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-#endif
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                    sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-#endif
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-#endif
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-#endif
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   float alpha, sycl::buffer<float, 1> &a, int64_t lda, int64_t stride_a,
-                   float beta, sycl::buffer<float, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<float, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-#endif
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   double alpha, sycl::buffer<double, 1> &a, int64_t lda, int64_t stride_a,
-                   double beta, sycl::buffer<double, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<double, 1> &c, int64_t ldc, int64_t stride_c, int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-#endif
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-                   int64_t stride_a, std::complex<float> beta,
-                   sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<std::complex<float>, 1> &c, int64_t ldc, int64_t stride_c,
-                   int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-#endif
-}
-
-void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                   std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                   int64_t lda, int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, int64_t stride_b,
-                   sycl::buffer<std::complex<double>, 1> &c, int64_t ldc, int64_t stride_c,
-                   int64_t batch_size) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-#endif
-}
-
-// USM APIs
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, int64_t *incx,
-                           float **y, int64_t *incy, int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, int64_t *incx,
-                           double **y, int64_t *incy, int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex<float> **x,
-                           int64_t *incx, std::complex<float> **y, int64_t *incy,
-                           int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex<double> **x,
-                           int64_t *incx, std::complex<double> **y, int64_t *incy,
-                           int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const float *x, int64_t incx,
-                           std::int64_t stridex, float *y, int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, int64_t incx,
-                           std::int64_t stridex, double *y, int64_t incy, std::int64_t stridey,
-                           std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex<float> *x,
-                           int64_t incx, std::int64_t stridex, std::complex<float> *y, int64_t incy,
-                           std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex<double> *x,
-                           int64_t incx, std::int64_t stridex, std::complex<double> *y,
-                           int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "copy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "copy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, const float **x,
-                           int64_t *incx, float **y, int64_t *incy, int64_t group_count,
-                           int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, const double **x,
-                           int64_t *incx, double **y, int64_t *incy, int64_t group_count,
-                           int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex<float> *alpha,
-                           const std::complex<float> **x, int64_t *incx, std::complex<float> **y,
-                           int64_t *incy, int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex<double> *alpha,
-                           const std::complex<double> **x, int64_t *incx, std::complex<double> **y,
-                           int64_t *incy, int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, const float *x,
-                           int64_t incx, int64_t stridex, float *y, int64_t incy, int64_t stridey,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, const double *x,
-                           int64_t incx, int64_t stridex, double *y, int64_t incy, int64_t stridey,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                           const std::complex<float> *x, int64_t incx, int64_t stridex,
-                           std::complex<float> *y, int64_t incy, int64_t stridey,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *x, int64_t incx, int64_t stridex,
-                           std::complex<double> *y, int64_t incy, int64_t stridey,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                           float alpha, const float *a, int64_t lda, int64_t stride_a,
-                           const float *x, int64_t incx, int64_t stride_x, float beta, float *y,
-                           int64_t incy, int64_t stride_y, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                           double alpha, const double *a, int64_t lda, int64_t stride_a,
-                           const double *x, int64_t incx, int64_t stride_x, double beta, double *y,
-                           int64_t incy, int64_t stride_y, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                           std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                           int64_t stride_a, const std::complex<float> *x, int64_t incx,
-                           int64_t stride_x, std::complex<float> beta, std::complex<float> *y,
-                           int64_t incy, int64_t stride_y, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n,
-                           std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                           int64_t stride_a, const std::complex<double> *x, int64_t incx,
-                           int64_t stride_x, std::complex<double> beta, std::complex<double> *y,
-                           int64_t incy, int64_t stride_y, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n,
-                           float *alpha, const float **a, int64_t *lda, const float **x,
-                           int64_t *incx, float *beta, float **y, int64_t *incy,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n,
-                           double *alpha, const double **a, int64_t *lda, const double **x,
-                           int64_t *incx, double *beta, double **y, int64_t *incy,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n,
-                           std::complex<float> *alpha, const std::complex<float> **a, int64_t *lda,
-                           const std::complex<float> **x, int64_t *incx, std::complex<float> *beta,
-                           std::complex<float> **y, int64_t *incy, int64_t group_count,
-                           int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n,
-                           std::complex<double> *alpha, const std::complex<double> **a,
-                           int64_t *lda, const std::complex<double> **x, int64_t *incx,
-                           std::complex<double> *beta, std::complex<double> **y, int64_t *incy,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemv_batch", "for row_major layout");
-#endif
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                           const float *a, int64_t lda, int64_t stride_a, const float *x,
-                           int64_t incx, int64_t stride_x, float *c, int64_t ldc, int64_t stride_c,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                           const double *a, int64_t lda, int64_t stride_a, const double *x,
-                           int64_t incx, int64_t stride_x, double *c, int64_t ldc, int64_t stride_c,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                           const std::complex<float> *a, int64_t lda, int64_t stride_a,
-                           const std::complex<float> *x, int64_t incx, int64_t stride_x,
-                           std::complex<float> *c, int64_t ldc, int64_t stride_c,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                           const std::complex<double> *a, int64_t lda, int64_t stride_a,
-                           const std::complex<double> *x, int64_t incx, int64_t stride_x,
-                           std::complex<double> *c, int64_t ldc, int64_t stride_c,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                           const float **a, int64_t *lda, const float **x, int64_t *incx, float **c,
-                           int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                           const double **a, int64_t *lda, const double **x, int64_t *incx,
-                           double **c, int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                           const std::complex<float> **a, int64_t *lda,
-                           const std::complex<float> **x, int64_t *incx, std::complex<float> **c,
-                           int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,
-                           const std::complex<double> **a, int64_t *lda,
-                           const std::complex<double> **x, int64_t *incx, std::complex<double> **c,
-                           int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "dgmm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                           int64_t *n, int64_t *k, float *alpha, const float **a, int64_t *lda,
-                           const float **b, int64_t *ldb, float *beta, float **c, int64_t *ldc,
-                           int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                           int64_t *n, int64_t *k, double *alpha, const double **a, int64_t *lda,
-                           const double **b, int64_t *ldb, double *beta, double **c, int64_t *ldc,
-                           int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                           int64_t *n, int64_t *k, std::complex<float> *alpha,
-                           const std::complex<float> **a, int64_t *lda,
-                           const std::complex<float> **b, int64_t *ldb, std::complex<float> *beta,
-                           std::complex<float> **c, int64_t *ldc, int64_t group_count,
-                           int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                           int64_t *n, int64_t *k, std::complex<double> *alpha,
-                           const std::complex<double> **a, int64_t *lda,
-                           const std::complex<double> **b, int64_t *ldb, std::complex<double> *beta,
-                           std::complex<double> **c, int64_t *ldc, int64_t group_count,
-                           int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                           int64_t *n, int64_t *k, sycl::half *alpha, const sycl::half **a,
-                           int64_t *lda, const sycl::half **b, int64_t *ldb, sycl::half *beta,
-                           sycl::half **c, int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, float *alpha, const sycl::half **a, int64_t *lda,
-                       const sycl::half **b, int64_t *ldb, float *beta, float **c, int64_t *ldc,
-                       int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, float *alpha, const std::int8_t **a, int64_t *lda,
-                       const std::int8_t **b, int64_t *ldb, float *beta, float **c, int64_t *ldc,
-                       int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,
-                       int64_t *n, int64_t *k, float *alpha, const std::int8_t **a, int64_t *lda,
-                       const std::int8_t **b, int64_t *ldb, float *beta, std::int32_t **c,
-                       int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                       const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                           int64_t n, int64_t k, float alpha, const float *a, int64_t lda,
-                           int64_t stride_a, const float *b, int64_t ldb, int64_t stride_b,
-                           float beta, float *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                           int64_t n, int64_t k, double alpha, const double *a, int64_t lda,
-                           int64_t stride_a, const double *b, int64_t ldb, int64_t stride_b,
-                           double beta, double *c, int64_t ldc, int64_t stride_c,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                           int64_t n, int64_t k, std::complex<float> alpha,
-                           const std::complex<float> *a, int64_t lda, int64_t stride_a,
-                           const std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                           std::complex<float> beta, std::complex<float> *c, int64_t ldc,
-                           int64_t stride_c, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                           int64_t n, int64_t k, std::complex<double> alpha,
-                           const std::complex<double> *a, int64_t lda, int64_t stride_a,
-                           const std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                           std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                           int64_t stride_c, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                           int64_t n, int64_t k, sycl::half alpha, const sycl::half *a, int64_t lda,
-                           int64_t stride_a, const sycl::half *b, int64_t ldb, int64_t stride_b,
-                           sycl::half beta, sycl::half *c, int64_t ldc, int64_t stride_c,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, float alpha, const sycl::half *a, int64_t lda, int64_t stride_a,
-                       const sycl::half *b, int64_t ldb, int64_t stride_b, float beta, float *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, float alpha, const std::int8_t *a, int64_t lda, int64_t stride_a,
-                       const std::int8_t *b, int64_t ldb, int64_t stride_b, float beta, float *c,
-                       int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                       int64_t k, float alpha, const std::int8_t *a, int64_t lda, int64_t stride_a,
-                       const std::int8_t *b, int64_t ldb, int64_t stride_b, float beta,
-                       std::int32_t *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                           transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha,
-                           const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb,
-                           int64_t stride_b, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                           transpose trans, diag unit_diag, int64_t m, int64_t n, double alpha,
-                           const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb,
-                           int64_t stride_b, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                           transpose trans, diag unit_diag, int64_t m, int64_t n,
-                           std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                           int64_t stride_a, std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower,
-                           transpose trans, diag unit_diag, int64_t m, int64_t n,
-                           std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                           int64_t stride_a, std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower,
-                           transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, float *alpha,
-                           const float **a, int64_t *lda, float **b, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower,
-                           transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, double *alpha,
-                           const double **a, int64_t *lda, double **b, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower,
-                           transpose *trans, diag *unit_diag, int64_t *m, int64_t *n,
-                           std::complex<float> *alpha, const std::complex<float> **a, int64_t *lda,
-                           std::complex<float> **b, int64_t *ldb, int64_t group_count,
-                           int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower,
-                           transpose *trans, diag *unit_diag, int64_t *m, int64_t *n,
-                           std::complex<double> *alpha, const std::complex<double> **a,
-                           int64_t *lda, std::complex<double> **b, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "trsm_batch", "for row_major layout");
-#endif
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                           int64_t *k, float *alpha, const float **a, int64_t *lda, float *beta,
-                           float **c, int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-#endif
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                           int64_t *k, double *alpha, const double **a, int64_t *lda, double *beta,
-                           double **c, int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-#endif
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                           int64_t *k, std::complex<float> *alpha, const std::complex<float> **a,
-                           int64_t *lda, std::complex<float> *beta, std::complex<float> **c,
-                           int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-#endif
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,
-                           int64_t *k, std::complex<double> *alpha, const std::complex<double> **a,
-                           int64_t *lda, std::complex<double> *beta, std::complex<double> **c,
-                           int64_t *ldc, int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-#endif
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                           int64_t k, float alpha, const float *a, int64_t lda, int64_t stride_a,
-                           float beta, float *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-#endif
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                           int64_t k, double alpha, const double *a, int64_t lda, int64_t stride_a,
-                           double beta, double *c, int64_t ldc, int64_t stride_c,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-#endif
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                           int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                           int64_t lda, int64_t stride_a, std::complex<float> beta,
-                           std::complex<float> *c, int64_t ldc, int64_t stride_c,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-#endif
-}
-
-sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                           int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                           int64_t lda, int64_t stride_a, std::complex<double> beta,
-                           std::complex<double> *c, int64_t ldc, int64_t stride_c,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "syrk_batch", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                           const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb,
-                           int64_t stride_b, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                           const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb,
-                           int64_t stride_b, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                           int64_t stride_a, std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                           int64_t stride_a, std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                           float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                           double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<float> alpha, std::complex<float> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<double> alpha, std::complex<double> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, float alpha, const float *a, int64_t lda, int64_t stride_a,
-                          float beta, const float *b, int64_t ldb, int64_t stride_b, float *c,
-                          int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-#endif
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, double alpha, const double *a, int64_t lda, int64_t stride_a,
-                          double beta, const double *b, int64_t ldb, int64_t stride_b, double *c,
-                          int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-#endif
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                          int64_t lda, int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, int64_t ldb, int64_t stride_b,
-                          std::complex<float> *c, int64_t ldc, int64_t stride_c, int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-#endif
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                          int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                          int64_t lda, int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, int64_t ldb, int64_t stride_b,
-                          std::complex<double> *c, int64_t ldc, int64_t stride_c,
-                          int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd_batch", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           float* alpha, const float** a, int64_t* lda, float** b, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           double* alpha, const double** a, int64_t* lda, double** b, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<float>* alpha, const std::complex<float>** a, int64_t* lda,
-                           std::complex<float>** b, int64_t* ldb, int64_t group_count,
-                           int64_t* groupsize, const std::vector<sycl::event>& dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<double>* alpha, const std::complex<double>** a,
-                           int64_t* lda, std::complex<double>** b, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           float* alpha, float** ab, int64_t* lda, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           double* alpha, double** ab, int64_t* lda, int64_t* ldb,
-                           int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<float>* alpha, std::complex<float>** ab, int64_t* lda,
-                           int64_t* ldb, int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-#endif
-}
-
-sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n,
-                           std::complex<double>* alpha, std::complex<double>** ab, int64_t* lda,
-                           int64_t* ldb, int64_t group_count, int64_t* groupsize,
-                           const std::vector<sycl::event>& dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-#endif
-}
diff --git a/src/blas/backends/netlib/netlib_common.hpp b/src/blas/backends/netlib/netlib_common.hpp
deleted file mode 100644
index 3a69c70f8..000000000
--- a/src/blas/backends/netlib/netlib_common.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _NETLIB_COMMON_HPP_
-#define _NETLIB_COMMON_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <complex>
-
-#include "cblas.h"
-
-#include "oneapi/mkl/blas/detail/netlib/onemkl_blas_netlib.hpp"
-#include "oneapi/mkl/types.hpp"
-
-#define GET_MULTI_PTR template get_multi_ptr<sycl::access::decorated::yes>().get_raw()
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace netlib {
-
-typedef enum { CblasFixOffset = 101, CblasColOffset = 102, CblasRowOffset = 103 } CBLAS_OFFSET;
-
-/**
- * Helper methods for converting between onemkl types and their BLAS
- * equivalents.
- */
-
-inline CBLAS_TRANSPOSE convert_to_cblas_trans(transpose trans) {
-    if (trans == transpose::trans)
-        return CBLAS_TRANSPOSE::CblasTrans;
-    else if (trans == transpose::conjtrans)
-        return CBLAS_TRANSPOSE::CblasConjTrans;
-    else
-        return CBLAS_TRANSPOSE::CblasNoTrans;
-}
-
-inline CBLAS_UPLO convert_to_cblas_uplo(uplo is_upper) {
-    return is_upper == uplo::upper ? CBLAS_UPLO::CblasUpper : CBLAS_UPLO::CblasLower;
-}
-
-inline CBLAS_DIAG convert_to_cblas_diag(diag is_unit) {
-    return is_unit == diag::unit ? CBLAS_DIAG::CblasUnit : CBLAS_DIAG::CblasNonUnit;
-}
-
-inline CBLAS_SIDE convert_to_cblas_side(side is_left) {
-    return is_left == side::left ? CBLAS_SIDE::CblasLeft : CBLAS_SIDE::CblasRight;
-}
-
-inline CBLAS_OFFSET convert_to_cblas_offset(offset offsetc) {
-    if (offsetc == offset::fix)
-        return CBLAS_OFFSET::CblasFixOffset;
-    else if (offsetc == offset::column)
-        return CBLAS_OFFSET::CblasColOffset;
-    else
-        return CBLAS_OFFSET::CblasRowOffset;
-}
-
-// host_task automatically uses run_on_host_intel if it is supported by the
-//  compiler. Otherwise, it falls back to single_task.
-template <typename K, typename H, typename F>
-static inline auto host_task_internal(H &cgh, F f, int) -> decltype(cgh.host_task(f)) {
-    return cgh.host_task(f);
-}
-
-template <typename K, typename H, typename F>
-static inline void host_task_internal(H &cgh, F f, long) {
-#ifndef __SYCL_DEVICE_ONLY__
-    cgh.template single_task<K>(f);
-#endif
-}
-
-template <typename K, typename H, typename F>
-static inline void host_task(H &cgh, F f) {
-    (void)host_task_internal<K>(cgh, f, 0);
-}
-
-} // namespace netlib
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_NETLIB_COMMON_HPP_
diff --git a/src/blas/backends/netlib/netlib_extensions.cpp b/src/blas/backends/netlib/netlib_extensions.cpp
deleted file mode 100644
index 4815ba598..000000000
--- a/src/blas/backends/netlib/netlib_extensions.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "netlib_common.hpp"
-#include "oneapi/mkl/blas/detail/netlib/onemkl_blas_netlib.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace netlib {
-namespace column_major {
-
-#define COLUMN_MAJOR
-#include "netlib_extensions.cxx"
-#undef COLUMN_MAJOR
-
-} // namespace column_major
-namespace row_major {
-
-#define ROW_MAJOR
-#include "netlib_extensions.cxx"
-#undef ROW_MAJOR
-
-} // namespace row_major
-} // namespace netlib
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/netlib/netlib_extensions.cxx b/src/blas/backends/netlib/netlib_extensions.cxx
deleted file mode 100644
index 8e94cb880..000000000
--- a/src/blas/backends/netlib/netlib_extensions.cxx
+++ /dev/null
@@ -1,585 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-               int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer<int8_t, 1> &a,
-               int64_t lda, int8_t ao, sycl::buffer<int8_t, 1> &b, int64_t ldb, int8_t bo,
-               float beta, sycl::buffer<int32_t, 1> &c, int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-#endif
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-               int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer<int8_t, 1> &a,
-               int64_t lda, int8_t ao, sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo,
-               float beta, sycl::buffer<int32_t, 1> &c, int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-#endif
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-               int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a,
-               int64_t lda, uint8_t ao, sycl::buffer<int8_t, 1> &b, int64_t ldb, int8_t bo,
-               float beta, sycl::buffer<int32_t, 1> &c, int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-#endif
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-               int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a,
-               int64_t lda, uint8_t ao, sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo,
-               float beta, sycl::buffer<int32_t, 1> &c, int64_t ldc,
-               sycl::buffer<int32_t, 1> &co) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-#endif
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-           sycl::buffer<float, 1> &b, int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-           int64_t ldc) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-#endif
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-           sycl::buffer<double, 1> &b, int64_t ldb, double beta, sycl::buffer<double, 1> &c,
-           int64_t ldc) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-#endif
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-           std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-#endif
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-#endif
-}
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-              sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b, int64_t ldb) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
-#endif
-}
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-              sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b, int64_t ldb) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
-#endif
-}
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-              sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-              sycl::buffer<std::complex<float>, 1> &b, int64_t ldb) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
-#endif
-}
-
-void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-              sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-              sycl::buffer<std::complex<double>, 1> &b, int64_t ldb) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
-#endif
-}
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-               sycl::buffer<float, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<float, 1> &b, int64_t ldb, std::int64_t strideb) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for row_major layout");
-#endif
-}
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-               sycl::buffer<double, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<double, 1> &b, int64_t ldb, std::int64_t strideb) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for row_major layout");
-#endif
-}
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-               sycl::buffer<std::complex<float>, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::int64_t strideb) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for row_major layout");
-#endif
-}
-
-void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-               std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-               std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-               std::int64_t strideb) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for row_major layout");
-#endif
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-              sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-#endif
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-              sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-#endif
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-              sycl::buffer<std::complex<float>, 1> &ab, int64_t lda, int64_t ldb) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-#endif
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-              sycl::buffer<std::complex<double>, 1> &ab, int64_t lda, int64_t ldb) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-#endif
-}
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             float alpha, sycl::buffer<float, 1> &a, int64_t lda, float beta,
-             sycl::buffer<float, 1> &b, int64_t ldb, sycl::buffer<float, 1> &c, int64_t ldc) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd", "for row_major layout");
-#endif
-}
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             double alpha, sycl::buffer<double, 1> &a, int64_t lda, double beta,
-             sycl::buffer<double, 1> &b, int64_t ldb, sycl::buffer<double, 1> &c, int64_t ldc) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd", "for row_major layout");
-#endif
-}
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-             std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-             sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd", "for row_major layout");
-#endif
-}
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-             std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-             std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-             sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd", "for row_major layout");
-#endif
-}
-
-// USM APIs
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                          offset offsetc, int64_t m, int64_t n, int64_t k, float alpha,
-                          const int8_t *a, int64_t lda, int8_t ao, const int8_t *b, int64_t ldb,
-                          int8_t bo, float beta, int32_t *c, int64_t ldc, const int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                          offset offsetc, int64_t m, int64_t n, int64_t k, float alpha,
-                          const int8_t *a, int64_t lda, int8_t ao, const uint8_t *b, int64_t ldb,
-                          uint8_t bo, float beta, int32_t *c, int64_t ldc, const int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                          offset offsetc, int64_t m, int64_t n, int64_t k, float alpha,
-                          const uint8_t *a, int64_t lda, uint8_t ao, const int8_t *b, int64_t ldb,
-                          int8_t bo, float beta, int32_t *c, int64_t ldc, const int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-#endif
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb,
-                          offset offsetc, int64_t m, int64_t n, int64_t k, float alpha,
-                          const uint8_t *a, int64_t lda, uint8_t ao, const uint8_t *b, int64_t ldb,
-                          uint8_t bo, float beta, int32_t *c, int64_t ldc, const int32_t *co,
-                          const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-#endif
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                      int64_t n, int64_t k, float alpha, const float *a, int64_t lda,
-                      const float *b, int64_t ldb, float beta, float *c, int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-#endif
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                      int64_t n, int64_t k, double alpha, const double *a, int64_t lda,
-                      const double *b, int64_t ldb, double beta, double *c, int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-#endif
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                      int64_t n, int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                      int64_t lda, const std::complex<float> *b, int64_t ldb,
-                      std::complex<float> beta, std::complex<float> *c, int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-#endif
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                      int64_t n, int64_t k, std::complex<double> alpha,
-                      const std::complex<double> *a, int64_t lda, const std::complex<double> *b,
-                      int64_t ldb, std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                      const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                     const float *a, int64_t lda, float *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                     const double *a, int64_t lda, double *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                     std::complex<float> *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                     std::complex<double> *b, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                      const float *a, int64_t lda, std::int64_t stridea, float *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                      const double *a, int64_t lda, std::int64_t stridea, double *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                      std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                      std::int64_t stridea, std::complex<float> *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for row_major layout");
-#endif
-}
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                      std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                      std::int64_t stridea, std::complex<double> *b, int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatcopy2", "for row_major layout");
-#endif
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                     float *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-#endif
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                     double *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-#endif
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<float> alpha, std::complex<float> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-#endif
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<double> alpha, std::complex<double> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-#endif
-}
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    float alpha, const float *a, int64_t lda, float beta, const float *b,
-                    int64_t ldb, float *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd", "for row_major layout");
-#endif
-}
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    double alpha, const double *a, int64_t lda, double beta, const double *b,
-                    int64_t ldb, double *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd", "for row_major layout");
-#endif
-}
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                    std::complex<float> beta, const std::complex<float> *b, int64_t ldb,
-                    std::complex<float> *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd", "for row_major layout");
-#endif
-}
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                    std::complex<double> beta, const std::complex<double> *b, int64_t ldb,
-                    std::complex<double> *c, int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "omatadd", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "omatadd", "for row_major layout");
-#endif
-}
-
-
diff --git a/src/blas/backends/netlib/netlib_level1.cpp b/src/blas/backends/netlib/netlib_level1.cpp
deleted file mode 100644
index 59830db81..000000000
--- a/src/blas/backends/netlib/netlib_level1.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "netlib_common.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/netlib/onemkl_blas_netlib.hpp"
-
-inline float abs_val(float val) {
-    return std::abs(val);
-}
-
-inline double abs_val(double val) {
-    return std::abs(val);
-}
-
-inline float abs_val(std::complex<float> val) {
-    return std::abs(val.real()) + std::abs(val.imag());
-}
-
-inline double abs_val(std::complex<double> val) {
-    return std::abs(val.real()) + std::abs(val.imag());
-}
-
-int cblas_isamin(int n, const float *x, int incx) {
-    if (n < 1 || incx < 1) {
-        return 0;
-    }
-    int min_idx = 0;
-    auto min_val = abs_val(x[0]);
-    if (sycl::isnan(min_val))
-        return 0;
-
-    for (int logical_i = 1; logical_i < n; ++logical_i) {
-        int i = logical_i * std::abs(incx);
-        auto curr_val = abs_val(x[i]);
-        if (sycl::isnan(curr_val))
-            return logical_i;
-        if (curr_val < min_val) {
-            min_idx = logical_i;
-            min_val = curr_val;
-        }
-    }
-    return min_idx;
-}
-
-int cblas_idamin(int n, const double *x, int incx) {
-    if (n < 1 || incx < 1) {
-        return 0;
-    }
-    int min_idx = 0;
-    auto min_val = abs_val(x[0]);
-    if (sycl::isnan(min_val))
-        return 0;
-
-    for (int logical_i = 1; logical_i < n; ++logical_i) {
-        int i = logical_i * std::abs(incx);
-        auto curr_val = abs_val(x[i]);
-        if (sycl::isnan(curr_val))
-            return logical_i;
-        if (curr_val < min_val) {
-            min_idx = logical_i;
-            min_val = curr_val;
-        }
-    }
-    return min_idx;
-}
-
-int cblas_icamin(int n, const std::complex<float> *x, int incx) {
-    if (n < 1 || incx < 1) {
-        return 0;
-    }
-    int min_idx = 0;
-    auto min_val = abs_val(x[0]);
-    if (sycl::isnan(min_val))
-        return 0;
-
-    for (int logical_i = 1; logical_i < n; ++logical_i) {
-        int i = logical_i * std::abs(incx);
-        auto curr_val = abs_val(x[i]);
-        if (sycl::isnan(curr_val))
-            return logical_i;
-        if (curr_val < min_val) {
-            min_idx = logical_i;
-            min_val = curr_val;
-        }
-    }
-    return min_idx;
-}
-
-int cblas_izamin(int n, const std::complex<double> *x, int incx) {
-    if (n < 1 || incx < 1) {
-        return 0;
-    }
-    int min_idx = 0;
-    auto min_val = abs_val(x[0]);
-    if (sycl::isnan(min_val))
-        return 0;
-
-    for (int logical_i = 1; logical_i < n; ++logical_i) {
-        int i = logical_i * std::abs(incx);
-        auto curr_val = abs_val(x[i]);
-        if (sycl::isnan(curr_val))
-            return logical_i;
-        if (curr_val < min_val) {
-            min_idx = logical_i;
-            min_val = curr_val;
-        }
-    }
-    return min_idx;
-}
-
-void cblas_csrot(const int n, std::complex<float> *cx, const int incx, std::complex<float> *cy,
-                 const int incy, const float c, const float s) {
-    if (n < 1)
-        return;
-    if (incx == 1 && incy == 1) {
-        for (int i = 0; i < n; i++) {
-            std::complex<float> ctemp = c * cx[i] + s * cy[i];
-            cy[i] = c * cy[i] - s * cx[i];
-            cx[i] = ctemp;
-        }
-    }
-    else {
-        int ix = 0, iy = 0;
-        if (incx < 0)
-            ix = (-n + 1) * incx;
-        if (incy < 0)
-            iy = (-n + 1) * incy;
-        for (int i = 0; i < n; i++) {
-            std::complex<float> ctemp = c * cx[ix] + s * cy[iy];
-            cy[iy] = c * cy[iy] - s * cx[ix];
-            cx[ix] = ctemp;
-            ix = ix + incx;
-            iy = iy + incy;
-        }
-    }
-}
-
-void cblas_zdrot(const int n, std::complex<double> *zx, const int incx, std::complex<double> *zy,
-                 const int incy, const double c, const double s) {
-    if (n < 1)
-        return;
-    if (incx == 1 && incy == 1) {
-        for (int i = 0; i < n; i++) {
-            std::complex<double> ctemp = c * zx[i] + s * zy[i];
-            zy[i] = c * zy[i] - s * zx[i];
-            zx[i] = ctemp;
-        }
-    }
-    else {
-        int ix = 0, iy = 0;
-        if (incx < 0)
-            ix = (-n + 1) * incx;
-        if (incy < 0)
-            iy = (-n + 1) * incy;
-        for (int i = 0; i < n; i++) {
-            std::complex<double> ctemp = c * zx[ix] + s * zy[iy];
-            zy[iy] = c * zy[iy] - s * zx[ix];
-            zx[ix] = ctemp;
-            ix = ix + incx;
-            iy = iy + incy;
-        }
-    }
-}
-
-void cblas_crotg(std::complex<float> *ca, const std::complex<float> *cb, float *c,
-                 std::complex<float> *s) {
-    if (std::abs(ca[0]) == 0) {
-        c[0] = 0.0;
-        s[0] = std::complex<float>(1.0, 0.0);
-        ca[0] = cb[0];
-    }
-    else {
-        float scale = std::abs(ca[0]) + std::abs(cb[0]);
-        float norm = scale * std::sqrt(std::pow(std::abs(ca[0] / scale), 2) +
-                                       std::pow(std::abs(cb[0] / scale), 2));
-        std::complex<float> alpha = ca[0] / std::abs(ca[0]);
-        c[0] = std::abs(ca[0]) / norm;
-        s[0] = alpha * std::conj(cb[0]) / norm;
-        ca[0] = alpha * norm;
-    }
-}
-
-void cblas_zrotg(std::complex<double> *ca, const std::complex<double> *cb, double *c,
-                 std::complex<double> *s) {
-    if (std::abs(ca[0]) == 0) {
-        c[0] = 0.0;
-        s[0] = std::complex<double>(1.0, 0.0);
-        ca[0] = cb[0];
-    }
-    else {
-        double scale = std::abs(ca[0]) + std::abs(cb[0]);
-        double norm = scale * std::sqrt(std::pow(std::abs(ca[0] / scale), 2) +
-                                        std::pow(std::abs(cb[0] / scale), 2));
-        std::complex<double> alpha = ca[0] / std::abs(ca[0]);
-        c[0] = std::abs(ca[0]) / norm;
-        s[0] = alpha * std::conj(cb[0]) / norm;
-        ca[0] = alpha * norm;
-    }
-}
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace netlib {
-namespace column_major {
-
-#define COLUMN_MAJOR
-#include "netlib_level1.cxx"
-#undef COLUMN_MAJOR
-
-} // namespace column_major
-namespace row_major {
-
-#define ROW_MAJOR
-#include "netlib_level1.cxx"
-#undef ROW_MAJOR
-
-} // namespace row_major
-} // namespace netlib
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/netlib/netlib_level1.cxx b/src/blas/backends/netlib/netlib_level1.cxx
deleted file mode 100644
index 9f953dc5b..000000000
--- a/src/blas/backends/netlib/netlib_level1.cxx
+++ /dev/null
@@ -1,1525 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void asum(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_sasum>(cgh, [=]() {
-            accessor_result[0] =
-                ::cblas_sasum((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx));
-        });
-    });
-}
-
-void asum(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_dasum>(cgh, [=]() {
-            accessor_result[0] =
-                ::cblas_dasum((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx));
-        });
-    });
-}
-
-void asum(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_scasum>(cgh, [=]() {
-            accessor_result[0] =
-                ::cblas_scasum((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx));
-        });
-    });
-}
-
-void asum(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_dzasum>(cgh, [=]() {
-            accessor_result[0] =
-                ::cblas_dzasum((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx));
-        });
-    });
-}
-
-void axpy(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_saxpy>(cgh, [=]() {
-            ::cblas_saxpy((const int)n, (const float)alpha, accessor_x.GET_MULTI_PTR,
-                          (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void axpy(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_daxpy>(cgh, [=]() {
-            ::cblas_daxpy((const int)n, (const double)alpha, accessor_x.GET_MULTI_PTR,
-                          (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void axpy(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_caxpy>(cgh, [=]() {
-            ::cblas_caxpy((const int)n, (const void *)&alpha, accessor_x.GET_MULTI_PTR,
-                          (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void axpy(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zaxpy>(cgh, [=]() {
-            ::cblas_zaxpy((const int)n, (const void *)&alpha, accessor_x.GET_MULTI_PTR,
-                          (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void axpby(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x, int64_t incx,
-           float beta, sycl::buffer<float, 1> &y, int64_t incy) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpby", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpby", "for row_major layout");
-#endif
-}
-
-void axpby(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x, int64_t incx,
-           double beta, sycl::buffer<double, 1> &y, int64_t incy) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpby", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpby", "for row_major layout");
-#endif
-}
-
-void axpby(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpby", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpby", "for row_major layout");
-#endif
-}
-
-void axpby(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpby", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpby", "for row_major layout");
-#endif
-}
-
-void copy(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_scopy>(cgh, [=]() {
-            ::cblas_scopy((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void copy(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dcopy>(cgh, [=]() {
-            ::cblas_dcopy((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void copy(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ccopy>(cgh, [=]() {
-            ::cblas_ccopy((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void copy(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zcopy>(cgh, [=]() {
-            ::cblas_zcopy((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void dot(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-         sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_sdot>(cgh, [=]() {
-            accessor_result[0] =
-                ::cblas_sdot((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                             accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void dot(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-         sycl::buffer<double, 1> &y, int64_t incy, sycl::buffer<double, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_ddot>(cgh, [=]() {
-            accessor_result[0] =
-                ::cblas_ddot((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                             accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void dot(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-         sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<double, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_dsdot>(cgh, [=]() {
-            accessor_result[0] =
-                ::cblas_dsdot((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                              accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void dotc(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_cdotc>(cgh, [=]() {
-            ::cblas_cdotc_sub((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                              accessor_y.GET_MULTI_PTR, (const int)incy,
-                              accessor_result.GET_MULTI_PTR);
-        });
-    });
-}
-
-void dotc(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zdotc>(cgh, [=]() {
-            ::cblas_zdotc_sub((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                              accessor_y.GET_MULTI_PTR, (const int)incy,
-                              accessor_result.GET_MULTI_PTR);
-        });
-    });
-}
-
-void dotu(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_cdotu>(cgh, [=]() {
-            ::cblas_cdotu_sub((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                              accessor_y.GET_MULTI_PTR, (const int)incy,
-                              accessor_result.GET_MULTI_PTR);
-        });
-    });
-}
-
-void dotu(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zdotu>(cgh, [=]() {
-            ::cblas_zdotu_sub((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                              accessor_y.GET_MULTI_PTR, (const int)incy,
-                              accessor_result.GET_MULTI_PTR);
-        });
-    });
-}
-
-void iamin(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_isamin>(cgh, [=]() {
-            accessor_result[0] = ::cblas_isamin((int)n, accessor_x.GET_MULTI_PTR, (int)incx);
-        });
-    });
-}
-
-void iamin(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.template get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.template get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_idamin>(cgh, [=]() {
-            accessor_result[0] = ::cblas_idamin((int)n, accessor_x.GET_MULTI_PTR, (int)incx);
-        });
-    });
-}
-
-void iamin(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_icamin>(cgh, [=]() {
-            accessor_result[0] = ::cblas_icamin((int)n, accessor_x.GET_MULTI_PTR, (int)incx);
-        });
-    });
-}
-
-void iamin(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_izamin>(cgh, [=]() {
-            accessor_result[0] = ::cblas_izamin((int)n, accessor_x.GET_MULTI_PTR, (int)incx);
-        });
-    });
-}
-
-void iamax(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_isamax>(cgh, [=]() {
-            accessor_result[0] = ::cblas_isamax((int)n, accessor_x.GET_MULTI_PTR, (int)incx);
-        });
-    });
-}
-
-void iamax(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_idamax>(cgh, [=]() {
-            accessor_result[0] = ::cblas_idamax((int)n, accessor_x.GET_MULTI_PTR, (int)incx);
-        });
-    });
-}
-
-void iamax(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_icamax>(cgh, [=]() {
-            accessor_result[0] = ::cblas_icamax((int)n, accessor_x.GET_MULTI_PTR, (int)incx);
-        });
-    });
-}
-
-void iamax(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-           sycl::buffer<int64_t, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_izamax>(cgh, [=]() {
-            accessor_result[0] = ::cblas_izamax((int)n, accessor_x.GET_MULTI_PTR, (int)incx);
-        });
-    });
-}
-
-void nrm2(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.template get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.template get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_snrm2>(cgh, [=]() {
-            accessor_result[0] =
-                ::cblas_snrm2((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx));
-        });
-    });
-}
-
-void nrm2(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_dnrm2>(cgh, [=]() {
-            accessor_result[0] =
-                ::cblas_dnrm2((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx));
-        });
-    });
-}
-
-void nrm2(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_scnrm2>(cgh, [=]() {
-            accessor_result[0] =
-                ::cblas_scnrm2((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx));
-        });
-    });
-}
-
-void nrm2(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_dznrm2>(cgh, [=]() {
-            accessor_result[0] =
-                ::cblas_dznrm2((const int)n, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx));
-        });
-    });
-}
-
-void rot(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-         sycl::buffer<float, 1> &y, int64_t incy, float c, float s) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_srot>(cgh, [=]() {
-            ::cblas_srot((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                         accessor_y.GET_MULTI_PTR, (const int)incy, (const float)c, (const float)s);
-        });
-    });
-}
-
-void rot(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-         sycl::buffer<double, 1> &y, int64_t incy, double c, double s) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_drot>(cgh, [=]() {
-            ::cblas_drot((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                         accessor_y.GET_MULTI_PTR, (const int)incy, (const float)c, (const float)s);
-        });
-    });
-}
-
-void rot(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &y, int64_t incy, float c, float s) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_csrot>(cgh, [=]() {
-            ::cblas_csrot((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy, (const float)c,
-                          (const float)s);
-        });
-    });
-}
-
-void rot(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &y, int64_t incy, double c, double s) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zdrot>(cgh, [=]() {
-            ::cblas_zdrot((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy, (const double)c,
-                          (const double)s);
-        });
-    });
-}
-
-void rotg(sycl::queue &queue, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &b,
-          sycl::buffer<float, 1> &c, sycl::buffer<float, 1> &s) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_s = s.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_srotg>(cgh, [=]() {
-            ::cblas_srotg(accessor_a.GET_MULTI_PTR, accessor_b.GET_MULTI_PTR,
-                          accessor_c.GET_MULTI_PTR, accessor_s.GET_MULTI_PTR);
-        });
-    });
-}
-
-void rotg(sycl::queue &queue, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &b,
-          sycl::buffer<double, 1> &c, sycl::buffer<double, 1> &s) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_s = s.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_drotg>(cgh, [=]() {
-            ::cblas_drotg(accessor_a.GET_MULTI_PTR, accessor_b.GET_MULTI_PTR,
-                          accessor_c.GET_MULTI_PTR, accessor_s.GET_MULTI_PTR);
-        });
-    });
-}
-
-void rotg(sycl::queue &queue, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<std::complex<float>, 1> &s) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_s = s.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_crotg>(cgh, [=]() {
-            ::cblas_crotg(accessor_a.GET_MULTI_PTR, accessor_b.GET_MULTI_PTR,
-                          accessor_c.GET_MULTI_PTR, accessor_s.GET_MULTI_PTR);
-        });
-    });
-}
-
-void rotg(sycl::queue &queue, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<std::complex<double>, 1> &s) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_s = s.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zrotg>(cgh, [=]() {
-            ::cblas_zrotg(accessor_a.GET_MULTI_PTR, accessor_b.GET_MULTI_PTR,
-                          accessor_c.GET_MULTI_PTR, accessor_s.GET_MULTI_PTR);
-        });
-    });
-}
-
-void rotm(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &param) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_param = param.get_access<sycl::access::mode::read>(cgh);
-        host_task<class netlib_srotm>(cgh, [=]() {
-            ::cblas_srotm((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy, accessor_param.GET_MULTI_PTR);
-        });
-    });
-}
-
-void rotm(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &y, int64_t incy, sycl::buffer<double, 1> &param) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_param = param.get_access<sycl::access::mode::read>(cgh);
-        host_task<class netlib_drotm>(cgh, [=]() {
-            ::cblas_drotm((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy, accessor_param.GET_MULTI_PTR);
-        });
-    });
-}
-
-void rotmg(sycl::queue &queue, sycl::buffer<float, 1> &d1, sycl::buffer<float, 1> &d2,
-           sycl::buffer<float, 1> &x1, float y1, sycl::buffer<float, 1> &param) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_d1 = d1.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_d2 = d2.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_x1 = x1.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_param = param.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_srotmg>(cgh, [=]() {
-            ::cblas_srotmg(accessor_d1.GET_MULTI_PTR, accessor_d2.GET_MULTI_PTR,
-                           accessor_x1.GET_MULTI_PTR, (float)y1, accessor_param.GET_MULTI_PTR);
-        });
-    });
-}
-
-void rotmg(sycl::queue &queue, sycl::buffer<double, 1> &d1, sycl::buffer<double, 1> &d2,
-           sycl::buffer<double, 1> &x1, double y1, sycl::buffer<double, 1> &param) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_d1 = d1.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_d2 = d2.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_x1 = x1.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_param = param.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_drotmg>(cgh, [=]() {
-            ::cblas_drotmg(accessor_d1.GET_MULTI_PTR, accessor_d2.GET_MULTI_PTR,
-                           accessor_x1.GET_MULTI_PTR, (double)y1, accessor_param.GET_MULTI_PTR);
-        });
-    });
-}
-
-void scal(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_sscal>(cgh, [=]() {
-            ::cblas_sscal((const int)n, (const float)alpha, accessor_x.GET_MULTI_PTR,
-                          (const int)std::abs(incx));
-        });
-    });
-}
-
-void scal(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dscal>(cgh, [=]() {
-            ::cblas_dscal((const int)n, (const double)alpha, accessor_x.GET_MULTI_PTR,
-                          (const int)std::abs(incx));
-        });
-    });
-}
-
-void scal(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_cscal>(cgh, [=]() {
-            ::cblas_cscal((const int)n, (const void *)&alpha, accessor_x.GET_MULTI_PTR,
-                          (const int)std::abs(incx));
-        });
-    });
-}
-
-void scal(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_csscal>(cgh, [=]() {
-            ::cblas_csscal((const int)n, (const float)alpha, accessor_x.GET_MULTI_PTR,
-                           (const int)std::abs(incx));
-        });
-    });
-}
-
-void scal(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zscal>(cgh, [=]() {
-            ::cblas_zscal((const int)n, (const void *)&alpha, accessor_x.GET_MULTI_PTR,
-                          (const int)std::abs(incx));
-        });
-    });
-}
-
-void scal(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<std::complex<double>, 1> &x,
-          int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zdscal>(cgh, [=]() {
-            ::cblas_zdscal((const int)n, (const double)alpha, accessor_x.GET_MULTI_PTR,
-                           (const int)std::abs(incx));
-        });
-    });
-}
-
-void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer<float, 1> &x, int64_t incx,
-            sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &result) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_result = result.get_access<sycl::access::mode::write>(cgh);
-        host_task<class netlib_sdsdot>(cgh, [=]() {
-            accessor_result[0] =
-                ::cblas_sdsdot((const int)n, (const float)sb, accessor_x.GET_MULTI_PTR,
-                               (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void swap(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-          sycl::buffer<float, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_sswap>(cgh, [=]() {
-            ::cblas_sswap((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void swap(sycl::queue &queue, int64_t n, sycl::buffer<double, 1> &x, int64_t incx,
-          sycl::buffer<double, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dswap>(cgh, [=]() {
-            ::cblas_dswap((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void swap(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_cswap>(cgh, [=]() {
-            ::cblas_cswap((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void swap(sycl::queue &queue, int64_t n, sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zswap>(cgh, [=]() {
-            ::cblas_zswap((const int)n, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-// USM APIs
-
-sycl::event asum(sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_sasum_usm>(
-            cgh, [=]() { result[0] = ::cblas_sasum((const int)n, x, (const int)std::abs(incx)); });
-    });
-    return done;
-}
-
-sycl::event asum(sycl::queue &queue, int64_t n, const double *x, int64_t incx, double *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dasum_usm>(
-            cgh, [=]() { result[0] = ::cblas_dasum((const int)n, x, (const int)std::abs(incx)); });
-    });
-    return done;
-}
-
-sycl::event asum(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                 float *result, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_scasum_usm>(
-            cgh, [=]() { result[0] = ::cblas_scasum((const int)n, x, (const int)std::abs(incx)); });
-    });
-    return done;
-}
-
-sycl::event asum(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                 double *result, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dzasum_usm>(
-            cgh, [=]() { result[0] = ::cblas_dzasum((const int)n, x, (const int)std::abs(incx)); });
-    });
-    return done;
-}
-
-sycl::event axpy(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, float *y,
-                 int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_saxpy_usm>(cgh, [=]() {
-            ::cblas_saxpy((const int)n, (const float)alpha, x, (const int)incx, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event axpy(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx,
-                 double *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_daxpy_usm>(cgh, [=]() {
-            ::cblas_daxpy((const int)n, (const double)alpha, x, (const int)incx, y,
-                          (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event axpy(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, int64_t incx, std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_caxpy_usm>(cgh, [=]() {
-            ::cblas_caxpy((const int)n, (const void *)&alpha, x, (const int)incx, y,
-                          (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event axpy(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, int64_t incx, std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zaxpy_usm>(cgh, [=]() {
-            ::cblas_zaxpy((const int)n, (const void *)&alpha, x, (const int)incx, y,
-                          (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event axpby(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx,
-                  float beta, float *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpby", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpby", "for row_major layout");
-#endif
-}
-
-sycl::event axpby(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx,
-                  double beta, double *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpby", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpby", "for row_major layout");
-#endif
-}
-
-sycl::event axpby(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                  const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                  std::complex<float> *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpby", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpby", "for row_major layout");
-#endif
-}
-
-sycl::event axpby(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                  const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                  std::complex<double> *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "axpby", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "axpby", "for row_major layout");
-#endif
-}
-
-sycl::event copy(sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *y,
-                 int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_scopy_usm>(
-            cgh, [=]() { ::cblas_scopy((const int)n, x, (const int)incx, y, (const int)incy); });
-    });
-    return done;
-}
-
-sycl::event copy(sycl::queue &queue, int64_t n, const double *x, int64_t incx, double *y,
-                 int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dcopy_usm>(
-            cgh, [=]() { ::cblas_dcopy((const int)n, x, (const int)incx, y, (const int)incy); });
-    });
-    return done;
-}
-
-sycl::event copy(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ccopy_usm>(
-            cgh, [=]() { ::cblas_ccopy((const int)n, x, (const int)incx, y, (const int)incy); });
-    });
-    return done;
-}
-
-sycl::event copy(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                 std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zcopy_usm>(
-            cgh, [=]() { ::cblas_zcopy((const int)n, x, (const int)incx, y, (const int)incy); });
-    });
-    return done;
-}
-
-sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y,
-                int64_t incy, float *result, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_sdot_usm>(cgh, [=]() {
-            result[0] = ::cblas_sdot((const int)n, x, (const int)incx, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event dot(sycl::queue &queue, int64_t n, const double *x, int64_t incx, const double *y,
-                int64_t incy, double *result, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ddot_usm>(cgh, [=]() {
-            result[0] = ::cblas_ddot((const int)n, x, (const int)incx, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y,
-                int64_t incy, double *result, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dsdot_usm>(cgh, [=]() {
-            result[0] = ::cblas_dsdot((const int)n, x, (const int)incx, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event dotc(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                 const std::complex<float> *y, int64_t incy, std::complex<float> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_cdotc_usm>(cgh, [=]() {
-            ::cblas_cdotc_sub((const int)n, x, (const int)incx, y, (const int)incy, result);
-        });
-    });
-    return done;
-}
-
-sycl::event dotc(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                 const std::complex<double> *y, int64_t incy, std::complex<double> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zdotc_usm>(cgh, [=]() {
-            ::cblas_zdotc_sub((const int)n, x, (const int)incx, y, (const int)incy, result);
-        });
-    });
-    return done;
-}
-
-sycl::event dotu(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                 const std::complex<float> *y, int64_t incy, std::complex<float> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_cdotu_usm>(cgh, [=]() {
-            ::cblas_cdotu_sub((const int)n, x, (const int)incx, y, (const int)incy, result);
-        });
-    });
-    return done;
-}
-
-sycl::event dotu(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                 const std::complex<double> *y, int64_t incy, std::complex<double> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zdotu_usm>(cgh, [=]() {
-            ::cblas_zdotu_sub((const int)n, x, (const int)incx, y, (const int)incy, result);
-        });
-    });
-    return done;
-}
-
-sycl::event iamin(sycl::queue &queue, int64_t n, const float *x, int64_t incx, int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_isamin_usm>(
-            cgh, [=]() { result[0] = ::cblas_isamin((int)n, x, (int)incx); });
-    });
-    return done;
-}
-
-sycl::event iamin(sycl::queue &queue, int64_t n, const double *x, int64_t incx, int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_idamin_usm>(
-            cgh, [=]() { result[0] = ::cblas_idamin((const int)n, x, (const int)incx); });
-    });
-    return done;
-}
-
-sycl::event iamin(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                  int64_t *result, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_icamin_usm>(
-            cgh, [=]() { result[0] = ::cblas_icamin((int)n, x, (int)incx); });
-    });
-    return done;
-}
-
-sycl::event iamin(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                  int64_t *result, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_izamin_usm>(
-            cgh, [=]() { result[0] = ::cblas_izamin((int)n, x, (int)incx); });
-    });
-    return done;
-}
-
-sycl::event iamax(sycl::queue &queue, int64_t n, const float *x, int64_t incx, int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_isamax_usm>(
-            cgh, [=]() { result[0] = ::cblas_isamax((int)n, x, (int)incx); });
-    });
-    return done;
-}
-
-sycl::event iamax(sycl::queue &queue, int64_t n, const double *x, int64_t incx, int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_idamax_usm>(
-            cgh, [=]() { result[0] = ::cblas_idamax((int)n, x, (int)incx); });
-    });
-    return done;
-}
-
-sycl::event iamax(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                  int64_t *result, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_icamax_usm>(
-            cgh, [=]() { result[0] = ::cblas_icamax((int)n, x, (int)incx); });
-    });
-    return done;
-}
-
-sycl::event iamax(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                  int64_t *result, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_izamax_usm>(
-            cgh, [=]() { result[0] = ::cblas_izamax((int)n, x, (int)incx); });
-    });
-    return done;
-}
-
-sycl::event nrm2(sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_snrm2_usm>(
-            cgh, [=]() { result[0] = ::cblas_snrm2((const int)n, x, (const int)std::abs(incx)); });
-    });
-    return done;
-}
-
-sycl::event nrm2(sycl::queue &queue, int64_t n, const double *x, int64_t incx, double *result,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dnrm2_usm>(
-            cgh, [=]() { result[0] = ::cblas_dnrm2((const int)n, x, (const int)std::abs(incx)); });
-    });
-    return done;
-}
-
-sycl::event nrm2(sycl::queue &queue, int64_t n, const std::complex<float> *x, int64_t incx,
-                 float *result, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_scnrm2_usm>(
-            cgh, [=]() { result[0] = ::cblas_scnrm2((const int)n, x, (const int)std::abs(incx)); });
-    });
-    return done;
-}
-
-sycl::event nrm2(sycl::queue &queue, int64_t n, const std::complex<double> *x, int64_t incx,
-                 double *result, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dznrm2_usm>(
-            cgh, [=]() { result[0] = ::cblas_dznrm2((const int)n, x, (const int)std::abs(incx)); });
-    });
-    return done;
-}
-
-sycl::event rot(sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, int64_t incy,
-                float c, float s, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_srot_usm>(cgh, [=]() {
-            ::cblas_srot((const int)n, x, (const int)incx, y, (const int)incy, (const float)c,
-                         (const float)s);
-        });
-    });
-    return done;
-}
-
-sycl::event rot(sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, int64_t incy,
-                double c, double s, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_drot_usm>(cgh, [=]() {
-            ::cblas_drot((const int)n, x, (const int)incx, y, (const int)incy, (const float)c,
-                         (const float)s);
-        });
-    });
-    return done;
-}
-
-sycl::event rot(sycl::queue &queue, int64_t n, std::complex<float> *x, int64_t incx,
-                std::complex<float> *y, int64_t incy, float c, float s,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_csrot_usm>(cgh, [=]() {
-            ::cblas_csrot((const int)n, x, (const int)incx, y, (const int)incy, (const float)c,
-                          (const float)s);
-        });
-    });
-    return done;
-}
-
-sycl::event rot(sycl::queue &queue, int64_t n, std::complex<double> *x, int64_t incx,
-                std::complex<double> *y, int64_t incy, double c, double s,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zdrot_usm>(cgh, [=]() {
-            ::cblas_zdrot((const int)n, x, (const int)incx, y, (const int)incy, (const double)c,
-                          (const double)s);
-        });
-    });
-    return done;
-}
-
-sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_srotg_usm>(cgh, [=]() { ::cblas_srotg(a, b, c, s); });
-    });
-    return done;
-}
-
-sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c, double *s,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_drotg_usm>(cgh, [=]() { ::cblas_drotg(a, b, c, s); });
-    });
-    return done;
-}
-
-sycl::event rotg(sycl::queue &queue, std::complex<float> *a, std::complex<float> *b, float *c,
-                 std::complex<float> *s, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_crotg_usm>(cgh, [=]() { ::cblas_crotg(a, b, c, s); });
-    });
-    return done;
-}
-
-sycl::event rotg(sycl::queue &queue, std::complex<double> *a, std::complex<double> *b, double *c,
-                 std::complex<double> *s, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zrotg_usm>(cgh, [=]() { ::cblas_zrotg(a, b, c, s); });
-    });
-    return done;
-}
-
-sycl::event rotm(sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, int64_t incy,
-                 float *param, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_srotm_usm>(cgh, [=]() {
-            ::cblas_srotm((const int)n, x, (const int)incx, y, (const int)incy, param);
-        });
-    });
-    return done;
-}
-
-sycl::event rotm(sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, int64_t incy,
-                 double *param, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_drotm_usm>(cgh, [=]() {
-            ::cblas_drotm((const int)n, x, (const int)incx, y, (const int)incy, param);
-        });
-    });
-    return done;
-}
-
-sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_srotmg_usm>(cgh,
-                                           [=]() { ::cblas_srotmg(d1, d2, x1, (float)y1, param); });
-    });
-    return done;
-}
-
-sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_drotmg_usm>(
-            cgh, [=]() { ::cblas_drotmg(d1, d2, x1, (double)y1, param); });
-    });
-    return done;
-}
-
-sycl::event scal(sycl::queue &queue, int64_t n, float alpha, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_sscal_usm>(cgh, [=]() {
-            ::cblas_sscal((const int)n, (const float)alpha, x, (const int)std::abs(incx));
-        });
-    });
-    return done;
-}
-
-sycl::event scal(sycl::queue &queue, int64_t n, double alpha, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dscal_usm>(cgh, [=]() {
-            ::cblas_dscal((const int)n, (const double)alpha, x, (const int)std::abs(incx));
-        });
-    });
-    return done;
-}
-
-sycl::event scal(sycl::queue &queue, int64_t n, std::complex<float> alpha, std::complex<float> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_cscal_usm>(cgh, [=]() {
-            ::cblas_cscal((const int)n, (const void *)&alpha, x, (const int)std::abs(incx));
-        });
-    });
-    return done;
-}
-
-sycl::event scal(sycl::queue &queue, int64_t n, float alpha, std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_csscal_usm>(cgh, [=]() {
-            ::cblas_csscal((const int)n, (const float)alpha, x, (const int)std::abs(incx));
-        });
-    });
-    return done;
-}
-
-sycl::event scal(sycl::queue &queue, int64_t n, std::complex<double> alpha, std::complex<double> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zscal_usm>(cgh, [=]() {
-            ::cblas_zscal((const int)n, (const void *)&alpha, x, (const int)std::abs(incx));
-        });
-    });
-    return done;
-}
-
-sycl::event scal(sycl::queue &queue, int64_t n, double alpha, std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zdscal_usm>(cgh, [=]() {
-            ::cblas_zdscal((const int)n, (const double)alpha, x, (const int)std::abs(incx));
-        });
-    });
-    return done;
-}
-
-sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx,
-                   const float *y, int64_t incy, float *result,
-                   const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_sdsdot_usm>(cgh, [=]() {
-            result[0] = ::cblas_sdsdot((const int)n, (const float)sb, x, (const int)incx, y,
-                                       (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event swap(sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_sswap_usm>(
-            cgh, [=]() { ::cblas_sswap((const int)n, x, (const int)incx, y, (const int)incy); });
-    });
-    return done;
-}
-
-sycl::event swap(sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dswap_usm>(
-            cgh, [=]() { ::cblas_dswap((const int)n, x, (const int)incx, y, (const int)incy); });
-    });
-    return done;
-}
-
-sycl::event swap(sycl::queue &queue, int64_t n, std::complex<float> *x, int64_t incx,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_cswap_usm>(
-            cgh, [=]() { ::cblas_cswap((const int)n, x, (const int)incx, y, (const int)incy); });
-    });
-    return done;
-}
-
-sycl::event swap(sycl::queue &queue, int64_t n, std::complex<double> *x, int64_t incx,
-                 std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zswap_usm>(
-            cgh, [=]() { ::cblas_zswap((const int)n, x, (const int)incx, y, (const int)incy); });
-    });
-    return done;
-}
diff --git a/src/blas/backends/netlib/netlib_level2.cpp b/src/blas/backends/netlib/netlib_level2.cpp
deleted file mode 100644
index fb63bf3a9..000000000
--- a/src/blas/backends/netlib/netlib_level2.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "netlib_common.hpp"
-#include "oneapi/mkl/blas/detail/netlib/onemkl_blas_netlib.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace netlib {
-namespace column_major {
-
-#define MAJOR CblasColMajor
-#include "netlib_level2.cxx"
-#undef MAJOR
-
-} // namespace column_major
-namespace row_major {
-
-#define MAJOR CblasRowMajor
-#include "netlib_level2.cxx"
-#undef MAJOR
-
-} // namespace row_major
-} // namespace netlib
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/netlib/netlib_level2.cxx b/src/blas/backends/netlib/netlib_level2.cxx
deleted file mode 100644
index 156ed133b..000000000
--- a/src/blas/backends/netlib/netlib_level2.cxx
+++ /dev/null
@@ -1,2138 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-          float alpha, sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x,
-          int64_t incx, float beta, sycl::buffer<float, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_sgbmv>(cgh, [=]() {
-            ::cblas_sgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const int)kl, (const int)ku, (const float)alpha,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR,
-                          (const int)incx, (const float)beta, accessor_y.GET_MULTI_PTR,
-                          (const int)incy);
-        });
-    });
-}
-
-void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-          double alpha, sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x,
-          int64_t incx, double beta, sycl::buffer<double, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dgbmv>(cgh, [=]() {
-            ::cblas_dgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const int)kl, (const int)ku, (const double)alpha,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR,
-                          (const int)incx, (const double)beta, accessor_y.GET_MULTI_PTR,
-                          (const int)incy);
-        });
-    });
-}
-
-void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_cgbmv>(cgh, [=]() {
-            ::cblas_cgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const int)kl, (const int)ku, (const void *)&alpha,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR,
-                          (const int)incx, (const void *)&beta, accessor_y.GET_MULTI_PTR,
-                          (const int)incy);
-        });
-    });
-}
-
-void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zgbmv>(cgh, [=]() {
-            ::cblas_zgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const int)kl, (const int)ku, (const void *)&alpha,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR,
-                          (const int)incx, (const void *)&beta, accessor_y.GET_MULTI_PTR,
-                          (const int)incy);
-        });
-    });
-}
-
-void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x, int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_sgemv>(cgh, [=]() {
-            ::cblas_sgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const float)alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, (const float)beta,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x, int64_t incx,
-          double beta, sycl::buffer<double, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dgemv>(cgh, [=]() {
-            ::cblas_dgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const double)alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, (const double)beta,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_cgemv>(cgh, [=]() {
-            ::cblas_cgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, (const void *)&beta,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zgemv>(cgh, [=]() {
-            ::cblas_zgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, (const void *)&beta,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void ger(sycl::queue &queue, int64_t m, int64_t n, float alpha, sycl::buffer<float, 1> &x,
-         int64_t incx, sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &a,
-         int64_t lda) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_sger>(cgh, [=]() {
-            ::cblas_sger(MAJOR, (const int)m, (const int)n, (const float)alpha,
-                         accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR,
-                         (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda);
-        });
-    });
-}
-
-void ger(sycl::queue &queue, int64_t m, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-         int64_t incx, sycl::buffer<double, 1> &y, int64_t incy, sycl::buffer<double, 1> &a,
-         int64_t lda) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dger>(cgh, [=]() {
-            ::cblas_dger(MAJOR, (const int)m, (const int)n, (const double)alpha,
-                         accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR,
-                         (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda);
-        });
-    });
-}
-
-void gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_cgerc>(cgh, [=]() {
-            ::cblas_cgerc(MAJOR, (const int)m, (const int)n, (const void *)&alpha,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR,
-                          (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda);
-        });
-    });
-}
-
-void gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zgerc>(cgh, [=]() {
-            ::cblas_zgerc(MAJOR, (const int)m, (const int)n, (const void *)&alpha,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR,
-                          (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda);
-        });
-    });
-}
-
-void geru(sycl::queue &queue, int64_t m, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_cgeru>(cgh, [=]() {
-            ::cblas_cgeru(MAJOR, (const int)m, (const int)n, (const void *)&alpha,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR,
-                          (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda);
-        });
-    });
-}
-
-void geru(sycl::queue &queue, int64_t m, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zgeru>(cgh, [=]() {
-            ::cblas_zgeru(MAJOR, (const int)m, (const int)n, (const void *)&alpha,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR,
-                          (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda);
-        });
-    });
-}
-
-void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_chbmv>(cgh, [=]() {
-            ::cblas_chbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k,
-                          (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, (const void *)&beta,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zhbmv>(cgh, [=]() {
-            ::cblas_zhbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k,
-                          (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, (const void *)&beta,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_chemv>(cgh, [=]() {
-            ::cblas_chemv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, (const void *)&beta,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zhemv>(cgh, [=]() {
-            ::cblas_zhemv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, (const void *)&beta,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void her(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a, int64_t lda) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_cher>(cgh, [=]() {
-            ::cblas_cher(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const float)alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                         accessor_a.GET_MULTI_PTR, (const int)lda);
-        });
-    });
-}
-
-void her(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a, int64_t lda) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zher>(cgh, [=]() {
-            ::cblas_zher(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const double)alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                         accessor_a.GET_MULTI_PTR, (const int)lda);
-        });
-    });
-}
-
-void her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_cher2>(cgh, [=]() {
-            ::cblas_cher2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy, accessor_a.GET_MULTI_PTR,
-                          (const int)lda);
-        });
-    });
-}
-
-void her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zher2>(cgh, [=]() {
-            ::cblas_zher2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy, accessor_a.GET_MULTI_PTR,
-                          (const int)lda);
-        });
-    });
-}
-
-void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &ap, sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-          int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_ap = ap.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_chpmv>(cgh, [=]() {
-            ::cblas_chpmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, accessor_ap.GET_MULTI_PTR, accessor_x.GET_MULTI_PTR,
-                          (const int)incx, (const void *)&beta, accessor_y.GET_MULTI_PTR,
-                          (const int)incy);
-        });
-    });
-}
-
-void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &ap, sycl::buffer<std::complex<double>, 1> &x,
-          int64_t incx, std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-          int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_ap = ap.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zhpmv>(cgh, [=]() {
-            ::cblas_zhpmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, accessor_ap.GET_MULTI_PTR, accessor_x.GET_MULTI_PTR,
-                          (const int)incx, (const void *)&beta, accessor_y.GET_MULTI_PTR,
-                          (const int)incy);
-        });
-    });
-}
-
-void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
-         sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &ap) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_ap = ap.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_chpr>(cgh, [=]() {
-            ::cblas_chpr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const float)alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                         accessor_ap.GET_MULTI_PTR);
-        });
-    });
-}
-
-void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
-         sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &ap) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_ap = ap.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zhpr>(cgh, [=]() {
-            ::cblas_zhpr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const double)alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                         accessor_ap.GET_MULTI_PTR);
-        });
-    });
-}
-
-void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &ap) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_ap = ap.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_chpr2>(cgh, [=]() {
-            ::cblas_chpr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy, accessor_ap.GET_MULTI_PTR);
-        });
-    });
-}
-
-void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &ap) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_ap = ap.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zhpr2>(cgh, [=]() {
-            ::cblas_zhpr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy, accessor_ap.GET_MULTI_PTR);
-        });
-    });
-}
-
-void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, float alpha,
-          sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x, int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ssbmv>(cgh, [=]() {
-            ::cblas_ssbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k,
-                          (const float)alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, (const float)beta,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, double alpha,
-          sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x, int64_t incx,
-          double beta, sycl::buffer<double, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dsbmv>(cgh, [=]() {
-            ::cblas_dsbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k,
-                          (const double)alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, (const double)beta,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer<float, 1> &ap,
-          sycl::buffer<float, 1> &x, int64_t incx, float beta, sycl::buffer<float, 1> &y,
-          int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_ap = ap.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_sspmv>(cgh, [=]() {
-            ::cblas_sspmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const float)alpha, accessor_ap.GET_MULTI_PTR, accessor_x.GET_MULTI_PTR,
-                          (const int)incx, (const float)beta, accessor_y.GET_MULTI_PTR,
-                          (const int)incy);
-        });
-    });
-}
-
-void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
-          sycl::buffer<double, 1> &ap, sycl::buffer<double, 1> &x, int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_ap = ap.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dspmv>(cgh, [=]() {
-            ::cblas_dspmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const double)alpha, accessor_ap.GET_MULTI_PTR, accessor_x.GET_MULTI_PTR,
-                          (const int)incx, (const double)beta, accessor_y.GET_MULTI_PTR,
-                          (const int)incy);
-        });
-    });
-}
-
-void spr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer<float, 1> &x,
-         int64_t incx, sycl::buffer<float, 1> &ap) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_ap = ap.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_sspr>(cgh, [=]() {
-            ::cblas_sspr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const float)alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                         accessor_ap.GET_MULTI_PTR);
-        });
-    });
-}
-
-void spr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-         int64_t incx, sycl::buffer<double, 1> &ap) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_ap = ap.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dspr>(cgh, [=]() {
-            ::cblas_dspr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const double)alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                         accessor_ap.GET_MULTI_PTR);
-        });
-    });
-}
-
-void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer<float, 1> &x,
-          int64_t incx, sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &ap) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_ap = ap.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_sspr2>(cgh, [=]() {
-            ::cblas_sspr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const float)alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy, accessor_ap.GET_MULTI_PTR);
-        });
-    });
-}
-
-void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-          int64_t incx, sycl::buffer<double, 1> &y, int64_t incy, sycl::buffer<double, 1> &ap) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_ap = ap.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dspr2>(cgh, [=]() {
-            ::cblas_dspr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const double)alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy, accessor_ap.GET_MULTI_PTR);
-        });
-    });
-}
-
-void symv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer<float, 1> &a,
-          int64_t lda, sycl::buffer<float, 1> &x, int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ssymv>(cgh, [=]() {
-            ::cblas_ssymv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const float)alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, (const float)beta,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void symv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          int64_t lda, sycl::buffer<double, 1> &x, int64_t incx, double beta,
-          sycl::buffer<double, 1> &y, int64_t incy) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dsymv>(cgh, [=]() {
-            ::cblas_dsymv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const double)alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_x.GET_MULTI_PTR, (const int)incx, (const double)beta,
-                          accessor_y.GET_MULTI_PTR, (const int)incy);
-        });
-    });
-}
-
-void syr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer<float, 1> &x,
-         int64_t incx, sycl::buffer<float, 1> &a, int64_t lda) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ssyr>(cgh, [=]() {
-            ::cblas_ssyr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const float)alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                         accessor_a.GET_MULTI_PTR, (const int)lda);
-        });
-    });
-}
-
-void syr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-         int64_t incx, sycl::buffer<double, 1> &a, int64_t lda) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dsyr>(cgh, [=]() {
-            ::cblas_dsyr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const double)alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                         accessor_a.GET_MULTI_PTR, (const int)lda);
-        });
-    });
-}
-
-void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer<float, 1> &x,
-          int64_t incx, sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &a,
-          int64_t lda) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ssyr2>(cgh, [=]() {
-            ::cblas_ssyr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const float)alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy, accessor_a.GET_MULTI_PTR,
-                          (const int)lda);
-        });
-    });
-}
-
-void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer<double, 1> &x,
-          int64_t incx, sycl::buffer<double, 1> &y, int64_t incy, sycl::buffer<double, 1> &a,
-          int64_t lda) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_x = x.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_y = y.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_a = a.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dsyr2>(cgh, [=]() {
-            ::cblas_dsyr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const double)alpha, accessor_x.GET_MULTI_PTR, (const int)incx,
-                          accessor_y.GET_MULTI_PTR, (const int)incy, accessor_a.GET_MULTI_PTR,
-                          (const int)lda);
-        });
-    });
-}
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x,
-          int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_stbmv>(cgh, [=]() {
-            ::cblas_stbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR,
-                          (const int)incx);
-        });
-    });
-}
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x,
-          int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dtbmv>(cgh, [=]() {
-            ::cblas_dtbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR,
-                          (const int)incx);
-        });
-    });
-}
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ctbmv>(cgh, [=]() {
-            ::cblas_ctbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR,
-                          (const int)incx);
-        });
-    });
-}
-
-void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ztbmv>(cgh, [=]() {
-            ::cblas_ztbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR,
-                          (const int)incx);
-        });
-    });
-}
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x,
-          int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_stbsv>(cgh, [=]() {
-            ::cblas_stbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR,
-                          (const int)incx);
-        });
-    });
-}
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x,
-          int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dtbsv>(cgh, [=]() {
-            ::cblas_dtbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR,
-                          (const int)incx);
-        });
-    });
-}
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ctbsv>(cgh, [=]() {
-            ::cblas_ctbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR,
-                          (const int)incx);
-        });
-    });
-}
-
-void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          int64_t k, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ztbsv>(cgh, [=]() {
-            ::cblas_ztbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR,
-                          (const int)incx);
-        });
-    });
-}
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<float, 1> &ap, sycl::buffer<float, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_ap = ap.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_stpmv>(cgh, [=]() {
-            ::cblas_stpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR,
-                          accessor_x.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<double, 1> &ap, sycl::buffer<double, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_ap = ap.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dtpmv>(cgh, [=]() {
-            ::cblas_dtpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR,
-                          accessor_x.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<float>, 1> &ap, sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_ap = ap.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ctpmv>(cgh, [=]() {
-            ::cblas_ctpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR,
-                          accessor_x.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<double>, 1> &ap, sycl::buffer<std::complex<double>, 1> &x,
-          int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_ap = ap.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ztpmv>(cgh, [=]() {
-            ::cblas_ztpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR,
-                          accessor_x.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<float, 1> &ap, sycl::buffer<float, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_ap = ap.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_stpsv>(cgh, [=]() {
-            ::cblas_stpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR,
-                          accessor_x.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<double, 1> &ap, sycl::buffer<double, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_ap = ap.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dtpsv>(cgh, [=]() {
-            ::cblas_dtpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR,
-                          accessor_x.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<float>, 1> &ap, sycl::buffer<std::complex<float>, 1> &x,
-          int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_ap = ap.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ctpsv>(cgh, [=]() {
-            ::cblas_ctpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR,
-                          accessor_x.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<double>, 1> &ap, sycl::buffer<std::complex<double>, 1> &x,
-          int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_ap = ap.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ztpsv>(cgh, [=]() {
-            ::cblas_ztpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_ap.GET_MULTI_PTR,
-                          accessor_x.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n,
-          sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_strmv>(cgh, [=]() {
-            ::cblas_strmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR,
-                          (const int)lda, accessor_b.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n,
-          sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dtrmv>(cgh, [=]() {
-            ::cblas_dtrmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR,
-                          (const int)lda, accessor_b.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ctrmv>(cgh, [=]() {
-            ::cblas_ctrmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR,
-                          (const int)lda, accessor_b.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ztrmv>(cgh, [=]() {
-            ::cblas_ztrmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR,
-                          (const int)lda, accessor_b.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_strsv>(cgh, [=]() {
-            ::cblas_strsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR,
-                          (const int)lda, accessor_x.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dtrsv>(cgh, [=]() {
-            ::cblas_dtrsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR,
-                          (const int)lda, accessor_x.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ctrsv>(cgh, [=]() {
-            ::cblas_ctrsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR,
-                          (const int)lda, accessor_x.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, int64_t incx) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_x = x.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ztrsv>(cgh, [=]() {
-            ::cblas_ztrsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, accessor_a.GET_MULTI_PTR,
-                          (const int)lda, accessor_x.GET_MULTI_PTR, (const int)incx);
-        });
-    });
-}
-
-// USM APIs
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-                 float alpha, const float *a, int64_t lda, const float *x, int64_t incx, float beta,
-                 float *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_sgbmv_usm>(cgh, [=]() {
-            ::cblas_sgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const int)kl, (const int)ku, (const float)alpha, a, (const int)lda, x,
-                          (const int)incx, (const float)beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-                 double alpha, const double *a, int64_t lda, const double *x, int64_t incx,
-                 double beta, double *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dgbmv_usm>(cgh, [=]() {
-            ::cblas_dgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const int)kl, (const int)ku, (const double)alpha, a, (const int)lda, x,
-                          (const int)incx, (const double)beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_cgbmv_usm>(cgh, [=]() {
-            ::cblas_cgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const int)kl, (const int)ku, (const void *)&alpha, a, (const int)lda, x,
-                          (const int)incx, (const void *)&beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zgbmv_usm>(cgh, [=]() {
-            ::cblas_zgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const int)kl, (const int)ku, (const void *)&alpha, a, (const int)lda, x,
-                          (const int)incx, (const void *)&beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                 const float *a, int64_t lda, const float *x, int64_t incx, float beta, float *y,
-                 int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_sgemv_usm>(cgh, [=]() {
-            ::cblas_sgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const float)alpha, a, (const int)lda, x, (const int)incx,
-                          (const float)beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                 const double *a, int64_t lda, const double *x, int64_t incx, double beta,
-                 double *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dgemv_usm>(cgh, [=]() {
-            ::cblas_dgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const double)alpha, a, (const int)lda, x, (const int)incx,
-                          (const double)beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_cgemv_usm>(cgh, [=]() {
-            ::cblas_cgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const void *)&alpha, a, (const int)lda, x, (const int)incx,
-                          (const void *)&beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zgemv_usm>(cgh, [=]() {
-            ::cblas_zgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n,
-                          (const void *)&alpha, a, (const int)lda, x, (const int)incx,
-                          (const void *)&beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event ger(sycl::queue &queue, int64_t m, int64_t n, float alpha, const float *x, int64_t incx,
-                const float *y, int64_t incy, float *a, int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_sger_usm>(cgh, [=]() {
-            ::cblas_sger(MAJOR, (const int)m, (const int)n, (const float)alpha, x, (const int)incx,
-                         y, (const int)incy, a, (const int)lda);
-        });
-    });
-    return done;
-}
-
-sycl::event ger(sycl::queue &queue, int64_t m, int64_t n, double alpha, const double *x,
-                int64_t incx, const double *y, int64_t incy, double *a, int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dger_usm>(cgh, [=]() {
-            ::cblas_dger(MAJOR, (const int)m, (const int)n, (const double)alpha, x, (const int)incx,
-                         y, (const int)incy, a, (const int)lda);
-        });
-    });
-    return done;
-}
-
-sycl::event gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
-                 int64_t incy, std::complex<float> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_cgerc_usm>(cgh, [=]() {
-            ::cblas_cgerc(MAJOR, (const int)m, (const int)n, (const void *)&alpha, x,
-                          (const int)incx, y, (const int)incy, a, (const int)lda);
-        });
-    });
-    return done;
-}
-
-sycl::event gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, int64_t incx, const std::complex<double> *y,
-                 int64_t incy, std::complex<double> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zgerc_usm>(cgh, [=]() {
-            ::cblas_zgerc(MAJOR, (const int)m, (const int)n, (const void *)&alpha, x,
-                          (const int)incx, y, (const int)incy, a, (const int)lda);
-        });
-    });
-    return done;
-}
-
-sycl::event geru(sycl::queue &queue, int64_t m, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
-                 int64_t incy, std::complex<float> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_cgeru_usm>(cgh, [=]() {
-            ::cblas_cgeru(MAJOR, (const int)m, (const int)n, (const void *)&alpha, x,
-                          (const int)incx, y, (const int)incy, a, (const int)lda);
-        });
-    });
-    return done;
-}
-
-sycl::event geru(sycl::queue &queue, int64_t m, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, int64_t incx, const std::complex<double> *y,
-                 int64_t incy, std::complex<double> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zgeru_usm>(cgh, [=]() {
-            ::cblas_zgeru(MAJOR, (const int)m, (const int)n, (const void *)&alpha, x,
-                          (const int)incx, y, (const int)incy, a, (const int)lda);
-        });
-    });
-    return done;
-}
-
-sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_chbmv_usm>(cgh, [=]() {
-            ::cblas_chbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k,
-                          (const void *)&alpha, a, (const int)lda, x, (const int)incx,
-                          (const void *)&beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zhbmv_usm>(cgh, [=]() {
-            ::cblas_zhbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k,
-                          (const void *)&alpha, a, (const int)lda, x, (const int)incx,
-                          (const void *)&beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, int64_t lda, const std::complex<float> *x,
-                 int64_t incx, std::complex<float> beta, std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_chemv_usm>(cgh, [=]() {
-            ::cblas_chemv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, a, (const int)lda, x, (const int)incx,
-                          (const void *)&beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, int64_t lda, const std::complex<double> *x,
-                 int64_t incx, std::complex<double> beta, std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zhemv_usm>(cgh, [=]() {
-            ::cblas_zhemv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, a, (const int)lda, x, (const int)incx,
-                          (const void *)&beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
-                const std::complex<float> *x, int64_t incx, std::complex<float> *a, int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_cher_usm>(cgh, [=]() {
-            ::cblas_cher(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const float)alpha, x, (const int)incx, a, (const int)lda);
-        });
-    });
-    return done;
-}
-
-sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
-                const std::complex<double> *x, int64_t incx, std::complex<double> *a, int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zher_usm>(cgh, [=]() {
-            ::cblas_zher(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const double)alpha, x, (const int)incx, a, (const int)lda);
-        });
-    });
-    return done;
-}
-
-sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
-                 int64_t incy, std::complex<float> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_cher2_usm>(cgh, [=]() {
-            ::cblas_cher2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, x, (const int)incx, y, (const int)incy, a,
-                          (const int)lda);
-        });
-    });
-    return done;
-}
-
-sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, int64_t incx, const std::complex<double> *y,
-                 int64_t incy, std::complex<double> *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zher2_usm>(cgh, [=]() {
-            ::cblas_zher2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, x, (const int)incx, y, (const int)incy, a,
-                          (const int)lda);
-        });
-    });
-    return done;
-}
-
-sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *ap, const std::complex<float> *x, int64_t incx,
-                 std::complex<float> beta, std::complex<float> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_chpmv_usm>(cgh, [=]() {
-            ::cblas_chpmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, ap, x, (const int)incx, (const void *)&beta, y,
-                          (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *ap, const std::complex<double> *x, int64_t incx,
-                 std::complex<double> beta, std::complex<double> *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zhpmv_usm>(cgh, [=]() {
-            ::cblas_zhpmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, ap, x, (const int)incx, (const void *)&beta, y,
-                          (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha,
-                const std::complex<float> *x, int64_t incx, std::complex<float> *ap,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_chpr_usm>(cgh, [=]() {
-            ::cblas_chpr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const float)alpha, x, (const int)incx, ap);
-        });
-    });
-    return done;
-}
-
-sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha,
-                const std::complex<double> *x, int64_t incx, std::complex<double> *ap,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zhpr_usm>(cgh, [=]() {
-            ::cblas_zhpr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const double)alpha, x, (const int)incx, ap);
-        });
-    });
-    return done;
-}
-
-sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *x, int64_t incx, const std::complex<float> *y,
-                 int64_t incy, std::complex<float> *ap,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_chpr2_usm>(cgh, [=]() {
-            ::cblas_chpr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, x, (const int)incx, y, (const int)incy, ap);
-        });
-    });
-    return done;
-}
-
-sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *x, int64_t incx, const std::complex<double> *y,
-                 int64_t incy, std::complex<double> *ap,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zhpr2_usm>(cgh, [=]() {
-            ::cblas_zhpr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const void *)&alpha, x, (const int)incx, y, (const int)incy, ap);
-        });
-    });
-    return done;
-}
-
-sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, float alpha,
-                 const float *a, int64_t lda, const float *x, int64_t incx, float beta, float *y,
-                 int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ssbmv_usm>(cgh, [=]() {
-            ::cblas_ssbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k,
-                          (const float)alpha, a, (const int)lda, x, (const int)incx,
-                          (const float)beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, double alpha,
-                 const double *a, int64_t lda, const double *x, int64_t incx, double beta,
-                 double *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dsbmv_usm>(cgh, [=]() {
-            ::cblas_dsbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k,
-                          (const double)alpha, a, (const int)lda, x, (const int)incx,
-                          (const double)beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *ap,
-                 const float *x, int64_t incx, float beta, float *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_sspmv_usm>(cgh, [=]() {
-            ::cblas_sspmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const float)alpha, ap, x, (const int)incx, (const float)beta, y,
-                          (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *ap,
-                 const double *x, int64_t incx, double beta, double *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dspmv_usm>(cgh, [=]() {
-            ::cblas_dspmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const double)alpha, ap, x, (const int)incx, (const double)beta, y,
-                          (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x,
-                int64_t incx, float *ap, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_sspr_usm>(cgh, [=]() {
-            ::cblas_sspr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const float)alpha, x, (const int)incx, ap);
-        });
-    });
-    return done;
-}
-
-sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x,
-                int64_t incx, double *ap, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dspr_usm>(cgh, [=]() {
-            ::cblas_dspr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const double)alpha, x, (const int)incx, ap);
-        });
-    });
-    return done;
-}
-
-sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x,
-                 int64_t incx, const float *y, int64_t incy, float *ap,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_sspr2_usm>(cgh, [=]() {
-            ::cblas_sspr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const float)alpha, x, (const int)incx, y, (const int)incy, ap);
-        });
-    });
-    return done;
-}
-
-sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x,
-                 int64_t incx, const double *y, int64_t incy, double *ap,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dspr2_usm>(cgh, [=]() {
-            ::cblas_dspr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const double)alpha, x, (const int)incx, y, (const int)incy, ap);
-        });
-    });
-    return done;
-}
-
-sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *a,
-                 int64_t lda, const float *x, int64_t incx, float beta, float *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ssymv_usm>(cgh, [=]() {
-            ::cblas_ssymv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const float)alpha, a, (const int)lda, x, (const int)incx,
-                          (const float)beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *a,
-                 int64_t lda, const double *x, int64_t incx, double beta, double *y, int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dsymv_usm>(cgh, [=]() {
-            ::cblas_dsymv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const double)alpha, a, (const int)lda, x, (const int)incx,
-                          (const double)beta, y, (const int)incy);
-        });
-    });
-    return done;
-}
-
-sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x,
-                int64_t incx, float *a, int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ssyr_usm>(cgh, [=]() {
-            ::cblas_ssyr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const float)alpha, x, (const int)incx, a, (const int)lda);
-        });
-    });
-    return done;
-}
-
-sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x,
-                int64_t incx, double *a, int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dsyr_usm>(cgh, [=]() {
-            ::cblas_dsyr(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                         (const double)alpha, x, (const int)incx, a, (const int)lda);
-        });
-    });
-    return done;
-}
-
-sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x,
-                 int64_t incx, const float *y, int64_t incy, float *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ssyr2_usm>(cgh, [=]() {
-            ::cblas_ssyr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const float)alpha, x, (const int)incx, y, (const int)incy, a,
-                          (const int)lda);
-        });
-    });
-    return done;
-}
-
-sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x,
-                 int64_t incx, const double *y, int64_t incy, double *a, int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dsyr2_usm>(cgh, [=]() {
-            ::cblas_dsyr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n,
-                          (const double)alpha, x, (const int)incx, y, (const int)incy, a,
-                          (const int)lda);
-        });
-    });
-    return done;
-}
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const float *a, int64_t lda, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_stbmv_usm>(cgh, [=]() {
-            ::cblas_stbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a,
-                          (const int)lda, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const double *a, int64_t lda, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dtbmv_usm>(cgh, [=]() {
-            ::cblas_dtbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a,
-                          (const int)lda, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const std::complex<float> *a, int64_t lda, std::complex<float> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ctbmv_usm>(cgh, [=]() {
-            ::cblas_ctbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a,
-                          (const int)lda, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const std::complex<double> *a, int64_t lda, std::complex<double> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ztbmv_usm>(cgh, [=]() {
-            ::cblas_ztbmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a,
-                          (const int)lda, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const float *a, int64_t lda, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_stbsv_usm>(cgh, [=]() {
-            ::cblas_stbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a,
-                          (const int)lda, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const double *a, int64_t lda, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dtbsv_usm>(cgh, [=]() {
-            ::cblas_dtbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a,
-                          (const int)lda, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const std::complex<float> *a, int64_t lda, std::complex<float> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ctbsv_usm>(cgh, [=]() {
-            ::cblas_ctbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a,
-                          (const int)lda, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 int64_t k, const std::complex<double> *a, int64_t lda, std::complex<double> *x,
-                 int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ztbsv_usm>(cgh, [=]() {
-            ::cblas_ztbsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, (const int)k, a,
-                          (const int)lda, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const float *ap, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_stpmv_usm>(cgh, [=]() {
-            ::cblas_stpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const double *ap, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dtpmv_usm>(cgh, [=]() {
-            ::cblas_dtpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const std::complex<float> *ap, std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ctpmv_usm>(cgh, [=]() {
-            ::cblas_ctpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const std::complex<double> *ap, std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ztpmv_usm>(cgh, [=]() {
-            ::cblas_ztpmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const float *ap, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_stpsv_usm>(cgh, [=]() {
-            ::cblas_stpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const double *ap, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dtpsv_usm>(cgh, [=]() {
-            ::cblas_dtpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const std::complex<float> *ap, std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ctpsv_usm>(cgh, [=]() {
-            ::cblas_ctpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const std::complex<double> *ap, std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ztpsv_usm>(cgh, [=]() {
-            ::cblas_ztpsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, ap, x, (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n,
-                 const float *a, int64_t lda, float *b, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_strmv_usm>(cgh, [=]() {
-            ::cblas_strmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, b,
-                          (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n,
-                 const double *a, int64_t lda, double *b, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dtrmv_usm>(cgh, [=]() {
-            ::cblas_dtrmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, b,
-                          (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n,
-                 const std::complex<float> *a, int64_t lda, std::complex<float> *b, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ctrmv_usm>(cgh, [=]() {
-            ::cblas_ctrmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, b,
-                          (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n,
-                 const std::complex<double> *a, int64_t lda, std::complex<double> *b, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ztrmv_usm>(cgh, [=]() {
-            ::cblas_ztrmv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, b,
-                          (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const float *a, int64_t lda, float *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_strsv_usm>(cgh, [=]() {
-            ::cblas_strsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, x,
-                          (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const double *a, int64_t lda, double *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dtrsv_usm>(cgh, [=]() {
-            ::cblas_dtrsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, x,
-                          (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const std::complex<float> *a, int64_t lda, std::complex<float> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ctrsv_usm>(cgh, [=]() {
-            ::cblas_ctrsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, x,
-                          (const int)incx);
-        });
-    });
-    return done;
-}
-
-sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n,
-                 const std::complex<double> *a, int64_t lda, std::complex<double> *x, int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ztrsv_usm>(cgh, [=]() {
-            ::cblas_ztrsv(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          convert_to_cblas_diag(unit_diag), (const int)n, a, (const int)lda, x,
-                          (const int)incx);
-        });
-    });
-    return done;
-}
diff --git a/src/blas/backends/netlib/netlib_level3.cpp b/src/blas/backends/netlib/netlib_level3.cpp
deleted file mode 100644
index c41f78205..000000000
--- a/src/blas/backends/netlib/netlib_level3.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "netlib_common.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/netlib/onemkl_blas_netlib.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace netlib {
-namespace column_major {
-
-#define MAJOR CblasColMajor
-#define COLUMN_MAJOR
-#include "netlib_level3.cxx"
-#undef MAJOR
-#undef COLUMN_MAJOR
-
-} // namespace column_major
-namespace row_major {
-
-#define MAJOR CblasRowMajor
-#define ROW_MAJOR
-#include "netlib_level3.cxx"
-#undef MAJOR
-#undef ROW_MAJOR
-
-} // namespace row_major
-} // namespace netlib
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/netlib/netlib_level3.cxx b/src/blas/backends/netlib/netlib_level3.cxx
deleted file mode 100644
index 8bb6a04ae..000000000
--- a/src/blas/backends/netlib/netlib_level3.cxx
+++ /dev/null
@@ -1,1148 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          float alpha, sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b,
-          int64_t ldb, float beta, sycl::buffer<float, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_sgemm>(cgh, [=]() {
-            ::cblas_sgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb),
-                          (const int)m, (const int)n, (const int)k, (const float)alpha,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR,
-                          (const int)ldb, (const float)beta, accessor_c.GET_MULTI_PTR,
-                          (const int)ldc);
-        });
-    });
-}
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          double alpha, sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b,
-          int64_t ldb, double beta, sycl::buffer<double, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dgemm>(cgh, [=]() {
-            ::cblas_dgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb),
-                          (const int)m, (const int)n, (const int)k, (const double)alpha,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR,
-                          (const int)ldb, (const double)beta, accessor_c.GET_MULTI_PTR,
-                          (const int)ldc);
-        });
-    });
-}
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_cgemm>(cgh, [=]() {
-            ::cblas_cgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb),
-                          (const int)m, (const int)n, (const int)k, (const void *)&alpha,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR,
-                          (const int)ldb, (const void *)&beta, accessor_c.GET_MULTI_PTR,
-                          (const int)ldc);
-        });
-    });
-}
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zgemm>(cgh, [=]() {
-            ::cblas_zgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb),
-                          (const int)m, (const int)n, (const int)k, (const void *)&alpha,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR,
-                          (const int)ldb, (const void *)&beta, accessor_c.GET_MULTI_PTR,
-                          (const int)ldc);
-        });
-    });
-}
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          sycl::half alpha, sycl::buffer<sycl::half, 1> &a, int64_t lda,
-          sycl::buffer<sycl::half, 1> &b, int64_t ldb, sycl::half beta,
-          sycl::buffer<sycl::half, 1> &c, int64_t ldc) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm", "for row_major layout");
-#endif
-}
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          float alpha, sycl::buffer<sycl::half, 1> &a, int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          int64_t ldb, float beta, sycl::buffer<float, 1> &c, int64_t ldc) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm", "for row_major layout");
-#endif
-}
-
-void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k,
-          float alpha, sycl::buffer<bfloat16, 1> &a, int64_t lda, sycl::buffer<bfloat16, 1> &b,
-          int64_t ldb, float beta, sycl::buffer<float, 1> &c, int64_t ldc) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm", "for row_major layout");
-#endif
-}
-
-void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_chemm>(cgh, [=]() {
-            ::cblas_chemm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n,
-                          (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_b.GET_MULTI_PTR, (const int)ldb, (const void *)&beta,
-                          accessor_c.GET_MULTI_PTR, (const int)ldc);
-        });
-    });
-}
-
-void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zhemm>(cgh, [=]() {
-            ::cblas_zhemm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n,
-                          (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_b.GET_MULTI_PTR, (const int)ldb, (const void *)&beta,
-                          accessor_c.GET_MULTI_PTR, (const int)ldc);
-        });
-    });
-}
-
-void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha,
-          sycl::buffer<std::complex<float>, 1> &a, int64_t lda, float beta,
-          sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_cherk>(cgh, [=]() {
-            ::cblas_cherk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          (const int)n, (const int)k, (const float)alpha, accessor_a.GET_MULTI_PTR,
-                          (const int)lda, (const float)beta, accessor_c.GET_MULTI_PTR,
-                          (const int)ldc);
-        });
-    });
-}
-
-void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, double alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda, double beta,
-          sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zherk>(cgh, [=]() {
-            ::cblas_zherk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          (const int)n, (const int)k, (const double)alpha, accessor_a.GET_MULTI_PTR,
-                          (const int)lda, (const double)beta, accessor_c.GET_MULTI_PTR,
-                          (const int)ldc);
-        });
-    });
-}
-
-void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-           std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, float beta,
-           sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_cher2k>(cgh, [=]() {
-            ::cblas_cher2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                           (const int)n, (const int)k, (const void *)&alpha,
-                           accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR,
-                           (const int)ldb, (const float)beta, accessor_c.GET_MULTI_PTR,
-                           (const int)ldc);
-        });
-    });
-}
-
-void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-           std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, double beta,
-           sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zher2k>(cgh, [=]() {
-            ::cblas_zher2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                           (const int)n, (const int)k, (const void *)&alpha,
-                           accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR,
-                           (const int)ldb, (const double)beta, accessor_c.GET_MULTI_PTR,
-                           (const int)ldc);
-        });
-    });
-}
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b, int64_t ldb,
-          float beta, sycl::buffer<float, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ssymm>(cgh, [=]() {
-            ::cblas_ssymm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n,
-                          (const float)alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_b.GET_MULTI_PTR, (const int)ldb, (const float)beta,
-                          accessor_c.GET_MULTI_PTR, (const int)ldc);
-        });
-    });
-}
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b, int64_t ldb,
-          double beta, sycl::buffer<double, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dsymm>(cgh, [=]() {
-            ::cblas_dsymm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n,
-                          (const double)alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_b.GET_MULTI_PTR, (const int)ldb, (const double)beta,
-                          accessor_c.GET_MULTI_PTR, (const int)ldc);
-        });
-    });
-}
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_csymm>(cgh, [=]() {
-            ::cblas_csymm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n,
-                          (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_b.GET_MULTI_PTR, (const int)ldb, (const void *)&beta,
-                          accessor_c.GET_MULTI_PTR, (const int)ldc);
-        });
-    });
-}
-
-void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zsymm>(cgh, [=]() {
-            ::cblas_zsymm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n,
-                          (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_b.GET_MULTI_PTR, (const int)ldb, (const void *)&beta,
-                          accessor_c.GET_MULTI_PTR, (const int)ldc);
-        });
-    });
-}
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha,
-          sycl::buffer<float, 1> &a, int64_t lda, float beta, sycl::buffer<float, 1> &c,
-          int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ssyrk>(cgh, [=]() {
-            ::cblas_ssyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          (const int)n, (const int)k, (const float)alpha, accessor_a.GET_MULTI_PTR,
-                          (const int)lda, (const float)beta, accessor_c.GET_MULTI_PTR,
-                          (const int)ldc);
-        });
-    });
-}
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, double alpha,
-          sycl::buffer<double, 1> &a, int64_t lda, double beta, sycl::buffer<double, 1> &c,
-          int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dsyrk>(cgh, [=]() {
-            ::cblas_dsyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          (const int)n, (const int)k, (const double)alpha, accessor_a.GET_MULTI_PTR,
-                          (const int)lda, (const double)beta, accessor_c.GET_MULTI_PTR,
-                          (const int)ldc);
-        });
-    });
-}
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_csyrk>(cgh, [=]() {
-            ::cblas_csyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          (const int)n, (const int)k, (const void *)&alpha,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, (const void *)&beta,
-                          accessor_c.GET_MULTI_PTR, (const int)ldc);
-        });
-    });
-}
-
-void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zsyrk>(cgh, [=]() {
-            ::cblas_zsyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          (const int)n, (const int)k, (const void *)&alpha,
-                          accessor_a.GET_MULTI_PTR, (const int)lda, (const void *)&beta,
-                          accessor_c.GET_MULTI_PTR, (const int)ldc);
-        });
-    });
-}
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha,
-           sycl::buffer<float, 1> &a, int64_t lda, sycl::buffer<float, 1> &b, int64_t ldb,
-           float beta, sycl::buffer<float, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ssyr2k>(cgh, [=]() {
-            ::cblas_ssyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                           (const int)n, (const int)k, (const float)alpha, accessor_a.GET_MULTI_PTR,
-                           (const int)lda, accessor_b.GET_MULTI_PTR, (const int)ldb,
-                           (const float)beta, accessor_c.GET_MULTI_PTR, (const int)ldc);
-        });
-    });
-}
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-           double alpha, sycl::buffer<double, 1> &a, int64_t lda, sycl::buffer<double, 1> &b,
-           int64_t ldb, double beta, sycl::buffer<double, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dsyr2k>(cgh, [=]() {
-            ::cblas_dsyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                           (const int)n, (const int)k, (const double)alpha,
-                           accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR,
-                           (const int)ldb, (const double)beta, accessor_c.GET_MULTI_PTR,
-                           (const int)ldc);
-        });
-    });
-}
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-           std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_csyr2k>(cgh, [=]() {
-            ::cblas_csyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                           (const int)n, (const int)k, (const void *)&alpha,
-                           accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR,
-                           (const int)ldb, (const void *)&beta, accessor_c.GET_MULTI_PTR,
-                           (const int)ldc);
-        });
-    });
-}
-
-void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-           std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, int64_t ldb, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_c = c.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_zsyr2k>(cgh, [=]() {
-            ::cblas_zsyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                           (const int)n, (const int)k, (const void *)&alpha,
-                           accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR,
-                           (const int)ldb, (const void *)&beta, accessor_c.GET_MULTI_PTR,
-                           (const int)ldc);
-        });
-    });
-}
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          int64_t m, int64_t n, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-          sycl::buffer<float, 1> &b, int64_t ldb) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_strmm>(cgh, [=]() {
-            ::cblas_strmm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const float)alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_b.GET_MULTI_PTR, (const int)ldb);
-        });
-    });
-}
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          int64_t m, int64_t n, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-          sycl::buffer<double, 1> &b, int64_t ldb) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dtrmm>(cgh, [=]() {
-            ::cblas_dtrmm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const double)alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_b.GET_MULTI_PTR, (const int)ldb);
-        });
-    });
-}
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          int64_t m, int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ctrmm>(cgh, [=]() {
-            ::cblas_ctrmm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_b.GET_MULTI_PTR, (const int)ldb);
-        });
-    });
-}
-
-void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          int64_t m, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, int64_t ldb) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ztrmm>(cgh, [=]() {
-            ::cblas_ztrmm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_b.GET_MULTI_PTR, (const int)ldb);
-        });
-    });
-}
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          int64_t m, int64_t n, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-          sycl::buffer<float, 1> &b, int64_t ldb) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_strsm>(cgh, [=]() {
-            ::cblas_strsm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const float)alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_b.GET_MULTI_PTR, (const int)ldb);
-        });
-    });
-}
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          int64_t m, int64_t n, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-          sycl::buffer<double, 1> &b, int64_t ldb) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_dtrsm>(cgh, [=]() {
-            ::cblas_dtrsm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const double)alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_b.GET_MULTI_PTR, (const int)ldb);
-        });
-    });
-}
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          int64_t m, int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ctrsm>(cgh, [=]() {
-            ::cblas_ctrsm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_b.GET_MULTI_PTR, (const int)ldb);
-        });
-    });
-}
-
-void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag,
-          int64_t m, int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, int64_t ldb) {
-    queue.submit([&](sycl::handler &cgh) {
-        auto accessor_a = a.get_access<sycl::access::mode::read>(cgh);
-        auto accessor_b = b.get_access<sycl::access::mode::read_write>(cgh);
-        host_task<class netlib_ztrsm>(cgh, [=]() {
-            ::cblas_ztrsm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda,
-                          accessor_b.GET_MULTI_PTR, (const int)ldb);
-        });
-    });
-}
-
-// USM APIs
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, float alpha, const float *a, int64_t lda, const float *b, int64_t ldb,
-                 float beta, float *c, int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_sgemm_usm>(cgh, [=]() {
-            ::cblas_sgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb),
-                          (const int)m, (const int)n, (const int)k, (const float)alpha, a,
-                          (const int)lda, b, (const int)ldb, (const float)beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, double alpha, const double *a, int64_t lda, const double *b,
-                 int64_t ldb, double beta, double *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dgemm_usm>(cgh, [=]() {
-            ::cblas_dgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb),
-                          (const int)m, (const int)n, (const int)k, (const double)alpha, a,
-                          (const int)lda, b, (const int)ldb, (const double)beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                 std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_cgemm_usm>(cgh, [=]() {
-            ::cblas_cgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb),
-                          (const int)m, (const int)n, (const int)k, (const void *)&alpha, a,
-                          (const int)lda, b, (const int)ldb, (const void *)&beta, c,
-                          (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
-                 std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zgemm_usm>(cgh, [=]() {
-            ::cblas_zgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb),
-                          (const int)m, (const int)n, (const int)k, (const void *)&alpha, a,
-                          (const int)lda, b, (const int)ldb, (const void *)&beta, c,
-                          (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, sycl::half alpha, const sycl::half *a, int64_t lda, const sycl::half *b,
-                 int64_t ldb, sycl::half beta, sycl::half *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm", "for row_major layout");
-#endif
-}
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, float alpha, const sycl::half *a, int64_t lda, const sycl::half *b,
-                 int64_t ldb, float beta, float *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm", "for row_major layout");
-#endif
-}
-
-sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                 int64_t k, float alpha, const bfloat16 *a, int64_t lda, const bfloat16 *b,
-                 int64_t ldb, float beta, float *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-#ifdef COLUMN_MAJOR
-    throw unimplemented("blas", "gemm", "for column_major layout");
-#endif
-#ifdef ROW_MAJOR
-    throw unimplemented("blas", "gemm", "for row_major layout");
-#endif
-}
-
-sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                 std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_chemm_usm>(cgh, [=]() {
-            ::cblas_chemm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n,
-                          (const void *)&alpha, a, (const int)lda, b, (const int)ldb,
-                          (const void *)&beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
-                 std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zhemm_usm>(cgh, [=]() {
-            ::cblas_zhemm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n,
-                          (const void *)&alpha, a, (const int)lda, b, (const int)ldb,
-                          (const void *)&beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                 float alpha, const std::complex<float> *a, int64_t lda, float beta,
-                 std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_cherk_usm>(cgh, [=]() {
-            ::cblas_cherk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          (const int)n, (const int)k, (const float)alpha, a, (const int)lda,
-                          (const float)beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                 double alpha, const std::complex<double> *a, int64_t lda, double beta,
-                 std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zherk_usm>(cgh, [=]() {
-            ::cblas_zherk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          (const int)n, (const int)k, (const double)alpha, a, (const int)lda,
-                          (const double)beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                  std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                  const std::complex<float> *b, int64_t ldb, float beta, std::complex<float> *c,
-                  int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_cher2k_usm>(cgh, [=]() {
-            ::cblas_cher2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                           (const int)n, (const int)k, (const void *)&alpha, a, (const int)lda, b,
-                           (const int)ldb, (const float)beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                  std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                  const std::complex<double> *b, int64_t ldb, double beta, std::complex<double> *c,
-                  int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zher2k_usm>(cgh, [=]() {
-            ::cblas_zher2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                           (const int)n, (const int)k, (const void *)&alpha, a, (const int)lda, b,
-                           (const int)ldb, (const double)beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-                 float alpha, const float *a, int64_t lda, const float *b, int64_t ldb, float beta,
-                 float *c, int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ssymm_usm>(cgh, [=]() {
-            ::cblas_ssymm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n,
-                          (const float)alpha, a, (const int)lda, b, (const int)ldb,
-                          (const float)beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-                 double alpha, const double *a, int64_t lda, const double *b, int64_t ldb,
-                 double beta, double *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dsymm_usm>(cgh, [=]() {
-            ::cblas_dsymm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n,
-                          (const double)alpha, a, (const int)lda, b, (const int)ldb,
-                          (const double)beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                 std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_csymm_usm>(cgh, [=]() {
-            ::cblas_csymm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n,
-                          (const void *)&alpha, a, (const int)lda, b, (const int)ldb,
-                          (const void *)&beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
-                 std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zsymm_usm>(cgh, [=]() {
-            ::cblas_zsymm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n,
-                          (const void *)&alpha, a, (const int)lda, b, (const int)ldb,
-                          (const void *)&beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                 float alpha, const float *a, int64_t lda, float beta, float *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ssyrk_usm>(cgh, [=]() {
-            ::cblas_ssyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          (const int)n, (const int)k, (const float)alpha, a, (const int)lda,
-                          (const float)beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                 double alpha, const double *a, int64_t lda, double beta, double *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dsyrk_usm>(cgh, [=]() {
-            ::cblas_dsyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          (const int)n, (const int)k, (const double)alpha, a, (const int)lda,
-                          (const double)beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 std::complex<float> beta, std::complex<float> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_csyrk_usm>(cgh, [=]() {
-            ::cblas_csyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          (const int)n, (const int)k, (const void *)&alpha, a, (const int)lda,
-                          (const void *)&beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zsyrk_usm>(cgh, [=]() {
-            ::cblas_zsyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                          (const int)n, (const int)k, (const void *)&alpha, a, (const int)lda,
-                          (const void *)&beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                  float alpha, const float *a, int64_t lda, const float *b, int64_t ldb, float beta,
-                  float *c, int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ssyr2k_usm>(cgh, [=]() {
-            ::cblas_ssyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                           (const int)n, (const int)k, (const float)alpha, a, (const int)lda, b,
-                           (const int)ldb, (const float)beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                  double alpha, const double *a, int64_t lda, const double *b, int64_t ldb,
-                  double beta, double *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dsyr2k_usm>(cgh, [=]() {
-            ::cblas_dsyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                           (const int)n, (const int)k, (const double)alpha, a, (const int)lda, b,
-                           (const int)ldb, (const double)beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                  std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                  const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                  std::complex<float> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_csyr2k_usm>(cgh, [=]() {
-            ::cblas_csyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                           (const int)n, (const int)k, (const void *)&alpha, a, (const int)lda, b,
-                           (const int)ldb, (const void *)&beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,
-                  std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                  const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
-                  std::complex<double> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_zsyr2k_usm>(cgh, [=]() {
-            ::cblas_zsyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-                           (const int)n, (const int)k, (const void *)&alpha, a, (const int)lda, b,
-                           (const int)ldb, (const void *)&beta, c, (const int)ldc);
-        });
-    });
-    return done;
-}
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, int64_t lda,
-                 float *b, int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_strmm_usm>(cgh, [=]() {
-            ::cblas_strmm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const float)alpha, a, (const int)lda, b, (const int)ldb);
-        });
-    });
-    return done;
-}
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, int64_t lda,
-                 double *b, int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dtrmm_usm>(cgh, [=]() {
-            ::cblas_dtrmm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const double)alpha, a, (const int)lda, b, (const int)ldb);
-        });
-    });
-    return done;
-}
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, int64_t lda, std::complex<float> *b, int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ctrmm_usm>(cgh, [=]() {
-            ::cblas_ctrmm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const void *)&alpha, a, (const int)lda, b, (const int)ldb);
-        });
-    });
-    return done;
-}
-
-sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, int64_t lda, std::complex<double> *b, int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ztrmm_usm>(cgh, [=]() {
-            ::cblas_ztrmm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const void *)&alpha, a, (const int)lda, b, (const int)ldb);
-        });
-    });
-    return done;
-}
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, int64_t lda,
-                 float *b, int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_strsm_usm>(cgh, [=]() {
-            ::cblas_strsm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const float)alpha, a, (const int)lda, b, (const int)ldb);
-        });
-    });
-    return done;
-}
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, int64_t lda,
-                 double *b, int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_dtrsm_usm>(cgh, [=]() {
-            ::cblas_dtrsm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const double)alpha, a, (const int)lda, b, (const int)ldb);
-        });
-    });
-    return done;
-}
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, int64_t m, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, int64_t lda, std::complex<float> *b, int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ctrsm_usm>(cgh, [=]() {
-            ::cblas_ctrsm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const void *)&alpha, a, (const int)lda, b, (const int)ldb);
-        });
-    });
-    return done;
-}
-
-sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa,
-                 diag unit_diag, int64_t m, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, int64_t lda, std::complex<double> *b, int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        host_task<class netlib_ztrsm_usm>(cgh, [=]() {
-            ::cblas_ztrsm(MAJOR, convert_to_cblas_side(left_right),
-                          convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-                          convert_to_cblas_diag(unit_diag), (const int)m, (const int)n,
-                          (const void *)&alpha, a, (const int)lda, b, (const int)ldb);
-        });
-    });
-    return done;
-}
diff --git a/src/blas/backends/netlib/netlib_wrappers.cpp b/src/blas/backends/netlib/netlib_wrappers.cpp
deleted file mode 100644
index 1a377f647..000000000
--- a/src/blas/backends/netlib/netlib_wrappers.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "blas/function_table.hpp"
-#include "oneapi/mkl/blas/detail/netlib/onemkl_blas_netlib.hpp"
-
-#define WRAPPER_VERSION 1
-
-extern "C" ONEMKL_EXPORT blas_function_table_t mkl_blas_table = {
-    WRAPPER_VERSION,
-#define BACKEND netlib
-#define MAJOR   column_major
-#include "../backend_wrappers.cxx"
-#undef MAJOR
-#define MAJOR row_major
-#include "../backend_wrappers.cxx"
-#undef MAJOR
-#undef BACKEND
-};
diff --git a/src/blas/backends/portblas/CMakeLists.txt b/src/blas/backends/portblas/CMakeLists.txt
deleted file mode 100644
index 03fddbb38..000000000
--- a/src/blas/backends/portblas/CMakeLists.txt
+++ /dev/null
@@ -1,222 +0,0 @@
-#==========================================================================
-#  Copyright (C) Codeplay Software Limited
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  For your convenience, a copy of the License has been included in this
-#  repository.
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-#=========================================================================
-
-set(LIB_NAME onemkl_blas_portblas)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-if(NOT DEFINED PORTBLAS_TUNING_TARGET)
-  option(PORTBLAS_TUNING_TARGET "Set a TUNING_TARGET for portBLAS" "")
-endif()
-
-# Parse compiler flags and return a list of SYCL targets
-# The list is empty if no targets are set
-function(get_sycl_targets FLAGS)
-  string(REGEX MATCH "-fsycl-targets=[^ ]*" SYCL_TARGETS_FLAG "${FLAGS}")
-  string(REPLACE "-fsycl-targets=" "" SYCL_TARGETS "${SYCL_TARGETS_FLAG}")
-  string(REPLACE "," ";" SYCL_TARGETS "${SYCL_TARGETS}")
-  set(SYCL_TARGETS ${SYCL_TARGETS} PARENT_SCOPE)
-endfunction(get_sycl_targets)
-
-# portBLAS supports tuning for some device types, but can only be compiled
-# for one at a time currently. Work out which device to tune for based on the
-# DPC++ target triple specified via -fsycl-targets
-if(TARGET ONEMKL::SYCL::SYCL)
-  get_target_property(ONEMKL_COMPILE_OPTIONS ONEMKL::SYCL::SYCL INTERFACE_COMPILE_OPTIONS)
-endif()
-get_sycl_targets("${ONEMKL_COMPILE_OPTIONS}")
-list(LENGTH SYCL_TARGETS NUM_TARGETS)
-if(NUM_TARGETS EQUAL 0)
-  get_sycl_targets("${CMAKE_CXX_FLAGS}")
-  list(LENGTH SYCL_TARGETS NUM_TARGETS)
-endif()
-
-if(PORTBLAS_TUNING_TARGET)
-  # Allow the user to manually enable a specific device type 
-  # for tuned portBLAS configurations and sets sycl-target.
-  if(PORTBLAS_TUNING_TARGET STREQUAL "INTEL_CPU")
-    set(ENABLE_PORTBLAS_BACKEND_INTEL_CPU "ON" CACHE INTERNAL "")
-    set(PORTBLAS_TUNING_TARGET "")
-    target_compile_options(ONEMKL::SYCL::SYCL INTERFACE
-      -fsycl-targets=spir64_x86_64 -fsycl-unnamed-lambda)
-    target_link_options(ONEMKL::SYCL::SYCL INTERFACE
-      -fsycl-targets=spir64_x86_64)
-  elseif(PORTBLAS_TUNING_TARGET STREQUAL "INTEL_GPU")
-    set(ENABLE_PORTBLAS_BACKEND_INTEL_GPU "ON" CACHE INTERNAL "")
-  elseif(PORTBLAS_TUNING_TARGET STREQUAL "AMD_GPU")
-    set(ENABLE_PORTBLAS_BACKEND_AMD_GPU "ON" CACHE INTERNAL "")
-    if (is_dpcpp)
-      target_compile_options(ONEMKL::SYCL::SYCL INTERFACE
-        -fsycl-targets=amdgcn-amd-amdhsa -fsycl-unnamed-lambda
-        -Xsycl-target-backend --offload-arch=${HIP_TARGETS})
-      target_link_options(ONEMKL::SYCL::SYCL INTERFACE
-        -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${HIP_TARGETS})
-    else()
-      message(WARNING "Compiler is not supported."
-      " Unable to automatically set the required flags for the target '${PORTBLAS_TUNING_TARGET}'."
-      " Compilation may fail.")
-    endif()
-  elseif(PORTBLAS_TUNING_TARGET STREQUAL "NVIDIA_GPU")
-    set(ENABLE_PORTBLAS_BACKEND_NVIDIA_GPU "ON" CACHE INTERNAL "")
-    if (is_dpcpp)
-      target_compile_options(ONEMKL::SYCL::SYCL INTERFACE
-        -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda)
-      target_link_options(ONEMKL::SYCL::SYCL INTERFACE
-        -fsycl-targets=nvptx64-nvidia-cuda)
-      if(DEFINED CUDA_TARGET)
-        target_compile_options(ONEMKL::SYCL::SYCL INTERFACE
-          -Xsycl-target-backend --cuda-gpu-arch=${CUDA_TARGET})
-        target_link_options(ONEMKL::SYCL::SYCL INTERFACE
-          -Xsycl-target-backend --cuda-gpu-arch=${CUDA_TARGET})
-      endif()
-    else()
-      message(WARNING "Compiler is not supported."
-      " Unable to automatically set the required flags for the target '${PORTBLAS_TUNING_TARGET}'."
-      " Compilation may fail.")
-    endif()
-  else()
-    message(FATAL_ERROR "Unsupported PORTBLAS_TUNING_TARGET: '${PORTBLAS_TUNING_TARGET}'")
-  endif()
-elseif(NUM_TARGETS EQUAL 0)
-  # Enable portBLAS backend for all devices types
-  set(ENABLE_PORTBLAS_BACKEND_INTEL_CPU "ON" CACHE INTERNAL "")
-  set(ENABLE_PORTBLAS_BACKEND_INTEL_GPU "ON" CACHE INTERNAL "")
-  set(ENABLE_PORTBLAS_BACKEND_AMD_GPU "ON" CACHE INTERNAL "")
-  set(ENABLE_PORTBLAS_BACKEND_NVIDIA_GPU "ON" CACHE INTERNAL "")
-else()
-  # Try to automatically detect the PORTBLAS_TUNING_TARGET
-  foreach(SYCL_TARGET IN LISTS SYCL_TARGETS)
-    if(SYCL_TARGETS MATCHES "^intel_gpu" OR SYCL_TARGETS MATCHES "^spir64_gen")
-      set(ENABLE_PORTBLAS_BACKEND_INTEL_GPU "ON" CACHE INTERNAL "")
-      set(PORTBLAS_TUNING_TARGET "INTEL_GPU")
-    elseif(SYCL_TARGETS MATCHES "^spir64_x86_64")
-      set(ENABLE_PORTBLAS_BACKEND_INTEL_CPU "ON" CACHE INTERNAL "")
-    elseif(SYCL_TARGETS MATCHES "^spir64")
-      set(ENABLE_PORTBLAS_BACKEND_INTEL_CPU "ON" CACHE INTERNAL "")
-      set(ENABLE_PORTBLAS_BACKEND_INTEL_GPU "ON" CACHE INTERNAL "")
-      set(PORTBLAS_TUNING_TARGET "INTEL_GPU")
-    elseif(SYCL_TARGETS MATCHES "^amd_gpu" OR SYCL_TARGETS MATCHES "-amd-")
-      set(ENABLE_PORTBLAS_BACKEND_AMD_GPU "ON" CACHE INTERNAL "")
-      set(PORTBLAS_TUNING_TARGET "AMD_GPU")
-    elseif(SYCL_TARGETS MATCHES "^nvidia_gpu" OR SYCL_TARGETS MATCHES "-nvidia-")
-      set(ENABLE_PORTBLAS_BACKEND_NVIDIA_GPU "ON" CACHE INTERNAL "")
-      set(PORTBLAS_TUNING_TARGET "NVIDIA_GPU")
-    endif()
-  endforeach()
-  # Currently portBLAS can only be tuned for one type of device.
-  if(NUM_TARGETS GREATER 1)
-    set(PORTBLAS_TUNING_TARGET "")
-  endif()
-endif()
-
-if(PORTBLAS_TUNING_TARGET STREQUAL "INTEL_GPU")
-  message(STATUS "Tuning portBLAS for Intel GPU devices")
-elseif(PORTBLAS_TUNING_TARGET STREQUAL "AMD_GPU")
-  message(STATUS "Tuning portBLAS for AMD GPU devices")
-elseif(PORTBLAS_TUNING_TARGET STREQUAL "NVIDIA_GPU")
-  message(STATUS "Tuning portBLAS for Nvidia GPU devices")
-else()
-  message(STATUS "portBLAS is not tuned for any device which can impact performance")
-endif()
-
-# If find_package doesn't work, download portBLAS from Github. This is
-# intended to make OneMKL easier to use.
-message(STATUS "Looking for portBLAS")
-find_package(PORTBLAS QUIET)
-if (NOT PORTBLAS_FOUND)
-  message(STATUS "Looking for portBLAS - could not find portBLAS with PORTBLAS_DIR")
-  include(FetchContent)
-  set(INSTALL_HEADER_ONLY ON)
-  set(BLAS_BUILD_SAMPLES OFF)
-  set(BLAS_ENABLE_BENCHMARK OFF)
-  set(BLAS_ENABLE_TESTING OFF)
-  set(ENABLE_EXPRESSION_TESTS OFF)
-  if(NOT PORTBLAS_TUNING_TARGET)
-    set(PORTBLAS_TUNING_TARGET "DEFAULT")
-  endif()
-  # Following variable TUNING_TARGET will be used in portBLAS internal configuration
-  set(TUNING_TARGET ${PORTBLAS_TUNING_TARGET})
-  set(BLAS_ENABLE_COMPLEX ON)
-  # Set the policy to forward variables to portBLAS configure step
-  set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
-  set(FETCHCONTENT_BASE_DIR "${CMAKE_BINARY_DIR}/deps")
-  FetchContent_Declare(
-    portBLAS
-    GIT_REPOSITORY https://github.com/codeplaysoftware/portBLAS
-    GIT_TAG        main
-  )
-  FetchContent_MakeAvailable(portblas)
-  message(STATUS "Looking for portBLAS - downloaded")
-
-else()
-  message(STATUS "Looking for portBLAS - found")
-  add_library(portblas ALIAS PORTBLAS::portblas)
-endif()
-
-set(SOURCES
-  portblas_level1_double.cpp portblas_level1_float.cpp
-  portblas_level2_double.cpp portblas_level2_float.cpp
-  portblas_level3_double.cpp portblas_level3_float.cpp
-  portblas_level3_half.cpp portblas_level3_bfloat16.cpp
-  portblas_batch.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: portblas_wrappers.cpp>)
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT ${SOURCES})
-add_dependencies(onemkl_backend_libs_blas ${LIB_NAME})
-
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET ${LIB_OBJ} SOURCES ${SOURCES})
-endif()
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL portblas)
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON)
-
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/blas/backends/portblas/portblas_batch.cpp b/src/blas/backends/portblas/portblas_batch.cpp
deleted file mode 100644
index 65f0cd59e..000000000
--- a/src/blas/backends/portblas/portblas_batch.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "portblas_common.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace portblas {
-namespace column_major {
-
-#define COLUMN_MAJOR
-constexpr bool is_column_major() {
-    return true;
-}
-#include "portblas_batch.cxx"
-#undef COLUMN_MAJOR
-
-} // namespace column_major
-namespace row_major {
-
-#define ROW_MAJOR
-constexpr bool is_column_major() {
-    return false;
-}
-#include "portblas_batch.cxx"
-#undef ROW_MAJOR
-
-} // namespace row_major
-} // namespace portblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/portblas/portblas_batch.cxx b/src/blas/backends/portblas/portblas_batch.cxx
deleted file mode 100644
index 28c7ee5dc..000000000
--- a/src/blas/backends/portblas/portblas_batch.cxx
+++ /dev/null
@@ -1,1017 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    throw unimplemented("blas", "syrk_batch", "");
-}
-
-void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer<double, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    throw unimplemented("blas", "syrk_batch", "");
-}
-
-void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    throw unimplemented("blas", "syrk_batch", "");
-}
-
-void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    throw unimplemented("blas", "syrk_batch", "");
-}
-
-void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex, float beta,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    throw unimplemented("blas", "gemv_batch", "");
-}
-
-void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex, double beta,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    throw unimplemented("blas", "gemv_batch", "");
-}
-
-void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    throw unimplemented("blas", "gemv_batch", "");
-}
-
-void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, std::complex<double> beta,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    throw unimplemented("blas", "gemv_batch", "");
-}
-
-void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size) {
-    throw unimplemented("blas", "dgmm_batch", "");
-}
-
-void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size) {
-    throw unimplemented("blas", "dgmm_batch", "");
-}
-
-void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size) {
-    throw unimplemented("blas", "dgmm_batch", "");
-}
-
-void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size) {
-    throw unimplemented("blas", "dgmm_batch", "");
-}
-
-void axpy_batch(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<float, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    CALL_PORTBLAS_FN(::blas::_axpy_batch, queue, n, alpha, x, incx, stridex, y, incy, stridey,
-                     batch_size);
-}
-
-void axpy_batch(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<double, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    CALL_PORTBLAS_FN(::blas::_axpy_batch, queue, n, alpha, x, incx, stridex, y, incy, stridey,
-                     batch_size);
-}
-
-void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    throw unimplemented("blas", "axpy_batch", "");
-}
-
-void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    throw unimplemented("blas", "axpy_batch", "");
-}
-void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<float, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    throw unimplemented("blas", "copy_batch", "");
-}
-
-void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<double, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    throw unimplemented("blas", "copy_batch", "");
-}
-
-void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    throw unimplemented("blas", "copy_batch", "");
-}
-
-void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    throw unimplemented("blas", "copy_batch", "");
-}
-
-void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    CALL_PORTBLAS_FN(::blas::_gemm_strided_batched, queue, transa, transb, m, n, k, alpha, a, lda,
-                     stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b, double beta,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    CALL_PORTBLAS_FN(::blas::_gemm_strided_batched, queue, transa, transb, m, n, k, alpha, a, lda,
-                     stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size);
-}
-
-void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    throw unimplemented("blas", "gemm_batch", " for complex");
-}
-
-void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    throw unimplemented("blas", "gemm_batch", " for complex");
-}
-
-void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    throw unimplemented("blas", "gemm_batch", " for complex");
-}
-
-void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    throw unimplemented("blas", "gemm_batch", " for unsupported dtype");
-}
-
-void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    throw unimplemented("blas", "gemm_batch", " for unsupported dtype");
-}
-
-void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    throw unimplemented("blas", "gemm_batch", " for unsupported dtype");
-}
-
-void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-                oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m,
-                std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                std::int64_t stride_b, std::int64_t batch_size) {
-    throw unimplemented("blas", "trsm_batch", "");
-}
-
-void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-                oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m,
-                std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                std::int64_t stride_a, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                std::int64_t stride_b, std::int64_t batch_size) {
-    throw unimplemented("blas", "trsm_batch", "");
-}
-
-void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-                oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m,
-                std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    throw unimplemented("blas", "trsm_batch", "");
-}
-
-void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-                oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m,
-                std::int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    throw unimplemented("blas", "trsm_batch", "");
-}
-
-void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    CALL_PORTBLAS_FN(::blas::_omatcopy_batch, queue, trans, m, n, alpha, a, lda, stride_a, b, ldb,
-                     stride_b, batch_size);
-}
-
-void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    CALL_PORTBLAS_FN(::blas::_omatcopy_batch, queue, trans, m, n, alpha, a, lda, stride_a, b, ldb,
-                     stride_b, batch_size);
-}
-
-void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    throw unimplemented("blas", "omatcopy_batch", "");
-}
-
-void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    throw unimplemented("blas", "omatcopy_batch", "");
-}
-
-void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "");
-}
-
-void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "");
-}
-
-void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "");
-}
-
-void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "");
-}
-
-void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                   std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                   std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<float, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    CALL_PORTBLAS_FN(::blas::_omatadd_batch, queue, transa, transb, m, n, alpha, a, lda, stride_a,
-                     beta, b, ldb, stride_b, c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                   std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                   std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer<double, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<double, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    CALL_PORTBLAS_FN(::blas::_omatadd_batch, queue, transa, transb, m, n, alpha, a, lda, stride_a,
-                     beta, b, ldb, stride_b, c, ldc, stride_c, batch_size);
-}
-
-void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                   std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                   sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                   std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    throw unimplemented("blas", "omatadd_batch", "");
-}
-
-void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                   std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                   sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                   std::int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                   std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    throw unimplemented("blas", "omatadd_batch", "");
-}
-
-// USM APIs
-
-sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower,
-                       oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const float **a, std::int64_t *lda, float *beta, float **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", " for USM");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower,
-                       oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *k,
-                       double *alpha, const double **a, std::int64_t *lda, double *beta, double **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", " for USM");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower,
-                       oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *k,
-                       std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
-                       std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", " for USM");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower,
-                       oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *k,
-                       std::complex<double> *alpha, const std::complex<double> **a,
-                       std::int64_t *lda, std::complex<double> *beta, std::complex<double> **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", " for USM");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                       oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, float alpha,
-                       const float *a, std::int64_t lda, std::int64_t stride_a, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", " for USM");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                       oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                       const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                       double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", " for USM");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                       oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                       std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                       std::int64_t stride_a, std::complex<float> beta, std::complex<float> *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", " for USM");
-}
-
-sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                       oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                       std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                       std::int64_t stride_a, std::complex<double> beta, std::complex<double> *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk_batch", " for USM");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                       std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                       std::int64_t stridea, const float *x, std::int64_t incx,
-                       std::int64_t stridex, float beta, float *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", " for USM");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                       std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                       std::int64_t stridea, const double *x, std::int64_t incx,
-                       std::int64_t stridex, double beta, double *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", " for USM");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                       std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                       std::int64_t lda, std::int64_t stridea, const std::complex<float> *x,
-                       std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-                       std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", " for USM");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                       std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                       std::int64_t lda, std::int64_t stridea, const std::complex<double> *x,
-                       std::int64_t incx, std::int64_t stridex, std::complex<double> beta,
-                       std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", " for USM");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m,
-                       std::int64_t *n, float *alpha, const float **a, std::int64_t *lda,
-                       const float **x, std::int64_t *incx, float *beta, float **y,
-                       std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", " for USM");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m,
-                       std::int64_t *n, double *alpha, const double **a, std::int64_t *lda,
-                       const double **x, std::int64_t *incx, double *beta, double **y,
-                       std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", " for USM");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m,
-                       std::int64_t *n, std::complex<float> *alpha, const std::complex<float> **a,
-                       std::int64_t *lda, const std::complex<float> **x, std::int64_t *incx,
-                       std::complex<float> *beta, std::complex<float> **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", " for USM");
-}
-
-sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m,
-                       std::int64_t *n, std::complex<double> *alpha, const std::complex<double> **a,
-                       std::int64_t *lda, const std::complex<double> **x, std::int64_t *incx,
-                       std::complex<double> *beta, std::complex<double> **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv_batch", " for USM");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m,
-                       std::int64_t n, const float *a, std::int64_t lda, std::int64_t stridea,
-                       const float *x, std::int64_t incx, std::int64_t stridex, float *c,
-                       std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", " for USM");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m,
-                       std::int64_t n, const double *a, std::int64_t lda, std::int64_t stridea,
-                       const double *x, std::int64_t incx, std::int64_t stridex, double *c,
-                       std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", " for USM");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m,
-                       std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                       std::int64_t stridea, const std::complex<float> *x, std::int64_t incx,
-                       std::int64_t stridex, std::complex<float> *c, std::int64_t ldc,
-                       std::int64_t stridec, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", " for USM");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m,
-                       std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                       std::int64_t stridea, const std::complex<double> *x, std::int64_t incx,
-                       std::int64_t stridex, std::complex<double> *c, std::int64_t ldc,
-                       std::int64_t stridec, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", " for USM");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m,
-                       std::int64_t *n, const float **a, std::int64_t *lda, const float **x,
-                       std::int64_t *incx, float **c, std::int64_t *ldc, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", " for USM");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m,
-                       std::int64_t *n, const double **a, std::int64_t *lda, const double **x,
-                       std::int64_t *incx, double **c, std::int64_t *ldc, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", " for USM");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m,
-                       std::int64_t *n, const std::complex<float> **a, std::int64_t *lda,
-                       const std::complex<float> **x, std::int64_t *incx, std::complex<float> **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", " for USM");
-}
-
-sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m,
-                       std::int64_t *n, const std::complex<double> **a, std::int64_t *lda,
-                       const std::complex<double> **x, std::int64_t *incx, std::complex<double> **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dgmm_batch", " for USM");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, float *alpha, const float **x,
-                       std::int64_t *incx, float **y, std::int64_t *incy, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", " for USM");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, double *alpha, const double **x,
-                       std::int64_t *incx, double **y, std::int64_t *incy, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", " for USM");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
-                       const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y,
-                       std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", " for USM");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
-                       const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
-                       std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", " for USM");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, float alpha, const float *x,
-                       std::int64_t incx, std::int64_t stridex, float *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_axpy_batch, queue, n, alpha, x, incx, stridex, y, incy, stridey,
-                         batch_size, dependencies);
-}
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, double alpha, const double *x,
-                       std::int64_t incx, std::int64_t stridex, double *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_axpy_batch, queue, n, alpha, x, incx, stridex, y, incy, stridey,
-                         batch_size, dependencies);
-}
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                       const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", " for USM");
-}
-
-sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                       const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy_batch", " for USM");
-}
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const float **x, std::int64_t *incx,
-                       float **y, std::int64_t *incy, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", " for USM");
-}
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const double **x, std::int64_t *incx,
-                       double **y, std::int64_t *incy, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", " for USM");
-}
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const std::complex<float> **x,
-                       std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", " for USM");
-}
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const std::complex<double> **x,
-                       std::int64_t *incx, std::complex<double> **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", " for USM");
-}
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                       std::int64_t stridex, float *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", " for USM");
-}
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-                       std::int64_t stridex, double *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", " for USM");
-}
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const std::complex<float> *x,
-                       std::int64_t incx, std::int64_t stridex, std::complex<float> *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", " for USM");
-}
-
-sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const std::complex<double> *x,
-                       std::int64_t incx, std::int64_t stridex, std::complex<double> *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy_batch", " for USM");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                       oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n,
-                       std::int64_t *k, float *alpha, const float **a, std::int64_t *lda,
-                       const float **b, std::int64_t *ldb, float *beta, float **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch", " for USM");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                       oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n,
-                       std::int64_t *k, double *alpha, const double **a, std::int64_t *lda,
-                       const double **b, std::int64_t *ldb, double *beta, double **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch", " for USM");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                       oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n,
-                       std::int64_t *k, std::complex<float> *alpha, const std::complex<float> **a,
-                       std::int64_t *lda, const std::complex<float> **b, std::int64_t *ldb,
-                       std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch", " for USM");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                       oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n,
-                       std::int64_t *k, std::complex<double> *alpha, const std::complex<double> **a,
-                       std::int64_t *lda, const std::complex<double> **b, std::int64_t *ldb,
-                       std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch", " for USM");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                       oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n,
-                       std::int64_t *k, sycl::half *alpha, const sycl::half **a, std::int64_t *lda,
-                       const sycl::half **b, std::int64_t *ldb, sycl::half *beta, sycl::half **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch", " for USM");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                       oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n,
-                       std::int64_t *k, float *alpha, const sycl::half **a, std::int64_t *lda,
-                       const sycl::half **b, std::int64_t *ldb, float *beta, float **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch", " for USM");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                       oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n,
-                       std::int64_t *k, float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, float **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch", " for USM");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa,
-                       oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n,
-                       std::int64_t *k, float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch", " for USM");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                       oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                       std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                       std::int64_t stride_a, const float *b, std::int64_t ldb,
-                       std::int64_t stride_b, float beta, float *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_gemm_strided_batched, queue, transa, transb, m, n, k, alpha, a,
-                         lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size,
-                         dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                       oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                       std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                       std::int64_t stride_a, const double *b, std::int64_t ldb,
-                       std::int64_t stride_b, double beta, double *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_gemm_strided_batched, queue, transa, transb, m, n, k, alpha, a,
-                         lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size,
-                         dependencies);
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                       oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                       std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                       std::int64_t lda, std::int64_t stride_a, const std::complex<float> *b,
-                       std::int64_t ldb, std::int64_t stride_b, std::complex<float> beta,
-                       std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch", " for USM");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                       oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                       std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                       std::int64_t lda, std::int64_t stride_a, const std::complex<double> *b,
-                       std::int64_t ldb, std::int64_t stride_b, std::complex<double> beta,
-                       std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch", " for USM");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                       oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                       std::int64_t k, sycl::half alpha, const sycl::half *a, std::int64_t lda,
-                       std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-                       std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch", " for USM");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                       oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                       std::int64_t k, float alpha, const sycl::half *a, std::int64_t lda,
-                       std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-                       std::int64_t stride_b, float beta, float *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch", " for USM");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                       oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                       std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                       std::int64_t stride_a, const std::int8_t *b, std::int64_t ldb,
-                       std::int64_t stride_b, float beta, float *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch", " for USM");
-}
-
-sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                       oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                       std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                       std::int64_t stride_a, const std::int8_t *b, std::int64_t ldb,
-                       std::int64_t stride_b, float beta, std::int32_t *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_batch", " for USM");
-}
-
-sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right,
-                       oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                       oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                       const float *a, std::int64_t lda, std::int64_t stride_a, float *b,
-                       std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", " for USM");
-}
-
-sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right,
-                       oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                       oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                       const double *a, std::int64_t lda, std::int64_t stride_a, double *b,
-                       std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", " for USM");
-}
-
-sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right,
-                       oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                       oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                       std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                       std::int64_t stride_a, std::complex<float> *b, std::int64_t ldb,
-                       std::int64_t stride_b, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", " for USM");
-}
-
-sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right,
-                       oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                       oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                       std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                       std::int64_t stride_a, std::complex<double> *b, std::int64_t ldb,
-                       std::int64_t stride_b, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", " for USM");
-}
-
-sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side *left_right,
-                       oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans,
-                       oneapi::mkl::diag *unit_diag, std::int64_t *m, std::int64_t *n, float *alpha,
-                       const float **a, std::int64_t *lda, float **b, std::int64_t *ldb,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", " for USM");
-}
-
-sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side *left_right,
-                       oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans,
-                       oneapi::mkl::diag *unit_diag, std::int64_t *m, std::int64_t *n,
-                       double *alpha, const double **a, std::int64_t *lda, double **b,
-                       std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", " for USM");
-}
-
-sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side *left_right,
-                       oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans,
-                       oneapi::mkl::diag *unit_diag, std::int64_t *m, std::int64_t *n,
-                       std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
-                       std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", " for USM");
-}
-
-sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side *left_right,
-                       oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans,
-                       oneapi::mkl::diag *unit_diag, std::int64_t *m, std::int64_t *n,
-                       std::complex<double> *alpha, const std::complex<double> **a,
-                       std::int64_t *lda, std::complex<double> **b, std::int64_t *ldb,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm_batch", " for USM");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                           std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                           std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_omatcopy_batch, queue, trans, m, n, alpha, a, lda, stride_a, b,
-                         ldb, stride_b, batch_size, dependencies);
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                           std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                           std::int64_t stride_a, double *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_omatcopy_batch, queue, trans, m, n, alpha, a, lda, stride_a, b,
-                         ldb, stride_b, batch_size, dependencies);
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                           std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                           std::int64_t lda, std::int64_t stride_a, std::complex<float> *b,
-                           std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", " for USM");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                           std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", " for USM");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                           std::int64_t n, float alpha, float *ab, std::int64_t lda,
-                           std::int64_t ldb, std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", " for USM");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                           std::int64_t n, double alpha, double *ab, std::int64_t lda,
-                           std::int64_t ldb, std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", " for USM");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                           std::int64_t n, std::complex<float> alpha, std::complex<float> *ab,
-                           std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", " for USM");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                           std::int64_t n, std::complex<double> alpha, std::complex<double> *ab,
-                           std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", " for USM");
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                          oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                          float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
-                          float beta, const float *b, std::int64_t ldb, std::int64_t stride_b,
-                          float *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_omatadd_batch, queue, transa, transb, m, n, alpha, a, lda,
-                         stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size,
-                         dependencies);
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                          oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                          double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
-                          double beta, const double *b, std::int64_t ldb, std::int64_t stride_b,
-                          double *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_omatadd_batch, queue, transa, transb, m, n, alpha, a, lda,
-                         stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size,
-                         dependencies);
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                          oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                          std::int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd_batch", " for USM");
-}
-
-sycl::event omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa,
-                          oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<double> alpha, const std::complex<double> *a,
-                          std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd_batch", " for USM");
-}
diff --git a/src/blas/backends/portblas/portblas_common.hpp b/src/blas/backends/portblas/portblas_common.hpp
deleted file mode 100644
index 1624749e8..000000000
--- a/src/blas/backends/portblas/portblas_common.hpp
+++ /dev/null
@@ -1,239 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _PORTBLAS_COMMON_HPP_
-#define _PORTBLAS_COMMON_HPP_
-
-#include "portblas.hpp"
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-
-#include <tuple>
-#include <utility>
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace portblas {
-
-namespace detail {
-// portBLAS handle type. Constructed with sycl::queue.
-using handle_t = ::blas::SB_Handle;
-
-// portBLAS buffer iterator. Constructed with sycl::buffer<ElemT,1>
-template <typename ElemT>
-using buffer_iterator_t = ::blas::BufferIterator<ElemT>;
-
-// sycl complex data type (experimental)
-template <typename ElemT>
-using sycl_complex_t = sycl::ext::oneapi::experimental::complex<ElemT>;
-
-/** A trait for obtaining equivalent portBLAS API types from oneMKL API
- *  types.
- * 
- *  @tparam InputT is the oneMKL type.
- *  portblas_type<InputT>::type should be the equivalent portBLAS type.
-**/
-template <typename InputT>
-struct portblas_type;
-
-#define DEF_PORTBLAS_TYPE(onemkl_t, portblas_t) \
-    template <>                                 \
-    struct portblas_type<onemkl_t> {            \
-        using type = portblas_t;                \
-    };
-
-DEF_PORTBLAS_TYPE(sycl::queue, handle_t)
-DEF_PORTBLAS_TYPE(int64_t, int64_t)
-DEF_PORTBLAS_TYPE(sycl::half, sycl::half)
-DEF_PORTBLAS_TYPE(float, float)
-DEF_PORTBLAS_TYPE(double, double)
-DEF_PORTBLAS_TYPE(oneapi::mkl::transpose, char)
-DEF_PORTBLAS_TYPE(oneapi::mkl::uplo, char)
-DEF_PORTBLAS_TYPE(oneapi::mkl::side, char)
-DEF_PORTBLAS_TYPE(oneapi::mkl::diag, char)
-DEF_PORTBLAS_TYPE(std::complex<float>, sycl_complex_t<float>)
-DEF_PORTBLAS_TYPE(std::complex<double>, sycl_complex_t<double>)
-// Passthrough of portBLAS arg types for more complex wrapping.
-DEF_PORTBLAS_TYPE(::blas::gemm_batch_type_t, ::blas::gemm_batch_type_t)
-
-#undef DEF_PORTBLAS_TYPE
-
-template <typename ElemT>
-struct portblas_type<sycl::buffer<ElemT, 1>> {
-    using type = buffer_iterator_t<ElemT>;
-};
-
-template <typename ElemT>
-struct portblas_type<ElemT*> {
-    using type = ElemT*;
-};
-
-// USM Complex
-template <typename ElemT>
-struct portblas_type<std::complex<ElemT>*> {
-    using type = sycl_complex_t<ElemT>*;
-};
-
-template <typename ElemT>
-struct portblas_type<const std::complex<ElemT>*> {
-    using type = const sycl_complex_t<ElemT>*;
-};
-
-template <>
-struct portblas_type<std::vector<sycl::event>> {
-    using type = std::vector<sycl::event>;
-};
-
-/** Convert a OneMKL argument to the type required for portBLAS.
- *  
- *  @tparam InputT The OneMKL type.
- *  @param input The value of the oneMKL type.
- *  @return The portBLAS value with appropriate type.
-**/
-template <typename InputT>
-inline typename portblas_type<InputT>::type convert_to_portblas_type(InputT& input) {
-    return typename portblas_type<InputT>::type(input);
-}
-
-template <>
-inline char convert_to_portblas_type<oneapi::mkl::transpose>(oneapi::mkl::transpose& trans) {
-    if (trans == oneapi::mkl::transpose::nontrans) {
-        return 'n';
-    }
-    else if (trans == oneapi::mkl::transpose::trans) {
-        return 't';
-    }
-    else { // trans == oneapi::mkl::transpose::conjtrans
-        return 'c';
-    }
-}
-
-template <>
-inline char convert_to_portblas_type<oneapi::mkl::uplo>(oneapi::mkl::uplo& upper_lower) {
-    if (upper_lower == oneapi::mkl::uplo::upper) {
-        return 'u';
-    }
-    else {
-        return 'l';
-    }
-}
-
-template <>
-inline char convert_to_portblas_type<oneapi::mkl::side>(oneapi::mkl::side& left_right) {
-    if (left_right == oneapi::mkl::side::left) {
-        return 'l';
-    }
-    else {
-        return 'r';
-    }
-}
-
-template <>
-inline char convert_to_portblas_type<oneapi::mkl::diag>(oneapi::mkl::diag& unit_diag) {
-    if (unit_diag == oneapi::mkl::diag::unit) {
-        return 'u';
-    }
-    else {
-        return 'n';
-    }
-}
-
-template <typename... ArgT>
-inline auto convert_to_portblas_type(ArgT... args) {
-    return std::make_tuple(convert_to_portblas_type(args)...);
-}
-
-/** Throw an unsupported_device exception if a certain argument type is found in
- * the argument pack.
- *  
- *  @tparam CheckT is type to look for a template parameter pack.
- *  @tparam AspectVal is the device aspect required to support CheckT.
-**/
-template <typename CheckT, sycl::aspect AspectVal>
-struct throw_if_unsupported_by_device {
-    /** Operator to throw if unsupported.
- * 
- *  @tparam ArgTs The argument types to check.
- *  @param The message to include in the exception.
- *  @param q is the sycl::queue.
- *  @param args are the remaining args to check for CheckT in.
-**/
-    template <typename... ArgTs>
-    void operator()(const std::string& message, sycl::queue q, ArgTs... args) {
-        static constexpr bool checkTypeInPack = (std::is_same_v<CheckT, ArgTs> || ...);
-        if (checkTypeInPack) {
-            if (!q.get_info<sycl::info::queue::device>().has(AspectVal)) {
-                throw mkl::unsupported_device("blas", message,
-                                              q.get_info<sycl::info::queue::device>());
-            }
-        }
-    }
-};
-
-} // namespace detail
-
-#define CALL_PORTBLAS_FN(portBLASFunc, ...)                                                     \
-    if constexpr (is_column_major()) {                                                          \
-        detail::throw_if_unsupported_by_device<sycl::buffer<double>, sycl::aspect::fp64>{}(     \
-            " portBLAS function requiring fp64 support", __VA_ARGS__);                          \
-        detail::throw_if_unsupported_by_device<sycl::buffer<sycl::half>, sycl::aspect::fp16>{}( \
-            " portBLAS function requiring fp16 support", __VA_ARGS__);                          \
-        auto args = detail::convert_to_portblas_type(__VA_ARGS__);                              \
-        auto fn = [](auto&&... targs) {                                                         \
-            portBLASFunc(std::forward<decltype(targs)>(targs)...);                              \
-        };                                                                                      \
-        try {                                                                                   \
-            std::apply(fn, args);                                                               \
-        }                                                                                       \
-        catch (const ::blas::unsupported_exception& e) {                                        \
-            throw unimplemented("blas", e.what());                                              \
-        }                                                                                       \
-    }                                                                                           \
-    else {                                                                                      \
-        throw unimplemented("blas", "portBLAS function");                                       \
-    }
-
-#define CALL_PORTBLAS_USM_FN(portblasFunc, ...)                                   \
-    if constexpr (is_column_major()) {                                            \
-        detail::throw_if_unsupported_by_device<double, sycl::aspect::fp64>{}(     \
-            " portBLAS function requiring fp64 support", __VA_ARGS__);            \
-        detail::throw_if_unsupported_by_device<sycl::half, sycl::aspect::fp16>{}( \
-            " portBLAS function requiring fp16 support", __VA_ARGS__);            \
-        auto args = detail::convert_to_portblas_type(__VA_ARGS__);                \
-        auto fn = [](auto&&... targs) {                                           \
-            return portblasFunc(std::forward<decltype(targs)>(targs)...).back();  \
-        };                                                                        \
-        try {                                                                     \
-            return std::apply(fn, args);                                          \
-        }                                                                         \
-        catch (const ::blas::unsupported_exception& e) {                          \
-            throw unimplemented("blas", e.what());                                \
-        }                                                                         \
-    }                                                                             \
-    else {                                                                        \
-        throw unimplemented("blas", "portBLAS function");                         \
-    }
-
-} // namespace portblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _PORTBLAS_COMMON_HPP_
diff --git a/src/blas/backends/portblas/portblas_gemm_bias.cxx b/src/blas/backends/portblas/portblas_gemm_bias.cxx
deleted file mode 100644
index 30f638f3e..000000000
--- a/src/blas/backends/portblas/portblas_gemm_bias.cxx
+++ /dev/null
@@ -1,90 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-               oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-               float alpha, sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "");
-}
-
-void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-               oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-               float alpha, sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "");
-}
-
-void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-               oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-               float alpha, sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "");
-}
-
-void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-               oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k,
-               float alpha, sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "");
-}
-
-// USM APIs
-
-sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa,
-                      oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m,
-                      std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a,
-                      std::int64_t lda, std::int8_t ao, const std::uint8_t *b, std::int64_t ldb,
-                      std::uint8_t bo, float beta, std::int32_t *c, std::int64_t ldc,
-                      const std::int32_t *co, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", " for USM");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa,
-                      oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m,
-                      std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a,
-                      std::int64_t lda, std::int8_t ao, const std::int8_t *b, std::int64_t ldb,
-                      std::int8_t bo, float beta, std::int32_t *c, std::int64_t ldc,
-                      const std::int32_t *co, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", " for USM");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa,
-                      oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m,
-                      std::int64_t n, std::int64_t k, float alpha, const std::uint8_t *a,
-                      std::int64_t lda, std::uint8_t ao, const std::int8_t *b, std::int64_t ldb,
-                      std::int8_t bo, float beta, std::int32_t *c, std::int64_t ldc,
-                      const std::int32_t *co, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", " for USM");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa,
-                      oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m,
-                      std::int64_t n, std::int64_t k, float alpha, const std::uint8_t *a,
-                      std::int64_t lda, std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb,
-                      std::uint8_t bo, float beta, std::int32_t *c, std::int64_t ldc,
-                      const std::int32_t *co, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", " for USM");
-}
diff --git a/src/blas/backends/portblas/portblas_level1.cxx b/src/blas/backends/portblas/portblas_level1.cxx
deleted file mode 100644
index e1e1f2f60..000000000
--- a/src/blas/backends/portblas/portblas_level1.cxx
+++ /dev/null
@@ -1,410 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void dotc(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<real_t>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<real_t>, 1> &result) {
-    throw unimplemented("blas", "dotc", "");
-}
-
-void dotu(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<real_t>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<real_t>, 1> &result) {
-    throw unimplemented("blas", "dotu", "");
-}
-
-void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<real_t, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    CALL_PORTBLAS_FN(::blas::_iamax, queue, n, x, incx, result);
-}
-
-void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<real_t>, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    throw unimplemented("blas", "iamax", "");
-}
-
-void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<real_t, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    CALL_PORTBLAS_FN(::blas::_iamin, queue, n, x, incx, result);
-}
-
-void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<real_t>, 1> &x,
-           std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    throw unimplemented("blas", "iamin", "");
-}
-
-void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<real_t>, 1> &x,
-          std::int64_t incx, sycl::buffer<real_t, 1> &result) {
-    throw unimplemented("blas", "asum", "");
-}
-
-void asum(sycl::queue &queue, std::int64_t n, sycl::buffer<real_t, 1> &x, std::int64_t incx,
-          sycl::buffer<real_t, 1> &result) {
-    // portBLAS asum implementation requires that result is initialized to zero
-    // before performing the computation.
-    queue.submit([&](sycl::handler &cgh) {
-        auto result_acc = result.template get_access<sycl::access::mode::write>(cgh);
-        cgh.single_task([=]() { result_acc[0] = real_t(0); });
-    });
-    CALL_PORTBLAS_FN(::blas::_asum, queue, n, x, incx, result);
-}
-
-void axpy(sycl::queue &queue, std::int64_t n, real_t alpha, sycl::buffer<real_t, 1> &x,
-          std::int64_t incx, sycl::buffer<real_t, 1> &y, std::int64_t incy) {
-    CALL_PORTBLAS_FN(::blas::_axpy, queue, n, alpha, x, incx, y, incy);
-}
-
-void axpy(sycl::queue &queue, std::int64_t n, std::complex<real_t> alpha,
-          sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy) {
-    throw unimplemented("blas", "axpy", "for complex");
-}
-
-void axpby(sycl::queue &queue, std::int64_t n, real_t alpha, sycl::buffer<real_t, 1> &x,
-           std::int64_t incx, real_t beta, sycl::buffer<real_t, 1> &y, std::int64_t incy) {
-    throw unimplemented("blas", "axpby", "");
-}
-
-void axpby(sycl::queue &queue, std::int64_t n, std::complex<real_t> alpha,
-           sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx, std::complex<real_t> beta,
-           sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy) {
-    throw unimplemented("blas", "axpby", "");
-}
-
-void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<real_t, 1> &x, std::int64_t incx,
-          sycl::buffer<real_t, 1> &y, std::int64_t incy) {
-    CALL_PORTBLAS_FN(::blas::_copy, queue, n, x, incx, y, incy);
-}
-
-void copy(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<real_t>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy) {
-    throw unimplemented("blas", "copy", " for complex.");
-}
-
-void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<real_t, 1> &x, std::int64_t incx,
-         sycl::buffer<real_t, 1> &y, std::int64_t incy, sycl::buffer<real_t, 1> &result) {
-    // portBLAS dot implementation requires that result is initialized to zero
-    // before performing the computation.
-    queue.submit([&](sycl::handler &cgh) {
-        auto result_acc = result.template get_access<sycl::access::mode::write>(cgh);
-        cgh.single_task([=]() { result_acc[0] = real_t(0); });
-    });
-    CALL_PORTBLAS_FN(::blas::_dot, queue, n, x, incx, y, incy, result);
-}
-
-#ifdef ENABLE_MIXED_PRECISION_WITH_DOUBLE
-void dot(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x, std::int64_t incx,
-         sycl::buffer<float, 1> &y, std::int64_t incy, sycl::buffer<double, 1> &result) {
-    throw unimplemented("blas", "dot", " for unmatched return type");
-}
-#endif
-
-void sdsdot(sycl::queue &queue, std::int64_t n, real_t sb, sycl::buffer<real_t, 1> &x,
-            std::int64_t incx, sycl::buffer<real_t, 1> &y, std::int64_t incy,
-            sycl::buffer<real_t, 1> &result) {
-    // portBLAS sdsdot implementation requires that result is initialized to zero
-    // before performing the computation.
-    queue.submit([&](sycl::handler &cgh) {
-        auto result_acc = result.template get_access<sycl::access::mode::write>(cgh);
-        cgh.single_task([=]() { result_acc[0] = real_t(0); });
-    });
-    CALL_PORTBLAS_FN(::blas::_sdsdot, queue, n, sb, x, incx, y, incy, result);
-}
-
-void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<real_t>, 1> &x,
-          std::int64_t incx, sycl::buffer<real_t, 1> &result) {
-    throw unimplemented("blas", "nrm2", " for complex");
-}
-
-void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer<real_t, 1> &x, std::int64_t incx,
-          sycl::buffer<real_t, 1> &result) {
-    // portBLAS nrm2 implementation requires that result is initialized to zero
-    // before performing the computation.
-    queue.submit([&](sycl::handler &cgh) {
-        auto result_acc = result.template get_access<sycl::access::mode::write>(cgh);
-        cgh.single_task([=]() { result_acc[0] = real_t(0); });
-    });
-    CALL_PORTBLAS_FN(::blas::_nrm2, queue, n, x, incx, result);
-}
-
-void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<real_t>, 1> &x,
-         std::int64_t incx, sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy, real_t c,
-         real_t s) {
-    throw unimplemented("blas", "rot", " for complex");
-}
-
-void rot(sycl::queue &queue, std::int64_t n, sycl::buffer<real_t, 1> &x, std::int64_t incx,
-         sycl::buffer<real_t, 1> &y, std::int64_t incy, real_t c, real_t s) {
-    CALL_PORTBLAS_FN(::blas::_rot, queue, n, x, incx, y, incy, c, s);
-}
-
-void rotg(sycl::queue &queue, sycl::buffer<real_t, 1> &a, sycl::buffer<real_t, 1> &b,
-          sycl::buffer<real_t, 1> &c, sycl::buffer<real_t, 1> &s) {
-    CALL_PORTBLAS_FN(::blas::_rotg, queue, a, b, c, s);
-}
-
-void rotg(sycl::queue &queue, sycl::buffer<std::complex<real_t>, 1> &a,
-          sycl::buffer<std::complex<real_t>, 1> &b, sycl::buffer<real_t, 1> &c,
-          sycl::buffer<std::complex<real_t>, 1> &s) {
-    throw unimplemented("blas", "rotg", " for complex");
-}
-
-void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer<real_t, 1> &x, std::int64_t incx,
-          sycl::buffer<real_t, 1> &y, std::int64_t incy, sycl::buffer<real_t, 1> &param) {
-    CALL_PORTBLAS_FN(::blas::_rotm, queue, n, x, incx, y, incy, param);
-}
-
-void rotmg(sycl::queue &queue, sycl::buffer<real_t, 1> &d1, sycl::buffer<real_t, 1> &d2,
-           sycl::buffer<real_t, 1> &x1, real_t y1, sycl::buffer<real_t, 1> &param) {
-    sycl::buffer<real_t, 1> y1_buffer(&y1, sycl::range<1>{ 1 });
-    CALL_PORTBLAS_FN(::blas::_rotmg, queue, d1, d2, x1, y1_buffer, param);
-}
-
-void scal(sycl::queue &queue, std::int64_t n, real_t alpha, sycl::buffer<real_t, 1> &x,
-          std::int64_t incx) {
-    CALL_PORTBLAS_FN(::blas::_scal, queue, n, alpha, x, incx);
-}
-
-void scal(sycl::queue &queue, std::int64_t n, std::complex<real_t> alpha,
-          sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx) {
-    throw unimplemented("blas", "scal", " for complex");
-}
-
-void scal(sycl::queue &queue, std::int64_t n, real_t alpha,
-          sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx) {
-    throw unimplemented("blas", "scal", " for complex");
-}
-
-void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<real_t, 1> &x, std::int64_t incx,
-          sycl::buffer<real_t, 1> &y, std::int64_t incy) {
-    CALL_PORTBLAS_FN(::blas::_swap, queue, n, x, incx, y, incy);
-}
-
-void swap(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<real_t>, 1> &x,
-          std::int64_t incx, sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy) {
-    throw unimplemented("blas", "swap", " for complex");
-}
-
-// USM APIs
-
-sycl::event dotc(sycl::queue &queue, std::int64_t n, const std::complex<real_t> *x,
-                 std::int64_t incx, const std::complex<real_t> *y, std::int64_t incy,
-                 std::complex<real_t> *result, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dotc", " for USM");
-}
-
-sycl::event dotu(sycl::queue &queue, std::int64_t n, const std::complex<real_t> *x,
-                 std::int64_t incx, const std::complex<real_t> *y, std::int64_t incy,
-                 std::complex<real_t> *result, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dotu", " for USM");
-}
-
-sycl::event iamax(sycl::queue &queue, std::int64_t n, const real_t *x, std::int64_t incx,
-                  std::int64_t *result, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_iamax, queue, n, x, incx, result, dependencies);
-}
-
-sycl::event iamax(sycl::queue &queue, std::int64_t n, const std::complex<real_t> *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "iamax", " for USM");
-}
-
-sycl::event iamin(sycl::queue &queue, std::int64_t n, const real_t *x, std::int64_t incx,
-                  std::int64_t *result, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_iamin, queue, n, x, incx, result, dependencies);
-}
-
-sycl::event iamin(sycl::queue &queue, std::int64_t n, const std::complex<real_t> *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "iamin", " for USM");
-}
-
-sycl::event asum(sycl::queue &queue, std::int64_t n, const std::complex<real_t> *x,
-                 std::int64_t incx, real_t *result, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "asum", " for USM");
-}
-
-sycl::event asum(sycl::queue &queue, std::int64_t n, const real_t *x, std::int64_t incx,
-                 real_t *result, const std::vector<sycl::event> &dependencies) {
-    // portBLAS asum implementation requires result to be initializes to zero
-    // before starting the computation.
-    auto init_res_val = queue.submit(
-        [&](sycl::handler &cgh) { cgh.single_task([=]() { result[0] = real_t(0); }); });
-    std::vector<sycl::event> new_dependencies = dependencies;
-    new_dependencies.push_back(init_res_val);
-    CALL_PORTBLAS_USM_FN(::blas::_asum, queue, n, x, incx, result, new_dependencies);
-}
-
-sycl::event axpy(sycl::queue &queue, std::int64_t n, real_t alpha, const real_t *x,
-                 std::int64_t incx, real_t *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_axpy, queue, n, alpha, x, incx, y, incy, dependencies);
-}
-
-sycl::event axpy(sycl::queue &queue, std::int64_t n, std::complex<real_t> alpha,
-                 const std::complex<real_t> *x, std::int64_t incx, std::complex<real_t> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpy", " for USM");
-}
-
-sycl::event axpby(sycl::queue &queue, std::int64_t n, real_t alpha, const real_t *x,
-                  std::int64_t incx, const real_t beta, real_t *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", " for USM");
-}
-
-sycl::event axpby(sycl::queue &queue, std::int64_t n, std::complex<real_t> alpha,
-                  const std::complex<real_t> *x, std::int64_t incx, const std::complex<real_t> beta,
-                  std::complex<real_t> *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", " for USM");
-}
-
-sycl::event copy(sycl::queue &queue, std::int64_t n, const real_t *x, std::int64_t incx, real_t *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_copy, queue, n, x, incx, y, incy, dependencies);
-}
-
-sycl::event copy(sycl::queue &queue, std::int64_t n, const std::complex<real_t> *x,
-                 std::int64_t incx, std::complex<real_t> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "copy", " for USM");
-}
-
-sycl::event dot(sycl::queue &queue, std::int64_t n, const real_t *x, std::int64_t incx,
-                const real_t *y, std::int64_t incy, real_t *result,
-                const std::vector<sycl::event> &dependencies) {
-    // portBLAS dot implementation requires result to be initializes to zero
-    // before starting the computation.
-    auto init_res_val = queue.submit(
-        [&](sycl::handler &cgh) { cgh.single_task([=]() { result[0] = real_t(0); }); });
-    std::vector<sycl::event> new_dependencies = dependencies;
-    new_dependencies.emplace_back(init_res_val);
-    CALL_PORTBLAS_USM_FN(::blas::_dot, queue, n, x, incx, y, incy, result, new_dependencies);
-}
-
-#ifdef ENABLE_MIXED_PRECISION_WITH_DOUBLE
-sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx,
-                const float *y, std::int64_t incy, double *result,
-                const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dot", " for USM");
-}
-#endif
-
-sycl::event sdsdot(sycl::queue &queue, std::int64_t n, real_t sb, const real_t *x,
-                   std::int64_t incx, const real_t *y, std::int64_t incy, real_t *result,
-                   const std::vector<sycl::event> &dependencies) {
-    // portBLAS sdsdot implementation requires result to be initializes to zero
-    // before starting the computation.
-    auto init_res_val = queue.submit(
-        [&](sycl::handler &cgh) { cgh.single_task([=]() { result[0] = real_t(0); }); });
-    std::vector<sycl::event> new_dependencies = dependencies;
-    new_dependencies.emplace_back(init_res_val);
-    CALL_PORTBLAS_USM_FN(::blas::_sdsdot, queue, n, sb, x, incx, y, incy, result, new_dependencies);
-}
-
-sycl::event nrm2(sycl::queue &queue, std::int64_t n, const std::complex<real_t> *x,
-                 std::int64_t incx, real_t *result, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "nrm2", " for USM");
-}
-
-sycl::event nrm2(sycl::queue &queue, std::int64_t n, const real_t *x, std::int64_t incx,
-                 real_t *result, const std::vector<sycl::event> &dependencies) {
-    // portBLAS nrm2 implementation requires result to be initializes to zero
-    // before starting the computation.
-    auto init_res_val = queue.submit(
-        [&](sycl::handler &cgh) { cgh.single_task([=]() { result[0] = real_t(0); }); });
-    std::vector<sycl::event> new_dependencies = dependencies;
-    new_dependencies.push_back(init_res_val);
-    CALL_PORTBLAS_USM_FN(::blas::_nrm2, queue, n, x, incx, result, new_dependencies);
-}
-
-sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex<real_t> *x, std::int64_t incx,
-                std::complex<real_t> *y, std::int64_t incy, real_t c, real_t s,
-                const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "rot", " for USM");
-}
-
-sycl::event rot(sycl::queue &queue, std::int64_t n, real_t *x, std::int64_t incx, real_t *y,
-                std::int64_t incy, real_t c, real_t s,
-                const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_rot, queue, n, x, incx, y, incy, c, s, dependencies);
-}
-
-sycl::event rotg(sycl::queue &queue, real_t *a, real_t *b, real_t *c, real_t *s,
-                 const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_rotg, queue, a, b, c, s, dependencies);
-}
-
-sycl::event rotg(sycl::queue &queue, std::complex<real_t> *a, std::complex<real_t> *b, real_t *c,
-                 std::complex<real_t> *s, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "rotg", " for USM");
-}
-
-sycl::event rotm(sycl::queue &queue, std::int64_t n, real_t *x, std::int64_t incx, real_t *y,
-                 std::int64_t incy, real_t *param, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_rotm, queue, n, x, incx, y, incy, param, dependencies);
-}
-
-sycl::event rotmg(sycl::queue &queue, real_t *d1, real_t *d2, real_t *x1, real_t y1, real_t *param,
-                  const std::vector<sycl::event> &dependencies) {
-    auto y_d =
-        (real_t *)sycl::malloc_device(sizeof(real_t), queue.get_device(), queue.get_context());
-    auto copy_in_event = queue.memcpy(y_d, &y1, sizeof(real_t), dependencies);
-    auto rotmg_event = std::invoke([&]() -> sycl::event {
-        CALL_PORTBLAS_USM_FN(::blas::_rotmg, queue, d1, d2, x1, y_d, param,
-                             std::vector<sycl::event>{ copy_in_event });
-    });
-    auto free_event = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(rotmg_event);
-        cgh.host_task([=]() { sycl::free(y_d, queue); });
-    });
-    return free_event;
-}
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, real_t alpha, real_t *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_scal, queue, n, alpha, x, incx, dependencies);
-}
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, std::complex<real_t> alpha,
-                 std::complex<real_t> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "scal", " for USM");
-}
-
-sycl::event scal(sycl::queue &queue, std::int64_t n, real_t alpha, std::complex<real_t> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "scal", " for USM");
-}
-
-sycl::event swap(sycl::queue &queue, std::int64_t n, real_t *x, std::int64_t incx, real_t *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_swap, queue, n, x, incx, y, incy, dependencies);
-}
-
-sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex<real_t> *x, std::int64_t incx,
-                 std::complex<real_t> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "swap", " for USM");
-}
diff --git a/src/blas/backends/portblas/portblas_level1_double.cpp b/src/blas/backends/portblas/portblas_level1_double.cpp
deleted file mode 100644
index 4c99f98c6..000000000
--- a/src/blas/backends/portblas/portblas_level1_double.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "portblas_common.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace portblas {
-
-using real_t = double;
-#define ENABLE_MIXED_PRECISION_WITH_DOUBLE
-
-namespace column_major {
-
-#define COLUMN_MAJOR
-constexpr bool is_column_major() {
-    return true;
-}
-#include "portblas_level1.cxx"
-#undef COLUMN_MAJOR
-
-} // namespace column_major
-namespace row_major {
-
-#define ROW_MAJOR
-constexpr bool is_column_major() {
-    return false;
-}
-#include "portblas_level1.cxx"
-#undef ROW_MAJOR
-
-#undef ENABLE_MIXED_PRECISION_WITH_DOUBLE
-} // namespace row_major
-} // namespace portblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/portblas/portblas_level1_float.cpp b/src/blas/backends/portblas/portblas_level1_float.cpp
deleted file mode 100644
index 744729f1a..000000000
--- a/src/blas/backends/portblas/portblas_level1_float.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "portblas_common.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace portblas {
-
-using real_t = float;
-
-namespace column_major {
-
-#define COLUMN_MAJOR
-constexpr bool is_column_major() {
-    return true;
-}
-#include "portblas_level1.cxx"
-#undef COLUMN_MAJOR
-
-} // namespace column_major
-namespace row_major {
-
-#define ROW_MAJOR
-constexpr bool is_column_major() {
-    return false;
-}
-#include "portblas_level1.cxx"
-#undef ROW_MAJOR
-
-} // namespace row_major
-} // namespace portblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/portblas/portblas_level2.cxx b/src/blas/backends/portblas/portblas_level2.cxx
deleted file mode 100644
index b3d8b6766..000000000
--- a/src/blas/backends/portblas/portblas_level2.cxx
+++ /dev/null
@@ -1,470 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-          real_t alpha, sycl::buffer<real_t, 1> &a, std::int64_t lda, sycl::buffer<real_t, 1> &x,
-          std::int64_t incx, real_t beta, sycl::buffer<real_t, 1> &y, std::int64_t incy) {
-    CALL_PORTBLAS_FN(::blas::_gemv, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-          std::complex<real_t> alpha, sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx, std::complex<real_t> beta,
-          sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy) {
-    throw unimplemented("blas", "gemv", " for complex");
-}
-
-void gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-          std::int64_t kl, std::int64_t ku, real_t alpha, sycl::buffer<real_t, 1> &a,
-          std::int64_t lda, sycl::buffer<real_t, 1> &x, std::int64_t incx, real_t beta,
-          sycl::buffer<real_t, 1> &y, std::int64_t incy) {
-    CALL_PORTBLAS_FN(::blas::_gbmv, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
-                     incy);
-}
-
-void gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-          std::int64_t kl, std::int64_t ku, std::complex<real_t> alpha,
-          sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx, std::complex<real_t> beta,
-          sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy) {
-    throw unimplemented("blas", "gbmv", " for complex");
-}
-
-void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, real_t alpha,
-         sycl::buffer<real_t, 1> &x, std::int64_t incx, sycl::buffer<real_t, 1> &y,
-         std::int64_t incy, sycl::buffer<real_t, 1> &a, std::int64_t lda) {
-    CALL_PORTBLAS_FN(::blas::_ger, queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<real_t> alpha,
-          sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda) {
-    throw unimplemented("blas", "gerc", "");
-}
-
-void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<real_t> alpha,
-          sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda) {
-    throw unimplemented("blas", "geru", "");
-}
-
-void hbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k,
-          std::complex<real_t> alpha, sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx, std::complex<real_t> beta,
-          sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy) {
-    throw unimplemented("blas", "hbmv", "");
-}
-
-void hemv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-          std::complex<real_t> alpha, sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx, std::complex<real_t> beta,
-          sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy) {
-    throw unimplemented("blas", "hemv", "");
-}
-
-void her(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-         sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda) {
-    throw unimplemented("blas", "her", "");
-}
-
-void her2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-          std::complex<real_t> alpha, sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda) {
-    throw unimplemented("blas", "her2", "");
-}
-
-void hpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-          std::complex<real_t> alpha, sycl::buffer<std::complex<real_t>, 1> &a,
-          sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx, std::complex<real_t> beta,
-          sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy) {
-    throw unimplemented("blas", "hpmv", "");
-}
-
-void hpr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-         sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<real_t>, 1> &a) {
-    throw unimplemented("blas", "hpr", "");
-}
-
-void hpr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-          std::complex<real_t> alpha, sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<real_t>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<real_t>, 1> &a) {
-    throw unimplemented("blas", "hpr2", "");
-}
-
-void sbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k,
-          real_t alpha, sycl::buffer<real_t, 1> &a, std::int64_t lda, sycl::buffer<real_t, 1> &x,
-          std::int64_t incx, real_t beta, sycl::buffer<real_t, 1> &y, std::int64_t incy) {
-    CALL_PORTBLAS_FN(::blas::_sbmv, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
-                     incy);
-}
-
-void symv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-          sycl::buffer<real_t, 1> &a, std::int64_t lda, sycl::buffer<real_t, 1> &x,
-          std::int64_t incx, real_t beta, sycl::buffer<real_t, 1> &y, std::int64_t incy) {
-    CALL_PORTBLAS_FN(::blas::_symv, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-void syr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-         sycl::buffer<real_t, 1> &x, std::int64_t incx, sycl::buffer<real_t, 1> &a,
-         std::int64_t lda) {
-    CALL_PORTBLAS_FN(::blas::_syr, queue, upper_lower, n, alpha, x, incx, a, lda);
-}
-
-void syr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-          sycl::buffer<real_t, 1> &x, std::int64_t incx, sycl::buffer<real_t, 1> &y,
-          std::int64_t incy, sycl::buffer<real_t, 1> &a, std::int64_t lda) {
-    CALL_PORTBLAS_FN(::blas::_syr2, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void spmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-          sycl::buffer<real_t, 1> &a, sycl::buffer<real_t, 1> &x, std::int64_t incx, real_t beta,
-          sycl::buffer<real_t, 1> &y, std::int64_t incy) {
-    CALL_PORTBLAS_FN(::blas::_spmv, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);
-}
-
-void spr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-         sycl::buffer<real_t, 1> &x, std::int64_t incx, sycl::buffer<real_t, 1> &a) {
-    CALL_PORTBLAS_FN(::blas::_spr, queue, upper_lower, n, alpha, x, incx, a);
-}
-
-void spr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-          sycl::buffer<real_t, 1> &x, std::int64_t incx, sycl::buffer<real_t, 1> &y,
-          std::int64_t incy, sycl::buffer<real_t, 1> &a) {
-    CALL_PORTBLAS_FN(::blas::_spr2, queue, upper_lower, n, alpha, x, incx, y, incy, a);
-}
-
-void tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<real_t, 1> &a,
-          std::int64_t lda, sycl::buffer<real_t, 1> &x, std::int64_t incx) {
-    CALL_PORTBLAS_FN(::blas::_tbmv, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-}
-
-void tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx) {
-    throw unimplemented("blas", "tbmv", "");
-}
-
-void tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<real_t, 1> &a,
-          std::int64_t lda, sycl::buffer<real_t, 1> &x, std::int64_t incx) {
-    CALL_PORTBLAS_FN(::blas::_tbsv, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-}
-
-void tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k,
-          sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx) {
-    throw unimplemented("blas", "tbsv", "");
-}
-
-void tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer<real_t, 1> &a,
-          sycl::buffer<real_t, 1> &x, std::int64_t incx) {
-    CALL_PORTBLAS_FN(::blas::_tpmv, queue, upper_lower, trans, unit_diag, n, a, x, incx);
-}
-
-void tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer<std::complex<real_t>, 1> &a,
-          sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx) {
-    throw unimplemented("blas", "tpmv", "");
-}
-
-void tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer<real_t, 1> &a,
-          sycl::buffer<real_t, 1> &x, std::int64_t incx) {
-    CALL_PORTBLAS_FN(::blas::_tpsv, queue, upper_lower, trans, unit_diag, n, a, x, incx);
-}
-
-void tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer<std::complex<real_t>, 1> &a,
-          sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx) {
-    throw unimplemented("blas", "tpsv", "");
-}
-
-void trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer<real_t, 1> &a, std::int64_t lda,
-          sycl::buffer<real_t, 1> &x, std::int64_t incx) {
-    CALL_PORTBLAS_FN(::blas::_trmv, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-}
-
-void trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer<std::complex<real_t>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx) {
-    throw unimplemented("blas", "trmv", " for complex");
-}
-
-void trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer<real_t, 1> &a, std::int64_t lda,
-          sycl::buffer<real_t, 1> &x, std::int64_t incx) {
-    CALL_PORTBLAS_FN(::blas::_trsv, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-}
-
-void trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer<std::complex<real_t>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<real_t>, 1> &x, std::int64_t incx) {
-    throw unimplemented("blas", "trsv", "");
-}
-
-// USM APIs
-
-sycl::event gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                 real_t alpha, const real_t *a, std::int64_t lda, const real_t *x,
-                 std::int64_t incx, real_t beta, real_t *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_gemv, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,
-                         dependencies);
-}
-
-sycl::event gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                 std::complex<real_t> alpha, const std::complex<real_t> *a, std::int64_t lda,
-                 const std::complex<real_t> *x, std::int64_t incx, std::complex<real_t> beta,
-                 std::complex<real_t> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemv", " for USM");
-}
-
-sycl::event gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                 std::int64_t kl, std::int64_t ku, real_t alpha, const real_t *a, std::int64_t lda,
-                 const real_t *x, std::int64_t incx, real_t beta, real_t *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_gbmv, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y,
-                         incy, dependencies);
-}
-
-sycl::event gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                 std::int64_t kl, std::int64_t ku, std::complex<real_t> alpha,
-                 const std::complex<real_t> *a, std::int64_t lda, const std::complex<real_t> *x,
-                 std::int64_t incx, std::complex<real_t> beta, std::complex<real_t> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gbmv", " for USM");
-}
-
-sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, real_t alpha, const real_t *x,
-                std::int64_t incx, const real_t *y, std::int64_t incy, real_t *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_ger, queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<real_t> alpha,
-                 const std::complex<real_t> *x, std::int64_t incx, const std::complex<real_t> *y,
-                 std::int64_t incy, std::complex<real_t> *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gerc", " for USM");
-}
-
-sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<real_t> alpha,
-                 const std::complex<real_t> *x, std::int64_t incx, const std::complex<real_t> *y,
-                 std::int64_t incy, std::complex<real_t> *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "geru", " for USM");
-}
-
-sycl::event hbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k,
-                 std::complex<real_t> alpha, const std::complex<real_t> *a, std::int64_t lda,
-                 const std::complex<real_t> *x, std::int64_t incx, std::complex<real_t> beta,
-                 std::complex<real_t> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "hbmv", " for USM");
-}
-
-sycl::event hemv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                 std::complex<real_t> alpha, const std::complex<real_t> *a, std::int64_t lda,
-                 const std::complex<real_t> *x, std::int64_t incx, std::complex<real_t> beta,
-                 std::complex<real_t> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "hemv", " for USM");
-}
-
-sycl::event her(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-                const std::complex<real_t> *x, std::int64_t incx, std::complex<real_t> *a,
-                std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "her", " for USM");
-}
-
-sycl::event her2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                 std::complex<real_t> alpha, const std::complex<real_t> *x, std::int64_t incx,
-                 const std::complex<real_t> *y, std::int64_t incy, std::complex<real_t> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "her2", " for USM");
-}
-
-sycl::event hpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                 std::complex<real_t> alpha, const std::complex<real_t> *a,
-                 const std::complex<real_t> *x, std::int64_t incx, std::complex<real_t> beta,
-                 std::complex<real_t> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "hpmv", " for USM");
-}
-
-sycl::event hpr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-                const std::complex<real_t> *x, std::int64_t incx, std::complex<real_t> *a,
-                const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "hpr", " for USM");
-}
-
-sycl::event hpr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                 std::complex<real_t> alpha, const std::complex<real_t> *x, std::int64_t incx,
-                 const std::complex<real_t> *y, std::int64_t incy, std::complex<real_t> *a,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "hpr2", " for USM");
-}
-
-sycl::event sbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k,
-                 real_t alpha, const real_t *a, std::int64_t lda, const real_t *x,
-                 std::int64_t incx, real_t beta, real_t *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_sbmv, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,
-                         incy, dependencies);
-}
-
-sycl::event symv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-                 const real_t *a, std::int64_t lda, const real_t *x, std::int64_t incx, real_t beta,
-                 real_t *y, std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_symv, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y,
-                         incy, dependencies);
-}
-
-sycl::event syr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-                const real_t *x, std::int64_t incx, real_t *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_syr, queue, upper_lower, n, alpha, x, incx, a, lda, dependencies);
-}
-
-sycl::event syr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-                 const real_t *x, std::int64_t incx, const real_t *y, std::int64_t incy, real_t *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_syr2, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda,
-                         dependencies);
-}
-
-sycl::event spmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-                 const real_t *a, const real_t *x, std::int64_t incx, real_t beta, real_t *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_spmv, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy,
-                         dependencies);
-}
-
-sycl::event spr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-                const real_t *x, std::int64_t incx, real_t *a,
-                const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_spr, queue, upper_lower, n, alpha, x, incx, a, dependencies);
-}
-
-sycl::event spr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha,
-                 const real_t *x, std::int64_t incx, const real_t *y, std::int64_t incy, real_t *a,
-                 const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_spr2, queue, upper_lower, n, alpha, x, incx, y, incy, a,
-                         dependencies);
-}
-
-sycl::event tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, const real_t *a,
-                 std::int64_t lda, real_t *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_tbmv, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
-                         dependencies);
-}
-
-sycl::event tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k,
-                 const std::complex<real_t> *a, std::int64_t lda, std::complex<real_t> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "tbmv", " for USM");
-}
-
-sycl::event tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, const real_t *a,
-                 std::int64_t lda, real_t *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_tbsv, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx,
-                         dependencies);
-}
-
-sycl::event tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k,
-                 const std::complex<real_t> *a, std::int64_t lda, std::complex<real_t> *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "tbsv", " for USM");
-}
-
-sycl::event tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 oneapi::mkl::diag unit_diag, std::int64_t n, const real_t *a, real_t *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_tpmv, queue, upper_lower, trans, unit_diag, n, a, x, incx,
-                         dependencies);
-}
-
-sycl::event tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 oneapi::mkl::diag unit_diag, std::int64_t n, const std::complex<real_t> *a,
-                 std::complex<real_t> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "tpmv", " for USM");
-}
-
-sycl::event tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 oneapi::mkl::diag unit_diag, std::int64_t n, const real_t *a, real_t *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_tpsv, queue, upper_lower, trans, unit_diag, n, a, x, incx,
-                         dependencies);
-}
-
-sycl::event tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 oneapi::mkl::diag unit_diag, std::int64_t n, const std::complex<real_t> *a,
-                 std::complex<real_t> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "tpsv", " for USM");
-}
-
-sycl::event trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 oneapi::mkl::diag unit_diag, std::int64_t n, const real_t *a, std::int64_t lda,
-                 real_t *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_trmv, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
-                         dependencies);
-}
-
-sycl::event trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 oneapi::mkl::diag unit_diag, std::int64_t n, const std::complex<real_t> *a,
-                 std::int64_t lda, std::complex<real_t> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trmv", " for USM");
-}
-
-sycl::event trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 oneapi::mkl::diag unit_diag, std::int64_t n, const real_t *a, std::int64_t lda,
-                 real_t *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_trsv, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx,
-                         dependencies);
-}
-
-sycl::event trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 oneapi::mkl::diag unit_diag, std::int64_t n, const std::complex<real_t> *a,
-                 std::int64_t lda, std::complex<real_t> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsv", " for USM");
-}
diff --git a/src/blas/backends/portblas/portblas_level2_double.cpp b/src/blas/backends/portblas/portblas_level2_double.cpp
deleted file mode 100644
index 092aa0c59..000000000
--- a/src/blas/backends/portblas/portblas_level2_double.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "portblas_common.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace portblas {
-
-using real_t = double;
-
-namespace column_major {
-
-#define COLUMN_MAJOR
-constexpr bool is_column_major() {
-    return true;
-}
-#include "portblas_level2.cxx"
-#undef COLUMN_MAJOR
-
-} // namespace column_major
-namespace row_major {
-
-#define ROW_MAJOR
-constexpr bool is_column_major() {
-    return false;
-}
-#include "portblas_level2.cxx"
-#undef ROW_MAJOR
-
-} // namespace row_major
-} // namespace portblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/portblas/portblas_level2_float.cpp b/src/blas/backends/portblas/portblas_level2_float.cpp
deleted file mode 100644
index 7308c05da..000000000
--- a/src/blas/backends/portblas/portblas_level2_float.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "portblas_common.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace portblas {
-
-using real_t = float;
-
-namespace column_major {
-
-#define COLUMN_MAJOR
-constexpr bool is_column_major() {
-    return true;
-}
-#include "portblas_level2.cxx"
-#undef COLUMN_MAJOR
-
-} // namespace column_major
-namespace row_major {
-
-#define ROW_MAJOR
-constexpr bool is_column_major() {
-    return false;
-}
-#include "portblas_level2.cxx"
-#undef ROW_MAJOR
-
-} // namespace row_major
-} // namespace portblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/portblas/portblas_level3.cxx b/src/blas/backends/portblas/portblas_level3.cxx
deleted file mode 100644
index 4eeb1e8f1..000000000
--- a/src/blas/backends/portblas/portblas_level3.cxx
+++ /dev/null
@@ -1,451 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// Buffer APIs
-
-void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, real_t alpha, sycl::buffer<real_t, 1> &a,
-          std::int64_t lda, sycl::buffer<real_t, 1> &b, std::int64_t ldb, real_t beta,
-          sycl::buffer<real_t, 1> &c, std::int64_t ldc) {
-    CALL_PORTBLAS_FN(::blas::_gemm, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
-                     ldc);
-}
-
-void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<real_t> alpha,
-          sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<real_t>, 1> &b, std::int64_t ldb, std::complex<real_t> beta,
-          sycl::buffer<std::complex<real_t>, 1> &c, std::int64_t ldc) {
-    using sycl_complex_real_t = sycl::ext::oneapi::experimental::complex<real_t>;
-    if (transa == oneapi::mkl::transpose::conjtrans ||
-        transb == oneapi::mkl::transpose::conjtrans) {
-        throw unimplemented("blas", "gemm", "Conjugate Transpose unsupported yet on portBLAS");
-    }
-    // Intermediate buffers for conversion purposes as portBLAS expects sycl::complex instead of std::complex
-    sycl::buffer<sycl_complex_real_t, 1> a_pb{ sycl::range<1>(a.size()) };
-    sycl::buffer<sycl_complex_real_t, 1> b_pb{ sycl::range<1>(b.size()) };
-    sycl::buffer<sycl_complex_real_t, 1> c_pb{ sycl::range<1>(c.size()) };
-
-    sycl::accessor<std::complex<real_t>, 1, sycl::access::mode::read> a_acc(a);
-    sycl::accessor<sycl_complex_real_t, 1, sycl::access::mode::write> a_pb_acc(a_pb);
-    queue.copy(a_acc, a_pb_acc);
-
-    sycl::accessor<std::complex<real_t>, 1, sycl::access::mode::read> b_acc(b);
-    sycl::accessor<sycl_complex_real_t, 1, sycl::access::mode::write> b_pb_acc(b_pb);
-    queue.copy(b_acc, b_pb_acc);
-
-    sycl::accessor<std::complex<real_t>, 1, sycl::access::mode::read> c_acc(c);
-    sycl::accessor<sycl_complex_real_t, 1, sycl::access::mode::write> c_pb_acc(c_pb);
-    queue.copy(c_acc, c_pb_acc);
-
-    CALL_PORTBLAS_FN(::blas::_gemm, queue, transa, transb, m, n, k, alpha, a_pb, lda, b_pb, ldb,
-                     beta, c_pb, ldc);
-
-    // Copy c_pb back to c
-    sycl::accessor<std::complex<real_t>, 1, sycl::access::mode::write> out_acc(c);
-    sycl::accessor<sycl_complex_real_t, 1, sycl::access::mode::read> out_pb_acc(c_pb);
-    queue.copy(out_pb_acc, out_acc);
-}
-
-void symm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-          std::int64_t m, std::int64_t n, real_t alpha, sycl::buffer<real_t, 1> &a,
-          std::int64_t lda, sycl::buffer<real_t, 1> &b, std::int64_t ldb, real_t beta,
-          sycl::buffer<real_t, 1> &c, std::int64_t ldc) {
-    CALL_PORTBLAS_FN(::blas::_symm, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
-                     beta, c, ldc);
-}
-
-void symm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<real_t> alpha,
-          sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<real_t>, 1> &b, std::int64_t ldb, std::complex<real_t> beta,
-          sycl::buffer<std::complex<real_t>, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "symm", "");
-}
-
-void hemm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<real_t> alpha,
-          sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<real_t>, 1> &b, std::int64_t ldb, std::complex<real_t> beta,
-          sycl::buffer<std::complex<real_t>, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "hemm", "");
-}
-
-void syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          std::int64_t n, std::int64_t k, real_t alpha, sycl::buffer<real_t, 1> &a,
-          std::int64_t lda, real_t beta, sycl::buffer<real_t, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "syrk", "");
-}
-
-void syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<real_t> alpha,
-          sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda, std::complex<real_t> beta,
-          sycl::buffer<std::complex<real_t>, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "syrk", "");
-}
-
-void herk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-          std::int64_t n, std::int64_t k, real_t alpha, sycl::buffer<std::complex<real_t>, 1> &a,
-          std::int64_t lda, real_t beta, sycl::buffer<std::complex<real_t>, 1> &c,
-          std::int64_t ldc) {
-    throw unimplemented("blas", "herk", "");
-}
-
-void syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-           std::int64_t n, std::int64_t k, real_t alpha, sycl::buffer<real_t, 1> &a,
-           std::int64_t lda, sycl::buffer<real_t, 1> &b, std::int64_t ldb, real_t beta,
-           sycl::buffer<real_t, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "syr2k", "");
-}
-
-void syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<real_t> alpha,
-           sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<real_t>, 1> &b, std::int64_t ldb, std::complex<real_t> beta,
-           sycl::buffer<std::complex<real_t>, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "syr2k", "");
-}
-
-void her2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<real_t> alpha,
-           sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<real_t>, 1> &b, std::int64_t ldb, real_t beta,
-           sycl::buffer<std::complex<real_t>, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "her2k", "");
-}
-
-void trmm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-          oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-          real_t alpha, sycl::buffer<real_t, 1> &a, std::int64_t lda, sycl::buffer<real_t, 1> &b,
-          std::int64_t ldb) {
-    throw unimplemented("blas", "trmm", "");
-}
-
-void trmm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-          oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<real_t> alpha, sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<real_t>, 1> &b, std::int64_t ldb) {
-    throw unimplemented("blas", "trmm", "");
-}
-
-void trsm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-          oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-          real_t alpha, sycl::buffer<real_t, 1> &a, std::int64_t lda, sycl::buffer<real_t, 1> &b,
-          std::int64_t ldb) {
-    CALL_PORTBLAS_FN(::blas::_trsm, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha,
-                     a, lda, b, ldb);
-}
-
-void trsm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-          oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<real_t> alpha, sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<real_t>, 1> &b, std::int64_t ldb) {
-    throw unimplemented("blas", "trsm", " for complex");
-}
-
-void gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa,
-           oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, real_t alpha,
-           sycl::buffer<real_t, 1> &a, std::int64_t lda, sycl::buffer<real_t, 1> &b,
-           std::int64_t ldb, real_t beta, sycl::buffer<real_t, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "");
-}
-
-void gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa,
-           oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k,
-           std::complex<real_t> alpha, sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<real_t>, 1> &b, std::int64_t ldb, std::complex<real_t> beta,
-           sycl::buffer<std::complex<real_t>, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "");
-}
-
-void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, real_t alpha,
-              sycl::buffer<real_t, 1> &a, std::int64_t lda, sycl::buffer<real_t, 1> &b,
-              std::int64_t ldb) {
-    CALL_PORTBLAS_FN(::blas::_omatcopy, queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-              std::complex<real_t> alpha, sycl::buffer<std::complex<real_t>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<real_t>, 1> &b, std::int64_t ldb) {
-    throw unimplemented("blas", "omatcopy", "");
-}
-
-void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, real_t alpha,
-               sycl::buffer<real_t, 1> &a, std::int64_t lda, std::int64_t stridea,
-               sycl::buffer<real_t, 1> &b, std::int64_t ldb, std::int64_t strideb) {
-    CALL_PORTBLAS_FN(::blas::_omatcopy2, queue, trans, m, n, alpha, a, lda, stridea, b, ldb,
-                     strideb);
-}
-
-void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-               std::complex<real_t> alpha, sycl::buffer<std::complex<real_t>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<real_t>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, real_t alpha,
-              sycl::buffer<real_t, 1> &ab, std::int64_t lda, std::int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-              std::complex<real_t> alpha, sycl::buffer<std::complex<real_t>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "");
-}
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-             real_t alpha, sycl::buffer<real_t, 1> &a, std::int64_t lda, real_t beta,
-             sycl::buffer<real_t, 1> &b, std::int64_t ldb, sycl::buffer<real_t, 1> &c,
-             std::int64_t ldc) {
-    CALL_PORTBLAS_FN(::blas::_omatadd, queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c,
-                     ldc);
-}
-
-void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n,
-             std::complex<real_t> alpha, sycl::buffer<std::complex<real_t>, 1> &a, std::int64_t lda,
-             std::complex<real_t> beta, sycl::buffer<std::complex<real_t>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<real_t>, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "omatadd", "");
-}
-
-// USM APIs
-
-sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, real_t alpha, const real_t *a,
-                 std::int64_t lda, const real_t *b, std::int64_t ldb, real_t beta, real_t *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_gemm, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta,
-                         c, ldc, dependencies);
-}
-
-sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, std::complex<real_t> alpha,
-                 const std::complex<real_t> *a, std::int64_t lda, const std::complex<real_t> *b,
-                 std::int64_t ldb, std::complex<real_t> beta, std::complex<real_t> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    if (transa == oneapi::mkl::transpose::conjtrans ||
-        transb == oneapi::mkl::transpose::conjtrans) {
-        throw unimplemented("blas", "gemm", "Conjugate Transpose unsupported yet on portBLAS");
-    }
-    CALL_PORTBLAS_USM_FN(::blas::_gemm, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta,
-                         c, ldc, dependencies);
-}
-
-sycl::event symm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-                 std::int64_t m, std::int64_t n, real_t alpha, const real_t *a, std::int64_t lda,
-                 const real_t *b, std::int64_t ldb, real_t beta, real_t *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_symm, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb,
-                         beta, c, ldc, dependencies);
-}
-
-sycl::event symm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-                 std::int64_t m, std::int64_t n, std::complex<real_t> alpha,
-                 const std::complex<real_t> *a, std::int64_t lda, const std::complex<real_t> *b,
-                 std::int64_t ldb, std::complex<real_t> beta, std::complex<real_t> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "symm", " for USM");
-}
-
-sycl::event hemm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-                 std::int64_t m, std::int64_t n, std::complex<real_t> alpha,
-                 const std::complex<real_t> *a, std::int64_t lda, const std::complex<real_t> *b,
-                 std::int64_t ldb, std::complex<real_t> beta, std::complex<real_t> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "hemm", " for USM");
-}
-
-sycl::event syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 std::int64_t n, std::int64_t k, real_t alpha, const real_t *a, std::int64_t lda,
-                 real_t beta, real_t *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk", " for USM");
-}
-
-sycl::event syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 std::int64_t n, std::int64_t k, std::complex<real_t> alpha,
-                 const std::complex<real_t> *a, std::int64_t lda, std::complex<real_t> beta,
-                 std::complex<real_t> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syrk", " for USM");
-}
-
-sycl::event herk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 std::int64_t n, std::int64_t k, real_t alpha, const std::complex<real_t> *a,
-                 std::int64_t lda, real_t beta, std::complex<real_t> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "herk", " for USM");
-}
-
-sycl::event syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                  std::int64_t n, std::int64_t k, real_t alpha, const real_t *a, std::int64_t lda,
-                  const real_t *b, std::int64_t ldb, real_t beta, real_t *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syr2k", " for USM");
-}
-
-sycl::event syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                  std::int64_t n, std::int64_t k, std::complex<real_t> alpha,
-                  const std::complex<real_t> *a, std::int64_t lda, const std::complex<real_t> *b,
-                  std::int64_t ldb, std::complex<real_t> beta, std::complex<real_t> *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "syr2k", " for USM");
-}
-
-sycl::event her2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                  std::int64_t n, std::int64_t k, std::complex<real_t> alpha,
-                  const std::complex<real_t> *a, std::int64_t lda, const std::complex<real_t> *b,
-                  std::int64_t ldb, real_t beta, std::complex<real_t> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "her2k", " for USM");
-}
-
-sycl::event trmm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m,
-                 std::int64_t n, real_t alpha, const real_t *a, std::int64_t lda, real_t *b,
-                 std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trmm", " for USM");
-}
-
-sycl::event trmm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m,
-                 std::int64_t n, std::complex<real_t> alpha, const std::complex<real_t> *a,
-                 std::int64_t lda, std::complex<real_t> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trmm", " for USM");
-}
-
-sycl::event trsm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m,
-                 std::int64_t n, real_t alpha, const real_t *a, std::int64_t lda, real_t *b,
-                 std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_trsm, queue, left_right, upper_lower, trans, unit_diag, m, n,
-                         alpha, a, lda, b, ldb, dependencies);
-}
-
-sycl::event trsm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m,
-                 std::int64_t n, std::complex<real_t> alpha, const std::complex<real_t> *a,
-                 std::int64_t lda, std::complex<real_t> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "trsm", " for USM");
-}
-
-sycl::event gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa,
-                  oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, real_t alpha,
-                  const real_t *a, std::int64_t lda, const real_t *b, std::int64_t ldb, real_t beta,
-                  real_t *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", " for USM");
-}
-
-sycl::event gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa,
-                  oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k,
-                  std::complex<real_t> alpha, const std::complex<real_t> *a, std::int64_t lda,
-                  const std::complex<real_t> *b, std::int64_t ldb, std::complex<real_t> beta,
-                  std::complex<real_t> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", " for USM");
-}
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     real_t alpha, const real_t *a, std::int64_t lda, real_t *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_omatcopy, queue, trans, m, n, alpha, a, lda, b, ldb,
-                         dependencies);
-}
-
-sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     std::complex<real_t> alpha, const std::complex<real_t> *a, std::int64_t lda,
-                     std::complex<real_t> *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy", "for USM");
-}
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                      real_t alpha, const real_t *a, std::int64_t lda, std::int64_t stridea,
-                      real_t *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_omatcopy2, queue, trans, m, n, alpha, a, lda, stridea, b, ldb,
-                         strideb, dependencies);
-}
-
-sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                      std::complex<real_t> alpha, const std::complex<real_t> *a, std::int64_t lda,
-                      std::int64_t stridea, std::complex<real_t> *b, std::int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy2", "for USM");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     real_t alpha, real_t *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n,
-                     std::complex<real_t> alpha, std::complex<real_t> *ab, std::int64_t lda,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "");
-}
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                    std::int64_t n, real_t alpha, const real_t *a, std::int64_t lda, real_t beta,
-                    const real_t *b, std::int64_t ldb, real_t *c, std::int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    CALL_PORTBLAS_USM_FN(::blas::_omatadd, queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb,
-                         c, ldc, dependencies);
-}
-
-sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m,
-                    std::int64_t n, std::complex<real_t> alpha, const std::complex<real_t> *a,
-                    std::int64_t lda, std::complex<real_t> beta, const std::complex<real_t> *b,
-                    std::int64_t ldb, std::complex<real_t> *c, std::int64_t ldc,
-                    const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatadd", "");
-}
-sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           real_t *alpha, const real_t **a, int64_t *lda, real_t **b, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "");
-}
-
-sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           std::complex<real_t> *alpha, const std::complex<real_t> **a,
-                           int64_t *lda, std::complex<real_t> **b, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy_batch", "");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           real_t *alpha, real_t **ab, int64_t *lda, int64_t *ldb,
-                           int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           std::complex<real_t> *alpha, std::complex<real_t> **ab, int64_t *lda,
-                           int64_t *ldb, int64_t group_count, int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "");
-}
diff --git a/src/blas/backends/portblas/portblas_level3_bfloat16.cpp b/src/blas/backends/portblas/portblas_level3_bfloat16.cpp
deleted file mode 100644
index 1684b1b3e..000000000
--- a/src/blas/backends/portblas/portblas_level3_bfloat16.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace portblas {
-namespace column_major {
-
-// BUFFER
-void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<oneapi::mkl::bfloat16, 1> &a, std::int64_t lda,
-          sycl::buffer<oneapi::mkl::bfloat16, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "gemm", " for bfloat16");
-}
-
-// USM
-sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                 const oneapi::mkl::bfloat16 *a, std::int64_t lda, const oneapi::mkl::bfloat16 *b,
-                 std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm", " for USM");
-}
-
-} // namespace column_major
-namespace row_major {
-
-// BUFFER
-void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<oneapi::mkl::bfloat16, 1> &a, std::int64_t lda,
-          sycl::buffer<oneapi::mkl::bfloat16, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "gemm", " for bfloat16");
-}
-
-// USM
-sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                 const oneapi::mkl::bfloat16 *a, std::int64_t lda, const oneapi::mkl::bfloat16 *b,
-                 std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm", " for USM");
-}
-
-} // namespace row_major
-} // namespace portblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/portblas/portblas_level3_double.cpp b/src/blas/backends/portblas/portblas_level3_double.cpp
deleted file mode 100644
index 9f9d82d37..000000000
--- a/src/blas/backends/portblas/portblas_level3_double.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "portblas_common.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace portblas {
-
-using real_t = double;
-
-namespace column_major {
-
-#define COLUMN_MAJOR
-constexpr bool is_column_major() {
-    return true;
-}
-#include "portblas_level3.cxx"
-#undef COLUMN_MAJOR
-
-} // namespace column_major
-namespace row_major {
-
-#define ROW_MAJOR
-constexpr bool is_column_major() {
-    return false;
-}
-#include "portblas_level3.cxx"
-#undef ROW_MAJOR
-
-} // namespace row_major
-} // namespace portblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/portblas/portblas_level3_float.cpp b/src/blas/backends/portblas/portblas_level3_float.cpp
deleted file mode 100644
index 53a5a1697..000000000
--- a/src/blas/backends/portblas/portblas_level3_float.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "portblas_common.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace portblas {
-
-using real_t = float;
-
-namespace column_major {
-
-#define COLUMN_MAJOR
-constexpr bool is_column_major() {
-    return true;
-}
-#include "portblas_level3.cxx"
-#include "portblas_gemm_bias.cxx"
-#undef COLUMN_MAJOR
-
-} // namespace column_major
-namespace row_major {
-
-#define ROW_MAJOR
-constexpr bool is_column_major() {
-    return false;
-}
-#include "portblas_level3.cxx"
-#include "portblas_gemm_bias.cxx"
-#undef ROW_MAJOR
-
-} // namespace row_major
-} // namespace portblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/portblas/portblas_level3_half.cpp b/src/blas/backends/portblas/portblas_level3_half.cpp
deleted file mode 100644
index 0e42528fa..000000000
--- a/src/blas/backends/portblas/portblas_level3_half.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace portblas {
-namespace column_major {
-
-// BUFFER
-void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "gemm", " half");
-}
-
-void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "gemm", " for different argument data types");
-}
-
-// USM
-sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                 const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                 sycl::half beta, sycl::half *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm", " for USM");
-}
-
-sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a,
-                 std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm", " for USM");
-}
-} // namespace column_major
-namespace row_major {
-
-// BUFFER
-void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "gemm", " half");
-}
-
-void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    throw unimplemented("blas", "gemm", " for different argument data types");
-}
-
-// USM
-sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                 const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                 sycl::half beta, sycl::half *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm", " for USM");
-}
-
-sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a,
-                 std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm", " for USM");
-}
-
-} // namespace row_major
-} // namespace portblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/portblas/portblas_wrappers.cpp b/src/blas/backends/portblas/portblas_wrappers.cpp
deleted file mode 100644
index 3f6170bb7..000000000
--- a/src/blas/backends/portblas/portblas_wrappers.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-//
-// generated file
-//
-
-#include "blas/function_table.hpp"
-
-#include "oneapi/mkl/blas/detail/portblas/onemkl_blas_portblas.hpp"
-
-#define WRAPPER_VERSION 1
-
-extern "C" ONEMKL_EXPORT blas_function_table_t mkl_blas_table = {
-    WRAPPER_VERSION,
-#define BACKEND portblas
-#define MAJOR   column_major
-#include "../backend_wrappers.cxx"
-#undef MAJOR
-#define MAJOR row_major
-#include "../backend_wrappers.cxx"
-#undef MAJOR
-#undef BACKEND
-};
diff --git a/src/blas/backends/rocblas/CMakeLists.txt b/src/blas/backends/rocblas/CMakeLists.txt
deleted file mode 100644
index 76dc126ad..000000000
--- a/src/blas/backends/rocblas/CMakeLists.txt
+++ /dev/null
@@ -1,87 +0,0 @@
-#==========================================================================
-#  Copyright (C) Codeplay Software Limited
-#  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  For your convenience, a copy of the License has been included in this
-#  repository.
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-#=========================================================================
-
-set(LIB_NAME onemkl_blas_rocblas)
-set(LIB_OBJ ${LIB_NAME}_obj)
-find_package(hip REQUIRED)
-find_package(rocblas REQUIRED)
-find_package(Threads REQUIRED)
-
-set(SOURCES rocblas_level1.cpp 
-                rocblas_level2.cpp 
-                rocblas_level3.cpp 
-                rocblas_batch.cpp 
-                rocblas_extensions.cpp
-                $<$<STREQUAL:${ONEMKL_SYCL_IMPLEMENTATION},dpc++>:rocblas_scope_handle.cpp >
-                $<$<STREQUAL:${ONEMKL_SYCL_IMPLEMENTATION},hipsycl>:rocblas_scope_handle_hipsycl.cpp >
-                $<$<BOOL:${BUILD_SHARED_LIBS}>: rocblas_wrappers.cpp>)
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT ${SOURCES})
-add_dependencies(onemkl_backend_libs_blas ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${PROJECT_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-if(NOT ${ONEMKL_SYCL_IMPLEMENTATION} STREQUAL "hipsycl")
-    target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-    target_compile_options(ONEMKL::SYCL::SYCL INTERFACE
-        -fsycl-targets=amdgcn-amd-amdhsa -fsycl-unnamed-lambda
-        -Xsycl-target-backend --offload-arch=${HIP_TARGETS})
-    target_link_options(ONEMKL::SYCL::SYCL INTERFACE
-        -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend 
-        --offload-arch=${HIP_TARGETS})
-else()
-    target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-    target_compile_options(ONEMKL::SYCL::SYCL INTERFACE)
-    target_link_options(ONEMKL::SYCL::SYCL INTERFACE)
-endif()
-
-target_link_libraries(${LIB_OBJ} PRIVATE roc::rocblas hip::host Threads::Threads)
-target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL)
-target_compile_features(${LIB_OBJ} PUBLIC cxx_std_17)
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON)
-
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET ${LIB_OBJ} SOURCES ${SOURCES})
-endif()
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
-
diff --git a/src/blas/backends/rocblas/rocblas_batch.cpp b/src/blas/backends/rocblas/rocblas_batch.cpp
deleted file mode 100644
index 5fa103055..000000000
--- a/src/blas/backends/rocblas/rocblas_batch.cpp
+++ /dev/null
@@ -1,2433 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#include "rocblas_helper.hpp"
-#include "rocblas_task.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hpp"
-
-// Helper Functions
-
-template <typename T>
-static inline void conj_vector(sycl::handler &cgh, sycl::buffer<T> &buf, const int64_t len,
-                               const int64_t inc, const int64_t stride, const int64_t batch_size) {
-    const auto abs_inc = std::abs(inc);
-    const auto abs_stride = std::abs(stride);
-    auto acc = buf.template get_access<sycl::access::mode::read_write>(cgh);
-    cgh.parallel_for(sycl::range{ (std::size_t)batch_size, (std::size_t)len },
-                     [=](sycl::item<2> it) {
-                         const auto index = it.get_id(0) * abs_stride + it.get_id(1) * abs_inc;
-                         acc[index] = std::conj(acc[index]);
-                     });
-}
-template <typename T>
-static inline void conj_vector(sycl::handler &cgh, T *ptr, const int64_t len, const int64_t inc,
-                               const int64_t stride, const int64_t batch_size) {
-    const auto abs_inc = std::abs(inc);
-    const auto abs_stride = std::abs(stride);
-    cgh.parallel_for(sycl::range{ (std::size_t)batch_size, (std::size_t)len },
-                     [=](sycl::item<2> it) {
-                         const auto index = it.get_id(0) * abs_stride + it.get_id(1) * abs_inc;
-                         ptr[index] = std::conj(ptr[index]);
-                     });
-}
-
-template <typename T>
-static inline void conj_vector(sycl::handler &cgh, T **ptr, const int64_t len, const int64_t inc,
-                               const int64_t stride, const int64_t group_size) {
-    const auto abs_inc = std::abs(inc);
-    cgh.parallel_for(sycl::range{ (std::size_t)group_size, (std::size_t)len },
-                     [=](sycl::item<2> it) {
-                         const auto col = it.get_id(0) + stride;
-                         const auto row = it.get_id(1) * abs_inc;
-                         ptr[col][row] = std::conj(ptr[col][row]);
-                     });
-}
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace rocblas {
-namespace column_major {
-
-// Buffer APIs
-
-template <typename Func, typename T>
-inline void copy_batch(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T, 1> &x,
-                       int64_t incx, int64_t stridex, sycl::buffer<T, 1> &y, int64_t incy,
-                       int64_t stridey, int64_t batch_size) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy, stridex, stridey, batch_size);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, stridex, y_, incy, stridey,
-                                    batch_size);
-        });
-    });
-}
-
-#define COPY_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                     \
-    void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, int64_t incx,     \
-                    int64_t stridex, sycl::buffer<TYPE, 1> &y, int64_t incy, int64_t stridey,  \
-                    int64_t batch_size) {                                                      \
-        copy_batch(ROCBLAS_ROUTINE, queue, n, x, incx, stridex, y, incy, stridey, batch_size); \
-    }
-
-COPY_STRIDED_BATCH_LAUNCHER(float, rocblas_scopy_strided_batched)
-COPY_STRIDED_BATCH_LAUNCHER(double, rocblas_dcopy_strided_batched)
-COPY_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_ccopy_strided_batched)
-COPY_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_zcopy_strided_batched)
-
-#undef COPY_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void axpy_batch(Func func, sycl::queue &queue, int64_t n, T alpha, sycl::buffer<T, 1> &x,
-                       int64_t incx, int64_t stridex, sycl::buffer<T, 1> &y, int64_t incy,
-                       int64_t stridey, int64_t batch_size) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy, stridex, stridey, batch_size);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType *)&alpha, x_, incx, stridex,
-                                    y_, incy, stridey, batch_size);
-        });
-    });
-}
-
-#define AXPY_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                 \
-    void axpy_batch(sycl::queue &queue, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &x,   \
-                    int64_t incx, int64_t stridex, sycl::buffer<TYPE, 1> &y, int64_t incy, \
-                    int64_t stridey, int64_t batch_size) {                                 \
-        axpy_batch(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, stridex, y, incy, stridey,   \
-                   batch_size);                                                            \
-    }
-
-AXPY_STRIDED_BATCH_LAUNCHER(float, rocblas_saxpy_strided_batched)
-AXPY_STRIDED_BATCH_LAUNCHER(double, rocblas_daxpy_strided_batched)
-AXPY_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_caxpy_strided_batched)
-AXPY_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_zaxpy_strided_batched)
-
-#undef AXPY_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                       T alpha, sycl::buffer<T, 1> &a, int64_t lda, int64_t stridea,
-                       sycl::buffer<T, 1> &x, int64_t incx, int64_t stridex, T beta,
-                       sycl::buffer<T, 1> &y, int64_t incy, int64_t stridey, int64_t batch_size) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, incx, incy, stridea, stridex, stridey, batch_size);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<const rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<const rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), m, n,
-                                    (rocDataType *)&alpha, a_, lda, stridea, x_, incx, stridex,
-                                    (rocDataType *)&beta, y_, incy, stridey, batch_size);
-        });
-    });
-}
-
-#define GEMV_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                         \
-    void gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,         \
-                    sycl::buffer<TYPE, 1> &a, int64_t lda, int64_t stridea,                        \
-                    sycl::buffer<TYPE, 1> &x, int64_t incx, int64_t stridex, TYPE beta,            \
-                    sycl::buffer<TYPE, 1> &y, int64_t incy, int64_t stridey, int64_t batch_size) { \
-        gemv_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex,  \
-                   beta, y, incy, stridey, batch_size);                                            \
-    }
-
-GEMV_STRIDED_BATCH_LAUNCHER(float, rocblas_sgemv_strided_batched)
-GEMV_STRIDED_BATCH_LAUNCHER(double, rocblas_dgemv_strided_batched)
-GEMV_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_cgemv_strided_batched)
-GEMV_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_zgemv_strided_batched)
-
-#undef GEMV_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void dgmm_batch(Func func, sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                       sycl::buffer<T, 1> &a, int64_t lda, int64_t stridea, sycl::buffer<T, 1> &x,
-                       int64_t incx, int64_t stridex, sycl::buffer<T, 1> &c, int64_t ldc,
-                       int64_t stridec, int64_t batch_size) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldc, incx, stridea, stridex, stridec, batch_size);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<const rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<const rocDataType *>(x_acc);
-            auto c_ = sc.get_mem<rocDataType *>(c_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), m, n, a_,
-                                    lda, stridea, x_, incx, stridex, c_, ldc, stridec, batch_size);
-        });
-    });
-}
-
-#define DGMM_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                         \
-    void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,                     \
-                    sycl::buffer<TYPE, 1> &a, int64_t lda, int64_t stridea,                        \
-                    sycl::buffer<TYPE, 1> &x, int64_t incx, int64_t stridex,                       \
-                    sycl::buffer<TYPE, 1> &c, int64_t ldc, int64_t stridec, int64_t batch_size) {  \
-        dgmm_batch(ROCBLAS_ROUTINE, queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, \
-                   ldc, stridec, batch_size);                                                      \
-    }
-
-DGMM_STRIDED_BATCH_LAUNCHER(float, rocblas_sdgmm_strided_batched)
-DGMM_STRIDED_BATCH_LAUNCHER(double, rocblas_ddgmm_strided_batched)
-DGMM_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_cdgmm_strided_batched)
-DGMM_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_zdgmm_strided_batched)
-
-#undef DGMM_STRIDED_BATCH_LAUNCHER
-
-template <typename Ta, typename Tb, typename Tc, typename Ts>
-inline void gemm_batch_impl(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                            int64_t n, int64_t k, Ts alpha, sycl::buffer<Ta, 1> &a, int64_t lda,
-                            int64_t stridea, sycl::buffer<Tb, 1> &b, int64_t ldb, int64_t strideb,
-                            Ts beta, sycl::buffer<Tc, 1> &c, int64_t ldc, int64_t stridec,
-                            int64_t batch_size) {
-    using rocTypeA = typename RocEquivalentType<Ta>::Type;
-    using rocTypeB = typename RocEquivalentType<Tb>::Type;
-    using rocTypeC = typename RocEquivalentType<Tc>::Type;
-    using rocTypeS = typename RocEquivalentType<Ts>::Type;
-    overflow_check(m, n, k, lda, ldb, ldc, stridea, strideb, stridec, batch_size);
-
-    int32_t solution_index = 0;
-    rocblas_gemm_flags flags = rocblas_gemm_flags_none;
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<const rocTypeA *>(a_acc);
-            auto b_ = sc.get_mem<const rocTypeB *>(b_acc);
-            auto c_ = sc.get_mem<rocTypeC *>(c_acc);
-
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(rocblas_gemm_strided_batched_ex, err, handle,
-                                    get_rocblas_operation(transa), get_rocblas_operation(transb), m,
-                                    n, k, &alpha, a_, get_rocblas_datatype<rocTypeA>(), lda,
-                                    stridea, b_, get_rocblas_datatype<rocTypeB>(), ldb, strideb,
-                                    &beta, c_, get_rocblas_datatype<rocTypeC>(), ldc, stridec, c_,
-                                    get_rocblas_datatype<rocTypeC>(), ldc, stridec, batch_size,
-                                    get_rocblas_datatype<rocTypeS>(), rocblas_gemm_algo_standard,
-                                    solution_index, flags);
-        });
-    });
-}
-
-#define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                               \
-    void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                    int64_t k, TYPE_S alpha, sycl::buffer<TYPE_A, 1> &a, int64_t lda,             \
-                    int64_t stridea, sycl::buffer<TYPE_B, 1> &b, int64_t ldb, int64_t strideb,    \
-                    TYPE_S beta, sycl::buffer<TYPE_C, 1> &c, int64_t ldc, int64_t stridec,        \
-                    int64_t batch_size) {                                                         \
-        gemm_batch_impl(queue, transa, transb, m, n, k, alpha, a, lda, stridea, b, ldb, strideb,  \
-                        beta, c, ldc, stridec, batch_size);                                       \
-    }
-
-GEMM_STRIDED_BATCH_LAUNCHER(sycl::half, sycl::half, sycl::half, sycl::half)
-GEMM_STRIDED_BATCH_LAUNCHER(float, float, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER(double, double, double, double)
-GEMM_STRIDED_BATCH_LAUNCHER(std::complex<float>, std::complex<float>, std::complex<float>,
-                            std::complex<float>)
-GEMM_STRIDED_BATCH_LAUNCHER(std::complex<double>, std::complex<double>, std::complex<double>,
-                            std::complex<double>)
-GEMM_STRIDED_BATCH_LAUNCHER(sycl::half, sycl::half, float, float)
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER
-
-#define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                               \
-    void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                    int64_t k, TYPE_S alpha, sycl::buffer<TYPE_A, 1> &a, int64_t lda,             \
-                    int64_t stridea, sycl::buffer<TYPE_B, 1> &b, int64_t ldb, int64_t strideb,    \
-                    TYPE_S beta, sycl::buffer<TYPE_C, 1> &c, int64_t ldc, int64_t stridec,        \
-                    int64_t batch_size) {                                                         \
-        throw unimplemented("blas", "gemm_batch",                                                 \
-                            std::string("for dtype unimplemented dtype combination <") +          \
-                                dtype_string<TYPE_A>() + "," + dtype_string<TYPE_B>() + "," +     \
-                                dtype_string<TYPE_C>() + "," + dtype_string<TYPE_S>() + ">");     \
-    }
-
-GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, std::int32_t, float)
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void trsm_batch(Func func, sycl::queue &queue, side left_right, uplo upper_lower,
-                       transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha,
-                       sycl::buffer<T, 1> &a, int64_t lda, int64_t stridea, sycl::buffer<T, 1> &b,
-                       int64_t ldb, int64_t strideb, int64_t batch_size) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, stridea, strideb, batch_size);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<const rocDataType *>(a_acc);
-            auto b_ = sc.get_mem<rocDataType *>(b_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
-                                    get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    m, n, (rocDataType *)&alpha, a_, lda, stridea, b_, ldb, strideb,
-                                    batch_size);
-        });
-    });
-}
-
-#define TRSM_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                         \
-    void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,        \
-                    diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &a,    \
-                    int64_t lda, int64_t stridea, sycl::buffer<TYPE, 1> &b, int64_t ldb,           \
-                    int64_t strideb, int64_t batch_size) {                                         \
-        trsm_batch(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, \
-                   a, lda, stridea, b, ldb, strideb, batch_size);                                  \
-    }
-
-TRSM_STRIDED_BATCH_LAUNCHER(float, rocblas_strsm_strided_batched)
-TRSM_STRIDED_BATCH_LAUNCHER(double, rocblas_dtrsm_strided_batched)
-TRSM_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_ctrsm_strided_batched)
-TRSM_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_ztrsm_strided_batched)
-
-#undef TRSM_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void syrk_batch(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                       int64_t k, T alpha, sycl::buffer<T, 1> &a, int64_t lda, int64_t stridea,
-                       T beta, sycl::buffer<T, 1> &c, int64_t ldc, int64_t stridec,
-                       int64_t batch_size) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, k, lda, ldc, stridea, stridec, batch_size);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<const rocDataType *>(a_acc);
-            auto c_ = sc.get_mem<rocDataType *>(c_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_,
-                                    lda, stridea, (rocDataType *)&beta, c_, ldc, stridec,
-                                    batch_size);
-        });
-    });
-}
-
-#define SYRK_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                         \
-    void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,   \
-                    TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, int64_t stridea, TYPE beta, \
-                    sycl::buffer<TYPE, 1> &c, int64_t ldc, int64_t stridec, int64_t batch_size) {  \
-        syrk_batch(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, stridea, beta, \
-                   c, ldc, stridec, batch_size);                                                   \
-    }
-
-SYRK_STRIDED_BATCH_LAUNCHER(float, rocblas_ssyrk_strided_batched)
-SYRK_STRIDED_BATCH_LAUNCHER(double, rocblas_dsyrk_strided_batched)
-SYRK_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_csyrk_strided_batched)
-SYRK_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_zsyrk_strided_batched)
-
-#undef SYRK_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void omatcopy_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           const T alpha, sycl::buffer<T, 1> &a, int64_t lda, int64_t stridea,
-                           sycl::buffer<T, 1> &b, int64_t ldb, int64_t strideb,
-                           int64_t batch_size) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, stridea, strideb, batch_size);
-
-    const T beta = 0;
-    const int64_t new_m = trans == oneapi::mkl::transpose::nontrans ? m : n;
-    const int64_t new_n = trans == oneapi::mkl::transpose::nontrans ? n : m;
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<const rocDataType *>(a_acc);
-            auto b_ = sc.get_mem<rocDataType *>(b_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans),
-                                    get_rocblas_operation(trans), new_m, new_n,
-                                    (rocDataType *)&alpha, a_, lda, stridea, (rocDataType *)&beta,
-                                    nullptr, lda, stridea, b_, ldb, strideb, batch_size);
-        });
-    });
-}
-
-#define OMATCOPY_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                    \
-    void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,                \
-                        const TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, int64_t stridea, \
-                        sycl::buffer<TYPE, 1> &b, int64_t ldb, int64_t strideb,                   \
-                        int64_t batch_size) {                                                     \
-        omatcopy_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, b, ldb,       \
-                       strideb, batch_size);                                                      \
-    }
-
-OMATCOPY_STRIDED_BATCH_LAUNCHER(float, rocblas_sgeam_strided_batched)
-OMATCOPY_STRIDED_BATCH_LAUNCHER(double, rocblas_dgeam_strided_batched)
-OMATCOPY_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_cgeam_strided_batched)
-OMATCOPY_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_zgeam_strided_batched)
-
-#undef OMATCOPY_STRIDED_BATCH_LAUNCHER
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                    sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                    sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-template <typename Func, typename T>
-inline void omatadd_batch(Func func, sycl::queue &queue, transpose transa, transpose transb,
-                          int64_t m, int64_t n, const T alpha, sycl::buffer<T, 1> &a, int64_t lda,
-                          int64_t stridea, const T beta, sycl::buffer<T, 1> &b, int64_t ldb,
-                          int64_t strideb, sycl::buffer<T, 1> &c, int64_t ldc, int64_t stridec,
-                          int64_t batch_size) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc, stridea, strideb, stridec, batch_size);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<const rocDataType *>(a_acc);
-            auto b_ = sc.get_mem<const rocDataType *>(b_acc);
-            auto c_ = sc.get_mem<rocDataType *>(c_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa),
-                                    get_rocblas_operation(transb), m, n, (rocDataType *)&alpha, a_,
-                                    lda, stridea, (rocDataType *)&beta, b_, ldb, strideb, c_, ldc,
-                                    stridec, batch_size);
-        });
-    });
-}
-
-#define OMATADD_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                     \
-    void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,         \
-                       int64_t n, const TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda,        \
-                       int64_t stridea, const TYPE beta, sycl::buffer<TYPE, 1> &b, int64_t ldb,   \
-                       int64_t strideb, sycl::buffer<TYPE, 1> &c, int64_t ldc, int64_t stridec,   \
-                       int64_t batch_size) {                                                      \
-        omatadd_batch(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, stridea, beta, \
-                      b, ldb, strideb, c, ldc, stridec, batch_size);                              \
-    }
-
-OMATADD_STRIDED_BATCH_LAUNCHER(float, rocblas_sgeam_strided_batched)
-OMATADD_STRIDED_BATCH_LAUNCHER(double, rocblas_dgeam_strided_batched)
-OMATADD_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_cgeam_strided_batched)
-OMATADD_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_zgeam_strided_batched)
-
-#undef OMATADD_STRIDED_BATCH_LAUNCHER
-
-// USM APIs
-
-template <typename Func, typename T>
-inline sycl::event copy_batch(Func func, sycl::queue &queue, int64_t *n, const T **x, int64_t *incx,
-                              T **y, int64_t *incy, int64_t group_count, int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    for (int64_t i = 0; i < group_count; i++) {
-        overflow_check(n[i], incx[i], incy[i], group_size[i]);
-    }
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            int64_t offset = 0;
-            rocblas_status err;
-            for (int64_t i = 0; i < group_count; i++) {
-                auto **x_ = reinterpret_cast<const rocDataType **>(x);
-                auto **y_ = reinterpret_cast<rocDataType **>(y);
-                ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, (int)n[i], x_ + offset, (int)incx[i],
-                                        y_ + offset, (int)incy[i], (int)group_size[i]);
-                offset += group_size[i];
-            }
-        });
-    });
-
-    return done;
-}
-
-#define COPY_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                          \
-    sycl::event copy_batch(sycl::queue &queue, int64_t *n, const TYPE **x, int64_t *incx,       \
-                           TYPE **y, int64_t *incy, int64_t group_count, int64_t *group_size,   \
-                           const std::vector<sycl::event> &dependencies) {                      \
-        return copy_batch(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, group_count, group_size, \
-                          dependencies);                                                        \
-    }
-
-COPY_BATCH_LAUNCHER_USM(float, rocblas_scopy_batched)
-COPY_BATCH_LAUNCHER_USM(double, rocblas_dcopy_batched)
-COPY_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_ccopy_batched)
-COPY_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zcopy_batched)
-
-#undef COPY_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event copy_batch(Func func, sycl::queue &queue, int64_t n, const T *x, int64_t incx,
-                              int64_t stridex, T *y, int64_t incy, int64_t stridey,
-                              int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy, stridex, stridey, batch_size);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, stridex, y_, incy, stridey,
-                                    batch_size);
-        });
-    });
-
-    return done;
-}
-
-#define COPY_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                 \
-    sycl::event copy_batch(sycl::queue &queue, int64_t n, const TYPE *x, int64_t incx,         \
-                           int64_t stridex, TYPE *y, int64_t incy, int64_t stridey,            \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) { \
-        return copy_batch(ROCBLAS_ROUTINE, queue, n, x, incx, stridex, y, incy, stridey,       \
-                          batch_size, dependencies);                                           \
-    }
-
-COPY_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_scopy_strided_batched)
-COPY_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_dcopy_strided_batched)
-COPY_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_ccopy_strided_batched)
-COPY_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zcopy_strided_batched)
-
-#undef COPY_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event axpy_batch(Func func, sycl::queue &queue, int64_t *n, T *alpha, const T **x,
-                              int64_t *incx, T **y, int64_t *incy, int64_t group_count,
-                              int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    for (int64_t i = 0; i < group_count; i++) {
-        overflow_check(n[i], incx[i], incy[i], group_size[i]);
-    }
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            int64_t offset = 0;
-            rocblas_status err;
-            for (int64_t i = 0; i < group_count; i++) {
-                auto **x_ = reinterpret_cast<const rocDataType **>(x);
-                auto **y_ = reinterpret_cast<rocDataType **>(y);
-                ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, (int)n[i], (rocDataType *)&alpha[i],
-                                        x_ + offset, (int)incx[i], y_ + offset, (int)incy[i],
-                                        (int)group_size[i]);
-                offset += group_size[i];
-            }
-        });
-    });
-
-    return done;
-}
-
-#define AXPY_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                          \
-    sycl::event axpy_batch(sycl::queue &queue, int64_t *n, TYPE *alpha, const TYPE **x,         \
-                           int64_t *incx, TYPE **y, int64_t *incy, int64_t group_count,         \
-                           int64_t *group_size, const std::vector<sycl::event> &dependencies) { \
-        return axpy_batch(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy, group_count,      \
-                          group_size, dependencies);                                            \
-    }
-
-AXPY_BATCH_LAUNCHER_USM(float, rocblas_saxpy_batched)
-AXPY_BATCH_LAUNCHER_USM(double, rocblas_daxpy_batched)
-AXPY_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_caxpy_batched)
-AXPY_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zaxpy_batched)
-
-#undef AXPY_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event axpy_batch(Func func, sycl::queue &queue, int64_t n, T alpha, const T *x,
-                              int64_t incx, int64_t stridex, T *y, int64_t incy, int64_t stridey,
-                              int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy, stridex, stridey, batch_size);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType *)&alpha, x_, incx, stridex,
-                                    y_, incy, stridey, batch_size);
-        });
-    });
-
-    return done;
-}
-
-#define AXPY_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                     \
-    sycl::event axpy_batch(sycl::queue &queue, int64_t n, TYPE alpha, const TYPE *x, int64_t incx, \
-                           int64_t stridex, TYPE *y, int64_t incy, int64_t stridey,                \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {     \
-        return axpy_batch(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, stridex, y, incy, stridey,    \
-                          batch_size, dependencies);                                               \
-    }
-
-AXPY_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_saxpy_strided_batched)
-AXPY_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_daxpy_strided_batched)
-AXPY_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_caxpy_strided_batched)
-AXPY_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zaxpy_strided_batched)
-
-#undef AXPY_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                              T alpha, const T *a, int64_t lda, int64_t stridea, const T *x,
-                              int64_t incx, int64_t stridex, T beta, T *y, int64_t incy,
-                              int64_t stridey, int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, m, lda, incx, incy, stridea, stridex, stridey, batch_size);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), m, n,
-                                    (rocDataType *)&alpha, a_, lda, stridea, x_, incx, stridex,
-                                    (rocDataType *)&beta, y_, incy, stridey, batch_size);
-        });
-    });
-
-    return done;
-}
-
-#define GEMV_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                    \
-    sycl::event gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \
-                           const TYPE *a, int64_t lda, int64_t stridea, const TYPE *x,            \
-                           int64_t incx, int64_t stridex, TYPE beta, TYPE *y, int64_t incy,       \
-                           int64_t stridey, int64_t batch_size,                                   \
-                           const std::vector<sycl::event> &dependencies) {                        \
-        return gemv_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, x, incx,   \
-                          stridex, beta, y, incy, stridey, batch_size, dependencies);             \
-    }
-
-GEMV_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_sgemv_strided_batched)
-GEMV_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_dgemv_strided_batched)
-GEMV_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_cgemv_strided_batched)
-GEMV_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zgemv_strided_batched)
-
-#undef GEMV_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose *trans, int64_t *m,
-                              int64_t *n, T *alpha, const T **a, int64_t *lda, const T **x,
-                              int64_t *incx, T *beta, T **y, int64_t *incy, int64_t group_count,
-                              int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    for (int64_t i = 0; i < group_count; i++) {
-        overflow_check(m[i], n[i], lda[i], incx[i], incy[i], group_size[i]);
-    }
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            int64_t offset = 0;
-            rocblas_status err;
-            for (int64_t i = 0; i < group_count; i++) {
-                auto **a_ = reinterpret_cast<const rocDataType **>(a);
-                auto **x_ = reinterpret_cast<const rocDataType **>(x);
-                auto **y_ = reinterpret_cast<rocDataType **>(y);
-                ROCBLAS_ERROR_FUNC_SYNC(
-                    func, err, handle, get_rocblas_operation(trans[i]), (int)m[i], (int)n[i],
-                    (rocDataType *)&alpha[i], a_ + offset, (int)lda[i], x_ + offset, (int)incx[i],
-                    (rocDataType *)&beta[i], y_ + offset, (int)incy[i], (int)group_size[i]);
-                offset += group_size[i];
-            }
-        });
-    });
-
-    return done;
-}
-
-#define GEMV_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                             \
-    sycl::event gemv_batch(                                                                        \
-        sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, TYPE *alpha, const TYPE **a, \
-        int64_t *lda, const TYPE **x, int64_t *incx, TYPE *beta, TYPE **y, int64_t *incy,          \
-        int64_t group_count, int64_t *group_size, const std::vector<sycl::event> &dependencies) {  \
-        return gemv_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y,    \
-                          incy, group_count, group_size, dependencies);                            \
-    }
-
-GEMV_BATCH_LAUNCHER_USM(float, rocblas_sgemv_batched)
-GEMV_BATCH_LAUNCHER_USM(double, rocblas_dgemv_batched)
-GEMV_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_cgemv_batched)
-GEMV_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zgemv_batched)
-
-#undef GEMV_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event dgmm_batch(Func func, sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                              const T *a, int64_t lda, int64_t stridea, const T *x, int64_t incx,
-                              int64_t stridex, T *c, int64_t ldc, int64_t stridec,
-                              int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, incx, stridea, stridex, stridec, batch_size);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto c_ = reinterpret_cast<rocDataType *>(c);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), m, n, a_,
-                                    lda, stridea, x_, incx, stridex, c_, ldc, stridec, batch_size);
-        });
-    });
-
-    return done;
-}
-
-#define DGMM_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                   \
-    sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,            \
-                           const TYPE *a, int64_t lda, int64_t stridea, const TYPE *x,           \
-                           int64_t incx, int64_t stridex, TYPE *c, int64_t ldc, int64_t stridec, \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {   \
-        return dgmm_batch(ROCBLAS_ROUTINE, queue, left_right, m, n, a, lda, stridea, x, incx,    \
-                          stridex, c, ldc, stridec, batch_size, dependencies);                   \
-    }
-
-DGMM_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_sdgmm_strided_batched)
-DGMM_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_ddgmm_strided_batched)
-DGMM_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_cdgmm_strided_batched)
-DGMM_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zdgmm_strided_batched)
-
-#undef DGMM_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event dgmm_batch(Func func, sycl::queue &queue, side *left_right, int64_t *m,
-                              int64_t *n, const T **a, int64_t *lda, const T **x, int64_t *incx,
-                              T **c, int64_t *ldc, int64_t group_count, int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    for (int64_t i = 0; i < group_count; i++) {
-        overflow_check(m[i], n[i], lda[i], ldc[i], incx[i], group_size[i]);
-    }
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            int64_t offset = 0;
-            rocblas_status err;
-
-            for (int64_t i = 0; i < group_count; i++) {
-                auto **a_ = reinterpret_cast<const rocDataType **>(a);
-                auto **x_ = reinterpret_cast<const rocDataType **>(x);
-                auto **c_ = reinterpret_cast<rocDataType **>(c);
-                ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right[i]),
-                                        (int)m[i], (int)n[i], a_ + offset, (int)lda[i], x_ + offset,
-                                        (int)incx[i], c_ + offset, (int)ldc[i], (int)group_size[i]);
-                offset += group_size[i];
-            }
-        });
-    });
-
-    return done;
-}
-
-#define DGMM_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                            \
-    sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,          \
-                           const TYPE **a, int64_t *lda, const TYPE **x, int64_t *incx, TYPE **c, \
-                           int64_t *ldc, int64_t group_count, int64_t *group_size,                \
-                           const std::vector<sycl::event> &dependencies) {                        \
-        return dgmm_batch(ROCBLAS_ROUTINE, queue, left_right, m, n, a, lda, x, incx, c, ldc,      \
-                          group_count, group_size, dependencies);                                 \
-    }
-
-DGMM_BATCH_LAUNCHER_USM(float, rocblas_sdgmm_batched)
-DGMM_BATCH_LAUNCHER_USM(double, rocblas_ddgmm_batched)
-DGMM_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_cdgmm_batched)
-DGMM_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zdgmm_batched)
-
-#undef DGMM_BATCH_LAUNCHER
-
-template <typename Ta, typename Tb, typename Tc, typename Ts>
-inline sycl::event gemm_batch_strided_usm_impl(sycl::queue &queue, transpose transa,
-                                               transpose transb, int64_t m, int64_t n, int64_t k,
-                                               Ts alpha, const Ta *a, int64_t lda, int64_t stridea,
-                                               const Tb *b, int64_t ldb, int64_t strideb, Ts beta,
-                                               Tc *c, int64_t ldc, int64_t stridec,
-                                               int64_t batch_size,
-                                               const std::vector<sycl::event> &dependencies) {
-    using rocTypeA = typename RocEquivalentType<Ta>::Type;
-    using rocTypeB = typename RocEquivalentType<Tb>::Type;
-    using rocTypeC = typename RocEquivalentType<Tc>::Type;
-    using rocTypeS = typename RocEquivalentType<Ts>::Type;
-    overflow_check(m, n, k, lda, ldb, ldc, stridea, strideb, stridec, batch_size);
-
-    int32_t solution_index = 0;
-    rocblas_gemm_flags flags = rocblas_gemm_flags_none;
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocTypeA *>(a);
-            auto b_ = reinterpret_cast<const rocTypeB *>(b);
-            auto c_ = reinterpret_cast<rocTypeC *>(c);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(rocblas_gemm_strided_batched_ex, err, handle,
-                                    get_rocblas_operation(transa), get_rocblas_operation(transb), m,
-                                    n, k, &alpha, a_, get_rocblas_datatype<rocTypeA>(), lda,
-                                    stridea, b_, get_rocblas_datatype<rocTypeB>(), ldb, strideb,
-                                    &beta, c_, get_rocblas_datatype<rocTypeC>(), ldc, stridec, c_,
-                                    get_rocblas_datatype<rocTypeC>(), ldc, stridec, batch_size,
-                                    get_rocblas_datatype<rocTypeS>(), rocblas_gemm_algo_standard,
-                                    solution_index, flags);
-        });
-    });
-
-    return done;
-}
-
-#define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                            \
-    sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,      \
-                           int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda,       \
-                           int64_t stridea, const TYPE_B *b, int64_t ldb, int64_t strideb,         \
-                           TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stridec,                   \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {     \
-        return gemm_batch_strided_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda, stridea, \
-                                           b, ldb, strideb, beta, c, ldc, stridec, batch_size,     \
-                                           dependencies);                                          \
-    }
-
-GEMM_STRIDED_BATCH_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, sycl::half)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(float, float, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(double, double, double, double)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, std::complex<float>, std::complex<float>,
-                                std::complex<float>)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, std::complex<double>, std::complex<double>,
-                                std::complex<double>)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float)
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER_USM
-
-#define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                        \
-    sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,  \
-                           int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda,   \
-                           int64_t stridea, const TYPE_B *b, int64_t ldb, int64_t strideb,     \
-                           TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stridec,               \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) { \
-        throw unimplemented("blas", "gemm_batch",                                              \
-                            std::string("for dtype unimplemented dtype combination <") +       \
-                                dtype_string<TYPE_A>() + "," + dtype_string<TYPE_B>() + "," +  \
-                                dtype_string<TYPE_C>() + "," + dtype_string<TYPE_S>() + ">");  \
-    }
-
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float)
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Ta, typename Tb, typename Tc, typename Ts>
-inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, transpose *transb,
-                                       int64_t *m, int64_t *n, int64_t *k, Ts *alpha, const Ta **a,
-                                       int64_t *lda, const Tb **b, int64_t *ldb, Ts *beta, Tc **c,
-                                       int64_t *ldc, int64_t group_count, int64_t *group_size,
-                                       const std::vector<sycl::event> &dependencies) {
-    using rocTypeA = typename RocEquivalentType<Ta>::Type;
-    using rocTypeB = typename RocEquivalentType<Tb>::Type;
-    using rocTypeC = typename RocEquivalentType<Tc>::Type;
-    using rocTypeS = typename RocEquivalentType<Ts>::Type;
-    for (int64_t i = 0; i < group_count; i++) {
-        overflow_check(m[i], n[i], k[i], lda[i], ldb[i], ldc[i], group_size[i]);
-    }
-
-    int32_t solution_index = 0;
-    rocblas_gemm_flags flags = rocblas_gemm_flags_none;
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            int64_t offset = 0;
-            rocblas_status err;
-            for (int64_t i = 0; i < group_count; i++) {
-                auto **a_ = reinterpret_cast<const rocTypeA **>(a);
-                auto **b_ = reinterpret_cast<const rocTypeB **>(b);
-                auto **c_ = reinterpret_cast<rocTypeC **>(c);
-                ROCBLAS_ERROR_FUNC_SYNC(
-                    rocblas_gemm_batched_ex, err, handle, get_rocblas_operation(transa[i]),
-                    get_rocblas_operation(transb[i]), (int)m[i], (int)n[i], (int)k[i], &alpha[i],
-                    a_ + offset, get_rocblas_datatype<rocTypeA>(), (int)lda[i], b_ + offset,
-                    get_rocblas_datatype<rocTypeB>(), (int)ldb[i], &beta[i], c_ + offset,
-                    get_rocblas_datatype<rocTypeC>(), (int)ldc[i], c_ + offset,
-                    get_rocblas_datatype<rocTypeC>(), (int)ldc[i], (int)group_size[i],
-                    get_rocblas_datatype<rocTypeS>(), rocblas_gemm_algo_standard, solution_index,
-                    flags);
-                offset += group_size[i];
-            }
-        });
-    });
-
-    return done;
-}
-
-#define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                                    \
-    sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,   \
-                           int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda,  \
-                           const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \
-                           int64_t group_count, int64_t *group_size,                               \
-                           const std::vector<sycl::event> &dependencies) {                         \
-        return gemm_batch_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, \
-                                   ldc, group_count, group_size, dependencies);                    \
-    }
-
-GEMM_BATCH_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, sycl::half)
-GEMM_BATCH_LAUNCHER_USM(float, float, float, float)
-GEMM_BATCH_LAUNCHER_USM(double, double, double, double)
-GEMM_BATCH_LAUNCHER_USM(std::complex<float>, std::complex<float>, std::complex<float>,
-                        std::complex<float>)
-GEMM_BATCH_LAUNCHER_USM(std::complex<double>, std::complex<double>, std::complex<double>,
-                        std::complex<double>)
-GEMM_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float)
-
-#undef GEMM_BATCH_LAUNCHER_USM
-
-#define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                                    \
-    sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,   \
-                           int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda,  \
-                           const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \
-                           int64_t group_count, int64_t *group_size,                               \
-                           const std::vector<sycl::event> &dependencies) {                         \
-        throw unimplemented("blas", "gemm_batch",                                                  \
-                            std::string("for dtype unimplemented dtype combination <") +           \
-                                dtype_string<TYPE_A>() + "," + dtype_string<TYPE_B>() + "," +      \
-                                dtype_string<TYPE_C>() + "," + dtype_string<TYPE_S>() + ">");      \
-    }
-
-GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, float, float)
-GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float)
-
-#undef GEMM_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trsm_batch(Func func, sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha,
-                              const T *a, int64_t lda, int64_t stridea, T *b, int64_t ldb,
-                              int64_t strideb, int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, stridea, strideb, batch_size);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto b_ = reinterpret_cast<rocDataType *>(b);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
-                                    get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    m, n, (rocDataType *)&alpha, a_, lda, stridea, b_, ldb, strideb,
-                                    batch_size);
-        });
-    });
-
-    return done;
-}
-
-#define TRSM_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                     \
-    sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \
-                           diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a,        \
-                           int64_t lda, int64_t stridea, TYPE *b, int64_t ldb, int64_t strideb,    \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {     \
-        return trsm_batch(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, \
-                          alpha, a, lda, stridea, b, ldb, strideb, batch_size, dependencies);      \
-    }
-
-TRSM_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_strsm_strided_batched)
-TRSM_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_dtrsm_strided_batched)
-TRSM_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_ctrsm_strided_batched)
-TRSM_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_ztrsm_strided_batched)
-
-#undef TRSM_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trsm_batch(Func func, sycl::queue &queue, side *left_right, uplo *upper_lower,
-                              transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, T *alpha,
-                              const T **a, int64_t *lda, T **b, int64_t *ldb, int64_t group_count,
-                              int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    for (int64_t i = 0; i < group_count; i++) {
-        overflow_check(m[i], n[i], lda[i], ldb[i], group_size[i]);
-    }
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            int64_t offset = 0;
-            rocblas_status err;
-
-            for (int64_t i = 0; i < group_count; i++) {
-                auto **a_ = reinterpret_cast<const rocDataType **>(a);
-                auto **b_ = reinterpret_cast<rocDataType **>(b);
-                ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right[i]),
-                                        get_rocblas_fill_mode(upper_lower[i]),
-                                        get_rocblas_operation(trans[i]),
-                                        get_rocblas_diag_type(unit_diag[i]), (int)m[i], (int)n[i],
-                                        (rocDataType *)&alpha[i], a_ + offset, (int)lda[i],
-                                        b_ + offset, (int)ldb[i], (int)group_size[i]);
-                offset += group_size[i];
-            }
-        });
-    });
-
-    return done;
-}
-
-#define TRSM_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                             \
-    sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower,                \
-                           transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, TYPE *alpha, \
-                           const TYPE **a, int64_t *lda, TYPE **b, int64_t *ldb,                   \
-                           int64_t group_count, int64_t *group_size,                               \
-                           const std::vector<sycl::event> &dependencies) {                         \
-        return trsm_batch(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, \
-                          alpha, a, lda, b, ldb, group_count, group_size, dependencies);           \
-    }
-
-TRSM_BATCH_LAUNCHER_USM(float, rocblas_strsm_batched)
-TRSM_BATCH_LAUNCHER_USM(double, rocblas_dtrsm_batched)
-TRSM_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_ctrsm_batched)
-TRSM_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_ztrsm_batched)
-
-#undef TRSM_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syrk_batch(Func func, sycl::queue &queue, uplo *upper_lower, transpose *trans,
-                              int64_t *n, int64_t *k, T *alpha, const T **a, int64_t *lda, T *beta,
-                              T **c, int64_t *ldc, int64_t group_count, int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    for (int64_t i = 0; i < group_count; i++) {
-        overflow_check(n[i], k[i], lda[i], ldc[i], group_size[i]);
-    }
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            int64_t offset = 0;
-            rocblas_status err;
-
-            for (int64_t i = 0; i < group_count; i++) {
-                auto **a_ = reinterpret_cast<const rocDataType **>(a);
-                auto **c_ = reinterpret_cast<rocDataType **>(c);
-                ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower[i]),
-                                        get_rocblas_operation(trans[i]), (int)n[i], (int)k[i],
-                                        (rocDataType *)&alpha[i], a_ + offset, (int)lda[i],
-                                        (rocDataType *)&beta[i], c_ + offset, (int)ldc[i],
-                                        (int)group_size[i]);
-                offset += group_size[i];
-            }
-        });
-    });
-
-    return done;
-}
-
-#define SYRK_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                           \
-    sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,  \
-                           int64_t *k, TYPE *alpha, const TYPE **a, int64_t *lda, TYPE *beta,    \
-                           TYPE **c, int64_t *ldc, int64_t group_count, int64_t *group_size,     \
-                           const std::vector<sycl::event> &dependencies) {                       \
-        return syrk_batch(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, \
-                          c, ldc, group_count, group_size, dependencies);                        \
-    }
-
-SYRK_BATCH_LAUNCHER_USM(float, rocblas_ssyrk_batched)
-SYRK_BATCH_LAUNCHER_USM(double, rocblas_dsyrk_batched)
-SYRK_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_csyrk_batched)
-SYRK_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zsyrk_batched)
-
-#undef SYRK_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syrk_batch(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                              int64_t n, int64_t k, const T alpha, const T *a, int64_t lda,
-                              int64_t stridea, const T beta, T *c, int64_t ldc, int64_t stridec,
-                              int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, k, lda, ldc, stridea, stridec, batch_size);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto c_ = reinterpret_cast<rocDataType *>(c);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_,
-                                    lda, stridea, (rocDataType *)&beta, c_, ldc, stridec,
-                                    batch_size);
-        });
-    });
-
-    return done;
-}
-
-#define SYRK_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                               \
-    sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, \
-                           int64_t k, const TYPE alpha, const TYPE *a, int64_t lda,          \
-                           int64_t stridea, const TYPE beta, TYPE *c, int64_t ldc,           \
-                           int64_t stridec, int64_t batch_size,                              \
-                           const std::vector<sycl::event> &dependencies) {                   \
-        return syrk_batch(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda,   \
-                          stridea, beta, c, ldc, stridec, batch_size, dependencies);         \
-    }
-
-SYRK_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_ssyrk_strided_batched)
-SYRK_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_dsyrk_strided_batched)
-SYRK_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_csyrk_strided_batched)
-SYRK_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zsyrk_strided_batched)
-
-#undef SYRK_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event omatcopy_batch(Func func, sycl::queue &queue, transpose trans, int64_t m,
-                                  int64_t n, const T alpha, const T *a, int64_t lda,
-                                  int64_t stridea, T *b, int64_t ldb, int64_t strideb,
-                                  int64_t batch_size,
-                                  const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, stridea, strideb, batch_size);
-
-    const T beta = 0;
-    const int64_t new_m = trans == oneapi::mkl::transpose::nontrans ? m : n;
-    const int64_t new_n = trans == oneapi::mkl::transpose::nontrans ? n : m;
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto b_ = reinterpret_cast<rocDataType *>(b);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans),
-                                    get_rocblas_operation(trans), new_m, new_n,
-                                    (rocDataType *)&alpha, a_, lda, stridea, (rocDataType *)&beta,
-                                    nullptr, lda, stridea, b_, ldb, strideb, batch_size);
-        });
-    });
-
-    return done;
-}
-
-#define OMATCOPY_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                 \
-    sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,          \
-                               const TYPE alpha, const TYPE *a, int64_t lda, int64_t stridea,      \
-                               TYPE *b, int64_t ldb, int64_t strideb, int64_t batch_size,          \
-                               const std::vector<sycl::event> &dependencies) {                     \
-        return omatcopy_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, b, ldb, \
-                              strideb, batch_size, dependencies);                                  \
-    }
-
-OMATCOPY_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_sgeam_strided_batched)
-OMATCOPY_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_dgeam_strided_batched)
-OMATCOPY_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_cgeam_strided_batched)
-OMATCOPY_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zgeam_strided_batched)
-
-#undef OMATCOPY_STRIDED_BATCH_LAUNCHER_USM
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                           float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                           double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<float> alpha, std::complex<float> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<double> alpha, std::complex<double> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-template <typename Func, typename T>
-inline sycl::event omatadd_batch(Func func, sycl::queue &queue, transpose transa, transpose transb,
-                                 int64_t m, int64_t n, const T alpha, const T *a, int64_t lda,
-                                 int64_t stridea, const T beta, const T *b, int64_t ldb,
-                                 int64_t strideb, T *c, int64_t ldc, int64_t stridec,
-                                 int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc, stridea, strideb, stridec, batch_size);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto b_ = reinterpret_cast<const rocDataType *>(b);
-            auto c_ = reinterpret_cast<rocDataType *>(c);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa),
-                                    get_rocblas_operation(transb), m, n, (rocDataType *)&alpha, a_,
-                                    lda, stridea, (rocDataType *)&beta, b_, ldb, strideb, c_, ldc,
-                                    stridec, batch_size);
-        });
-    });
-
-    return done;
-}
-
-#define OMATADD_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                  \
-    sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,   \
-                              int64_t n, const TYPE alpha, const TYPE *a, int64_t lda,             \
-                              int64_t stridea, const TYPE beta, const TYPE *b, int64_t ldb,        \
-                              int64_t strideb, TYPE *c, int64_t ldc, int64_t stridec,              \
-                              int64_t batch_size, const std::vector<sycl::event> &dependencies) {  \
-        return omatadd_batch(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, stridea, \
-                             beta, b, ldb, strideb, c, ldc, stridec, batch_size, dependencies);    \
-    }
-
-OMATADD_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_sgeam_strided_batched)
-OMATADD_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_dgeam_strided_batched)
-OMATADD_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_cgeam_strided_batched)
-OMATADD_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zgeam_strided_batched)
-
-#undef OMATADD_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event omatcopy_batch(Func func, sycl::queue &queue, transpose *trans, int64_t *m,
-                                  int64_t *n, T *alpha, const T **a, int64_t *lda, T **b,
-                                  int64_t *ldb, int64_t group_count, int64_t *group_size,
-                                  const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    for (int64_t i = 0; i < group_count; i++) {
-        overflow_check(m[i], n[i], lda[i], ldb[i], group_size[i]);
-    }
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            int64_t offset = 0;
-            rocblas_status err;
-
-            for (int64_t i = 0; i < group_count; i++) {
-                auto **a_ = reinterpret_cast<const rocDataType **>(a);
-                auto **b_ = reinterpret_cast<rocDataType **>(b);
-
-                const T beta = 0;
-                const auto new_m = trans[i] == oneapi::mkl::transpose::nontrans ? m[i] : n[i];
-                const auto new_n = trans[i] == oneapi::mkl::transpose::nontrans ? n[i] : m[i];
-
-                ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans[i]),
-                                        get_rocblas_operation(trans[i]), (int)new_m, (int)new_n,
-                                        (rocDataType *)&alpha[i], a_ + offset, (int)lda[i],
-                                        (rocDataType *)&beta, nullptr, (int)lda[i], b_ + offset,
-                                        (int)ldb[i], (int)group_size[i]);
-                offset += group_size[i];
-            }
-        });
-    });
-
-    return done;
-}
-
-#define OMATCOPY_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                        \
-    sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,      \
-                               TYPE *alpha, const TYPE **a, int64_t *lda, TYPE **b, int64_t *ldb, \
-                               int64_t group_count, int64_t *group_size,                          \
-                               const std::vector<sycl::event> &dependencies) {                    \
-        return omatcopy_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb,         \
-                              group_count, group_size, dependencies);                             \
-    }
-
-OMATCOPY_BATCH_LAUNCHER_USM(float, rocblas_sgeam_batched)
-OMATCOPY_BATCH_LAUNCHER_USM(double, rocblas_dgeam_batched)
-OMATCOPY_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_cgeam_batched)
-OMATCOPY_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zgeam_batched)
-
-#undef OMATCOPY_BATCH_LAUNCHER_USM
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           float *alpha, float **ab, int64_t *lda, int64_t *ldb,
-                           int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           double *alpha, double **ab, int64_t *lda, int64_t *ldb,
-                           int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           std::complex<float> *alpha, std::complex<float> **ab, int64_t *lda,
-                           int64_t *ldb, int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           std::complex<double> *alpha, std::complex<double> **ab, int64_t *lda,
-                           int64_t *ldb, int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for column_major layout");
-}
-
-} // namespace column_major
-
-namespace row_major {
-
-// Buffer APIs
-
-template <typename Func, typename T>
-inline void copy_batch(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T, 1> &x,
-                       int64_t incx, int64_t stridex, sycl::buffer<T, 1> &y, int64_t incy,
-                       int64_t stridey, int64_t batch_size) {
-    column_major::copy_batch(func, queue, n, x, incx, stridex, y, incy, stridey, batch_size);
-}
-
-#define COPY_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                     \
-    void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, int64_t incx,     \
-                    int64_t stridex, sycl::buffer<TYPE, 1> &y, int64_t incy, int64_t stridey,  \
-                    int64_t batch_size) {                                                      \
-        copy_batch(ROCBLAS_ROUTINE, queue, n, x, incx, stridex, y, incy, stridey, batch_size); \
-    }
-
-COPY_STRIDED_BATCH_LAUNCHER(float, rocblas_scopy_strided_batched)
-COPY_STRIDED_BATCH_LAUNCHER(double, rocblas_dcopy_strided_batched)
-COPY_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_ccopy_strided_batched)
-COPY_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_zcopy_strided_batched)
-
-#undef COPY_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void axpy_batch(Func func, sycl::queue &queue, int64_t n, T alpha, sycl::buffer<T, 1> &x,
-                       int64_t incx, int64_t stridex, sycl::buffer<T, 1> &y, int64_t incy,
-                       int64_t stridey, int64_t batch_size) {
-    column_major::axpy_batch(func, queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size);
-}
-
-#define AXPY_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                 \
-    void axpy_batch(sycl::queue &queue, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &x,   \
-                    int64_t incx, int64_t stridex, sycl::buffer<TYPE, 1> &y, int64_t incy, \
-                    int64_t stridey, int64_t batch_size) {                                 \
-        axpy_batch(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, stridex, y, incy, stridey,   \
-                   batch_size);                                                            \
-    }
-
-AXPY_STRIDED_BATCH_LAUNCHER(float, rocblas_saxpy_strided_batched)
-AXPY_STRIDED_BATCH_LAUNCHER(double, rocblas_daxpy_strided_batched)
-AXPY_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_caxpy_strided_batched)
-AXPY_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_zaxpy_strided_batched)
-
-#undef AXPY_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                       std::complex<T> alpha, sycl::buffer<std::complex<T>, 1> &a, int64_t lda,
-                       int64_t stridea, sycl::buffer<std::complex<T>, 1> &x, int64_t incx,
-                       int64_t stridex, std::complex<T> beta, sycl::buffer<std::complex<T>, 1> &y,
-                       int64_t incy, int64_t stridey, int64_t batch_size) {
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        alpha = std::conj(alpha);
-        beta = std::conj(beta);
-
-        if (m > 0) {
-            queue.submit(
-                [&](sycl::handler &cgh) { conj_vector(cgh, x, m, incx, stridex, batch_size); });
-
-            if (n > 0) {
-                queue.submit(
-                    [&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy, stridey, batch_size); });
-            }
-        }
-    }
-
-    column_major::gemv_batch(func, queue, new_trans, n, m, alpha, a, lda, stridea, x, incx, stridex,
-                             beta, y, incy, stridey, batch_size);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit(
-                [&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy, stridey, batch_size); });
-        }
-    }
-}
-
-template <typename Func, typename T>
-inline void gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                       T alpha, sycl::buffer<T, 1> &a, int64_t lda, int64_t stridea,
-                       sycl::buffer<T, 1> &x, int64_t incx, int64_t stridex, T beta,
-                       sycl::buffer<T, 1> &y, int64_t incy, int64_t stridey, int64_t batch_size) {
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    column_major::gemv_batch(func, queue, new_trans, n, m, alpha, a, lda, stridea, x, incx, stridex,
-                             beta, y, incy, stridey, batch_size);
-}
-
-#define GEMV_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                         \
-    void gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,         \
-                    sycl::buffer<TYPE, 1> &a, int64_t lda, int64_t stridea,                        \
-                    sycl::buffer<TYPE, 1> &x, int64_t incx, int64_t stridex, TYPE beta,            \
-                    sycl::buffer<TYPE, 1> &y, int64_t incy, int64_t stridey, int64_t batch_size) { \
-        gemv_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex,  \
-                   beta, y, incy, stridey, batch_size);                                            \
-    }
-
-GEMV_STRIDED_BATCH_LAUNCHER(float, rocblas_sgemv_strided_batched)
-GEMV_STRIDED_BATCH_LAUNCHER(double, rocblas_dgemv_strided_batched)
-GEMV_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_cgemv_strided_batched)
-GEMV_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_zgemv_strided_batched)
-
-#undef GEMV_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void dgmm_batch(Func func, sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                       sycl::buffer<T, 1> &a, int64_t lda, int64_t stridea, sycl::buffer<T, 1> &x,
-                       int64_t incx, int64_t stridex, sycl::buffer<T, 1> &c, int64_t ldc,
-                       int64_t stridec, int64_t batch_size) {
-    auto new_side =
-        left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left;
-
-    column_major::dgmm_batch(func, queue, new_side, n, m, a, lda, stridea, x, incx, stridex, c, ldc,
-                             stridec, batch_size);
-}
-
-#define DGMM_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                         \
-    void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,                     \
-                    sycl::buffer<TYPE, 1> &a, int64_t lda, int64_t stridea,                        \
-                    sycl::buffer<TYPE, 1> &x, int64_t incx, int64_t stridex,                       \
-                    sycl::buffer<TYPE, 1> &c, int64_t ldc, int64_t stridec, int64_t batch_size) {  \
-        dgmm_batch(ROCBLAS_ROUTINE, queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, \
-                   ldc, stridec, batch_size);                                                      \
-    }
-
-DGMM_STRIDED_BATCH_LAUNCHER(float, rocblas_sdgmm_strided_batched)
-DGMM_STRIDED_BATCH_LAUNCHER(double, rocblas_ddgmm_strided_batched)
-DGMM_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_cdgmm_strided_batched)
-DGMM_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_zdgmm_strided_batched)
-
-#undef DGMM_STRIDED_BATCH_LAUNCHER
-
-template <typename Ta, typename Tb, typename Tc, typename Ts>
-inline void gemm_batch_impl(sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                            int64_t n, int64_t k, Ts alpha, sycl::buffer<Ta, 1> &a, int64_t lda,
-                            int64_t stridea, sycl::buffer<Tb, 1> &b, int64_t ldb, int64_t strideb,
-                            Ts beta, sycl::buffer<Tc, 1> &c, int64_t ldc, int64_t stridec,
-                            int64_t batch_size) {
-    auto new_transa = transb;
-    auto new_transb = transa;
-
-    column_major::gemm_batch(queue, new_transa, new_transb, n, m, k, alpha, b, ldb, strideb, a, lda,
-                             stridea, beta, c, ldc, stridec, batch_size);
-}
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER
-#define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                               \
-    void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                    int64_t k, TYPE_S alpha, sycl::buffer<TYPE_A, 1> &a, int64_t lda,             \
-                    int64_t stridea, sycl::buffer<TYPE_B, 1> &b, int64_t ldb, int64_t strideb,    \
-                    TYPE_S beta, sycl::buffer<TYPE_C, 1> &c, int64_t ldc, int64_t stridec,        \
-                    int64_t batch_size) {                                                         \
-        gemm_batch_impl(queue, transa, transb, m, n, k, alpha, a, lda, stridea, b, ldb, strideb,  \
-                        beta, c, ldc, stridec, batch_size);                                       \
-    }
-
-GEMM_STRIDED_BATCH_LAUNCHER(float, float, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER(double, double, double, double)
-GEMM_STRIDED_BATCH_LAUNCHER(std::complex<float>, std::complex<float>, std::complex<float>,
-                            std::complex<float>)
-GEMM_STRIDED_BATCH_LAUNCHER(std::complex<double>, std::complex<double>, std::complex<double>,
-                            std::complex<double>)
-GEMM_STRIDED_BATCH_LAUNCHER(sycl::half, sycl::half, sycl::half, sycl::half)
-GEMM_STRIDED_BATCH_LAUNCHER(sycl::half, sycl::half, float, float)
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER
-
-#define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                               \
-    void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                    int64_t k, TYPE_S alpha, sycl::buffer<TYPE_A, 1> &a, int64_t lda,             \
-                    int64_t stridea, sycl::buffer<TYPE_B, 1> &b, int64_t ldb, int64_t strideb,    \
-                    TYPE_S beta, sycl::buffer<TYPE_C, 1> &c, int64_t ldc, int64_t stridec,        \
-                    int64_t batch_size) {                                                         \
-        throw unimplemented("blas", "gemm_batch",                                                 \
-                            std::string("for dtype unimplemented dtype combination <") +          \
-                                dtype_string<TYPE_A>() + "," + dtype_string<TYPE_B>() + "," +     \
-                                dtype_string<TYPE_C>() + "," + dtype_string<TYPE_S>() + ">");     \
-    }
-
-GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, std::int32_t, float)
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void trsm_batch(Func func, sycl::queue &queue, side left_right, uplo upper_lower,
-                       transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha,
-                       sycl::buffer<T, 1> &a, int64_t lda, int64_t stridea, sycl::buffer<T, 1> &b,
-                       int64_t ldb, int64_t strideb, int64_t batch_size) {
-    auto new_side =
-        left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left;
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    column_major::trsm_batch(func, queue, new_side, new_uplo, trans, unit_diag, n, m, alpha, a, lda,
-                             stridea, b, ldb, strideb, batch_size);
-}
-
-#define TRSM_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                         \
-    void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,        \
-                    diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &a,    \
-                    int64_t lda, int64_t stridea, sycl::buffer<TYPE, 1> &b, int64_t ldb,           \
-                    int64_t strideb, int64_t batch_size) {                                         \
-        trsm_batch(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, \
-                   a, lda, stridea, b, ldb, strideb, batch_size);                                  \
-    }
-
-TRSM_STRIDED_BATCH_LAUNCHER(float, rocblas_strsm_strided_batched)
-TRSM_STRIDED_BATCH_LAUNCHER(double, rocblas_dtrsm_strided_batched)
-TRSM_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_ctrsm_strided_batched)
-TRSM_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_ztrsm_strided_batched)
-
-#undef TRSM_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void syrk_batch(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                       int64_t k, T alpha, sycl::buffer<T, 1> &a, int64_t lda, int64_t stridea,
-                       T beta, sycl::buffer<T, 1> &c, int64_t ldc, int64_t stridec,
-                       int64_t batch_size) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    column_major::syrk_batch(func, queue, new_uplo, new_trans, n, k, alpha, a, lda, stridea, beta,
-                             c, ldc, stridec, batch_size);
-}
-
-#define SYRK_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                         \
-    void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,   \
-                    TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, int64_t stridea, TYPE beta, \
-                    sycl::buffer<TYPE, 1> &c, int64_t ldc, int64_t stridec, int64_t batch_size) {  \
-        syrk_batch(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, stridea, beta, \
-                   c, ldc, stridec, batch_size);                                                   \
-    }
-
-SYRK_STRIDED_BATCH_LAUNCHER(float, rocblas_ssyrk_strided_batched)
-SYRK_STRIDED_BATCH_LAUNCHER(double, rocblas_dsyrk_strided_batched)
-SYRK_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_csyrk_strided_batched)
-SYRK_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_zsyrk_strided_batched)
-
-#undef SYRK_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void omatcopy_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           const T alpha, sycl::buffer<T, 1> &a, int64_t lda, int64_t stridea,
-                           sycl::buffer<T, 1> &b, int64_t ldb, int64_t strideb,
-                           int64_t batch_size) {
-    return column_major::omatcopy_batch(func, queue, trans, n, m, alpha, a, lda, stridea, b, ldb,
-                                        strideb, batch_size);
-}
-
-#define OMATCOPY_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                    \
-    void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,                \
-                        const TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, int64_t stridea, \
-                        sycl::buffer<TYPE, 1> &b, int64_t ldb, int64_t strideb,                   \
-                        int64_t batch_size) {                                                     \
-        omatcopy_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, b, ldb,       \
-                       strideb, batch_size);                                                      \
-    }
-
-OMATCOPY_STRIDED_BATCH_LAUNCHER(float, rocblas_sgeam_strided_batched)
-OMATCOPY_STRIDED_BATCH_LAUNCHER(double, rocblas_dgeam_strided_batched)
-OMATCOPY_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_cgeam_strided_batched)
-OMATCOPY_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_zgeam_strided_batched)
-
-#undef OMATCOPY_STRIDED_BATCH_LAUNCHER
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                    sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                    sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb, int64_t stride,
-                    int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                    std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-                    int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-template <typename Func, typename T>
-inline void omatadd_batch(Func func, sycl::queue &queue, transpose transa, transpose transb,
-                          int64_t m, int64_t n, const T alpha, sycl::buffer<T, 1> &a, int64_t lda,
-                          int64_t stridea, const T beta, sycl::buffer<T, 1> &b, int64_t ldb,
-                          int64_t strideb, sycl::buffer<T, 1> &c, int64_t ldc, int64_t stridec,
-                          int64_t batch_size) {
-    return column_major::omatadd_batch(func, queue, transa, transb, n, m, alpha, a, lda, stridea,
-                                       beta, b, ldb, strideb, c, ldc, stridec, batch_size);
-}
-
-#define OMATADD_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                     \
-    void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,         \
-                       int64_t n, const TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda,        \
-                       int64_t stridea, const TYPE beta, sycl::buffer<TYPE, 1> &b, int64_t ldb,   \
-                       int64_t strideb, sycl::buffer<TYPE, 1> &c, int64_t ldc, int64_t stridec,   \
-                       int64_t batch_size) {                                                      \
-        omatadd_batch(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, stridea, beta, \
-                      b, ldb, strideb, c, ldc, stridec, batch_size);                              \
-    }
-
-OMATADD_STRIDED_BATCH_LAUNCHER(float, rocblas_sgeam_strided_batched)
-OMATADD_STRIDED_BATCH_LAUNCHER(double, rocblas_dgeam_strided_batched)
-OMATADD_STRIDED_BATCH_LAUNCHER(std::complex<float>, rocblas_cgeam_strided_batched)
-OMATADD_STRIDED_BATCH_LAUNCHER(std::complex<double>, rocblas_zgeam_strided_batched)
-
-#undef OMATADD_STRIDED_BATCH_LAUNCHER
-
-// USM APIs
-
-template <typename Func, typename T>
-inline sycl::event copy_batch(Func func, sycl::queue &queue, int64_t *n, const T **x, int64_t *incx,
-                              T **y, int64_t *incy, int64_t group_count, int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies) {
-    return column_major::copy_batch(func, queue, n, x, incx, y, incy, group_count, group_size,
-                                    dependencies);
-}
-
-#define COPY_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                          \
-    sycl::event copy_batch(sycl::queue &queue, int64_t *n, const TYPE **x, int64_t *incx,       \
-                           TYPE **y, int64_t *incy, int64_t group_count, int64_t *group_size,   \
-                           const std::vector<sycl::event> &dependencies) {                      \
-        return copy_batch(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, group_count, group_size, \
-                          dependencies);                                                        \
-    }
-
-COPY_BATCH_LAUNCHER_USM(float, rocblas_scopy_batched)
-COPY_BATCH_LAUNCHER_USM(double, rocblas_dcopy_batched)
-COPY_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_ccopy_batched)
-COPY_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zcopy_batched)
-
-#undef COPY_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event copy_batch(Func func, sycl::queue &queue, int64_t n, const T *x, int64_t incx,
-                              int64_t stridex, T *y, int64_t incy, int64_t stridey,
-                              int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return column_major::copy_batch(func, queue, n, x, incx, stridex, y, incy, stridey, batch_size,
-                                    dependencies);
-}
-
-#define COPY_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                 \
-    sycl::event copy_batch(sycl::queue &queue, int64_t n, const TYPE *x, int64_t incx,         \
-                           int64_t stridex, TYPE *y, int64_t incy, int64_t stridey,            \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) { \
-        return copy_batch(ROCBLAS_ROUTINE, queue, n, x, incx, stridex, y, incy, stridey,       \
-                          batch_size, dependencies);                                           \
-    }
-
-COPY_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_scopy_strided_batched)
-COPY_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_dcopy_strided_batched)
-COPY_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_ccopy_strided_batched)
-COPY_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zcopy_strided_batched)
-
-#undef COPY_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event axpy_batch(Func func, sycl::queue &queue, int64_t *n, T *alpha, const T **x,
-                              int64_t *incx, T **y, int64_t *incy, int64_t group_count,
-                              int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return column_major::axpy_batch(func, queue, n, alpha, x, incx, y, incy, group_count,
-                                    group_size, dependencies);
-}
-
-#define AXPY_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                          \
-    sycl::event axpy_batch(sycl::queue &queue, int64_t *n, TYPE *alpha, const TYPE **x,         \
-                           int64_t *incx, TYPE **y, int64_t *incy, int64_t group_count,         \
-                           int64_t *group_size, const std::vector<sycl::event> &dependencies) { \
-        return axpy_batch(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy, group_count,      \
-                          group_size, dependencies);                                            \
-    }
-
-AXPY_BATCH_LAUNCHER_USM(float, rocblas_saxpy_batched)
-AXPY_BATCH_LAUNCHER_USM(double, rocblas_daxpy_batched)
-AXPY_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_caxpy_batched)
-AXPY_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zaxpy_batched)
-
-#undef AXPY_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event axpy_batch(Func func, sycl::queue &queue, int64_t n, T alpha, const T *x,
-                              int64_t incx, int64_t stridex, T *y, int64_t incy, int64_t stridey,
-                              int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return column_major::axpy_batch(func, queue, n, alpha, x, incx, stridex, y, incy, stridey,
-                                    batch_size, dependencies);
-}
-
-#define AXPY_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                     \
-    sycl::event axpy_batch(sycl::queue &queue, int64_t n, TYPE alpha, const TYPE *x, int64_t incx, \
-                           int64_t stridex, TYPE *y, int64_t incy, int64_t stridey,                \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {     \
-        return axpy_batch(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, stridex, y, incy, stridey,    \
-                          batch_size, dependencies);                                               \
-    }
-
-AXPY_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_saxpy_strided_batched)
-AXPY_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_daxpy_strided_batched)
-AXPY_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_caxpy_strided_batched)
-AXPY_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zaxpy_strided_batched)
-
-#undef AXPY_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                              std::complex<T> alpha, const std::complex<T> *a, int64_t lda,
-                              int64_t stridea, const std::complex<T> *x, int64_t incx,
-                              int64_t stridex, std::complex<T> beta, std::complex<T> *y,
-                              int64_t incy, int64_t stridey, int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies) {
-    sycl::event done;
-
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        alpha = std::conj(alpha);
-        beta = std::conj(beta);
-
-        if (m > 0) {
-            done = queue.submit([&](sycl::handler &cgh) {
-                conj_vector(cgh, (std::complex<T> *)x, m, incx, stridex, batch_size);
-            });
-
-            if (n > 0) {
-                done = queue.submit(
-                    [&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy, stridey, batch_size); });
-            }
-        }
-    }
-
-    done.wait_and_throw();
-
-    done = column_major::gemv_batch(func, queue, new_trans, n, m, alpha, a, lda, stridea, x, incx,
-                                    stridex, beta, y, incy, stridey, batch_size, dependencies);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            done = queue.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(done);
-                conj_vector(cgh, y, n, incy, stridey, batch_size);
-            });
-        }
-    }
-
-    return done;
-}
-
-template <typename Func, typename T>
-inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                              T alpha, const T *a, int64_t lda, int64_t stridea, const T *x,
-                              int64_t incx, int64_t stridex, T beta, T *y, int64_t incy,
-                              int64_t stridey, int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies) {
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    return column_major::gemv_batch(func, queue, new_trans, n, m, alpha, a, lda, stridea, x, incx,
-                                    stridex, beta, y, incy, stridey, batch_size, dependencies);
-}
-
-#define GEMV_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                    \
-    sycl::event gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \
-                           const TYPE *a, int64_t lda, int64_t stridea, const TYPE *x,            \
-                           int64_t incx, int64_t stridex, TYPE beta, TYPE *y, int64_t incy,       \
-                           int64_t stridey, int64_t batch_size,                                   \
-                           const std::vector<sycl::event> &dependencies) {                        \
-        return gemv_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, x, incx,   \
-                          stridex, beta, y, incy, stridey, batch_size, dependencies);             \
-    }
-
-GEMV_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_sgemv_strided_batched)
-GEMV_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_dgemv_strided_batched)
-GEMV_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_cgemv_strided_batched)
-GEMV_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zgemv_strided_batched)
-
-#undef GEMV_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose *trans, int64_t *m,
-                              int64_t *n, std::complex<T> *alpha, const std::complex<T> **a,
-                              int64_t *lda, const std::complex<T> **x, int64_t *incx,
-                              std::complex<T> *beta, std::complex<T> **y, int64_t *incy,
-                              int64_t group_count, int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies) {
-    sycl::event done;
-
-    int64_t stride = 0;
-    for (int64_t i = 0; i < group_count; i++) {
-        if (trans[i] == oneapi::mkl::transpose::conjtrans) {
-            alpha[i] = std::conj(alpha[i]);
-            beta[i] = std::conj(beta[i]);
-
-            if (m[i] > 0) {
-                done = queue.submit([&](sycl::handler &cgh) {
-                    conj_vector(cgh, (std::complex<T> **)x, m[i], incx[i], stride, group_size[i]);
-                });
-
-                if (n[i] > 0) {
-                    done = queue.submit([&](sycl::handler &cgh) {
-                        conj_vector(cgh, y, n[i], incy[i], stride, group_size[i]);
-                    });
-                }
-            }
-        }
-        stride += group_size[i];
-    }
-
-    done.wait_and_throw();
-
-    auto tmp_trans = std::vector<transpose>{ (std::size_t)group_count };
-    for (int64_t i = 0; i < group_count; i++) {
-        const auto new_trans = trans[i] == oneapi::mkl::transpose::nontrans
-                                   ? oneapi::mkl::transpose::trans
-                                   : oneapi::mkl::transpose::nontrans;
-        tmp_trans[i] = trans[i];
-        trans[i] = new_trans;
-    }
-    done = column_major::gemv_batch(func, queue, trans, n, m, alpha, a, lda, x, incx, beta, y, incy,
-                                    group_count, group_size, dependencies);
-    done.wait_and_throw();
-    for (int64_t i = 0; i < group_count; i++) {
-        trans[i] = tmp_trans[i];
-    }
-
-    stride = 0;
-    for (int64_t i = 0; i < group_count; i++) {
-        if (trans[i] == oneapi::mkl::transpose::conjtrans) {
-            if (n[i] > 0) {
-                done = queue.submit([&](sycl::handler &cgh) {
-                    conj_vector(cgh, y, n[i], incy[i], stride, group_size[i]);
-                });
-            }
-        }
-        stride += group_size[i];
-    }
-
-    return done;
-}
-
-template <typename Func, typename T>
-inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose *trans, int64_t *m,
-                              int64_t *n, T *alpha, const T **a, int64_t *lda, const T **x,
-                              int64_t *incx, T *beta, T **y, int64_t *incy, int64_t group_count,
-                              int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    auto tmp_trans = std::vector<transpose>{ static_cast<std::size_t>(group_count) };
-
-    for (int64_t i = 0; i < group_count; i++) {
-        const auto new_trans = trans[i] == oneapi::mkl::transpose::nontrans
-                                   ? oneapi::mkl::transpose::trans
-                                   : oneapi::mkl::transpose::nontrans;
-        tmp_trans[i] = trans[i];
-        trans[i] = new_trans;
-    }
-    auto done = column_major::gemv_batch(func, queue, trans, n, m, alpha, a, lda, x, incx, beta, y,
-                                         incy, group_count, group_size, dependencies);
-    done.wait_and_throw();
-    for (int64_t i = 0; i < group_count; i++) {
-        trans[i] = tmp_trans[i];
-    }
-
-    return done;
-}
-
-#define GEMV_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                             \
-    sycl::event gemv_batch(                                                                        \
-        sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, TYPE *alpha, const TYPE **a, \
-        int64_t *lda, const TYPE **x, int64_t *incx, TYPE *beta, TYPE **y, int64_t *incy,          \
-        int64_t group_count, int64_t *group_size, const std::vector<sycl::event> &dependencies) {  \
-        return gemv_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y,    \
-                          incy, group_count, group_size, dependencies);                            \
-    }
-
-GEMV_BATCH_LAUNCHER_USM(float, rocblas_sgemv_batched)
-GEMV_BATCH_LAUNCHER_USM(double, rocblas_dgemv_batched)
-GEMV_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_cgemv_batched)
-GEMV_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zgemv_batched)
-
-#undef GEMV_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event dgmm_batch(Func func, sycl::queue &queue, side left_right, int64_t m, int64_t n,
-                              const T *a, int64_t lda, int64_t stridea, const T *x, int64_t incx,
-                              int64_t stridex, T *c, int64_t ldc, int64_t stridec,
-                              int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto new_side =
-        left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left;
-
-    return column_major::dgmm_batch(func, queue, new_side, n, m, a, lda, stridea, x, incx, stridex,
-                                    c, ldc, stridec, batch_size, dependencies);
-}
-
-#define DGMM_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                   \
-    sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n,            \
-                           const TYPE *a, int64_t lda, int64_t stridea, const TYPE *x,           \
-                           int64_t incx, int64_t stridex, TYPE *c, int64_t ldc, int64_t stridec, \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {   \
-        return dgmm_batch(ROCBLAS_ROUTINE, queue, left_right, m, n, a, lda, stridea, x, incx,    \
-                          stridex, c, ldc, stridec, batch_size, dependencies);                   \
-    }
-
-DGMM_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_sdgmm_strided_batched)
-DGMM_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_ddgmm_strided_batched)
-DGMM_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_cdgmm_strided_batched)
-DGMM_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zdgmm_strided_batched)
-
-#undef DGMM_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event dgmm_batch(Func func, sycl::queue &queue, side *left_right, int64_t *m,
-                              int64_t *n, const T **a, int64_t *lda, const T **x, int64_t *incx,
-                              T **c, int64_t *ldc, int64_t group_count, int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies) {
-    for (int64_t i = 0; i < group_count; i++) {
-        const auto new_side = left_right[i] == oneapi::mkl::side::left ? oneapi::mkl::side::right
-                                                                       : oneapi::mkl::side::left;
-        left_right[i] = new_side;
-    }
-
-    return column_major::dgmm_batch(func, queue, left_right, n, m, a, lda, x, incx, c, ldc,
-                                    group_count, group_size, dependencies);
-}
-
-#define DGMM_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                            \
-    sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n,          \
-                           const TYPE **a, int64_t *lda, const TYPE **x, int64_t *incx, TYPE **c, \
-                           int64_t *ldc, int64_t group_count, int64_t *group_size,                \
-                           const std::vector<sycl::event> &dependencies) {                        \
-        return dgmm_batch(ROCBLAS_ROUTINE, queue, left_right, m, n, a, lda, x, incx, c, ldc,      \
-                          group_count, group_size, dependencies);                                 \
-    }
-
-DGMM_BATCH_LAUNCHER_USM(float, rocblas_sdgmm_batched)
-DGMM_BATCH_LAUNCHER_USM(double, rocblas_ddgmm_batched)
-DGMM_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_cdgmm_batched)
-DGMM_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zdgmm_batched)
-
-#undef DGMM_BATCH_LAUNCHER
-
-template <typename Ta, typename Tb, typename Tc, typename Ts>
-inline sycl::event gemm_batch_strided_usm_impl(sycl::queue &queue, transpose transa,
-                                               transpose transb, int64_t m, int64_t n, int64_t k,
-                                               Ts alpha, const Ta *a, int64_t lda, int64_t stridea,
-                                               const Tb *b, int64_t ldb, int64_t strideb, Ts beta,
-                                               Tc *c, int64_t ldc, int64_t stridec,
-                                               int64_t batch_size,
-                                               const std::vector<sycl::event> &dependencies) {
-    auto new_transa = transb;
-    auto new_transb = transa;
-
-    return column_major::gemm_batch(queue, new_transa, new_transb, n, m, k, alpha, b, ldb, strideb,
-                                    a, lda, stridea, beta, c, ldc, stridec, batch_size,
-                                    dependencies);
-}
-
-#define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                            \
-    sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,      \
-                           int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda,       \
-                           int64_t stridea, const TYPE_B *b, int64_t ldb, int64_t strideb,         \
-                           TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stridec,                   \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {     \
-        return gemm_batch_strided_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda, stridea, \
-                                           b, ldb, strideb, beta, c, ldc, stridec, batch_size,     \
-                                           dependencies);                                          \
-    }
-
-GEMM_STRIDED_BATCH_LAUNCHER_USM(float, float, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(double, double, double, double)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, std::complex<float>, std::complex<float>,
-                                std::complex<float>)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, std::complex<double>, std::complex<double>,
-                                std::complex<double>)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, sycl::half)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float)
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER_USM
-
-#define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                        \
-    sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,  \
-                           int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda,   \
-                           int64_t stridea, const TYPE_B *b, int64_t ldb, int64_t strideb,     \
-                           TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stridec,               \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) { \
-        throw unimplemented("blas", "gemm_batch",                                              \
-                            std::string("for dtype unimplemented dtype combination <") +       \
-                                dtype_string<TYPE_A>() + "," + dtype_string<TYPE_B>() + "," +  \
-                                dtype_string<TYPE_C>() + "," + dtype_string<TYPE_S>() + ">");  \
-    }
-
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, float, float)
-GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float)
-
-#undef GEMM_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Ta, typename Tb, typename Tc, typename Ts>
-inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, transpose *transb,
-                                       int64_t *m, int64_t *n, int64_t *k, Ts *alpha, const Ta **a,
-                                       int64_t *lda, const Tb **b, int64_t *ldb, Ts *beta, Tc **c,
-                                       int64_t *ldc, int64_t group_count, int64_t *group_size,
-                                       const std::vector<sycl::event> &dependencies) {
-    for (int64_t i = 0; i < group_count; i++) {
-        std::swap(transa[i], transb[i]);
-    }
-
-    return column_major::gemm_batch(queue, transa, transb, n, m, k, alpha, b, ldb, a, lda, beta, c,
-                                    ldc, group_count, group_size, dependencies);
-}
-
-#define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                                    \
-    sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,   \
-                           int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda,  \
-                           const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \
-                           int64_t group_count, int64_t *group_size,                               \
-                           const std::vector<sycl::event> &dependencies) {                         \
-        return gemm_batch_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, \
-                                   ldc, group_count, group_size, dependencies);                    \
-    }
-
-GEMM_BATCH_LAUNCHER_USM(float, float, float, float)
-GEMM_BATCH_LAUNCHER_USM(double, double, double, double)
-GEMM_BATCH_LAUNCHER_USM(std::complex<float>, std::complex<float>, std::complex<float>,
-                        std::complex<float>)
-GEMM_BATCH_LAUNCHER_USM(std::complex<double>, std::complex<double>, std::complex<double>,
-                        std::complex<double>)
-GEMM_BATCH_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, sycl::half)
-GEMM_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float)
-
-#undef GEMM_BATCH_LAUNCHER_USM
-
-#define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S)                                    \
-    sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m,   \
-                           int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda,  \
-                           const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \
-                           int64_t group_count, int64_t *group_size,                               \
-                           const std::vector<sycl::event> &dependencies) {                         \
-        throw unimplemented("blas", "gemm_batch",                                                  \
-                            std::string("for dtype unimplemented dtype combination <") +           \
-                                dtype_string<TYPE_A>() + "," + dtype_string<TYPE_B>() + "," +      \
-                                dtype_string<TYPE_C>() + "," + dtype_string<TYPE_S>() + ">");      \
-    }
-
-GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, float, float)
-GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float)
-
-#undef GEMM_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trsm_batch(Func func, sycl::queue &queue, side left_right, uplo upper_lower,
-                              transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha,
-                              const T *a, int64_t lda, int64_t stridea, T *b, int64_t ldb,
-                              int64_t strideb, int64_t batch_size,
-                              const std::vector<sycl::event> &dependencies) {
-    auto new_side =
-        left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left;
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    return column_major::trsm_batch(func, queue, new_side, new_uplo, trans, unit_diag, n, m, alpha,
-                                    a, lda, stridea, b, ldb, strideb, batch_size, dependencies);
-}
-
-#define TRSM_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                     \
-    sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \
-                           diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a,        \
-                           int64_t lda, int64_t stridea, TYPE *b, int64_t ldb, int64_t strideb,    \
-                           int64_t batch_size, const std::vector<sycl::event> &dependencies) {     \
-        return trsm_batch(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, \
-                          alpha, a, lda, stridea, b, ldb, strideb, batch_size, dependencies);      \
-    }
-
-TRSM_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_strsm_strided_batched)
-TRSM_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_dtrsm_strided_batched)
-TRSM_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_ctrsm_strided_batched)
-TRSM_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_ztrsm_strided_batched)
-
-#undef TRSM_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trsm_batch(Func func, sycl::queue &queue, side *left_right, uplo *upper_lower,
-                              transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, T *alpha,
-                              const T **a, int64_t *lda, T **b, int64_t *ldb, int64_t group_count,
-                              int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    for (int64_t i = 0; i < group_count; i++) {
-        const auto new_side = left_right[i] == oneapi::mkl::side::left ? oneapi::mkl::side::right
-                                                                       : oneapi::mkl::side::left;
-        left_right[i] = new_side;
-
-        const auto new_uplo = upper_lower[i] == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                                         : oneapi::mkl::uplo::lower;
-        upper_lower[i] = new_uplo;
-    }
-
-    return column_major::trsm_batch(func, queue, left_right, upper_lower, trans, unit_diag, n, m,
-                                    alpha, a, lda, b, ldb, group_count, group_size, dependencies);
-}
-
-#define TRSM_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                             \
-    sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower,                \
-                           transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, TYPE *alpha, \
-                           const TYPE **a, int64_t *lda, TYPE **b, int64_t *ldb,                   \
-                           int64_t group_count, int64_t *group_size,                               \
-                           const std::vector<sycl::event> &dependencies) {                         \
-        return trsm_batch(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, \
-                          alpha, a, lda, b, ldb, group_count, group_size, dependencies);           \
-    }
-
-TRSM_BATCH_LAUNCHER_USM(float, rocblas_strsm_batched)
-TRSM_BATCH_LAUNCHER_USM(double, rocblas_dtrsm_batched)
-TRSM_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_ctrsm_batched)
-TRSM_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_ztrsm_batched)
-
-#undef TRSM_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syrk_batch(Func func, sycl::queue &queue, uplo *upper_lower, transpose *trans,
-                              int64_t *n, int64_t *k, T *alpha, const T **a, int64_t *lda, T *beta,
-                              T **c, int64_t *ldc, int64_t group_count, int64_t *group_size,
-                              const std::vector<sycl::event> &dependencies) {
-    for (int64_t i = 0; i < group_count; i++) {
-        const auto new_uplo = upper_lower[i] == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                                         : oneapi::mkl::uplo::lower;
-        upper_lower[i] = new_uplo;
-
-        const auto new_trans = trans[i] == oneapi::mkl::transpose::nontrans
-                                   ? oneapi::mkl::transpose::trans
-                                   : oneapi::mkl::transpose::nontrans;
-        trans[i] = new_trans;
-    }
-
-    return column_major::syrk_batch(func, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c,
-                                    ldc, group_count, group_size, dependencies);
-}
-
-#define SYRK_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                           \
-    sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n,  \
-                           int64_t *k, TYPE *alpha, const TYPE **a, int64_t *lda, TYPE *beta,    \
-                           TYPE **c, int64_t *ldc, int64_t group_count, int64_t *group_size,     \
-                           const std::vector<sycl::event> &dependencies) {                       \
-        return syrk_batch(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, \
-                          c, ldc, group_count, group_size, dependencies);                        \
-    }
-
-SYRK_BATCH_LAUNCHER_USM(float, rocblas_ssyrk_batched)
-SYRK_BATCH_LAUNCHER_USM(double, rocblas_dsyrk_batched)
-SYRK_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_csyrk_batched)
-SYRK_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zsyrk_batched)
-
-#undef SYRK_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syrk_batch(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                              int64_t n, int64_t k, const T alpha, const T *a, int64_t lda,
-                              int64_t stridea, const T beta, T *c, int64_t ldc, int64_t stridec,
-                              int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    return column_major::syrk_batch(func, queue, new_uplo, new_trans, n, k, alpha, a, lda, stridea,
-                                    beta, c, ldc, stridec, batch_size, dependencies);
-}
-
-#define SYRK_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                               \
-    sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, \
-                           int64_t k, const TYPE alpha, const TYPE *a, int64_t lda,          \
-                           int64_t stridea, const TYPE beta, TYPE *c, int64_t ldc,           \
-                           int64_t stridec, int64_t batch_size,                              \
-                           const std::vector<sycl::event> &dependencies) {                   \
-        return syrk_batch(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda,   \
-                          stridea, beta, c, ldc, stridec, batch_size, dependencies);         \
-    }
-
-SYRK_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_ssyrk_strided_batched)
-SYRK_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_dsyrk_strided_batched)
-SYRK_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_csyrk_strided_batched)
-SYRK_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zsyrk_strided_batched)
-
-#undef SYRK_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event omatcopy_batch(Func func, sycl::queue &queue, transpose trans, int64_t m,
-                                  int64_t n, const T alpha, const T *a, int64_t lda,
-                                  int64_t stridea, T *b, int64_t ldb, int64_t strideb,
-                                  int64_t batch_size,
-                                  const std::vector<sycl::event> &dependencies) {
-    return column_major::omatcopy_batch(func, queue, trans, n, m, alpha, a, lda, stridea, b, ldb,
-                                        strideb, batch_size, dependencies);
-}
-
-#define OMATCOPY_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                 \
-    sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,          \
-                               const TYPE alpha, const TYPE *a, int64_t lda, int64_t stridea,      \
-                               TYPE *b, int64_t ldb, int64_t strideb, int64_t batch_size,          \
-                               const std::vector<sycl::event> &dependencies) {                     \
-        return omatcopy_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, b, ldb, \
-                              strideb, batch_size, dependencies);                                  \
-    }
-
-OMATCOPY_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_sgeam_strided_batched)
-OMATCOPY_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_dgeam_strided_batched)
-OMATCOPY_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_cgeam_strided_batched)
-OMATCOPY_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zgeam_strided_batched)
-
-#undef OMATCOPY_STRIDED_BATCH_LAUNCHER_USM
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                           float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                           double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<float> alpha, std::complex<float> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                           std::complex<double> alpha, std::complex<double> *ab, int64_t lda,
-                           int64_t ldb, int64_t stride, int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-template <typename Func, typename T>
-inline sycl::event omatadd_batch(Func func, sycl::queue &queue, transpose transa, transpose transb,
-                                 int64_t m, int64_t n, const T alpha, const T *a, int64_t lda,
-                                 int64_t stridea, const T beta, const T *b, int64_t ldb,
-                                 int64_t strideb, T *c, int64_t ldc, int64_t stridec,
-                                 int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return column_major::omatadd_batch(func, queue, transa, transb, n, m, alpha, a, lda, stridea,
-                                       beta, b, ldb, strideb, c, ldc, stridec, batch_size,
-                                       dependencies);
-}
-
-#define OMATADD_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                  \
-    sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m,   \
-                              int64_t n, const TYPE alpha, const TYPE *a, int64_t lda,             \
-                              int64_t stridea, const TYPE beta, const TYPE *b, int64_t ldb,        \
-                              int64_t strideb, TYPE *c, int64_t ldc, int64_t stridec,              \
-                              int64_t batch_size, const std::vector<sycl::event> &dependencies) {  \
-        return omatadd_batch(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, stridea, \
-                             beta, b, ldb, strideb, c, ldc, stridec, batch_size, dependencies);    \
-    }
-
-OMATADD_STRIDED_BATCH_LAUNCHER_USM(float, rocblas_sgeam_strided_batched)
-OMATADD_STRIDED_BATCH_LAUNCHER_USM(double, rocblas_dgeam_strided_batched)
-OMATADD_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_cgeam_strided_batched)
-OMATADD_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zgeam_strided_batched)
-
-#undef OMATADD_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event omatcopy_batch(Func func, sycl::queue &queue, transpose *trans, int64_t *m,
-                                  int64_t *n, T *alpha, const T **a, int64_t *lda, T **b,
-                                  int64_t *ldb, int64_t group_count, int64_t *group_size,
-                                  const std::vector<sycl::event> &dependencies) {
-    return column_major::omatcopy_batch(func, queue, trans, n, m, alpha, a, lda, b, ldb,
-                                        group_count, group_size, dependencies);
-}
-
-#define OMATCOPY_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                        \
-    sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,      \
-                               TYPE *alpha, const TYPE **a, int64_t *lda, TYPE **b, int64_t *ldb, \
-                               int64_t group_count, int64_t *group_size,                          \
-                               const std::vector<sycl::event> &dependencies) {                    \
-        return omatcopy_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb,         \
-                              group_count, group_size, dependencies);                             \
-    }
-
-OMATCOPY_BATCH_LAUNCHER_USM(float, rocblas_sgeam_batched)
-OMATCOPY_BATCH_LAUNCHER_USM(double, rocblas_dgeam_batched)
-OMATCOPY_BATCH_LAUNCHER_USM(std::complex<float>, rocblas_cgeam_batched)
-OMATCOPY_BATCH_LAUNCHER_USM(std::complex<double>, rocblas_zgeam_batched)
-
-#undef OMATCOPY_BATCH_LAUNCHER_USM
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           float *alpha, float **ab, int64_t *lda, int64_t *ldb,
-                           int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           double *alpha, double **ab, int64_t *lda, int64_t *ldb,
-                           int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           std::complex<float> *alpha, std::complex<float> **ab, int64_t *lda,
-                           int64_t *ldb, int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n,
-                           std::complex<double> *alpha, std::complex<double> **ab, int64_t *lda,
-                           int64_t *ldb, int64_t group_count, int64_t *group_size,
-                           const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy_batch", "for row_major layout");
-}
-
-} // namespace row_major
-} // namespace rocblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/rocblas/rocblas_extensions.cpp b/src/blas/backends/rocblas/rocblas_extensions.cpp
deleted file mode 100644
index a1fd1df1c..000000000
--- a/src/blas/backends/rocblas/rocblas_extensions.cpp
+++ /dev/null
@@ -1,716 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#include "rocblas_helper.hpp"
-#include "rocblas_task.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace rocblas {
-namespace column_major {
-
-// Buffer APIs
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<int8_t, 1> &a, int64_t lda,
-               int8_t ao, sycl::buffer<int8_t, 1> &b, int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<int8_t, 1> &a, int64_t lda,
-               int8_t ao, sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a, int64_t lda,
-               uint8_t ao, sycl::buffer<int8_t, 1> &b, int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a, int64_t lda,
-               uint8_t ao, sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-           sycl::buffer<float, 1> &b, int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-           int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-           sycl::buffer<double, 1> &b, int64_t ldb, double beta, sycl::buffer<double, 1> &c,
-           int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-           std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-template <typename Func, typename T>
-inline void omatcopy(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     const T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b,
-                     int64_t ldb) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb);
-
-    const T beta = 0;
-    const int64_t new_m = trans == oneapi::mkl::transpose::nontrans ? m : n;
-    const int64_t new_n = trans == oneapi::mkl::transpose::nontrans ? n : m;
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto b_ = sc.get_mem<rocDataType *>(b_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans),
-                                    get_rocblas_operation(trans), new_m, new_n,
-                                    (rocDataType *)&alpha, a_, lda, (rocDataType *)&beta, nullptr,
-                                    lda, b_, ldb);
-        });
-    });
-}
-
-#define OMATCOPY_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                  \
-    void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, const TYPE alpha,    \
-                  sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) { \
-        omatcopy(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb);                     \
-    }
-
-OMATCOPY_LAUNCHER(float, rocblas_sgeam)
-OMATCOPY_LAUNCHER(double, rocblas_dgeam)
-OMATCOPY_LAUNCHER(std::complex<float>, rocblas_cgeam)
-OMATCOPY_LAUNCHER(std::complex<double>, rocblas_zgeam)
-
-#undef OMATCOPY_LAUNCHER
-
-template <typename Func, typename T>
-void omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m,
-               int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<T, 1> &b, int64_t ldb, std::int64_t strideb) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-#define OMATCOPY2_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                \
-    void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,        \
-                   sycl::buffer<TYPE, 1> &a, int64_t lda, int64_t stridea,                       \
-                   sycl::buffer<TYPE, 1> &b, int64_t ldb, int64_t strideb) {                     \
-        omatcopy2(#ROCBLAS_ROUTINE, ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, lda, \
-                  b, ldb, strideb);                                                              \
-    }
-
-OMATCOPY2_LAUNCHER(float, "unimplemented")
-OMATCOPY2_LAUNCHER(double, "unimplemented")
-OMATCOPY2_LAUNCHER(std::complex<float>, "unimplemented")
-OMATCOPY2_LAUNCHER(std::complex<double>, "unimplemented")
-
-#undef OMATCOPY2_LAUNCHER
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-              sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-              sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-              sycl::buffer<std::complex<float>, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-              sycl::buffer<std::complex<double>, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-template <typename Func, typename T>
-inline void omatadd(Func func, sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                    int64_t n, const T alpha, sycl::buffer<T, 1> &a, int64_t lda, const T beta,
-                    sycl::buffer<T, 1> &b, int64_t ldb, sycl::buffer<T, 1> &c, int64_t ldc) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto b_ = sc.get_mem<rocDataType *>(b_acc);
-            auto c_ = sc.get_mem<rocDataType *>(c_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa),
-                                    get_rocblas_operation(transb), m, n, (rocDataType *)&alpha, a_,
-                                    lda, (rocDataType *)&beta, b_, ldb, c_, ldc);
-        });
-    });
-}
-
-#define OMATADD_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                  \
-    void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,   \
-                 const TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, const TYPE beta,       \
-                 sycl::buffer<TYPE, 1> &b, int64_t ldb, sycl::buffer<TYPE, 1> &c, int64_t ldc) { \
-        omatadd(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c,    \
-                ldc);                                                                            \
-    }
-
-OMATADD_LAUNCHER(float, rocblas_sgeam)
-OMATADD_LAUNCHER(double, rocblas_dgeam)
-OMATADD_LAUNCHER(std::complex<float>, rocblas_cgeam)
-OMATADD_LAUNCHER(std::complex<double>, rocblas_zgeam)
-
-#undef OMATADD_LAUNCHER
-
-// USM APIs
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda,
-                      int8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda,
-                      int8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda,
-                      uint8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda,
-                      uint8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for column_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b,
-                  int64_t ldb, float beta, float *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, double alpha, const double *a, int64_t lda, const double *b,
-                  int64_t ldb, double beta, double *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                  int64_t lda, const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                  std::complex<float> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                  int64_t lda, const std::complex<double> *b, int64_t ldb,
-                  std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for column_major layout");
-}
-
-template <typename Func, typename T>
-inline sycl::event omatcopy(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                            const T alpha, const T *a, int64_t lda, T *b, int64_t ldb,
-                            const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb);
-
-    const T beta = 0;
-    const int64_t new_m = trans == oneapi::mkl::transpose::nontrans ? m : n;
-    const int64_t new_n = trans == oneapi::mkl::transpose::nontrans ? n : m;
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto b_ = reinterpret_cast<rocDataType *>(b);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans),
-                                    get_rocblas_operation(trans), new_m, new_n,
-                                    (rocDataType *)&alpha, a_, lda, (rocDataType *)&beta, nullptr,
-                                    lda, b_, ldb);
-        });
-    });
-
-    return done;
-}
-
-#define OMATCOPY_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                               \
-    sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,                \
-                         const TYPE alpha, const TYPE *a, int64_t lda, TYPE *b, int64_t ldb,       \
-                         const std::vector<sycl::event> &dependencies) {                           \
-        return omatcopy(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); \
-    }
-
-OMATCOPY_LAUNCHER_USM(float, rocblas_sgeam)
-OMATCOPY_LAUNCHER_USM(double, rocblas_dgeam)
-OMATCOPY_LAUNCHER_USM(std::complex<float>, rocblas_cgeam)
-OMATCOPY_LAUNCHER_USM(std::complex<double>, rocblas_zgeam)
-
-#undef OMATCOPY_LAUNCHER_USM
-
-template <typename Func, typename T>
-sycl::event omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans,
-                      int64_t m, int64_t n, T alpha, const T *a, int64_t lda, int64_t stridea, T *b,
-                      int64_t ldb, int64_t strideb, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-#define OMATCOPY2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                              \
-    sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,   \
-                          const TYPE *a, int64_t lda, int64_t stridea, TYPE *b, int64_t ldb,       \
-                          int64_t strideb, const std::vector<sycl::event> &dependencies) {         \
-        return omatcopy2(#ROCBLAS_ROUTINE, ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, \
-                         lda, b, ldb, strideb, dependencies);                                      \
-    }
-
-OMATCOPY2_LAUNCHER_USM(float, "unimplemented")
-OMATCOPY2_LAUNCHER_USM(double, "unimplemented")
-OMATCOPY2_LAUNCHER_USM(std::complex<float>, "unimplemented")
-OMATCOPY2_LAUNCHER_USM(std::complex<double>, "unimplemented")
-
-#undef OMATCOPY2_LAUNCHER_USM
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                     float *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                     double *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<float> alpha, std::complex<float> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<double> alpha, std::complex<double> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for column_major layout");
-}
-
-template <typename Func, typename T>
-inline sycl::event omatadd(Func func, sycl::queue &queue, transpose transa, transpose transb,
-                           int64_t m, int64_t n, const T alpha, const T *a, int64_t lda,
-                           const T beta, const T *b, int64_t ldb, T *c, int64_t ldc,
-                           const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto b_ = reinterpret_cast<const rocDataType *>(b);
-            auto c_ = reinterpret_cast<rocDataType *>(c);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa),
-                                    get_rocblas_operation(transb), m, n, (rocDataType *)&alpha, a_,
-                                    lda, (rocDataType *)&beta, b_, ldb, c_, ldc);
-        });
-    });
-
-    return done;
-}
-
-#define OMATADD_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                               \
-    sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m,        \
-                        int64_t n, const TYPE alpha, const TYPE *a, int64_t lda, const TYPE beta, \
-                        const TYPE *b, int64_t ldb, TYPE *c, int64_t ldc,                         \
-                        const std::vector<sycl::event> &dependencies) {                           \
-        return omatadd(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, \
-                       c, ldc, dependencies);                                                     \
-    }
-
-OMATADD_LAUNCHER_USM(float, rocblas_sgeam)
-OMATADD_LAUNCHER_USM(double, rocblas_dgeam)
-OMATADD_LAUNCHER_USM(std::complex<float>, rocblas_cgeam)
-OMATADD_LAUNCHER_USM(std::complex<double>, rocblas_zgeam)
-
-#undef OMATADD_LAUNCHER_USM
-
-} // namespace column_major
-
-namespace row_major {
-
-// Buffer APIs
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<int8_t, 1> &a, int64_t lda,
-               int8_t ao, sycl::buffer<int8_t, 1> &b, int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<int8_t, 1> &a, int64_t lda,
-               int8_t ao, sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a, int64_t lda,
-               uint8_t ao, sycl::buffer<int8_t, 1> &b, int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m,
-               int64_t n, int64_t k, float alpha, sycl::buffer<uint8_t, 1> &a, int64_t lda,
-               uint8_t ao, sycl::buffer<uint8_t, 1> &b, int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, float alpha, sycl::buffer<float, 1> &a, int64_t lda,
-           sycl::buffer<float, 1> &b, int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-           int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, double alpha, sycl::buffer<double, 1> &a, int64_t lda,
-           sycl::buffer<double, 1> &b, int64_t ldb, double beta, sycl::buffer<double, 1> &c,
-           int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<float>, 1> &b, int64_t ldb,
-           std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n,
-           int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-           int64_t lda, sycl::buffer<std::complex<double>, 1> &b, int64_t ldb,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, int64_t ldc) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-template <typename Func, typename T>
-inline void omatcopy(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     const T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b,
-                     int64_t ldb) {
-    column_major::omatcopy(func, queue, trans, n, m, alpha, a, lda, b, ldb);
-}
-
-#define OMATCOPY_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                  \
-    void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, const TYPE alpha,    \
-                  sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) { \
-        omatcopy(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb);                     \
-    }
-
-OMATCOPY_LAUNCHER(float, rocblas_sgeam)
-OMATCOPY_LAUNCHER(double, rocblas_dgeam)
-OMATCOPY_LAUNCHER(std::complex<float>, rocblas_cgeam)
-OMATCOPY_LAUNCHER(std::complex<double>, rocblas_zgeam)
-
-#undef OMATCOPY_LAUNCHER
-
-template <typename Func, typename T>
-void omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m,
-               int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, std::int64_t stridea,
-               sycl::buffer<T, 1> &b, int64_t ldb, std::int64_t strideb) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-#define OMATCOPY2_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                \
-    void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,        \
-                   sycl::buffer<TYPE, 1> &a, int64_t lda, int64_t stridea,                       \
-                   sycl::buffer<TYPE, 1> &b, int64_t ldb, int64_t strideb) {                     \
-        omatcopy2(#ROCBLAS_ROUTINE, ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, lda, \
-                  b, ldb, strideb);                                                              \
-    }
-
-OMATCOPY2_LAUNCHER(float, "unimplemented")
-OMATCOPY2_LAUNCHER(double, "unimplemented")
-OMATCOPY2_LAUNCHER(std::complex<float>, "unimplemented")
-OMATCOPY2_LAUNCHER(std::complex<double>, "unimplemented")
-
-#undef OMATCOPY2_LAUNCHER
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-              sycl::buffer<float, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-              sycl::buffer<double, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<float> alpha,
-              sycl::buffer<std::complex<float>, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex<double> alpha,
-              sycl::buffer<std::complex<double>, 1> &ab, int64_t lda, int64_t ldb) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-template <typename Func, typename T>
-inline void omatadd(Func func, sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                    int64_t n, const T alpha, sycl::buffer<T, 1> &a, int64_t lda, const T beta,
-                    sycl::buffer<T, 1> &b, int64_t ldb, sycl::buffer<T, 1> &c, int64_t ldc) {
-    column_major::omatadd(func, queue, transa, transb, n, m, alpha, a, lda, beta, b, ldb, c, ldc);
-}
-
-#define OMATADD_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                  \
-    void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,   \
-                 const TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, const TYPE beta,       \
-                 sycl::buffer<TYPE, 1> &b, int64_t ldb, sycl::buffer<TYPE, 1> &c, int64_t ldc) { \
-        omatadd(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c,    \
-                ldc);                                                                            \
-    }
-
-OMATADD_LAUNCHER(float, rocblas_sgeam)
-OMATADD_LAUNCHER(double, rocblas_dgeam)
-OMATADD_LAUNCHER(std::complex<float>, rocblas_cgeam)
-OMATADD_LAUNCHER(std::complex<double>, rocblas_zgeam)
-
-#undef OMATADD_LAUNCHER
-
-// USM APIs
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda,
-                      int8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda,
-                      int8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda,
-                      uint8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc,
-                      int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda,
-                      uint8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c,
-                      int64_t ldc, const int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemm_bias", "for row_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b,
-                  int64_t ldb, float beta, float *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, double alpha, const double *a, int64_t lda, const double *b,
-                  int64_t ldb, double beta, double *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                  int64_t lda, const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                  std::complex<float> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb,
-                  int64_t n, int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                  int64_t lda, const std::complex<double> *b, int64_t ldb,
-                  std::complex<double> beta, std::complex<double> *c, int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "gemmt", "for row_major layout");
-}
-
-template <typename Func, typename T>
-inline sycl::event omatcopy(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                            const T alpha, const T *a, int64_t lda, T *b, int64_t ldb,
-                            const std::vector<sycl::event> &dependencies) {
-    return column_major::omatcopy(func, queue, trans, n, m, alpha, a, lda, b, ldb, dependencies);
-}
-
-#define OMATCOPY_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                               \
-    sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,                \
-                         const TYPE alpha, const TYPE *a, int64_t lda, TYPE *b, int64_t ldb,       \
-                         const std::vector<sycl::event> &dependencies) {                           \
-        return omatcopy(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); \
-    }
-
-OMATCOPY_LAUNCHER_USM(float, rocblas_sgeam)
-OMATCOPY_LAUNCHER_USM(double, rocblas_dgeam)
-OMATCOPY_LAUNCHER_USM(std::complex<float>, rocblas_cgeam)
-OMATCOPY_LAUNCHER_USM(std::complex<double>, rocblas_zgeam)
-
-#undef OMATCOPY_LAUNCHER_USM
-
-template <typename Func, typename T>
-sycl::event omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans,
-                      int64_t m, int64_t n, T alpha, const T *a, int64_t lda, int64_t stridea, T *b,
-                      int64_t ldb, int64_t strideb, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "omatcopy2", "");
-}
-
-#define OMATCOPY2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                              \
-    sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,   \
-                          const TYPE *a, int64_t lda, int64_t stridea, TYPE *b, int64_t ldb,       \
-                          int64_t strideb, const std::vector<sycl::event> &dependencies) {         \
-        return omatcopy2(#ROCBLAS_ROUTINE, ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, \
-                         lda, b, ldb, strideb, dependencies);                                      \
-    }
-
-OMATCOPY2_LAUNCHER_USM(float, "unimplemented")
-OMATCOPY2_LAUNCHER_USM(double, "unimplemented")
-OMATCOPY2_LAUNCHER_USM(std::complex<float>, "unimplemented")
-OMATCOPY2_LAUNCHER_USM(std::complex<double>, "unimplemented")
-
-#undef OMATCOPY2_LAUNCHER_USM
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha,
-                     float *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha,
-                     double *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<float> alpha, std::complex<float> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                     std::complex<double> alpha, std::complex<double> *ab, int64_t lda, int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "imatcopy", "for row_major layout");
-}
-
-template <typename Func, typename T>
-inline sycl::event omatadd(Func func, sycl::queue &queue, transpose transa, transpose transb,
-                           int64_t m, int64_t n, const T alpha, const T *a, int64_t lda,
-                           const T beta, const T *b, int64_t ldb, T *c, int64_t ldc,
-                           const std::vector<sycl::event> &dependencies) {
-    return column_major::omatadd(func, queue, transa, transb, n, m, alpha, a, lda, beta, b, ldb, c,
-                                 ldc, dependencies);
-}
-
-#define OMATADD_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                               \
-    sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m,        \
-                        int64_t n, const TYPE alpha, const TYPE *a, int64_t lda, const TYPE beta, \
-                        const TYPE *b, int64_t ldb, TYPE *c, int64_t ldc,                         \
-                        const std::vector<sycl::event> &dependencies) {                           \
-        return omatadd(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, \
-                       c, ldc, dependencies);                                                     \
-    }
-
-OMATADD_LAUNCHER_USM(float, rocblas_sgeam)
-OMATADD_LAUNCHER_USM(double, rocblas_dgeam)
-OMATADD_LAUNCHER_USM(std::complex<float>, rocblas_cgeam)
-OMATADD_LAUNCHER_USM(std::complex<double>, rocblas_zgeam)
-
-#undef OMATADD_LAUNCHER_USM
-
-} // namespace row_major
-} // namespace rocblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/rocblas/rocblas_handle.hpp b/src/blas/backends/rocblas/rocblas_handle.hpp
deleted file mode 100644
index 7a8dfe91f..000000000
--- a/src/blas/backends/rocblas/rocblas_handle.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/***************************************************************************
-*  Copyright 2020-2022 Intel Corporation
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#ifndef _ROCBLAS_HANDLE_HPP_
-#define _ROCBLAS_HANDLE_HPP_
-#include <atomic>
-#include <unordered_map>
-#include "rocblas_helper.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace rocblas {
-
-template <typename T>
-struct rocblas_handle_ {
-    using handle_container_t = std::unordered_map<T, std::atomic<rocblas_handle> *>;
-    handle_container_t rocblas_handle_mapper_{};
-    ~rocblas_handle_() noexcept(false) {
-        for (auto &handle_pair : rocblas_handle_mapper_) {
-            rocblas_status err;
-            if (handle_pair.second != nullptr) {
-                auto handle = handle_pair.second->exchange(nullptr);
-                if (handle != nullptr) {
-                    ROCBLAS_ERROR_FUNC(rocblas_destroy_handle, err, handle);
-                    handle = nullptr;
-                }
-                else {
-                    // if the handle is nullptr it means the handle was already
-                    // destroyed by the ContextCallback and we're free to delete the
-                    // atomic object.
-                    delete handle_pair.second;
-                }
-
-                handle_pair.second = nullptr;
-            }
-        }
-        rocblas_handle_mapper_.clear();
-    }
-};
-
-} // namespace rocblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _ROCBLAS_HANDLE_HPP_
diff --git a/src/blas/backends/rocblas/rocblas_helper.hpp b/src/blas/backends/rocblas/rocblas_helper.hpp
deleted file mode 100644
index ae6301a7a..000000000
--- a/src/blas/backends/rocblas/rocblas_helper.hpp
+++ /dev/null
@@ -1,293 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-/**
- * @file rocblas*.cpp : contains the implementation of all the routines
- * for rocBLAS backend
- */
-#ifndef _ROCBLAS_HELPER_HPP_
-
-#define _ROCBLAS_HELPER_HPP_
-
-#include <rocblas/rocblas.h>
-#include <complex>
-#include "oneapi/mkl/types.hpp"
-#include <hip/hip_runtime.h>
-#include "dtype_string.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace rocblas {
-
-// The static assert to make sure that all index types used in
-// src/oneMKL/backend/rocblas/blas.hpp interface are int64_t
-template <typename... Next>
-struct is_int64 : std::false_type {};
-
-template <typename First>
-struct is_int64<First> : std::is_same<int64_t, First> {};
-
-template <typename First, typename... Next>
-struct is_int64<First, Next...>
-        : std::integral_constant<bool, std::is_same<int64_t, First>::value &&
-                                           is_int64<Next...>::value> {};
-
-template <typename... T>
-struct Overflow {
-    static void inline check(T...) {}
-};
-
-template <typename Index, typename... T>
-struct Overflow<Index, T...> {
-    static void inline check(Index index, T... next) {
-        if (std::abs(index) >= (1LL << 31)) {
-            throw std::runtime_error(
-                "Rocblas index overflow. rocblas does not support 64 bit integer as "
-                "data size. Thus, the data size should not be greater that maximum "
-                "supported size by 32 bit integer.");
-        }
-        Overflow<T...>::check(next...);
-    }
-};
-
-template <typename Index, typename... Next>
-void overflow_check(Index index, Next... indices) {
-    static_assert(is_int64<Index, Next...>::value, "oneMKL index type must be 64 bit integer.");
-    Overflow<Index, Next...>::check(index, indices...);
-}
-
-class rocblas_error : virtual public std::runtime_error {
-protected:
-    inline const char *rocblas_error_map(rocblas_status error) {
-        switch (error) {
-            case rocblas_status_success: return "rocblas_status_success";
-            case rocblas_status_invalid_handle: return "rocblas_status_invalid_handle";
-            case rocblas_status_not_implemented: return "rocblas_status_not_implemented";
-            case rocblas_status_invalid_pointer: return "rocblas_status_invalid_pointer";
-            case rocblas_status_invalid_size: return "rocblas_status_invalid_size";
-            case rocblas_status_memory_error: return "rocblas_status_memory_error";
-            case rocblas_status_internal_error: return "rocblas_status_internal_error";
-            case rocblas_status_perf_degraded: return "rocblas_status_perf_degraded";
-            case rocblas_status_size_query_mismatch: return "rocblas_status_size_query_mismatch";
-            case rocblas_status_size_increased: return "rocblas_status_size_increased";
-            case rocblas_status_size_unchanged: return "rocblas_status_size_unchanged";
-            case rocblas_status_invalid_value: return "rocblas_status_invalid_value";
-            case rocblas_status_continue: return "rocblas_status_continue";
-            case rocblas_status_check_numerics_fail: return "rocblas_status_check_numerics_fail";
-
-            default: return "<unknown>";
-        }
-    }
-
-    int error_number; ///< Error number
-public:
-    /** Constructor (C++ STL string, rocblas_status).
-   *  @param msg The error message
-   *  @param err_num error number
-   */
-    explicit rocblas_error(std::string message, rocblas_status result)
-            : std::runtime_error((message + std::string(rocblas_error_map(result)))) {
-        error_number = static_cast<int>(result);
-    }
-
-    /** Destructor.
-   *  Virtual to allow for subclassing.
-   */
-    virtual ~rocblas_error() throw() {}
-
-    /** Returns error number.
-   *  @return #error_number
-   */
-    virtual int getErrorNumber() const throw() {
-        return error_number;
-    }
-};
-
-class hip_error : virtual public std::runtime_error {
-protected:
-    inline const char *hip_error_map(hipError_t result) {
-        return hipGetErrorName(result);
-    }
-    int error_number; ///< error number
-public:
-    /** Constructor (C++ STL string, hipError_t).
-   *  @param msg The error message
-   *  @param err_num Error number
-   */
-    explicit hip_error(std::string message, hipError_t result)
-            : std::runtime_error((message + std::string(hip_error_map(result)))) {
-        error_number = static_cast<int>(result);
-    }
-
-    /** Destructor.
-   *  Virtual to allow for subclassing.
-   */
-    virtual ~hip_error() throw() {}
-
-    /** Returns error number.
-   *  @return #error_number
-   */
-    virtual int getErrorNumber() const throw() {
-        return error_number;
-    }
-};
-
-#define HIP_ERROR_FUNC(name, err, ...)                                 \
-    err = name(__VA_ARGS__);                                           \
-    if (err != HIP_SUCCESS) {                                          \
-        throw hip_error(std::string(#name) + std::string(" : "), err); \
-    }
-
-#define ROCBLAS_ERROR_FUNC(name, err, ...)                                 \
-    err = name(__VA_ARGS__);                                               \
-    if (err != rocblas_status_success) {                                   \
-        throw rocblas_error(std::string(#name) + std::string(" : "), err); \
-    }
-
-#define ROCBLAS_ERROR_FUNC_SYNC(name, err, handle, ...)                    \
-    err = name(handle, __VA_ARGS__);                                       \
-    if (err != rocblas_status_success) {                                   \
-        throw rocblas_error(std::string(#name) + std::string(" : "), err); \
-    }                                                                      \
-    hipStream_t currentStreamId;                                           \
-    ROCBLAS_ERROR_FUNC(rocblas_get_stream, err, handle, &currentStreamId); \
-    hipError_t hip_err;                                                    \
-    HIP_ERROR_FUNC(hipStreamSynchronize, hip_err, currentStreamId);
-
-inline rocblas_operation get_rocblas_operation(oneapi::mkl::transpose trn) {
-    switch (trn) {
-        case oneapi::mkl::transpose::nontrans: return rocblas_operation_none;
-        case oneapi::mkl::transpose::trans: return rocblas_operation_transpose;
-        case oneapi::mkl::transpose::conjtrans: return rocblas_operation_conjugate_transpose;
-        default: throw "Wrong transpose Operation.";
-    }
-}
-
-inline rocblas_fill get_rocblas_fill_mode(oneapi::mkl::uplo ul) {
-    switch (ul) {
-        case oneapi::mkl::uplo::upper: return rocblas_fill_upper;
-        case oneapi::mkl::uplo::lower: return rocblas_fill_lower;
-        default: throw "Wrong fill mode.";
-    }
-}
-
-inline rocblas_diagonal get_rocblas_diag_type(oneapi::mkl::diag un) {
-    switch (un) {
-        case oneapi::mkl::diag::unit: return rocblas_diagonal_unit;
-        case oneapi::mkl::diag::nonunit: return rocblas_diagonal_non_unit;
-        default: throw "Wrong diag type.";
-    }
-}
-
-inline rocblas_side get_rocblas_side_mode(oneapi::mkl::side lr) {
-    switch (lr) {
-        case oneapi::mkl::side::left: return rocblas_side_left;
-        case oneapi::mkl::side::right: return rocblas_side_right;
-        default: throw "Wrong side mode.";
-    }
-}
-
-template <typename T>
-inline rocblas_datatype get_rocblas_datatype() {
-    static_assert(false);
-}
-
-template <>
-inline rocblas_datatype get_rocblas_datatype<rocblas_half>() {
-    return rocblas_datatype_f16_r;
-}
-
-template <>
-inline rocblas_datatype get_rocblas_datatype<float>() {
-    return rocblas_datatype_f32_r;
-}
-
-template <>
-inline rocblas_datatype get_rocblas_datatype<double>() {
-    return rocblas_datatype_f64_r;
-}
-
-template <>
-inline rocblas_datatype get_rocblas_datatype<rocblas_float_complex>() {
-    return rocblas_datatype_f32_c;
-}
-
-template <>
-inline rocblas_datatype get_rocblas_datatype<rocblas_double_complex>() {
-    return rocblas_datatype_f64_c;
-}
-
-template <>
-inline rocblas_datatype get_rocblas_datatype<std::int8_t>() {
-    return rocblas_datatype_i8_r;
-}
-
-template <>
-inline rocblas_datatype get_rocblas_datatype<std::uint8_t>() {
-    return rocblas_datatype_u8_r;
-}
-
-template <>
-inline rocblas_datatype get_rocblas_datatype<std::int32_t>() {
-    return rocblas_datatype_i32_r;
-}
-
-template <>
-inline rocblas_datatype get_rocblas_datatype<std::uint32_t>() {
-    return rocblas_datatype_u32_r;
-}
-
-template <>
-inline rocblas_datatype get_rocblas_datatype<rocblas_bfloat16>() {
-    return rocblas_datatype_bf16_r;
-}
-
-template <>
-inline rocblas_datatype get_rocblas_datatype<std::complex<rocblas_bfloat16>>() {
-    return rocblas_datatype_bf16_c;
-}
-
-/*converting std::complex<T> to roc_<T>_complex 
-             sycl::half      to rocblas_half*/
-template <typename T>
-struct RocEquivalentType {
-    using Type = T;
-};
-
-template <>
-struct RocEquivalentType<std::complex<float>> {
-    using Type = rocblas_float_complex;
-};
-template <>
-struct RocEquivalentType<std::complex<double>> {
-    using Type = rocblas_double_complex;
-};
-template <>
-struct RocEquivalentType<sycl::half> {
-    using Type = rocblas_half;
-};
-
-} // namespace rocblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-#endif // _ROCBLAS_HELPER_HPP_
diff --git a/src/blas/backends/rocblas/rocblas_level1.cpp b/src/blas/backends/rocblas/rocblas_level1.cpp
deleted file mode 100644
index 3a1eacb38..000000000
--- a/src/blas/backends/rocblas/rocblas_level1.cpp
+++ /dev/null
@@ -1,1782 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#include "rocblas_helper.hpp"
-#include "rocblas_task.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace rocblas {
-namespace column_major {
-
-// Buffer APIs
-
-template <typename Func, typename T1, typename T2>
-inline void asum(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T1, 1> &x,
-                 const int64_t incx, sycl::buffer<T2, 1> &result) {
-    using rocDataType1 = typename RocEquivalentType<T1>::Type;
-    using rocDataType2 = typename RocEquivalentType<T2>::Type;
-    overflow_check(n, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto res_acc = result.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            // By default the pointer mode is the rocblas_pointer_mode_host
-            // when the data is on buffer, it must be set to
-            // rocblas_set_pointer_mode mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device);
-            auto x_ = sc.get_mem<rocDataType1 *>(x_acc);
-            auto res_ = sc.get_mem<rocDataType2 *>(res_acc);
-            rocblas_status err;
-            // ASUM does not support negative index
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, std::abs(incx), res_);
-            // Higher level BLAS functions expect rocblas_pointer_mode_host
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid invalid memory accesses
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
-        });
-    });
-}
-
-#define ASUM_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE)                                        \
-    void asum(sycl::queue &queue, int64_t n, sycl::buffer<TYPE1, 1> &x, const int64_t incx, \
-              sycl::buffer<TYPE2, 1> &result) {                                             \
-        asum(ROCBLAS_ROUTINE, queue, n, x, incx, result);                                   \
-    }
-
-ASUM_LAUNCHER(float, float, rocblas_sasum)
-ASUM_LAUNCHER(double, double, rocblas_dasum)
-ASUM_LAUNCHER(std::complex<float>, float, rocblas_scasum)
-ASUM_LAUNCHER(std::complex<double>, double, rocblas_dzasum)
-
-#undef ASUM_LAUNCHER
-
-template <typename Func, typename T1, typename T2>
-inline void scal(Func func, sycl::queue &queue, int64_t n, T1 a, sycl::buffer<T2, 1> &x,
-                 int64_t incx) {
-    using rocDataType1 = typename RocEquivalentType<T1>::Type;
-    using rocDataType2 = typename RocEquivalentType<T2>::Type;
-    overflow_check(n, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto x_ = sc.get_mem<rocDataType2 *>(x_acc);
-            rocblas_status err;
-            // SCAL does not support negative incx
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType1 *)&a, x_, std::abs(incx));
-        });
-    });
-}
-
-#define SCAL_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE)                                             \
-    void scal(sycl::queue &queue, int64_t n, TYPE1 a, sycl::buffer<TYPE2, 1> &x, int64_t incx) { \
-        scal(ROCBLAS_ROUTINE, queue, n, a, x, incx);                                             \
-    }
-
-SCAL_LAUNCHER(float, float, rocblas_sscal)
-SCAL_LAUNCHER(double, double, rocblas_dscal)
-SCAL_LAUNCHER(std::complex<float>, std::complex<float>, rocblas_cscal)
-SCAL_LAUNCHER(std::complex<double>, std::complex<double>, rocblas_zscal)
-SCAL_LAUNCHER(float, std::complex<float>, rocblas_csscal)
-SCAL_LAUNCHER(double, std::complex<double>, rocblas_zdscal)
-
-#undef SCAL_LAUNCHER
-
-template <typename Func, typename T>
-inline void axpy(Func func, sycl::queue &queue, int64_t n, T alpha, sycl::buffer<T, 1> &x,
-                 int64_t incx, sycl::buffer<T, 1> &y, int64_t incy) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType *)&alpha, x_, incx, y_,
-                                    incy);
-        });
-    });
-}
-
-#define AXPY_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                     \
-    void axpy(sycl::queue &queue, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                                          \
-        axpy(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy);                                \
-    }
-
-AXPY_LAUNCHER(float, rocblas_saxpy)
-AXPY_LAUNCHER(double, rocblas_daxpy)
-AXPY_LAUNCHER(std::complex<float>, rocblas_caxpy)
-AXPY_LAUNCHER(std::complex<double>, rocblas_zaxpy)
-
-#undef AXPY_LAUNCHER
-
-void axpby(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x, int64_t incx,
-           float beta, sycl::buffer<float, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-
-void axpby(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x, int64_t incx,
-           double beta, sycl::buffer<double, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-
-void axpby(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-
-void axpby(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-
-template <typename Func, typename T1, typename T2>
-inline void rotg(Func func, sycl::queue &queue, sycl::buffer<T1, 1> &a, sycl::buffer<T1, 1> &b,
-                 sycl::buffer<T2, 1> &c, sycl::buffer<T1, 1> &s) {
-    using rocDataType1 = typename RocEquivalentType<T1>::Type;
-    using rocDataType2 = typename RocEquivalentType<T2>::Type;
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        auto s_acc = s.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            // By default the pointer mode is the rocblas_pointer_mode_host
-            // when the data is on buffer, it must be set to
-            // rocblas_set_pointer_mode mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device);
-            auto a_ = sc.get_mem<rocDataType1 *>(a_acc);
-            auto b_ = sc.get_mem<rocDataType1 *>(b_acc);
-            auto c_ = sc.get_mem<rocDataType2 *>(c_acc);
-            auto s_ = sc.get_mem<rocDataType1 *>(s_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, a_, b_, c_, s_);
-            // Higher level BLAS functions expect rocblas_pointer_mode_host
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid invalid memory accesses
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
-        });
-    });
-}
-
-#define ROTG_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE)                                    \
-    void rotg(sycl::queue &queue, sycl::buffer<TYPE1, 1> &a, sycl::buffer<TYPE1, 1> &b, \
-              sycl::buffer<TYPE2, 1> &c, sycl::buffer<TYPE1, 1> &s) {                   \
-        rotg(ROCBLAS_ROUTINE, queue, a, b, c, s);                                       \
-    }
-
-ROTG_LAUNCHER(float, float, rocblas_srotg)
-ROTG_LAUNCHER(double, double, rocblas_drotg)
-ROTG_LAUNCHER(std::complex<float>, float, rocblas_crotg)
-ROTG_LAUNCHER(std::complex<double>, double, rocblas_zrotg)
-
-#undef ROTG_LAUNCHER
-
-template <typename Func, typename T>
-inline void rotm(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T, 1> &x, int64_t incx,
-                 sycl::buffer<T, 1> &y, int64_t incy, sycl::buffer<T, 1> &param) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        auto param_acc = param.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            // By default the pointer mode is the rocblas_pointer_mode_host
-            // when the data is on buffer, it must be set to
-            // rocblas_set_pointer_mode mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            auto param_ = sc.get_mem<rocDataType *>(param_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy, param_);
-            // Higher level BLAS functions expect rocblas_pointer_mode_host
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid invalid memory accesses
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
-        });
-    });
-}
-
-#define ROTM_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                          \
-    void rotm(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, int64_t incx,  \
-              sycl::buffer<TYPE, 1> &y, int64_t incy, sycl::buffer<TYPE, 1> &param) { \
-        rotm(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, param);                     \
-    }
-
-ROTM_LAUNCHER(float, rocblas_srotm)
-ROTM_LAUNCHER(double, rocblas_drotm)
-
-#undef ROTM_LAUNCHER
-
-template <typename Func, typename T>
-inline void copy(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T, 1> &x, int64_t incx,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy);
-        });
-    });
-}
-
-#define COPY_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                         \
-    void copy(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        copy(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy);                           \
-    }
-
-COPY_LAUNCHER(float, rocblas_scopy)
-COPY_LAUNCHER(double, rocblas_dcopy)
-COPY_LAUNCHER(std::complex<float>, rocblas_ccopy)
-COPY_LAUNCHER(std::complex<double>, rocblas_zcopy)
-
-#undef COPY_LAUNCHER
-
-template <typename Func, typename T>
-inline void dot(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T, 1> &x, const int64_t incx,
-                sycl::buffer<T, 1> &y, int64_t incy, sycl::buffer<T, 1> &result) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read>(cgh);
-        auto res_acc = result.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            // By default the pointer mode is the rocblas_pointer_mode_host
-            // when the data is on buffer, it must be set to
-            // rocblas_set_pointer_mode mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            auto res_ = sc.get_mem<rocDataType *>(res_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy, res_);
-            // Higher level BLAS functions expect rocblas_pointer_mode_host
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid invalid memory accesses
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
-        });
-    });
-}
-
-#define DOT_LAUNCHER(EXT, TYPE, ROCBLAS_ROUTINE)                                                 \
-    void dot##EXT(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, const int64_t incx,   \
-                  sycl::buffer<TYPE, 1> &y, const int64_t incy, sycl::buffer<TYPE, 1> &result) { \
-        dot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, result);                                \
-    }
-
-DOT_LAUNCHER(, float, rocblas_sdot)
-DOT_LAUNCHER(, double, rocblas_ddot)
-DOT_LAUNCHER(u, std::complex<float>, rocblas_cdotu)
-DOT_LAUNCHER(c, std::complex<float>, rocblas_cdotc)
-DOT_LAUNCHER(u, std::complex<double>, rocblas_zdotu)
-DOT_LAUNCHER(c, std::complex<double>, rocblas_zdotc)
-
-#undef DOT_LAUNCHER
-
-void dot(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-         sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<double, 1> &result) {
-    throw unimplemented("blas", "dot", "for column_major layout");
-}
-
-template <typename Func, typename T1, typename T2, typename T3>
-inline void rot(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T1, 1> &x,
-                const int64_t incx, sycl::buffer<T1, 1> &y, int64_t incy, T2 c, T3 s) {
-    using rocDataType1 = typename RocEquivalentType<T1>::Type;
-    using rocDataType2 = typename RocEquivalentType<T2>::Type;
-    using rocDataType3 = typename RocEquivalentType<T3>::Type;
-    overflow_check(n, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            // By default the pointer mode is the rocblas_pointer_mode_host
-            // when the data is on buffer, it must be set to
-            // rocblas_set_pointer_mode mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            // rocblas_set_pointer_mode(handle, rocblas_set_pointer_mode);
-            auto x_ = sc.get_mem<rocDataType1 *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType1 *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy, (rocDataType2 *)&c,
-                                    (rocDataType3 *)&s);
-        });
-    });
-}
-
-#define ROT_LAUNCHER(TYPE1, TYPE2, TYPE3, ROCBLAS_ROUTINE)                                 \
-    void rot(sycl::queue &queue, int64_t n, sycl::buffer<TYPE1, 1> &x, const int64_t incx, \
-             sycl::buffer<TYPE1, 1> &y, int64_t incy, TYPE2 c, TYPE3 s) {                  \
-        rot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s);                            \
-    }
-
-ROT_LAUNCHER(float, float, float, rocblas_srot)
-ROT_LAUNCHER(double, double, double, rocblas_drot)
-ROT_LAUNCHER(std::complex<float>, float, float, rocblas_csrot)
-ROT_LAUNCHER(std::complex<double>, double, double, rocblas_zdrot)
-
-#undef ROT_LAUNCHER
-
-void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer<float, 1> &x, int64_t incx,
-            sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &result) {
-    overflow_check(n, incx, incy);
-
-    // rocBLAS does not support sdot so we need to mimic sdot.
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.get_access<sycl::access::mode::read>(cgh);
-        auto res_acc = result.get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            // By default the pointer mode is the rocblas_pointer_mode_host
-            // when the data is on buffer, it must be set to
-            // rocblas_set_pointer_mode mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device);
-            auto x_ = sc.get_mem<float *>(x_acc);
-            auto y_ = sc.get_mem<float *>(y_acc);
-            auto res_ = sc.get_mem<float *>(res_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(rocblas_sdot, err, handle, n, x_, incx, y_, incy, res_);
-            // Higher level BLAS functions expect rocblas_pointer_mode_host
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid invalid memory accesses
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
-        });
-    });
-
-    // Since SB is a host pointer we need to bring the result back to the host and
-    // add sb to it.
-    result.get_access<sycl::access::mode::read_write>()[0] += sb;
-}
-
-template <typename Func, typename T>
-inline void rotmg(Func func, sycl::queue &queue, sycl::buffer<T, 1> &d1, sycl::buffer<T, 1> &d2,
-                  sycl::buffer<T, 1> &x1, T y1, sycl::buffer<T, 1> &param) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    sycl::buffer<T, 1> y1_buff(&y1, sycl::range<1>(1));
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto d1_acc = d1.template get_access<sycl::access::mode::read_write>(cgh);
-        auto d2_acc = d2.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x1_acc = x1.template get_access<sycl::access::mode::read_write>(cgh);
-        auto y1_acc = y1_buff.template get_access<sycl::access::mode::read>(cgh);
-        auto param_acc = param.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            // By default the pointer mode is the rocblas_pointer_mode_host
-            // when the data is on buffer, it must be set to
-            // rocblas_set_pointer_mode mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device);
-            auto d1_ = sc.get_mem<rocDataType *>(d1_acc);
-            auto d2_ = sc.get_mem<rocDataType *>(d2_acc);
-            auto x1_ = sc.get_mem<rocDataType *>(x1_acc);
-            auto y1_ = sc.get_mem<rocDataType *>(y1_acc);
-            auto param_ = sc.get_mem<rocDataType *>(param_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, d1_, d2_, x1_, y1_, param_);
-            // Higher level BLAS functions expect rocblas_pointer_mode_host
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid invalid memory accesses
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
-        });
-    });
-}
-
-#define ROTMG_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                            \
-    void rotmg(sycl::queue &queue, sycl::buffer<TYPE, 1> &d1, sycl::buffer<TYPE, 1> &d2, \
-               sycl::buffer<TYPE, 1> &x1, TYPE y1, sycl::buffer<TYPE, 1> &param) {       \
-        rotmg(ROCBLAS_ROUTINE, queue, d1, d2, x1, y1, param);                            \
-    }
-
-ROTMG_LAUNCHER(float, rocblas_srotmg)
-ROTMG_LAUNCHER(double, rocblas_drotmg)
-
-#undef ROTMG_LAUNCHER
-
-template <typename Func, typename T>
-inline void iamax(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T, 1> &x,
-                  const int64_t incx, sycl::buffer<int64_t, 1> &result) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx);
-
-    // rocBLAS does not support int64_t as return type for the data by default. So we need to
-    // mimic iamax. We are converting the result to be the int and then we convert
-    // it back to the actual data on the host.
-    // This change may cause failure as the result of integer overflow
-    // based on the size. Alternatively either we need to write a sycl kernel
-    // to elementwise copy the data between two buffer, or allow reinterpret cast
-    // to convert to different type with different typesize size.
-    sycl::buffer<int, 1> int_res_buff{ sycl::range<1>(1) };
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto int_res_acc = int_res_buff.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            // By default the pointer mode is the rocblas_pointer_mode_host
-            // when the data is on buffer, it must be set to
-            // rocblas_set_pointer_mode mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto int_res_ = sc.get_mem<int *>(int_res_acc);
-            rocblas_status err;
-            // For negative incx, iamax returns 0. This behaviour is similar to that of
-            // reference netlib BLAS.
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, int_res_);
-            // Higher level BLAS functions expect rocblas_pointer_mode_host
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid invalid memory accesses
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
-        });
-    });
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto int_res_acc = int_res_buff.template get_access<sycl::access::mode::read>(cgh);
-        auto result_acc = result.template get_access<sycl::access::mode::write>(cgh);
-        cgh.single_task(
-            [=]() { result_acc[0] = std::max((int64_t)int_res_acc[0] - 1, (int64_t)0); });
-    });
-}
-
-#define IAMAX_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                               \
-    void iamax(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, const int64_t incx, \
-               sycl::buffer<int64_t, 1> &result) {                                          \
-        iamax(ROCBLAS_ROUTINE, queue, n, x, incx, result);                                  \
-    }
-
-IAMAX_LAUNCHER(float, rocblas_isamax)
-IAMAX_LAUNCHER(double, rocblas_idamax)
-IAMAX_LAUNCHER(std::complex<float>, rocblas_icamax)
-IAMAX_LAUNCHER(std::complex<double>, rocblas_izamax)
-
-#undef IAMAX_LAUNCHER
-
-template <typename Func, typename T>
-inline void swap(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T, 1> &x, int64_t incx,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy);
-        });
-    });
-}
-
-#define SWAP_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                         \
-    void swap(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        swap(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy);                           \
-    }
-
-SWAP_LAUNCHER(float, rocblas_sswap)
-SWAP_LAUNCHER(double, rocblas_dswap)
-SWAP_LAUNCHER(std::complex<float>, rocblas_cswap)
-SWAP_LAUNCHER(std::complex<double>, rocblas_zswap)
-
-#undef SWAP_LAUNCHER
-
-template <typename Func, typename T>
-inline void iamin(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T, 1> &x,
-                  const int64_t incx, sycl::buffer<int64_t, 1> &result) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx);
-
-    // rocBLAS does not support int64_t as return type for the data by default. So we need to
-    // mimic iamin we are converting the result to be the int and then we convert
-    // it back to the actual data on the host.
-    // This change may cause failure as the result of integer overflow
-    // based on the size. Alternatively, either we need to write a sycl kernel
-    // to elementwise copy the data between two buffer, or allow reinterpret cast
-    // to convert to different type with different typesize size.
-    sycl::buffer<int, 1> int_res_buff{ sycl::range<1>(1) };
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto int_res_acc = int_res_buff.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            // By default the pointer mode is the rocblas_pointer_mode_host
-            // when the data is on buffer, it must be set to
-            // rocblas_set_pointer_mode mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto int_res_ = sc.get_mem<int *>(int_res_acc);
-            rocblas_status err;
-            // For negative incx, iamin returns 0. This behaviour is similar to that of
-            // implemented as a reference IAMIN.
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, int_res_);
-            // Higher level BLAS functions expect rocblas_pointer_mode_host
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid invalid memory accesses
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
-        });
-    });
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto int_res_acc = int_res_buff.template get_access<sycl::access::mode::read>(cgh);
-        auto result_acc = result.template get_access<sycl::access::mode::write>(cgh);
-        cgh.single_task(
-            [=]() { result_acc[0] = std::max((int64_t)int_res_acc[0] - 1, (int64_t)0); });
-    });
-}
-
-#define IAMIN_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                               \
-    void iamin(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, const int64_t incx, \
-               sycl::buffer<int64_t, 1> &result) {                                          \
-        iamin(ROCBLAS_ROUTINE, queue, n, x, incx, result);                                  \
-    }
-
-IAMIN_LAUNCHER(float, rocblas_isamin)
-IAMIN_LAUNCHER(double, rocblas_idamin)
-IAMIN_LAUNCHER(std::complex<float>, rocblas_icamin)
-IAMIN_LAUNCHER(std::complex<double>, rocblas_izamin)
-
-#undef IAMIN_LAUNCHER
-
-template <typename Func, typename T1, typename T2>
-inline void nrm2(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T1, 1> &x,
-                 const int64_t incx, sycl::buffer<T2, 1> &result) {
-    using rocDataType1 = typename RocEquivalentType<T1>::Type;
-    using rocDataType2 = typename RocEquivalentType<T2>::Type;
-    overflow_check(n, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto res_acc = result.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            // By default the pointer mode is the rocblas_pointer_mode_host
-            // when the data is on buffer, it must be set to
-            // rocblas_set_pointer_mode mode otherwise it causes the segmentation
-            // fault. When it is set to device it is users responsibility to
-            // synchronise as the function is completely asynchronous.
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device);
-            auto x_ = sc.get_mem<rocDataType1 *>(x_acc);
-            auto res_ = sc.get_mem<rocDataType2 *>(res_acc);
-            rocblas_status err;
-            // NRM2 does not support negative index
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, std::abs(incx), res_);
-            // Higher level BLAS functions expect rocblas_pointer_mode_host
-            // to be set, therfore we need to reset this to the default value
-            // in order to avoid invalid memory accesses
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
-        });
-    });
-}
-
-#define NRM2_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE)                                        \
-    void nrm2(sycl::queue &queue, int64_t n, sycl::buffer<TYPE1, 1> &x, const int64_t incx, \
-              sycl::buffer<TYPE2, 1> &result) {                                             \
-        nrm2(ROCBLAS_ROUTINE, queue, n, x, incx, result);                                   \
-    }
-
-NRM2_LAUNCHER(float, float, rocblas_snrm2)
-NRM2_LAUNCHER(double, double, rocblas_dnrm2)
-NRM2_LAUNCHER(std::complex<float>, float, rocblas_scnrm2)
-NRM2_LAUNCHER(std::complex<double>, double, rocblas_dznrm2)
-
-#undef NRM2_LAUNCHER
-
-// USM APIs
-
-template <typename Func, typename T1, typename T2>
-inline sycl::event asum(Func func, sycl::queue &queue, int64_t n, const T1 *x, const int64_t incx,
-                        T2 *result, const std::vector<sycl::event> &dependencies) {
-    using rocDataType1 = typename RocEquivalentType<T1>::Type;
-    using rocDataType2 = typename RocEquivalentType<T2>::Type;
-    overflow_check(n, incx);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device);
-
-            auto x_ = reinterpret_cast<const rocDataType1 *>(x);
-            auto res_ = reinterpret_cast<rocDataType2 *>(result);
-            rocblas_status err;
-            // ASUM does not support negative index
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, std::abs(incx), res_);
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
-        });
-    });
-
-    return done;
-}
-
-#define ASUM_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE)                                \
-    sycl::event asum(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \
-                     TYPE2 *result, const std::vector<sycl::event> &dependencies) {     \
-        return asum(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies);          \
-    }
-
-ASUM_LAUNCHER_USM(float, float, rocblas_sasum)
-ASUM_LAUNCHER_USM(double, double, rocblas_dasum)
-ASUM_LAUNCHER_USM(std::complex<float>, float, rocblas_scasum)
-ASUM_LAUNCHER_USM(std::complex<double>, double, rocblas_dzasum)
-
-#undef ASUM_LAUNCHER_USM
-
-template <typename Func, typename T1, typename T2>
-inline sycl::event scal(Func func, sycl::queue &queue, int64_t n, T1 a, T2 *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType1 = typename RocEquivalentType<T1>::Type;
-    using rocDataType2 = typename RocEquivalentType<T2>::Type;
-    overflow_check(n, incx);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = reinterpret_cast<rocDataType2 *>(x);
-            rocblas_status err;
-            // SCAL does not support negative incx
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType1 *)&a, x_, std::abs(incx));
-        });
-    });
-
-    return done;
-}
-
-#define SCAL_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE)                             \
-    sycl::event scal(sycl::queue &queue, int64_t n, TYPE1 a, TYPE2 *x, int64_t incx, \
-                     const std::vector<sycl::event> &dependencies) {                 \
-        return scal(ROCBLAS_ROUTINE, queue, n, a, x, incx, dependencies);            \
-    }
-
-SCAL_LAUNCHER_USM(float, float, rocblas_sscal)
-SCAL_LAUNCHER_USM(double, double, rocblas_dscal)
-SCAL_LAUNCHER_USM(std::complex<float>, std::complex<float>, rocblas_cscal)
-SCAL_LAUNCHER_USM(std::complex<double>, std::complex<double>, rocblas_zscal)
-SCAL_LAUNCHER_USM(float, std::complex<float>, rocblas_csscal)
-SCAL_LAUNCHER_USM(double, std::complex<double>, rocblas_zdscal)
-
-#undef SCAL_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event axpy(Func func, sycl::queue &queue, int64_t n, T alpha, const T *x, int64_t incx,
-                        T *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType *)&alpha, x_, incx, y_,
-                                    incy);
-        });
-    });
-
-    return done;
-}
-
-#define AXPY_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                             \
-    sycl::event axpy(sycl::queue &queue, int64_t n, TYPE alpha, const TYPE *x, int64_t incx, \
-                     TYPE *y, int64_t incy, const std::vector<sycl::event> &dependencies) {  \
-        return axpy(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy, dependencies);       \
-    }
-
-AXPY_LAUNCHER_USM(float, rocblas_saxpy)
-AXPY_LAUNCHER_USM(double, rocblas_daxpy)
-AXPY_LAUNCHER_USM(std::complex<float>, rocblas_caxpy)
-AXPY_LAUNCHER_USM(std::complex<double>, rocblas_zaxpy)
-
-#undef AXPY_LAUNCHER_USM
-
-sycl::event axpby(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx,
-                  float beta, float *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-sycl::event axpby(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx,
-                  double beta, double *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-sycl::event axpby(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                  const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                  std::complex<float> *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-sycl::event axpby(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                  const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                  std::complex<double> *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for column_major layout");
-}
-
-template <typename Func, typename T1, typename T2>
-inline sycl::event rotg(Func func, sycl::queue &queue, T1 *a, T1 *b, T2 *c, T1 *s,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType1 = typename RocEquivalentType<T1>::Type;
-    using rocDataType2 = typename RocEquivalentType<T2>::Type;
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<rocDataType1 *>(a);
-            auto b_ = reinterpret_cast<rocDataType1 *>(b);
-            auto c_ = reinterpret_cast<rocDataType2 *>(c);
-            auto s_ = reinterpret_cast<rocDataType1 *>(s);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, a_, b_, c_, s_);
-        });
-    });
-
-    return done;
-}
-
-#define ROTG_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE)                         \
-    sycl::event rotg(sycl::queue &queue, TYPE1 *a, TYPE1 *b, TYPE2 *c, TYPE1 *s, \
-                     const std::vector<sycl::event> &dependencies) {             \
-        return rotg(ROCBLAS_ROUTINE, queue, a, b, c, s, dependencies);           \
-    }
-
-ROTG_LAUNCHER_USM(float, float, rocblas_srotg)
-ROTG_LAUNCHER_USM(double, double, rocblas_drotg)
-ROTG_LAUNCHER_USM(std::complex<float>, float, rocblas_crotg)
-ROTG_LAUNCHER_USM(std::complex<double>, double, rocblas_zrotg)
-
-#undef ROTG_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event rotm(Func func, sycl::queue &queue, int64_t n, T *x, int64_t incx, T *y,
-                        int64_t incy, T *param, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = reinterpret_cast<rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            auto param_ = reinterpret_cast<rocDataType *>(param);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy, param_);
-        });
-    });
-
-    return done;
-}
-
-#define ROTM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event rotm(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \
-                     TYPE *param, const std::vector<sycl::event> &dependencies) {                 \
-        return rotm(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, param, dependencies);            \
-    }
-
-ROTM_LAUNCHER_USM(float, rocblas_srotm)
-ROTM_LAUNCHER_USM(double, rocblas_drotm)
-
-#undef ROTM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event copy(Func func, sycl::queue &queue, int64_t n, const T *x, int64_t incx, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy);
-        });
-    });
-
-    return done;
-}
-
-#define COPY_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                          \
-    sycl::event copy(sycl::queue &queue, int64_t n, const TYPE *x, int64_t incx, TYPE *y, \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {        \
-        return copy(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies);           \
-    }
-
-COPY_LAUNCHER_USM(float, rocblas_scopy)
-COPY_LAUNCHER_USM(double, rocblas_dcopy)
-COPY_LAUNCHER_USM(std::complex<float>, rocblas_ccopy)
-COPY_LAUNCHER_USM(std::complex<double>, rocblas_zcopy)
-
-#undef COPY_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event dot(Func func, sycl::queue &queue, int64_t n, const T *x, const int64_t incx,
-                       const T *y, int64_t incy, T *result,
-                       const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<const rocDataType *>(y);
-            auto res_ = reinterpret_cast<rocDataType *>(result);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy, res_);
-        });
-    });
-
-    return done;
-}
-
-#define DOT_LAUNCHER_USM(EXT, TYPE, ROCBLAS_ROUTINE)                                       \
-    sycl::event dot##EXT(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \
-                         const TYPE *y, const int64_t incy, TYPE *result,                  \
-                         const std::vector<sycl::event> &dependencies) {                   \
-        return dot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, result, dependencies);     \
-    }
-
-DOT_LAUNCHER_USM(, float, rocblas_sdot)
-DOT_LAUNCHER_USM(, double, rocblas_ddot)
-DOT_LAUNCHER_USM(u, std::complex<float>, rocblas_cdotu)
-DOT_LAUNCHER_USM(c, std::complex<float>, rocblas_cdotc)
-DOT_LAUNCHER_USM(u, std::complex<double>, rocblas_zdotu)
-DOT_LAUNCHER_USM(c, std::complex<double>, rocblas_zdotc)
-
-#undef DOT_LAUNCHER_USM
-
-sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y,
-                int64_t incy, double *result, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dot", "for column_major layout");
-}
-
-template <typename Func, typename T1, typename T2, typename T3>
-inline sycl::event rot(Func func, sycl::queue &queue, int64_t n, T1 *x, const int64_t incx, T1 *y,
-                       int64_t incy, T2 c, T3 s, const std::vector<sycl::event> &dependencies) {
-    using rocDataType1 = typename RocEquivalentType<T1>::Type;
-    using rocDataType2 = typename RocEquivalentType<T2>::Type;
-    using rocDataType3 = typename RocEquivalentType<T3>::Type;
-    overflow_check(n, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = reinterpret_cast<rocDataType1 *>(x);
-            auto y_ = reinterpret_cast<rocDataType1 *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy, (rocDataType2 *)&c,
-                                    (rocDataType3 *)&s);
-        });
-    });
-
-    return done;
-}
-
-#define ROT_LAUNCHER_USM(TYPE1, TYPE2, TYPE3, ROCBLAS_ROUTINE)                             \
-    sycl::event rot(sycl::queue &queue, int64_t n, TYPE1 *x, const int64_t incx, TYPE1 *y, \
-                    int64_t incy, TYPE2 c, TYPE3 s,                                        \
-                    const std::vector<sycl::event> &dependencies) {                        \
-        return rot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s, dependencies);       \
-    }
-
-ROT_LAUNCHER_USM(float, float, float, rocblas_srot)
-ROT_LAUNCHER_USM(double, double, double, rocblas_drot)
-ROT_LAUNCHER_USM(std::complex<float>, float, float, rocblas_csrot)
-ROT_LAUNCHER_USM(std::complex<double>, double, double, rocblas_zdrot)
-
-#undef ROT_LAUNCHER_USM
-
-sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx,
-                   const float *y, int64_t incy, float *result,
-                   const std::vector<sycl::event> &dependencies) {
-    overflow_check(n, incx, incy);
-
-    // rocBLAS does not support sdot so we need to mimic sdot.
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = reinterpret_cast<const float *>(x);
-            auto y_ = reinterpret_cast<const float *>(y);
-            auto res_ = reinterpret_cast<float *>(result);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(rocblas_sdot, err, handle, n, x_, incx, y_, incy, res_);
-        });
-    });
-
-    done.wait_and_throw();
-    result[0] = result[0] + sb;
-    return done;
-}
-
-template <typename Func, typename T>
-inline sycl::event rotmg(Func func, sycl::queue &queue, T *d1, T *d2, T *x1, T y1, T *param,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto d1_ = reinterpret_cast<rocDataType *>(d1);
-            auto d2_ = reinterpret_cast<rocDataType *>(d2);
-            auto x1_ = reinterpret_cast<rocDataType *>(x1);
-            auto y1_ = reinterpret_cast<const rocDataType *>(&y1);
-            auto param_ = reinterpret_cast<rocDataType *>(param);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, d1_, d2_, x1_, y1_, param_);
-        });
-    });
-
-    return done;
-}
-
-#define ROTMG_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                             \
-    sycl::event rotmg(sycl::queue &queue, TYPE *d1, TYPE *d2, TYPE *x1, TYPE y1, TYPE *param, \
-                      const std::vector<sycl::event> &dependencies) {                         \
-        return rotmg(ROCBLAS_ROUTINE, queue, d1, d2, x1, y1, param, dependencies);            \
-    }
-
-ROTMG_LAUNCHER_USM(float, rocblas_srotmg)
-ROTMG_LAUNCHER_USM(double, rocblas_drotmg)
-
-#undef ROTMG_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event iamax(Func func, sycl::queue &queue, int64_t n, const T *x, const int64_t incx,
-                         int64_t *result, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx);
-    // rocBLAS does not support int64_t as return type for the data by default. So we need to
-    // mimic iamax. We are converting the result to be the int and then we convert
-    // it back to the actual data on the host.
-    // This change may cause failure as the result of integer overflow
-    // based on the size.
-    auto int_res_p = (int *)sycl::aligned_alloc_shared(64, sizeof(rocblas_int), queue.get_device(),
-                                                       queue.get_context());
-    *int_res_p = 0;
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto int_res_p_ = reinterpret_cast<int *>(int_res_p);
-            rocblas_status err;
-            // For negative incx, iamax returns 0. This behaviour is similar to that of
-            // reference iamax.
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, int_res_p_);
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
-        });
-    });
-
-    done.wait_and_throw();
-    result[0] = std::max((int64_t)(*int_res_p - 1), int64_t{ 0 });
-    return done;
-}
-
-#define IAMAX_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                       \
-    sycl::event iamax(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \
-                      int64_t *result, const std::vector<sycl::event> &dependencies) {  \
-        return iamax(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies);         \
-    }
-
-IAMAX_LAUNCHER_USM(float, rocblas_isamax)
-IAMAX_LAUNCHER_USM(double, rocblas_idamax)
-IAMAX_LAUNCHER_USM(std::complex<float>, rocblas_icamax)
-IAMAX_LAUNCHER_USM(std::complex<double>, rocblas_izamax)
-
-#undef IAMAX_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event swap(Func func, sycl::queue &queue, int64_t n, T *x, int64_t incx, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto x_ = reinterpret_cast<rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy);
-        });
-    });
-
-    return done;
-}
-
-#define SWAP_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event swap(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return swap(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies);                   \
-    }
-
-SWAP_LAUNCHER_USM(float, rocblas_sswap)
-SWAP_LAUNCHER_USM(double, rocblas_dswap)
-SWAP_LAUNCHER_USM(std::complex<float>, rocblas_cswap)
-SWAP_LAUNCHER_USM(std::complex<double>, rocblas_zswap)
-
-#undef SWAP_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event iamin(Func func, sycl::queue &queue, int64_t n, const T *x, const int64_t incx,
-                         int64_t *result, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx);
-    // rocBLAS does not support int64_t as return type for the data by default. So we need to
-    // mimic iamin. We are converting the result to be the int and then we convert
-    // it back to the actual data on the host.
-    // This change may cause failure as the result of integer overflow
-    // based on the size.
-    auto int_res_p = (int *)sycl::aligned_alloc_shared(64, sizeof(rocblas_int), queue.get_device(),
-                                                       queue.get_context());
-    *int_res_p = 0;
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device);
-
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto int_res_p_ = reinterpret_cast<int *>(int_res_p);
-            rocblas_status err;
-            // For negative incx, iamin returns 0. This behaviour is similar to that of
-            // implemented iamin.
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, int_res_p_);
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
-        });
-    });
-
-    done.wait_and_throw();
-    result[0] = std::max((int64_t)(*int_res_p - 1), int64_t{ 0 });
-    return done;
-}
-
-#define IAMIN_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                       \
-    sycl::event iamin(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \
-                      int64_t *result, const std::vector<sycl::event> &dependencies) {  \
-        return iamin(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies);         \
-    }
-
-IAMIN_LAUNCHER_USM(float, rocblas_isamin)
-IAMIN_LAUNCHER_USM(double, rocblas_idamin)
-IAMIN_LAUNCHER_USM(std::complex<float>, rocblas_icamin)
-IAMIN_LAUNCHER_USM(std::complex<double>, rocblas_izamin)
-
-#undef IAMIN_LAUNCHER_USM
-
-template <typename Func, typename T1, typename T2>
-inline sycl::event nrm2(Func func, sycl::queue &queue, int64_t n, const T1 *x, const int64_t incx,
-                        T2 *result, const std::vector<sycl::event> &dependencies) {
-    using rocDataType1 = typename RocEquivalentType<T1>::Type;
-    using rocDataType2 = typename RocEquivalentType<T2>::Type;
-    overflow_check(n, incx);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device);
-
-            auto x_ = reinterpret_cast<const rocDataType1 *>(x);
-            auto res_ = reinterpret_cast<rocDataType2 *>(result);
-            rocblas_status err;
-            // NRM2 does not support negative index
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, std::abs(incx), res_);
-            rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
-        });
-    });
-
-    return done;
-}
-
-#define NRM2_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE)                                \
-    sycl::event nrm2(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \
-                     TYPE2 *result, const std::vector<sycl::event> &dependencies) {     \
-        return nrm2(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies);          \
-    }
-
-NRM2_LAUNCHER_USM(float, float, rocblas_snrm2)
-NRM2_LAUNCHER_USM(double, double, rocblas_dnrm2)
-NRM2_LAUNCHER_USM(std::complex<float>, float, rocblas_scnrm2)
-NRM2_LAUNCHER_USM(std::complex<double>, double, rocblas_dznrm2)
-
-#undef NRM2_LAUNCHER_USM
-
-} // namespace column_major
-namespace row_major {
-
-// Buffer APIs
-
-template <typename Func, typename T1, typename T2>
-inline void asum(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T1, 1> &x,
-                 const int64_t incx, sycl::buffer<T2, 1> &result) {
-    column_major::asum(func, queue, n, x, incx, result);
-}
-
-#define ASUM_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE)                                        \
-    void asum(sycl::queue &queue, int64_t n, sycl::buffer<TYPE1, 1> &x, const int64_t incx, \
-              sycl::buffer<TYPE2, 1> &result) {                                             \
-        asum(ROCBLAS_ROUTINE, queue, n, x, incx, result);                                   \
-    }
-
-ASUM_LAUNCHER(float, float, rocblas_sasum)
-ASUM_LAUNCHER(double, double, rocblas_dasum)
-ASUM_LAUNCHER(std::complex<float>, float, rocblas_scasum)
-ASUM_LAUNCHER(std::complex<double>, double, rocblas_dzasum)
-
-#undef ASUM_LAUNCHER
-
-template <typename Func, typename T1, typename T2>
-inline void scal(Func func, sycl::queue &queue, int64_t n, T1 a, sycl::buffer<T2, 1> &x,
-                 int64_t incx) {
-    column_major::scal(func, queue, n, a, x, incx);
-}
-
-#define SCAL_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE)                                             \
-    void scal(sycl::queue &queue, int64_t n, TYPE1 a, sycl::buffer<TYPE2, 1> &x, int64_t incx) { \
-        scal(ROCBLAS_ROUTINE, queue, n, a, x, incx);                                             \
-    }
-
-SCAL_LAUNCHER(float, float, rocblas_sscal)
-SCAL_LAUNCHER(double, double, rocblas_dscal)
-SCAL_LAUNCHER(std::complex<float>, std::complex<float>, rocblas_cscal)
-SCAL_LAUNCHER(std::complex<double>, std::complex<double>, rocblas_zscal)
-SCAL_LAUNCHER(float, std::complex<float>, rocblas_csscal)
-SCAL_LAUNCHER(double, std::complex<double>, rocblas_zdscal)
-
-#undef SCAL_LAUNCHER
-
-template <typename Func, typename T>
-inline void axpy(Func func, sycl::queue &queue, int64_t n, T alpha, sycl::buffer<T, 1> &x,
-                 int64_t incx, sycl::buffer<T, 1> &y, int64_t incy) {
-    column_major::axpy(func, queue, n, alpha, x, incx, y, incy);
-}
-
-#define AXPY_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                     \
-    void axpy(sycl::queue &queue, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                                          \
-        axpy(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy);                                \
-    }
-
-AXPY_LAUNCHER(float, rocblas_saxpy)
-AXPY_LAUNCHER(double, rocblas_daxpy)
-AXPY_LAUNCHER(std::complex<float>, rocblas_caxpy)
-AXPY_LAUNCHER(std::complex<double>, rocblas_zaxpy)
-
-#undef AXPY_LAUNCHER
-
-void axpby(sycl::queue &queue, int64_t n, float alpha, sycl::buffer<float, 1> &x, int64_t incx,
-           float beta, sycl::buffer<float, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-
-void axpby(sycl::queue &queue, int64_t n, double alpha, sycl::buffer<double, 1> &x, int64_t incx,
-           double beta, sycl::buffer<double, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-
-void axpby(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &x, int64_t incx, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-
-void axpby(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &x, int64_t incx, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &y, int64_t incy) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-
-template <typename Func, typename T1, typename T2>
-inline void rotg(Func func, sycl::queue &queue, sycl::buffer<T1, 1> &a, sycl::buffer<T1, 1> &b,
-                 sycl::buffer<T2, 1> &c, sycl::buffer<T1, 1> &s) {
-    column_major::rotg(func, queue, a, b, c, s);
-}
-
-#define ROTG_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE)                                    \
-    void rotg(sycl::queue &queue, sycl::buffer<TYPE1, 1> &a, sycl::buffer<TYPE1, 1> &b, \
-              sycl::buffer<TYPE2, 1> &c, sycl::buffer<TYPE1, 1> &s) {                   \
-        rotg(ROCBLAS_ROUTINE, queue, a, b, c, s);                                       \
-    }
-
-ROTG_LAUNCHER(float, float, rocblas_srotg)
-ROTG_LAUNCHER(double, double, rocblas_drotg)
-ROTG_LAUNCHER(std::complex<float>, float, rocblas_crotg)
-ROTG_LAUNCHER(std::complex<double>, double, rocblas_zrotg)
-
-#undef ROTG_LAUNCHER
-
-template <typename Func, typename T>
-inline void rotm(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T, 1> &x, int64_t incx,
-                 sycl::buffer<T, 1> &y, int64_t incy, sycl::buffer<T, 1> &param) {
-    column_major::rotm(func, queue, n, x, incx, y, incy, param);
-}
-
-#define ROTM_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                          \
-    void rotm(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, int64_t incx,  \
-              sycl::buffer<TYPE, 1> &y, int64_t incy, sycl::buffer<TYPE, 1> &param) { \
-        rotm(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, param);                     \
-    }
-
-ROTM_LAUNCHER(float, rocblas_srotm)
-ROTM_LAUNCHER(double, rocblas_drotm)
-
-#undef ROTM_LAUNCHER
-
-template <typename Func, typename T>
-inline void copy(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T, 1> &x, int64_t incx,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    column_major::copy(func, queue, n, x, incx, y, incy);
-}
-
-#define COPY_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                         \
-    void copy(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        copy(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy);                           \
-    }
-
-COPY_LAUNCHER(float, rocblas_scopy)
-COPY_LAUNCHER(double, rocblas_dcopy)
-COPY_LAUNCHER(std::complex<float>, rocblas_ccopy)
-COPY_LAUNCHER(std::complex<double>, rocblas_zcopy)
-
-#undef COPY_LAUNCHER
-
-template <typename Func, typename T>
-inline void dot(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T, 1> &x, const int64_t incx,
-                sycl::buffer<T, 1> &y, int64_t incy, sycl::buffer<T, 1> &result) {
-    column_major::dot(func, queue, n, x, incx, y, incy, result);
-}
-
-#define DOT_LAUNCHER(EXT, TYPE, ROCBLAS_ROUTINE)                                                 \
-    void dot##EXT(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, const int64_t incx,   \
-                  sycl::buffer<TYPE, 1> &y, const int64_t incy, sycl::buffer<TYPE, 1> &result) { \
-        dot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, result);                                \
-    }
-
-DOT_LAUNCHER(, float, rocblas_sdot)
-DOT_LAUNCHER(, double, rocblas_ddot)
-DOT_LAUNCHER(u, std::complex<float>, rocblas_cdotu)
-DOT_LAUNCHER(c, std::complex<float>, rocblas_cdotc)
-DOT_LAUNCHER(u, std::complex<double>, rocblas_zdotu)
-DOT_LAUNCHER(c, std::complex<double>, rocblas_zdotc)
-
-#undef DOT_LAUNCHER
-
-void dot(sycl::queue &queue, int64_t n, sycl::buffer<float, 1> &x, int64_t incx,
-         sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<double, 1> &result) {
-    throw unimplemented("blas", "dot", "for row_major layout");
-}
-
-template <typename Func, typename T1, typename T2, typename T3>
-inline void rot(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T1, 1> &x,
-                const int64_t incx, sycl::buffer<T1, 1> &y, int64_t incy, T2 c, T3 s) {
-    column_major::rot(func, queue, n, x, incx, y, incy, c, s);
-}
-
-#define ROT_LAUNCHER(TYPE1, TYPE2, TYPE3, ROCBLAS_ROUTINE)                                 \
-    void rot(sycl::queue &queue, int64_t n, sycl::buffer<TYPE1, 1> &x, const int64_t incx, \
-             sycl::buffer<TYPE1, 1> &y, int64_t incy, TYPE2 c, TYPE3 s) {                  \
-        rot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s);                            \
-    }
-
-ROT_LAUNCHER(float, float, float, rocblas_srot)
-ROT_LAUNCHER(double, double, double, rocblas_drot)
-ROT_LAUNCHER(std::complex<float>, float, float, rocblas_csrot)
-ROT_LAUNCHER(std::complex<double>, double, double, rocblas_zdrot)
-
-#undef ROT_LAUNCHER
-
-void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer<float, 1> &x, int64_t incx,
-            sycl::buffer<float, 1> &y, int64_t incy, sycl::buffer<float, 1> &result) {
-    column_major::sdsdot(queue, n, sb, x, incx, y, incy, result);
-}
-
-template <typename Func, typename T>
-inline void rotmg(Func func, sycl::queue &queue, sycl::buffer<T, 1> &d1, sycl::buffer<T, 1> &d2,
-                  sycl::buffer<T, 1> &x1, T y1, sycl::buffer<T, 1> &param) {
-    column_major::rotmg(func, queue, d1, d2, x1, y1, param);
-}
-
-#define ROTMG_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                            \
-    void rotmg(sycl::queue &queue, sycl::buffer<TYPE, 1> &d1, sycl::buffer<TYPE, 1> &d2, \
-               sycl::buffer<TYPE, 1> &x1, TYPE y1, sycl::buffer<TYPE, 1> &param) {       \
-        rotmg(ROCBLAS_ROUTINE, queue, d1, d2, x1, y1, param);                            \
-    }
-
-ROTMG_LAUNCHER(float, rocblas_srotmg)
-ROTMG_LAUNCHER(double, rocblas_drotmg)
-
-#undef ROTMG_LAUNCHER
-
-template <typename Func, typename T>
-inline void iamax(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T, 1> &x,
-                  const int64_t incx, sycl::buffer<int64_t, 1> &result) {
-    column_major::iamax(func, queue, n, x, incx, result);
-}
-
-#define IAMAX_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                               \
-    void iamax(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, const int64_t incx, \
-               sycl::buffer<int64_t, 1> &result) {                                          \
-        iamax(ROCBLAS_ROUTINE, queue, n, x, incx, result);                                  \
-    }
-
-IAMAX_LAUNCHER(float, rocblas_isamax)
-IAMAX_LAUNCHER(double, rocblas_idamax)
-IAMAX_LAUNCHER(std::complex<float>, rocblas_icamax)
-IAMAX_LAUNCHER(std::complex<double>, rocblas_izamax)
-
-#undef IAMAX_LAUNCHER
-
-template <typename Func, typename T>
-inline void swap(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T, 1> &x, int64_t incx,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    column_major::swap(func, queue, n, x, incx, y, incy);
-}
-
-#define SWAP_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                         \
-    void swap(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        swap(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy);                           \
-    }
-
-SWAP_LAUNCHER(float, rocblas_sswap)
-SWAP_LAUNCHER(double, rocblas_dswap)
-SWAP_LAUNCHER(std::complex<float>, rocblas_cswap)
-SWAP_LAUNCHER(std::complex<double>, rocblas_zswap)
-
-#undef SWAP_LAUNCHER
-
-template <typename Func, typename T>
-inline void iamin(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T, 1> &x,
-                  const int64_t incx, sycl::buffer<int64_t, 1> &result) {
-    column_major::iamin(func, queue, n, x, incx, result);
-}
-
-#define IAMIN_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                               \
-    void iamin(sycl::queue &queue, int64_t n, sycl::buffer<TYPE, 1> &x, const int64_t incx, \
-               sycl::buffer<int64_t, 1> &result) {                                          \
-        iamin(ROCBLAS_ROUTINE, queue, n, x, incx, result);                                  \
-    }
-
-IAMIN_LAUNCHER(float, rocblas_isamin)
-IAMIN_LAUNCHER(double, rocblas_idamin)
-IAMIN_LAUNCHER(std::complex<float>, rocblas_icamin)
-IAMIN_LAUNCHER(std::complex<double>, rocblas_izamin)
-
-#undef IAMIN_LAUNCHER
-
-template <typename Func, typename T1, typename T2>
-inline void nrm2(Func func, sycl::queue &queue, int64_t n, sycl::buffer<T1, 1> &x,
-                 const int64_t incx, sycl::buffer<T2, 1> &result) {
-    column_major::nrm2(func, queue, n, x, incx, result);
-}
-
-#define NRM2_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE)                                        \
-    void nrm2(sycl::queue &queue, int64_t n, sycl::buffer<TYPE1, 1> &x, const int64_t incx, \
-              sycl::buffer<TYPE2, 1> &result) {                                             \
-        nrm2(ROCBLAS_ROUTINE, queue, n, x, incx, result);                                   \
-    }
-
-NRM2_LAUNCHER(float, float, rocblas_snrm2)
-NRM2_LAUNCHER(double, double, rocblas_dnrm2)
-NRM2_LAUNCHER(std::complex<float>, float, rocblas_scnrm2)
-NRM2_LAUNCHER(std::complex<double>, double, rocblas_dznrm2)
-
-#undef NRM2_LAUNCHER
-
-// USM APIs
-
-template <typename Func, typename T1, typename T2>
-inline sycl::event asum(Func func, sycl::queue &queue, int64_t n, const T1 *x, const int64_t incx,
-                        T2 *result, const std::vector<sycl::event> &dependencies) {
-    return column_major::asum(func, queue, n, x, incx, result, dependencies);
-}
-
-#define ASUM_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE)                                \
-    sycl::event asum(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \
-                     TYPE2 *result, const std::vector<sycl::event> &dependencies) {     \
-        return asum(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies);          \
-    }
-
-ASUM_LAUNCHER_USM(float, float, rocblas_sasum)
-ASUM_LAUNCHER_USM(double, double, rocblas_dasum)
-ASUM_LAUNCHER_USM(std::complex<float>, float, rocblas_scasum)
-ASUM_LAUNCHER_USM(std::complex<double>, double, rocblas_dzasum)
-
-#undef ASUM_LAUNCHER_USM
-
-template <typename Func, typename T1, typename T2>
-inline sycl::event scal(Func func, sycl::queue &queue, int64_t n, T1 a, T2 *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    return column_major::scal(func, queue, n, a, x, incx, dependencies);
-}
-
-#define SCAL_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE)                             \
-    sycl::event scal(sycl::queue &queue, int64_t n, TYPE1 a, TYPE2 *x, int64_t incx, \
-                     const std::vector<sycl::event> &dependencies) {                 \
-        return scal(ROCBLAS_ROUTINE, queue, n, a, x, incx, dependencies);            \
-    }
-
-SCAL_LAUNCHER_USM(float, float, rocblas_sscal)
-SCAL_LAUNCHER_USM(double, double, rocblas_dscal)
-SCAL_LAUNCHER_USM(std::complex<float>, std::complex<float>, rocblas_cscal)
-SCAL_LAUNCHER_USM(std::complex<double>, std::complex<double>, rocblas_zscal)
-SCAL_LAUNCHER_USM(float, std::complex<float>, rocblas_csscal)
-SCAL_LAUNCHER_USM(double, std::complex<double>, rocblas_zdscal)
-
-#undef SCAL_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event axpy(Func func, sycl::queue &queue, int64_t n, T alpha, const T *x, int64_t incx,
-                        T *y, int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return column_major::axpy(func, queue, n, alpha, x, incx, y, incy, dependencies);
-}
-
-#define AXPY_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                             \
-    sycl::event axpy(sycl::queue &queue, int64_t n, TYPE alpha, const TYPE *x, int64_t incx, \
-                     TYPE *y, int64_t incy, const std::vector<sycl::event> &dependencies) {  \
-        return axpy(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy, dependencies);       \
-    }
-
-AXPY_LAUNCHER_USM(float, rocblas_saxpy)
-AXPY_LAUNCHER_USM(double, rocblas_daxpy)
-AXPY_LAUNCHER_USM(std::complex<float>, rocblas_caxpy)
-AXPY_LAUNCHER_USM(std::complex<double>, rocblas_zaxpy)
-
-#undef AXPY_LAUNCHER_USM
-
-sycl::event axpby(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx,
-                  float beta, float *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-sycl::event axpby(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx,
-                  double beta, double *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-sycl::event axpby(sycl::queue &queue, int64_t n, std::complex<float> alpha,
-                  const std::complex<float> *x, int64_t incx, std::complex<float> beta,
-                  std::complex<float> *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-sycl::event axpby(sycl::queue &queue, int64_t n, std::complex<double> alpha,
-                  const std::complex<double> *x, int64_t incx, std::complex<double> beta,
-                  std::complex<double> *y, int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "axpby", "for row_major layout");
-}
-
-template <typename Func, typename T1, typename T2>
-inline sycl::event rotg(Func func, sycl::queue &queue, T1 *a, T1 *b, T2 *c, T1 *s,
-                        const std::vector<sycl::event> &dependencies) {
-    return column_major::rotg(func, queue, a, b, c, s, dependencies);
-}
-
-#define ROTG_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE)                         \
-    sycl::event rotg(sycl::queue &queue, TYPE1 *a, TYPE1 *b, TYPE2 *c, TYPE1 *s, \
-                     const std::vector<sycl::event> &dependencies) {             \
-        return rotg(ROCBLAS_ROUTINE, queue, a, b, c, s, dependencies);           \
-    }
-
-ROTG_LAUNCHER_USM(float, float, rocblas_srotg)
-ROTG_LAUNCHER_USM(double, double, rocblas_drotg)
-ROTG_LAUNCHER_USM(std::complex<float>, float, rocblas_crotg)
-ROTG_LAUNCHER_USM(std::complex<double>, double, rocblas_zrotg)
-
-#undef ROTG_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event rotm(Func func, sycl::queue &queue, int64_t n, T *x, int64_t incx, T *y,
-                        int64_t incy, T *param, const std::vector<sycl::event> &dependencies) {
-    return column_major::rotm(func, queue, n, x, incx, y, incy, param, dependencies);
-}
-
-#define ROTM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event rotm(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \
-                     TYPE *param, const std::vector<sycl::event> &dependencies) {                 \
-        return rotm(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, param, dependencies);            \
-    }
-
-ROTM_LAUNCHER_USM(float, rocblas_srotm)
-ROTM_LAUNCHER_USM(double, rocblas_drotm)
-
-#undef ROTM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event copy(Func func, sycl::queue &queue, int64_t n, const T *x, int64_t incx, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return column_major::copy(func, queue, n, x, incx, y, incy, dependencies);
-}
-
-#define COPY_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                          \
-    sycl::event copy(sycl::queue &queue, int64_t n, const TYPE *x, int64_t incx, TYPE *y, \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {        \
-        return copy(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies);           \
-    }
-
-COPY_LAUNCHER_USM(float, rocblas_scopy)
-COPY_LAUNCHER_USM(double, rocblas_dcopy)
-COPY_LAUNCHER_USM(std::complex<float>, rocblas_ccopy)
-COPY_LAUNCHER_USM(std::complex<double>, rocblas_zcopy)
-
-#undef COPY_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event dot(Func func, sycl::queue &queue, int64_t n, const T *x, const int64_t incx,
-                       const T *y, int64_t incy, T *result,
-                       const std::vector<sycl::event> &dependencies) {
-    return column_major::dot(func, queue, n, x, incx, y, incy, result, dependencies);
-}
-
-#define DOT_LAUNCHER_USM(EXT, TYPE, ROCBLAS_ROUTINE)                                       \
-    sycl::event dot##EXT(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \
-                         const TYPE *y, const int64_t incy, TYPE *result,                  \
-                         const std::vector<sycl::event> &dependencies) {                   \
-        return dot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, result, dependencies);     \
-    }
-
-DOT_LAUNCHER_USM(, float, rocblas_sdot)
-DOT_LAUNCHER_USM(, double, rocblas_ddot)
-DOT_LAUNCHER_USM(u, std::complex<float>, rocblas_cdotu)
-DOT_LAUNCHER_USM(c, std::complex<float>, rocblas_cdotc)
-DOT_LAUNCHER_USM(u, std::complex<double>, rocblas_zdotu)
-DOT_LAUNCHER_USM(c, std::complex<double>, rocblas_zdotc)
-
-#undef DOT_LAUNCHER_USM
-
-sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y,
-                int64_t incy, double *result, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("blas", "dot", "for row_major layout");
-}
-
-template <typename Func, typename T1, typename T2, typename T3>
-inline sycl::event rot(Func func, sycl::queue &queue, int64_t n, T1 *x, const int64_t incx, T1 *y,
-                       int64_t incy, T2 c, T3 s, const std::vector<sycl::event> &dependencies) {
-    return column_major::rot(func, queue, n, x, incx, y, incy, c, s, dependencies);
-}
-
-#define ROT_LAUNCHER_USM(TYPE1, TYPE2, TYPE3, ROCBLAS_ROUTINE)                             \
-    sycl::event rot(sycl::queue &queue, int64_t n, TYPE1 *x, const int64_t incx, TYPE1 *y, \
-                    int64_t incy, TYPE2 c, TYPE3 s,                                        \
-                    const std::vector<sycl::event> &dependencies) {                        \
-        return rot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s, dependencies);       \
-    }
-
-ROT_LAUNCHER_USM(float, float, float, rocblas_srot)
-ROT_LAUNCHER_USM(double, double, double, rocblas_drot)
-ROT_LAUNCHER_USM(std::complex<float>, float, float, rocblas_csrot)
-ROT_LAUNCHER_USM(std::complex<double>, double, double, rocblas_zdrot)
-
-#undef ROT_LAUNCHER_USM
-
-sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx,
-                   const float *y, int64_t incy, float *result,
-                   const std::vector<sycl::event> &dependencies) {
-    return column_major::sdsdot(queue, n, sb, x, incx, y, incy, result);
-}
-
-template <typename Func, typename T>
-inline sycl::event rotmg(Func func, sycl::queue &queue, T *d1, T *d2, T *x1, T y1, T *param,
-                         const std::vector<sycl::event> &dependencies) {
-    return column_major::rotmg(func, queue, d1, d2, x1, y1, param, dependencies);
-}
-
-#define ROTMG_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                             \
-    sycl::event rotmg(sycl::queue &queue, TYPE *d1, TYPE *d2, TYPE *x1, TYPE y1, TYPE *param, \
-                      const std::vector<sycl::event> &dependencies) {                         \
-        return rotmg(ROCBLAS_ROUTINE, queue, d1, d2, x1, y1, param, dependencies);            \
-    }
-
-ROTMG_LAUNCHER_USM(float, rocblas_srotmg)
-ROTMG_LAUNCHER_USM(double, rocblas_drotmg)
-
-#undef ROTMG_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event iamax(Func func, sycl::queue &queue, int64_t n, const T *x, const int64_t incx,
-                         int64_t *result, const std::vector<sycl::event> &dependencies) {
-    return column_major::iamax(func, queue, n, x, incx, result, dependencies);
-}
-
-#define IAMAX_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                       \
-    sycl::event iamax(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \
-                      int64_t *result, const std::vector<sycl::event> &dependencies) {  \
-        return iamax(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies);         \
-    }
-
-IAMAX_LAUNCHER_USM(float, rocblas_isamax)
-IAMAX_LAUNCHER_USM(double, rocblas_idamax)
-IAMAX_LAUNCHER_USM(std::complex<float>, rocblas_icamax)
-IAMAX_LAUNCHER_USM(std::complex<double>, rocblas_izamax)
-
-#undef IAMAX_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event swap(Func func, sycl::queue &queue, int64_t n, T *x, int64_t incx, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return column_major::swap(func, queue, n, x, incx, y, incy, dependencies);
-}
-
-#define SWAP_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event swap(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return swap(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies);                   \
-    }
-
-SWAP_LAUNCHER_USM(float, rocblas_sswap)
-SWAP_LAUNCHER_USM(double, rocblas_dswap)
-SWAP_LAUNCHER_USM(std::complex<float>, rocblas_cswap)
-SWAP_LAUNCHER_USM(std::complex<double>, rocblas_zswap)
-
-#undef SWAP_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event iamin(Func func, sycl::queue &queue, int64_t n, const T *x, const int64_t incx,
-                         int64_t *result, const std::vector<sycl::event> &dependencies) {
-    return column_major::iamin(func, queue, n, x, incx, result, dependencies);
-}
-
-#define IAMIN_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                       \
-    sycl::event iamin(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \
-                      int64_t *result, const std::vector<sycl::event> &dependencies) {  \
-        return iamin(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies);         \
-    }
-
-IAMIN_LAUNCHER_USM(float, rocblas_isamin)
-IAMIN_LAUNCHER_USM(double, rocblas_idamin)
-IAMIN_LAUNCHER_USM(std::complex<float>, rocblas_icamin)
-IAMIN_LAUNCHER_USM(std::complex<double>, rocblas_izamin)
-
-#undef IAMIN_LAUNCHER_USM
-
-template <typename Func, typename T1, typename T2>
-inline sycl::event nrm2(Func func, sycl::queue &queue, int64_t n, const T1 *x, const int64_t incx,
-                        T2 *result, const std::vector<sycl::event> &dependencies) {
-    return column_major::nrm2(func, queue, n, x, incx, result, dependencies);
-}
-
-#define NRM2_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE)                                \
-    sycl::event nrm2(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \
-                     TYPE2 *result, const std::vector<sycl::event> &dependencies) {     \
-        return nrm2(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies);          \
-    }
-
-NRM2_LAUNCHER_USM(float, float, rocblas_snrm2)
-NRM2_LAUNCHER_USM(double, double, rocblas_dnrm2)
-NRM2_LAUNCHER_USM(std::complex<float>, float, rocblas_scnrm2)
-NRM2_LAUNCHER_USM(std::complex<double>, double, rocblas_dznrm2)
-
-#undef NRM2_LAUNCHER_USM
-
-} // namespace row_major
-} // namespace rocblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/rocblas/rocblas_level2.cpp b/src/blas/backends/rocblas/rocblas_level2.cpp
deleted file mode 100644
index 882f7ff1c..000000000
--- a/src/blas/backends/rocblas/rocblas_level2.cpp
+++ /dev/null
@@ -1,3575 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#include "rocblas_helper.hpp"
-#include "rocblas_task.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hpp"
-
-// Helper Functions
-
-template <typename T>
-static inline void conj_vector(sycl::handler &cgh, sycl::buffer<T> &buf, const int64_t len,
-                               const int64_t inc) {
-    const auto abs_inc = std::abs(inc);
-    auto acc = buf.template get_access<sycl::access::mode::read_write>(cgh);
-    cgh.parallel_for(sycl::range{ (std::size_t)len }, [=](sycl::id<1> id) {
-        const auto index = id * abs_inc;
-        acc[index] = std::conj(acc[index]);
-    });
-}
-template <typename T>
-static inline void conj_vector(sycl::handler &cgh, T *ptr, const int64_t len, const int64_t inc) {
-    const auto abs_inc = std::abs(inc);
-    cgh.parallel_for(sycl::range{ (std::size_t)len }, [=](sycl::id<1> id) {
-        const auto index = id * abs_inc;
-        ptr[index] = std::conj(ptr[index]);
-    });
-}
-
-template <typename T>
-static inline void conj_vector(sycl::handler &cgh, sycl::buffer<T> &buf_a, sycl::buffer<T> &buf_b,
-                               const int64_t len, const int64_t inc_a, const int64_t inc_b) {
-    const auto abs_inc_a = std::abs(inc_a);
-    const auto abs_inc_b = std::abs(inc_b);
-    auto acc_a = buf_a.template get_access<sycl::access::mode::read_write>(cgh);
-    auto acc_b = buf_b.template get_access<sycl::access::mode::read_write>(cgh);
-    cgh.parallel_for(sycl::range{ (std::size_t)len }, [=](sycl::id<1> id) {
-        const auto index_a = id * abs_inc_a;
-        const auto index_b = id * abs_inc_b;
-        acc_a[index_a] = std::conj(acc_a[index_a]);
-        acc_b[index_b] = std::conj(acc_b[index_b]);
-    });
-}
-template <typename T>
-static inline void conj_vector(sycl::handler &cgh, T *ptr_a, T *ptr_b, const int64_t len,
-                               const int64_t inc_a, const int64_t inc_b) {
-    const auto abs_inc_a = std::abs(inc_a);
-    const auto abs_inc_b = std::abs(inc_b);
-    cgh.parallel_for(sycl::range{ (std::size_t)len }, [=](sycl::id<1> id) {
-        const auto index_a = id * abs_inc_a;
-        const auto index_b = id * abs_inc_b;
-        ptr_a[index_a] = std::conj(ptr_a[index_a]);
-        ptr_b[index_b] = std::conj(ptr_b[index_b]);
-    });
-}
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace rocblas {
-namespace column_major {
-
-// Buffer APIs
-
-template <typename Func, typename T>
-inline void gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, m, lda, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), m, n,
-                                    (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta,
-                                    y_, incy);
-        });
-    });
-}
-
-#define GEMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                 \
-    void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,         \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                           \
-        gemv(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);    \
-    }
-
-GEMV_LAUNCHER(float, rocblas_sgemv)
-GEMV_LAUNCHER(double, rocblas_dgemv)
-GEMV_LAUNCHER(std::complex<float>, rocblas_cgemv)
-GEMV_LAUNCHER(std::complex<double>, rocblas_zgemv)
-
-#undef GEMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl,
-                 int64_t ku, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx, T beta, sycl::buffer<T, 1> &y, int64_t incy) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, m, lda, kl, ku, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), m, n, kl, ku,
-                                    (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta,
-                                    y_, incy);
-        });
-    });
-}
-
-#define GBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                      \
-    void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,  \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x,        \
-              int64_t incx, TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                  \
-        gbmv(ROCBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); \
-    }
-
-GBMV_LAUNCHER(float, rocblas_sgbmv)
-GBMV_LAUNCHER(double, rocblas_dgbmv)
-GBMV_LAUNCHER(std::complex<float>, rocblas_cgbmv)
-GBMV_LAUNCHER(std::complex<double>, rocblas_zgbmv)
-
-#undef GBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void ger(Func func, sycl::queue &queue, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &x,
-                int64_t incx, sycl::buffer<T, 1> &y, int64_t incy, sycl::buffer<T, 1> &a,
-                int64_t lda) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, m, lda, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, m, n, (rocDataType *)&alpha, x_, incx, y_,
-                                    incy, a_, lda);
-        });
-    });
-}
-
-#define GER_LAUNCHER(EXT, TYPE, ROCBLAS_ROUTINE)                                                  \
-    void ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &x, \
-                  int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy, sycl::buffer<TYPE, 1> &a, \
-                  int64_t lda) {                                                                  \
-        ger(ROCBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda);                       \
-    }
-
-GER_LAUNCHER(, float, rocblas_sger)
-GER_LAUNCHER(, double, rocblas_dger)
-GER_LAUNCHER(u, std::complex<float>, rocblas_cgeru)
-GER_LAUNCHER(u, std::complex<double>, rocblas_zgeru)
-GER_LAUNCHER(c, std::complex<float>, rocblas_cgerc)
-GER_LAUNCHER(c, std::complex<double>, rocblas_zgerc)
-
-#undef GER_LAUNCHER
-
-template <typename Func, typename T>
-inline void hbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, T alpha,
-                 sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, k,
-                                    (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta,
-                                    y_, incy);
-        });
-    });
-}
-
-#define HBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,           \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx,    \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        hbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); \
-    }
-
-HBMV_LAUNCHER(std::complex<float>, rocblas_chbmv)
-HBMV_LAUNCHER(std::complex<double>, rocblas_zhbmv)
-
-#undef HBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void hemv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta,
-                                    y_, incy);
-        });
-    });
-}
-
-#define HEMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                 \
-    void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                   \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                           \
-        hemv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); \
-    }
-
-HEMV_LAUNCHER(std::complex<float>, rocblas_chemv)
-HEMV_LAUNCHER(std::complex<double>, rocblas_zhemv)
-
-#undef HEMV_LAUNCHER
-
-template <typename Func, typename ScalarType, typename DataType>
-inline void her(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, ScalarType alpha,
-                sycl::buffer<DataType, 1> &x, int64_t incx, sycl::buffer<DataType, 1> &a,
-                int64_t lda) {
-    using rocScalarType = typename RocEquivalentType<ScalarType>::Type;
-    using rocDataType = typename RocEquivalentType<DataType>::Type;
-    overflow_check(n, lda, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocScalarType *)&alpha, x_, incx, a_, lda);
-        });
-    });
-}
-
-#define HER_LAUNCHER(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE)                            \
-    void her(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha,         \
-             sycl::buffer<DATA_TYPE, 1> &x, int64_t incx, sycl::buffer<DATA_TYPE, 1> &a, \
-             int64_t lda) {                                                              \
-        her(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda);             \
-    }
-
-HER_LAUNCHER(float, std::complex<float>, rocblas_cher)
-HER_LAUNCHER(double, std::complex<double>, rocblas_zher)
-
-#undef HER_LAUNCHER
-
-template <typename Func, typename T>
-inline void her2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a, int64_t lda) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, x_, incx, y_, incy, a_, lda);
-        });
-    });
-}
-
-#define HER2_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                  \
-    void her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                    \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy, \
-              sycl::buffer<TYPE, 1> &a, int64_t lda) {                                        \
-        her2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);        \
-    }
-
-HER2_LAUNCHER(std::complex<float>, rocblas_cher2)
-HER2_LAUNCHER(std::complex<double>, rocblas_zher2)
-
-#undef HER2_LAUNCHER
-
-template <typename Func, typename T>
-inline void hpmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &a, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, a_, x_, incx, (rocDataType *)&beta, y_,
-                                    incy);
-        });
-    });
-}
-
-#define HPMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                               \
-    void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                 \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx, TYPE beta, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                                    \
-        hpmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);    \
-    }
-
-HPMV_LAUNCHER(std::complex<float>, rocblas_chpmv)
-HPMV_LAUNCHER(std::complex<double>, rocblas_zhpmv)
-
-#undef HPMV_LAUNCHER
-
-template <typename Func, typename ScalarType, typename DataType>
-inline void hpr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, ScalarType alpha,
-                sycl::buffer<DataType, 1> &x, int64_t incx, sycl::buffer<DataType, 1> &a) {
-    using rocScalarType = typename RocEquivalentType<ScalarType>::Type;
-    using rocDataType = typename RocEquivalentType<DataType>::Type;
-    overflow_check(n, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocScalarType *)&alpha, x_, incx, a_);
-        });
-    });
-}
-
-#define HPR_LAUNCHER(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE)                              \
-    void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha,           \
-             sycl::buffer<DATA_TYPE, 1> &x, int64_t incx, sycl::buffer<DATA_TYPE, 1> &a) { \
-        hpr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a);                    \
-    }
-
-HPR_LAUNCHER(float, std::complex<float>, rocblas_chpr)
-HPR_LAUNCHER(double, std::complex<double>, rocblas_zhpr)
-
-#undef HPR_LAUNCHER
-
-template <typename Func, typename T>
-inline void hpr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, x_, incx, y_, incy, a_);
-        });
-    });
-}
-
-#define HPR2_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                  \
-    void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                    \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy, \
-              sycl::buffer<TYPE, 1> &a) {                                                     \
-        hpr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a);             \
-    }
-
-HPR2_LAUNCHER(std::complex<float>, rocblas_chpr2)
-HPR2_LAUNCHER(std::complex<double>, rocblas_zhpr2)
-
-#undef HPR2_LAUNCHER
-
-template <typename Func, typename T>
-inline void sbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, T alpha,
-                 sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, k,
-                                    (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta,
-                                    y_, incy);
-        });
-    });
-}
-
-#define SBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,           \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx,    \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        sbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); \
-    }
-
-SBMV_LAUNCHER(float, rocblas_ssbmv)
-SBMV_LAUNCHER(double, rocblas_dsbmv)
-
-#undef SBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void symv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta,
-                                    y_, incy);
-        });
-    });
-}
-
-#define SYMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                 \
-    void symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                   \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                           \
-        symv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); \
-    }
-
-SYMV_LAUNCHER(float, rocblas_ssymv)
-SYMV_LAUNCHER(double, rocblas_dsymv)
-
-#undef SYMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void syr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &a, int64_t lda) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, lda, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, x_, incx, a_, lda);
-        });
-    });
-}
-
-#define SYR_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                   \
-    void syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                     \
-             sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &a, int64_t lda) { \
-        syr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda);                  \
-    }
-
-SYR_LAUNCHER(float, rocblas_ssyr)
-SYR_LAUNCHER(double, rocblas_dsyr)
-// Intel does not support the following two
-SYR_LAUNCHER(std::complex<float>, rocblas_csyr)
-SYR_LAUNCHER(std::complex<double>, rocblas_zsyr)
-
-#undef SYR_LAUNCHER
-
-template <typename Func, typename T>
-inline void syr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a, int64_t lda) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, x_, incx, y_, incy, a_, lda);
-        });
-    });
-}
-
-#define SYR2_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                  \
-    void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                    \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy, \
-              sycl::buffer<TYPE, 1> &a, int64_t lda) {                                        \
-        syr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);        \
-    }
-
-SYR2_LAUNCHER(float, rocblas_ssyr2)
-SYR2_LAUNCHER(double, rocblas_dsyr2)
-// Intel does not support the following two
-SYR2_LAUNCHER(std::complex<float>, rocblas_csyr2)
-SYR2_LAUNCHER(std::complex<double>, rocblas_zsyr2)
-
-#undef SYR2_LAUNCHER
-
-template <typename Func, typename T>
-inline void spmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &a, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, a_, x_, incx, (rocDataType *)&beta, y_,
-                                    incy);
-        });
-    });
-}
-
-#define SPMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                               \
-    void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                 \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx, TYPE beta, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                                    \
-        spmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);    \
-    }
-
-SPMV_LAUNCHER(float, rocblas_sspmv)
-SPMV_LAUNCHER(double, rocblas_dspmv)
-
-#undef SPMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void spr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &a) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, x_, incx, a_);
-        });
-    });
-}
-
-#define SPR_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                      \
-    void spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,        \
-             sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &a) { \
-        spr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a);          \
-    }
-
-SPR_LAUNCHER(float, rocblas_sspr)
-SPR_LAUNCHER(double, rocblas_dspr)
-
-#undef SPR_LAUNCHER
-
-template <typename Func, typename T>
-inline void spr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read>(cgh);
-        auto y_acc = y.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            auto y_ = sc.get_mem<rocDataType *>(y_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, x_, incx, y_, incy, a_);
-        });
-    });
-}
-
-#define SPR2_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                  \
-    void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                    \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy, \
-              sycl::buffer<TYPE, 1> &a) {                                                     \
-        spr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a);             \
-    }
-
-SPR2_LAUNCHER(float, rocblas_sspr2)
-SPR2_LAUNCHER(double, rocblas_dspr2)
-
-#undef SPR2_LAUNCHER
-
-template <typename Func, typename T>
-inline void tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, int64_t k, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    n, k, a_, lda, x_, incx);
-        });
-    });
-}
-
-#define TBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              int64_t k, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x,       \
-              int64_t incx) {                                                                   \
-        tbmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);     \
-    }
-
-TBMV_LAUNCHER(float, rocblas_stbmv)
-TBMV_LAUNCHER(double, rocblas_dtbmv)
-TBMV_LAUNCHER(std::complex<float>, rocblas_ctbmv)
-TBMV_LAUNCHER(std::complex<double>, rocblas_ztbmv)
-
-#undef TBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, int64_t k, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    n, k, a_, lda, x_, incx);
-        });
-    });
-}
-
-#define TBSV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              int64_t k, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x,       \
-              int64_t incx) {                                                                   \
-        tbsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);     \
-    }
-
-TBSV_LAUNCHER(float, rocblas_stbsv)
-TBSV_LAUNCHER(double, rocblas_dtbsv)
-TBSV_LAUNCHER(std::complex<float>, rocblas_ctbsv)
-TBSV_LAUNCHER(std::complex<double>, rocblas_ztbsv)
-
-#undef TBSV_LAUNCHER
-
-template <typename Func, typename T>
-inline void tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, sycl::buffer<T, 1> &a, sycl::buffer<T, 1> &x, int64_t incx) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    n, a_, x_, incx);
-        });
-    });
-}
-
-#define TPMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx) {               \
-        tpmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx);             \
-    }
-
-TPMV_LAUNCHER(float, rocblas_stpmv)
-TPMV_LAUNCHER(double, rocblas_dtpmv)
-TPMV_LAUNCHER(std::complex<float>, rocblas_ctpmv)
-TPMV_LAUNCHER(std::complex<double>, rocblas_ztpmv)
-
-#undef TPMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, sycl::buffer<T, 1> &a, sycl::buffer<T, 1> &x, int64_t incx) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    n, a_, x_, incx);
-        });
-    });
-}
-
-#define TPSV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx) {               \
-        tpsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx);             \
-    }
-
-TPSV_LAUNCHER(float, rocblas_stpsv)
-TPSV_LAUNCHER(double, rocblas_dtpsv)
-TPSV_LAUNCHER(std::complex<float>, rocblas_ctpsv)
-TPSV_LAUNCHER(std::complex<double>, rocblas_ztpsv)
-
-#undef TPSV_LAUNCHER
-
-template <typename Func, typename T>
-inline void trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, lda, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    n, a_, lda, x_, incx);
-        });
-    });
-}
-
-#define TRMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx) {  \
-        trmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);        \
-    }
-
-TRMV_LAUNCHER(float, rocblas_strmv)
-TRMV_LAUNCHER(double, rocblas_dtrmv)
-TRMV_LAUNCHER(std::complex<float>, rocblas_ctrmv)
-TRMV_LAUNCHER(std::complex<double>, rocblas_ztrmv)
-
-#undef TRMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, lda, incx);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto x_acc = x.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto x_ = sc.get_mem<rocDataType *>(x_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    n, a_, lda, x_, incx);
-        });
-    });
-}
-
-#define TRSV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx) {  \
-        trsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);        \
-    }
-
-TRSV_LAUNCHER(float, rocblas_strsv)
-TRSV_LAUNCHER(double, rocblas_dtrsv)
-TRSV_LAUNCHER(std::complex<float>, rocblas_ctrsv)
-TRSV_LAUNCHER(std::complex<double>, rocblas_ztrsv)
-
-#undef TRSV_LAUNCHER
-
-// USM APIs
-
-template <typename Func, typename T>
-inline sycl::event gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                        T alpha, const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), m, n,
-                                    (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta,
-                                    y_, incy);
-        });
-    });
-
-    return done;
-}
-
-#define GEMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,       \
-                     const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {                \
-        return gemv(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,   \
-                    dependencies);                                                                \
-    }
-
-GEMV_LAUNCHER_USM(float, rocblas_sgemv)
-GEMV_LAUNCHER_USM(double, rocblas_dgemv)
-GEMV_LAUNCHER_USM(std::complex<float>, rocblas_cgemv)
-GEMV_LAUNCHER_USM(std::complex<double>, rocblas_zgemv)
-
-#undef GEMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                        int64_t kl, int64_t ku, T alpha, const T *a, int64_t lda, const T *x,
-                        int64_t incx, T beta, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, m, lda, kl, ku, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), m, n, kl, ku,
-                                    (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta,
-                                    y_, incy);
-        });
-    });
-
-    return done;
-}
-
-#define GBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl,       \
-                     int64_t ku, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *x,           \
-                     int64_t incx, TYPE beta, TYPE *y, int64_t incy,                              \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return gbmv(ROCBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, \
-                    incy, dependencies);                                                          \
-    }
-
-GBMV_LAUNCHER_USM(float, rocblas_sgbmv)
-GBMV_LAUNCHER_USM(double, rocblas_dgbmv)
-GBMV_LAUNCHER_USM(std::complex<float>, rocblas_cgbmv)
-GBMV_LAUNCHER_USM(std::complex<double>, rocblas_zgbmv)
-
-#undef GBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event ger(Func func, sycl::queue &queue, int64_t m, int64_t n, T alpha, const T *x,
-                       int64_t incx, const T *y, int64_t incy, T *a, int64_t lda,
-                       const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, m, lda, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<const rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, m, n, (rocDataType *)&alpha, x_, incx, y_,
-                                    incy, a_, lda);
-        });
-    });
-
-    return done;
-}
-
-#define GER_LAUNCHER_USM(EXT, TYPE, ROCBLAS_ROUTINE)                                             \
-    sycl::event ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, const TYPE *x,    \
-                         int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda,        \
-                         const std::vector<sycl::event> &dependencies) {                         \
-        return ger(ROCBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); \
-    }
-
-GER_LAUNCHER_USM(, float, rocblas_sger)
-GER_LAUNCHER_USM(, double, rocblas_dger)
-GER_LAUNCHER_USM(u, std::complex<float>, rocblas_cgeru)
-GER_LAUNCHER_USM(u, std::complex<double>, rocblas_zgeru)
-GER_LAUNCHER_USM(c, std::complex<float>, rocblas_cgerc)
-GER_LAUNCHER_USM(c, std::complex<double>, rocblas_zgerc)
-
-#undef GER_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k,
-                        T alpha, const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, k,
-                                    (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta,
-                                    y_, incy);
-        });
-    });
-
-    return done;
-}
-
-#define HBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,      \
-                     const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {                \
-        return hbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,   \
-                    incy, dependencies);                                                          \
-    }
-
-HBMV_LAUNCHER_USM(std::complex<float>, rocblas_chbmv)
-HBMV_LAUNCHER_USM(std::complex<double>, rocblas_zhbmv)
-
-#undef HBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hemv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta,
-                                    y_, incy);
-        });
-    });
-
-    return done;
-}
-
-#define HEMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                   \
-    sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a,   \
-                     int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy,   \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return hemv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, \
-                    dependencies);                                                                 \
-    }
-
-HEMV_LAUNCHER_USM(std::complex<float>, rocblas_chemv)
-HEMV_LAUNCHER_USM(std::complex<double>, rocblas_zhemv)
-
-#undef HEMV_LAUNCHER_USM
-
-template <typename Func, typename ScalarType, typename DataType>
-inline sycl::event her(Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                       const ScalarType alpha, const DataType *x, int64_t incx, DataType *a,
-                       int64_t lda, const std::vector<sycl::event> &dependencies) {
-    using rocScalarType = typename RocEquivalentType<ScalarType>::Type;
-    using rocDataType = typename RocEquivalentType<DataType>::Type;
-    overflow_check(n, lda, incx);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocScalarType *)&alpha, x_, incx, a_, lda);
-        });
-    });
-
-    return done;
-}
-
-#define HER_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE)                                 \
-    sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha,     \
-                    const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, int64_t lda,                  \
-                    const std::vector<sycl::event> &dependencies) {                               \
-        return her(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); \
-    }
-
-HER_LAUNCHER_USM(float, std::complex<float>, rocblas_cher)
-HER_LAUNCHER_USM(double, std::complex<double>, rocblas_zher)
-
-#undef HER_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event her2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *x, int64_t incx, const T *y, int64_t incy, T *a, int64_t lda,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<const rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, x_, incx, y_, incy, a_, lda);
-        });
-    });
-
-    return done;
-}
-
-#define HER2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                 \
-    sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda,            \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return her2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda,     \
-                    dependencies);                                                               \
-    }
-
-HER2_LAUNCHER_USM(std::complex<float>, rocblas_cher2)
-HER2_LAUNCHER_USM(std::complex<double>, rocblas_zher2)
-
-#undef HER2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hpmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *a, const T *x, int64_t incx, T beta, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, a_, x_, incx, (rocDataType *)&beta, y_,
-                                    incy);
-        });
-    });
-
-    return done;
-}
-
-#define HPMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                 \
-    sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \
-                     const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy,              \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return hpmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy,    \
-                    dependencies);                                                               \
-    }
-
-HPMV_LAUNCHER_USM(std::complex<float>, rocblas_chpmv)
-HPMV_LAUNCHER_USM(std::complex<double>, rocblas_zhpmv)
-
-#undef HPMV_LAUNCHER_USM
-
-template <typename Func, typename ScalarType, typename DataType>
-inline sycl::event hpr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                       const ScalarType alpha, const DataType *x, int64_t incx, DataType *a,
-                       const std::vector<sycl::event> &dependencies) {
-    using rocScalarType = typename RocEquivalentType<ScalarType>::Type;
-    using rocDataType = typename RocEquivalentType<DataType>::Type;
-    overflow_check(n, incx);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocScalarType *)&alpha, x_, incx, a_);
-        });
-    });
-
-    return done;
-}
-
-#define HPR_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE)                             \
-    sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \
-                    const DATA_TYPE *x, int64_t incx, DATA_TYPE *a,                           \
-                    const std::vector<sycl::event> &dependencies) {                           \
-        return hpr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, dependencies);  \
-    }
-
-HPR_LAUNCHER_USM(float, std::complex<float>, rocblas_chpr)
-HPR_LAUNCHER_USM(double, std::complex<double>, rocblas_zhpr)
-
-#undef HPR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hpr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *x, int64_t incx, const T *y, int64_t incy, T *a,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<const rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, x_, incx, y_, incy, a_);
-        });
-    });
-
-    return done;
-}
-
-#define HPR2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                 \
-    sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a,                         \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return hpr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a,          \
-                    dependencies);                                                               \
-    }
-
-HPR2_LAUNCHER_USM(std::complex<float>, rocblas_chpr2)
-HPR2_LAUNCHER_USM(std::complex<double>, rocblas_zhpr2)
-
-#undef HPR2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event sbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k,
-                        T alpha, const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, k,
-                                    (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta,
-                                    y_, incy);
-        });
-    });
-
-    return done;
-}
-
-#define SBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,      \
-                     const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {                \
-        return sbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,   \
-                    incy, dependencies);                                                          \
-    }
-
-SBMV_LAUNCHER_USM(float, rocblas_ssbmv)
-SBMV_LAUNCHER_USM(double, rocblas_dsbmv)
-
-#undef SBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event symv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta,
-                                    y_, incy);
-        });
-    });
-
-    return done;
-}
-
-#define SYMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                   \
-    sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a,   \
-                     int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy,   \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return symv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, \
-                    dependencies);                                                                 \
-    }
-
-SYMV_LAUNCHER_USM(float, rocblas_ssymv)
-SYMV_LAUNCHER_USM(double, rocblas_dsymv)
-
-#undef SYMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                       const T *x, int64_t incx, T *a, int64_t lda,
-                       const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, lda, incx);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, x_, incx, a_, lda);
-        });
-    });
-
-    return done;
-}
-
-#define SYR_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                   \
-    sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x,   \
-                    int64_t incx, TYPE *a, int64_t lda,                                           \
-                    const std::vector<sycl::event> &dependencies) {                               \
-        return syr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); \
-    }
-
-SYR_LAUNCHER_USM(float, rocblas_ssyr)
-SYR_LAUNCHER_USM(double, rocblas_dsyr)
-// Intel does not support the following two
-SYR_LAUNCHER_USM(std::complex<float>, rocblas_csyr)
-SYR_LAUNCHER_USM(std::complex<double>, rocblas_zsyr)
-
-#undef SYR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *x, int64_t incx, const T *y, int64_t incy, T *a, int64_t lda,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, lda, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<const rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, x_, incx, y_, incy, a_, lda);
-        });
-    });
-
-    return done;
-}
-
-#define SYR2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                 \
-    sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda,            \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return syr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda,     \
-                    dependencies);                                                               \
-    }
-
-SYR2_LAUNCHER_USM(float, rocblas_ssyr2)
-SYR2_LAUNCHER_USM(double, rocblas_dsyr2)
-// Intel does not support the following two
-SYR2_LAUNCHER_USM(std::complex<float>, rocblas_csyr2)
-SYR2_LAUNCHER_USM(std::complex<double>, rocblas_zsyr2)
-
-#undef SYR2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event spmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *a, const T *x, int64_t incx, T beta, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, a_, x_, incx, (rocDataType *)&beta, y_,
-                                    incy);
-        });
-    });
-
-    return done;
-}
-
-#define SPMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                 \
-    sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \
-                     const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy,              \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return spmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy,    \
-                    dependencies);                                                               \
-    }
-
-SPMV_LAUNCHER_USM(float, rocblas_sspmv)
-SPMV_LAUNCHER_USM(double, rocblas_dspmv)
-
-#undef SPMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event spr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                       const T *x, int64_t incx, T *a,
-                       const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, x_, incx, a_);
-        });
-    });
-
-    return done;
-}
-
-#define SPR_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                 \
-    sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                    int64_t incx, TYPE *a, const std::vector<sycl::event> &dependencies) {      \
-        return spr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, dependencies);    \
-    }
-
-SPR_LAUNCHER_USM(float, rocblas_sspr)
-SPR_LAUNCHER_USM(double, rocblas_dspr)
-
-#undef SPR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event spr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *x, int64_t incx, const T *y, int64_t incy, T *a,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx, incy);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<rocDataType *>(a);
-            auto x_ = reinterpret_cast<const rocDataType *>(x);
-            auto y_ = reinterpret_cast<const rocDataType *>(y);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n,
-                                    (rocDataType *)&alpha, x_, incx, y_, incy, a_);
-        });
-    });
-
-    return done;
-}
-
-#define SPR2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                 \
-    sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a,                         \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return spr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a,          \
-                    dependencies);                                                               \
-    }
-
-SPR2_LAUNCHER_USM(float, rocblas_sspr2)
-SPR2_LAUNCHER_USM(double, rocblas_dspr2)
-
-#undef SPR2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, int64_t k, const T *a, int64_t lda, T *x,
-                        int64_t incx, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<rocDataType *>(x);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    n, k, a_, lda, x_, incx);
-        });
-    });
-
-    return done;
-}
-
-#define TBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,       \
-                     int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,     \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return tbmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, \
-                    dependencies);                                                                \
-    }
-
-TBMV_LAUNCHER_USM(float, rocblas_stbmv)
-TBMV_LAUNCHER_USM(double, rocblas_dtbmv)
-TBMV_LAUNCHER_USM(std::complex<float>, rocblas_ctbmv)
-TBMV_LAUNCHER_USM(std::complex<double>, rocblas_ztbmv)
-
-#undef TBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, int64_t k, const T *a, int64_t lda, T *x,
-                        int64_t incx, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, k, lda, incx);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<rocDataType *>(x);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    n, k, a_, lda, x_, incx);
-        });
-    });
-
-    return done;
-}
-
-#define TBSV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,       \
-                     int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,     \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return tbsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, \
-                    dependencies);                                                                \
-    }
-
-TBSV_LAUNCHER_USM(float, rocblas_stbsv)
-TBSV_LAUNCHER_USM(double, rocblas_dtbsv)
-TBSV_LAUNCHER_USM(std::complex<float>, rocblas_ctbsv)
-TBSV_LAUNCHER_USM(std::complex<double>, rocblas_ztbsv)
-
-#undef TBSV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, const T *a, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<rocDataType *>(x);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    n, a_, x_, incx);
-        });
-    });
-
-    return done;
-}
-
-#define TPMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                            \
-    sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \
-                     int64_t n, const TYPE *a, TYPE *x, int64_t incx,                       \
-                     const std::vector<sycl::event> &dependencies) {                        \
-        return tpmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx,   \
-                    dependencies);                                                          \
-    }
-
-TPMV_LAUNCHER_USM(float, rocblas_stpmv)
-TPMV_LAUNCHER_USM(double, rocblas_dtpmv)
-TPMV_LAUNCHER_USM(std::complex<float>, rocblas_ctpmv)
-TPMV_LAUNCHER_USM(std::complex<double>, rocblas_ztpmv)
-
-#undef TPMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, const T *a, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, incx);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<rocDataType *>(x);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    n, a_, x_, incx);
-        });
-    });
-
-    return done;
-}
-
-#define TPSV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                            \
-    sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \
-                     int64_t n, const TYPE *a, TYPE *x, int64_t incx,                       \
-                     const std::vector<sycl::event> &dependencies) {                        \
-        return tpsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx,   \
-                    dependencies);                                                          \
-    }
-
-TPSV_LAUNCHER_USM(float, rocblas_stpsv)
-TPSV_LAUNCHER_USM(double, rocblas_dtpsv)
-TPSV_LAUNCHER_USM(std::complex<float>, rocblas_ctpsv)
-TPSV_LAUNCHER_USM(std::complex<double>, rocblas_ztpsv)
-
-#undef TPSV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, lda, incx);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<rocDataType *>(x);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    n, a_, lda, x_, incx);
-        });
-    });
-
-    return done;
-}
-
-#define TRMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                               \
-    sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,    \
-                     int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,             \
-                     const std::vector<sycl::event> &dependencies) {                           \
-        return trmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, \
-                    dependencies);                                                             \
-    }
-
-TRMV_LAUNCHER_USM(float, rocblas_strmv)
-TRMV_LAUNCHER_USM(double, rocblas_dtrmv)
-TRMV_LAUNCHER_USM(std::complex<float>, rocblas_ctrmv)
-TRMV_LAUNCHER_USM(std::complex<double>, rocblas_ztrmv)
-
-#undef TRMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, lda, incx);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto x_ = reinterpret_cast<rocDataType *>(x);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    n, a_, lda, x_, incx);
-        });
-    });
-
-    return done;
-}
-
-#define TRSV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                               \
-    sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,    \
-                     int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,             \
-                     const std::vector<sycl::event> &dependencies) {                           \
-        return trsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, \
-                    dependencies);                                                             \
-    }
-
-TRSV_LAUNCHER_USM(float, rocblas_strsv)
-TRSV_LAUNCHER_USM(double, rocblas_dtrsv)
-TRSV_LAUNCHER_USM(std::complex<float>, rocblas_ctrsv)
-TRSV_LAUNCHER_USM(std::complex<double>, rocblas_ztrsv)
-
-#undef TRSV_LAUNCHER_USM
-
-} // namespace column_major
-
-namespace row_major {
-
-// Buffer APIs
-
-template <typename Func, typename T>
-inline void gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                 std::complex<T> alpha, sycl::buffer<std::complex<T>, 1> &a, int64_t lda,
-                 sycl::buffer<std::complex<T>, 1> &x, int64_t incx, std::complex<T> beta,
-                 sycl::buffer<std::complex<T>, 1> &y, int64_t incy) {
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        alpha = std::conj(alpha);
-        beta = std::conj(beta);
-
-        if (m > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, m, incx); });
-
-            if (n > 0) {
-                queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); });
-            }
-        }
-    }
-
-    column_major::gemv(func, queue, new_trans, n, m, alpha, a, lda, x, incx, beta, y, incy);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); });
-        }
-    }
-}
-
-template <typename Func, typename T>
-inline void gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    column_major::gemv(func, queue, new_trans, n, m, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-#define GEMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                 \
-    void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,         \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                           \
-        gemv(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);    \
-    }
-
-GEMV_LAUNCHER(float, rocblas_sgemv)
-GEMV_LAUNCHER(double, rocblas_dgemv)
-GEMV_LAUNCHER(std::complex<float>, rocblas_cgemv)
-GEMV_LAUNCHER(std::complex<double>, rocblas_zgemv)
-
-#undef GEMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl,
-                 int64_t ku, std::complex<T> alpha, sycl::buffer<std::complex<T>, 1> &a,
-                 int64_t lda, sycl::buffer<std::complex<T>, 1> &x, int64_t incx,
-                 std::complex<T> beta, sycl::buffer<std::complex<T>, 1> &y, int64_t incy) {
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        alpha = std::conj(alpha);
-        beta = std::conj(beta);
-
-        if (m > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, m, incx); });
-
-            if (n > 0) {
-                queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); });
-            }
-        }
-    }
-
-    column_major::gbmv(func, queue, new_trans, n, m, ku, kl, alpha, a, lda, x, incx, beta, y, incy);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); });
-        }
-    }
-}
-
-template <typename Func, typename T>
-inline void gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl,
-                 int64_t ku, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx, T beta, sycl::buffer<T, 1> &y, int64_t incy) {
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    column_major::gbmv(func, queue, new_trans, n, m, ku, kl, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-#define GBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                      \
-    void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku,  \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x,        \
-              int64_t incx, TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                  \
-        gbmv(ROCBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); \
-    }
-
-GBMV_LAUNCHER(float, rocblas_sgbmv)
-GBMV_LAUNCHER(double, rocblas_dgbmv)
-GBMV_LAUNCHER(std::complex<float>, rocblas_cgbmv)
-GBMV_LAUNCHER(std::complex<double>, rocblas_zgbmv)
-
-#undef GBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void gerc(Func func, sycl::queue &queue, int64_t m, int64_t n, std::complex<T> alpha,
-                 sycl::buffer<std::complex<T>, 1> &x, int64_t incx,
-                 sycl::buffer<std::complex<T>, 1> &y, int64_t incy,
-                 sycl::buffer<std::complex<T>, 1> &a, int64_t lda) {
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); });
-    }
-
-    column_major::ger(func, queue, n, m, alpha, y, incy, x, incx, a, lda);
-}
-
-template <typename Func, typename T>
-inline void geru(Func func, sycl::queue &queue, int64_t m, int64_t n, std::complex<T> alpha,
-                 sycl::buffer<std::complex<T>, 1> &x, int64_t incx,
-                 sycl::buffer<std::complex<T>, 1> &y, int64_t incy,
-                 sycl::buffer<std::complex<T>, 1> &a, int64_t lda) {
-    column_major::ger(func, queue, n, m, alpha, y, incy, x, incx, a, lda);
-}
-
-template <typename Func, typename T>
-inline void ger(Func func, sycl::queue &queue, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &x,
-                int64_t incx, sycl::buffer<T, 1> &y, int64_t incy, sycl::buffer<T, 1> &a,
-                int64_t lda) {
-    column_major::ger(func, queue, n, m, alpha, y, incy, x, incx, a, lda);
-}
-
-#define GER_LAUNCHER(EXT, TYPE, ROCBLAS_ROUTINE)                                                  \
-    void ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &x, \
-                  int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy, sycl::buffer<TYPE, 1> &a, \
-                  int64_t lda) {                                                                  \
-        ger##EXT(ROCBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda);                  \
-    }
-
-GER_LAUNCHER(, float, rocblas_sger)
-GER_LAUNCHER(, double, rocblas_dger)
-GER_LAUNCHER(u, std::complex<float>, rocblas_cgeru)
-GER_LAUNCHER(u, std::complex<double>, rocblas_zgeru)
-GER_LAUNCHER(c, std::complex<float>, rocblas_cgeru)
-GER_LAUNCHER(c, std::complex<double>, rocblas_zgeru)
-
-#undef GER_LAUNCHER
-
-template <typename Func, typename T>
-inline void hbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, T alpha,
-                 sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_alpha = std::conj(alpha);
-    auto new_beta = std::conj(beta);
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, y, n, incx, incy); });
-    }
-
-    column_major::hbmv(func, queue, new_uplo, n, k, new_alpha, a, lda, x, incx, new_beta, y, incy);
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); });
-    }
-}
-
-#define HBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,           \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx,    \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        hbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); \
-    }
-
-HBMV_LAUNCHER(std::complex<float>, rocblas_chbmv)
-HBMV_LAUNCHER(std::complex<double>, rocblas_zhbmv)
-
-#undef HBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void hemv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_alpha = std::conj(alpha);
-    auto new_beta = std::conj(beta);
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, y, n, incx, incy); });
-    }
-
-    column_major::hemv(func, queue, new_uplo, n, new_alpha, a, lda, x, incx, new_beta, y, incy);
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); });
-    }
-}
-
-#define HEMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                 \
-    void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                   \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                           \
-        hemv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); \
-    }
-
-HEMV_LAUNCHER(std::complex<float>, rocblas_chemv)
-HEMV_LAUNCHER(std::complex<double>, rocblas_zhemv)
-
-#undef HEMV_LAUNCHER
-
-template <typename Func, typename ScalarType, typename DataType>
-inline void her(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, ScalarType alpha,
-                sycl::buffer<DataType, 1> &x, int64_t incx, sycl::buffer<DataType, 1> &a,
-                int64_t lda) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); });
-    }
-
-    column_major::her(func, queue, new_uplo, n, alpha, x, incx, a, lda);
-}
-
-#define HER_LAUNCHER(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE)                            \
-    void her(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha,         \
-             sycl::buffer<DATA_TYPE, 1> &x, int64_t incx, sycl::buffer<DATA_TYPE, 1> &a, \
-             int64_t lda) {                                                              \
-        her(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda);             \
-    }
-
-HER_LAUNCHER(float, std::complex<float>, rocblas_cher)
-HER_LAUNCHER(double, std::complex<double>, rocblas_zher)
-
-#undef HER_LAUNCHER
-
-template <typename Func, typename T>
-inline void her2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a, int64_t lda) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, y, n, incx, incy); });
-    }
-
-    column_major::her2(func, queue, new_uplo, n, alpha, y, incy, x, incx, a, lda);
-}
-
-#define HER2_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                  \
-    void her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                    \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy, \
-              sycl::buffer<TYPE, 1> &a, int64_t lda) {                                        \
-        her2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);        \
-    }
-
-HER2_LAUNCHER(std::complex<float>, rocblas_cher2)
-HER2_LAUNCHER(std::complex<double>, rocblas_zher2)
-
-#undef HER2_LAUNCHER
-
-template <typename Func, typename T>
-inline void hpmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &a, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_alpha = std::conj(alpha);
-    auto new_beta = std::conj(beta);
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, y, n, incx, incy); });
-    }
-
-    column_major::hpmv(func, queue, new_uplo, n, new_alpha, a, x, incx, new_beta, y, incy);
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); });
-    }
-}
-
-#define HPMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                               \
-    void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                 \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx, TYPE beta, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                                    \
-        hpmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);    \
-    }
-
-HPMV_LAUNCHER(std::complex<float>, rocblas_chpmv)
-HPMV_LAUNCHER(std::complex<double>, rocblas_zhpmv)
-
-#undef HPMV_LAUNCHER
-
-template <typename Func, typename ScalarType, typename DataType>
-inline void hpr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, ScalarType alpha,
-                sycl::buffer<DataType, 1> &x, int64_t incx, sycl::buffer<DataType, 1> &a) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); });
-    }
-
-    column_major::hpr(func, queue, new_uplo, n, alpha, x, incx, a);
-}
-
-#define HPR_LAUNCHER(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE)                              \
-    void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha,           \
-             sycl::buffer<DATA_TYPE, 1> &x, int64_t incx, sycl::buffer<DATA_TYPE, 1> &a) { \
-        hpr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a);                    \
-    }
-
-HPR_LAUNCHER(float, std::complex<float>, rocblas_chpr)
-HPR_LAUNCHER(double, std::complex<double>, rocblas_zhpr)
-
-#undef HPR_LAUNCHER
-
-template <typename Func, typename T>
-inline void hpr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, y, n, incx, incy); });
-    }
-
-    column_major::hpr2(func, queue, new_uplo, n, alpha, y, incy, x, incx, a);
-}
-
-#define HPR2_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                  \
-    void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                    \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy, \
-              sycl::buffer<TYPE, 1> &a) {                                                     \
-        hpr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a);             \
-    }
-
-HPR2_LAUNCHER(std::complex<float>, rocblas_chpr2)
-HPR2_LAUNCHER(std::complex<double>, rocblas_zhpr2)
-
-#undef HPR2_LAUNCHER
-
-template <typename Func, typename T>
-inline void sbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, T alpha,
-                 sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    column_major::sbmv(func, queue, new_uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-#define SBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,           \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx,    \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                              \
-        sbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); \
-    }
-
-SBMV_LAUNCHER(float, rocblas_ssbmv)
-SBMV_LAUNCHER(double, rocblas_dsbmv)
-
-#undef SBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void symv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    column_major::symv(func, queue, new_uplo, n, alpha, a, lda, x, incx, beta, y, incy);
-}
-
-#define SYMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                 \
-    void symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                   \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx, \
-              TYPE beta, sycl::buffer<TYPE, 1> &y, int64_t incy) {                           \
-        symv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); \
-    }
-
-SYMV_LAUNCHER(float, rocblas_ssymv)
-SYMV_LAUNCHER(double, rocblas_dsymv)
-
-#undef SYMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void syr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &a, int64_t lda) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    column_major::syr(func, queue, new_uplo, n, alpha, x, incx, a, lda);
-}
-
-#define SYR_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                   \
-    void syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                     \
-             sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &a, int64_t lda) { \
-        syr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda);                  \
-    }
-
-SYR_LAUNCHER(float, rocblas_ssyr)
-SYR_LAUNCHER(double, rocblas_dsyr)
-// Intel does not support the following two
-SYR_LAUNCHER(std::complex<float>, rocblas_csyr)
-SYR_LAUNCHER(std::complex<double>, rocblas_zsyr)
-
-#undef SYR_LAUNCHER
-
-template <typename Func, typename T>
-inline void syr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a, int64_t lda) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    column_major::syr2(func, queue, new_uplo, n, alpha, x, incx, y, incy, a, lda);
-}
-
-#define SYR2_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                  \
-    void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                    \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy, \
-              sycl::buffer<TYPE, 1> &a, int64_t lda) {                                        \
-        syr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda);        \
-    }
-
-SYR2_LAUNCHER(float, rocblas_ssyr2)
-SYR2_LAUNCHER(double, rocblas_dsyr2)
-// Intel does not support the following two
-SYR2_LAUNCHER(std::complex<float>, rocblas_csyr2)
-SYR2_LAUNCHER(std::complex<double>, rocblas_zsyr2)
-
-#undef SYR2_LAUNCHER
-
-template <typename Func, typename T>
-inline void spmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &a, sycl::buffer<T, 1> &x, int64_t incx, T beta,
-                 sycl::buffer<T, 1> &y, int64_t incy) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    column_major::spmv(func, queue, new_uplo, n, alpha, a, x, incx, beta, y, incy);
-}
-
-#define SPMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                               \
-    void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                 \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx, TYPE beta, \
-              sycl::buffer<TYPE, 1> &y, int64_t incy) {                                    \
-        spmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy);    \
-    }
-
-SPMV_LAUNCHER(float, rocblas_sspmv)
-SPMV_LAUNCHER(double, rocblas_dspmv)
-
-#undef SPMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void spr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &a) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    column_major::spr(func, queue, new_uplo, n, alpha, x, incx, a);
-}
-
-#define SPR_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                      \
-    void spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,        \
-             sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &a) { \
-        spr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a);          \
-    }
-
-SPR_LAUNCHER(float, rocblas_sspr)
-SPR_LAUNCHER(double, rocblas_dspr)
-
-#undef SPR_LAUNCHER
-
-template <typename Func, typename T>
-inline void spr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                 sycl::buffer<T, 1> &x, int64_t incx, sycl::buffer<T, 1> &y, int64_t incy,
-                 sycl::buffer<T, 1> &a) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    column_major::spr2(func, queue, new_uplo, n, alpha, x, incx, y, incy, a);
-}
-
-#define SPR2_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                  \
-    void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha,                    \
-              sycl::buffer<TYPE, 1> &x, int64_t incx, sycl::buffer<TYPE, 1> &y, int64_t incy, \
-              sycl::buffer<TYPE, 1> &a) {                                                     \
-        spr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a);             \
-    }
-
-SPR2_LAUNCHER(float, rocblas_sspr2)
-SPR2_LAUNCHER(double, rocblas_dspr2)
-
-#undef SPR2_LAUNCHER
-
-template <typename Func, typename T>
-inline void tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, int64_t k, sycl::buffer<std::complex<T>, 1> &a, int64_t lda,
-                 sycl::buffer<std::complex<T>, 1> &x, int64_t incx) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); });
-        }
-    }
-
-    column_major::tbmv(func, queue, new_uplo, new_trans, unit_diag, n, k, a, lda, x, incx);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); });
-        }
-    }
-}
-
-template <typename Func, typename T>
-inline void tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, int64_t k, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    column_major::tbmv(func, queue, new_uplo, new_trans, unit_diag, n, k, a, lda, x, incx);
-}
-
-#define TBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              int64_t k, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x,       \
-              int64_t incx) {                                                                   \
-        tbmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);     \
-    }
-
-TBMV_LAUNCHER(float, rocblas_stbmv)
-TBMV_LAUNCHER(double, rocblas_dtbmv)
-TBMV_LAUNCHER(std::complex<float>, rocblas_ctbmv)
-TBMV_LAUNCHER(std::complex<double>, rocblas_ztbmv)
-
-#undef TBMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, int64_t k, sycl::buffer<std::complex<T>, 1> &a, int64_t lda,
-                 sycl::buffer<std::complex<T>, 1> &x, int64_t incx) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); });
-        }
-    }
-
-    column_major::tbsv(func, queue, new_uplo, new_trans, unit_diag, n, k, a, lda, x, incx);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); });
-        }
-    }
-}
-
-template <typename Func, typename T>
-inline void tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, int64_t k, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    column_major::tbsv(func, queue, new_uplo, new_trans, unit_diag, n, k, a, lda, x, incx);
-}
-
-#define TBSV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              int64_t k, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x,       \
-              int64_t incx) {                                                                   \
-        tbsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);     \
-    }
-
-TBSV_LAUNCHER(float, rocblas_stbsv)
-TBSV_LAUNCHER(double, rocblas_dtbsv)
-TBSV_LAUNCHER(std::complex<float>, rocblas_ctbsv)
-TBSV_LAUNCHER(std::complex<double>, rocblas_ztbsv)
-
-#undef TBSV_LAUNCHER
-
-template <typename Func, typename T>
-inline void tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, sycl::buffer<std::complex<T>, 1> &a,
-                 sycl::buffer<std::complex<T>, 1> &x, int64_t incx) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); });
-        }
-    }
-
-    column_major::tpmv(func, queue, new_uplo, new_trans, unit_diag, n, a, x, incx);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); });
-        }
-    }
-}
-
-template <typename Func, typename T>
-inline void tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, sycl::buffer<T, 1> &a, sycl::buffer<T, 1> &x, int64_t incx) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    column_major::tpmv(func, queue, new_uplo, new_trans, unit_diag, n, a, x, incx);
-}
-
-#define TPMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx) {               \
-        tpmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx);             \
-    }
-
-TPMV_LAUNCHER(float, rocblas_stpmv)
-TPMV_LAUNCHER(double, rocblas_dtpmv)
-TPMV_LAUNCHER(std::complex<float>, rocblas_ctpmv)
-TPMV_LAUNCHER(std::complex<double>, rocblas_ztpmv)
-
-#undef TPMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, sycl::buffer<std::complex<T>, 1> &a,
-                 sycl::buffer<std::complex<T>, 1> &x, int64_t incx) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); });
-        }
-    }
-
-    column_major::tpsv(func, queue, new_uplo, new_trans, unit_diag, n, a, x, incx);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); });
-        }
-    }
-}
-
-template <typename Func, typename T>
-inline void tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, sycl::buffer<T, 1> &a, sycl::buffer<T, 1> &x, int64_t incx) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    column_major::tpsv(func, queue, new_uplo, new_trans, unit_diag, n, a, x, incx);
-}
-
-#define TPSV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              sycl::buffer<TYPE, 1> &a, sycl::buffer<TYPE, 1> &x, int64_t incx) {               \
-        tpsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx);             \
-    }
-
-TPSV_LAUNCHER(float, rocblas_stpsv)
-TPSV_LAUNCHER(double, rocblas_dtpsv)
-TPSV_LAUNCHER(std::complex<float>, rocblas_ctpsv)
-TPSV_LAUNCHER(std::complex<double>, rocblas_ztpsv)
-
-#undef TPSV_LAUNCHER
-
-template <typename Func, typename T>
-inline void trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, sycl::buffer<std::complex<T>, 1> &a, int64_t lda,
-                 sycl::buffer<std::complex<T>, 1> &x, int64_t incx) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); });
-        }
-    }
-
-    column_major::trmv(func, queue, new_uplo, new_trans, unit_diag, n, a, lda, x, incx);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); });
-        }
-    }
-}
-
-template <typename Func, typename T>
-inline void trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    column_major::trmv(func, queue, new_uplo, new_trans, unit_diag, n, a, lda, x, incx);
-}
-
-#define TRMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx) {  \
-        trmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);        \
-    }
-
-TRMV_LAUNCHER(float, rocblas_strmv)
-TRMV_LAUNCHER(double, rocblas_dtrmv)
-TRMV_LAUNCHER(std::complex<float>, rocblas_ctrmv)
-TRMV_LAUNCHER(std::complex<double>, rocblas_ztrmv)
-
-#undef TRMV_LAUNCHER
-
-template <typename Func, typename T>
-inline void trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, sycl::buffer<std::complex<T>, 1> &a, int64_t lda,
-                 sycl::buffer<std::complex<T>, 1> &x, int64_t incx) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); });
-        }
-    }
-
-    column_major::trsv(func, queue, new_uplo, new_trans, unit_diag, n, a, lda, x, incx);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); });
-        }
-    }
-}
-
-template <typename Func, typename T>
-inline void trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,
-                 int64_t n, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &x,
-                 int64_t incx) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    column_major::trsv(func, queue, new_uplo, new_trans, unit_diag, n, a, lda, x, incx);
-}
-
-#define TRSV_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \
-              sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &x, int64_t incx) {  \
-        trsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx);        \
-    }
-
-TRSV_LAUNCHER(float, rocblas_strsv)
-TRSV_LAUNCHER(double, rocblas_dtrsv)
-TRSV_LAUNCHER(std::complex<float>, rocblas_ctrsv)
-TRSV_LAUNCHER(std::complex<double>, rocblas_ztrsv)
-
-#undef TRSV_LAUNCHER
-
-// USM APIs
-
-template <typename Func, typename T>
-inline sycl::event gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                        std::complex<T> alpha, const std::complex<T> *a, int64_t lda,
-                        const std::complex<T> *x, int64_t incx, std::complex<T> beta,
-                        std::complex<T> *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    sycl::event done;
-
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        alpha = std::conj(alpha);
-        beta = std::conj(beta);
-
-        if (m > 0) {
-            done = queue.submit(
-                [&](sycl::handler &cgh) { conj_vector(cgh, (std::complex<T> *)x, m, incx); });
-
-            if (n > 0) {
-                done = queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); });
-            }
-        }
-    }
-
-    done.wait_and_throw();
-
-    done = column_major::gemv(func, queue, new_trans, n, m, alpha, a, lda, x, incx, beta, y, incy,
-                              dependencies);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            done = queue.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(done);
-                conj_vector(cgh, y, n, incy);
-            });
-        }
-    }
-
-    return done;
-}
-
-template <typename Func, typename T>
-inline sycl::event gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                        T alpha, const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    return column_major::gemv(func, queue, new_trans, n, m, alpha, a, lda, x, incx, beta, y, incy,
-                              dependencies);
-}
-
-#define GEMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha,       \
-                     const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {                \
-        return gemv(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy,   \
-                    dependencies);                                                                \
-    }
-
-GEMV_LAUNCHER_USM(float, rocblas_sgemv)
-GEMV_LAUNCHER_USM(double, rocblas_dgemv)
-GEMV_LAUNCHER_USM(std::complex<float>, rocblas_cgemv)
-GEMV_LAUNCHER_USM(std::complex<double>, rocblas_zgemv)
-
-#undef GEMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                        int64_t kl, int64_t ku, std::complex<T> alpha, const std::complex<T> *a,
-                        int64_t lda, const std::complex<T> *x, int64_t incx, std::complex<T> beta,
-                        std::complex<T> *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    sycl::event done;
-
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        alpha = std::conj(alpha);
-        beta = std::conj(beta);
-
-        if (m > 0) {
-            done = queue.submit(
-                [&](sycl::handler &cgh) { conj_vector(cgh, (std::complex<T> *)x, m, incx); });
-
-            if (n > 0) {
-                done = queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); });
-            }
-        }
-    }
-
-    done.wait_and_throw();
-
-    done = column_major::gbmv(func, queue, new_trans, n, m, ku, kl, alpha, a, lda, x, incx, beta, y,
-                              incy, dependencies);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            done = queue.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(done);
-                conj_vector(cgh, y, n, incy);
-            });
-        }
-    }
-
-    return done;
-}
-
-template <typename Func, typename T>
-inline sycl::event gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n,
-                        int64_t kl, int64_t ku, T alpha, const T *a, int64_t lda, const T *x,
-                        int64_t incx, T beta, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    return column_major::gbmv(func, queue, new_trans, n, m, ku, kl, alpha, a, lda, x, incx, beta, y,
-                              incy, dependencies);
-}
-
-#define GBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl,       \
-                     int64_t ku, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *x,           \
-                     int64_t incx, TYPE beta, TYPE *y, int64_t incy,                              \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return gbmv(ROCBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, \
-                    incy, dependencies);                                                          \
-    }
-
-GBMV_LAUNCHER_USM(float, rocblas_sgbmv)
-GBMV_LAUNCHER_USM(double, rocblas_dgbmv)
-GBMV_LAUNCHER_USM(std::complex<float>, rocblas_cgbmv)
-GBMV_LAUNCHER_USM(std::complex<double>, rocblas_zgbmv)
-
-#undef GBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event gerc(Func func, sycl::queue &queue, int64_t m, int64_t n, std::complex<T> alpha,
-                        const std::complex<T> *x, int64_t incx, const std::complex<T> *y,
-                        int64_t incy, std::complex<T> *a, int64_t lda,
-                        const std::vector<sycl::event> &dependencies) {
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (std::complex<T> *)y, n, incy); })
-            .wait_and_throw();
-    }
-
-    return column_major::ger(func, queue, n, m, alpha, y, incy, x, incx, a, lda, dependencies);
-}
-
-template <typename Func, typename T>
-inline sycl::event geru(Func func, sycl::queue &queue, int64_t m, int64_t n, std::complex<T> alpha,
-                        const std::complex<T> *x, int64_t incx, const std::complex<T> *y,
-                        int64_t incy, std::complex<T> *a, int64_t lda,
-                        const std::vector<sycl::event> &dependencies) {
-    return column_major::ger(func, queue, n, m, alpha, y, incy, x, incx, a, lda, dependencies);
-}
-
-template <typename Func, typename T>
-inline sycl::event ger(Func func, sycl::queue &queue, int64_t m, int64_t n, T alpha, const T *x,
-                       int64_t incx, const T *y, int64_t incy, T *a, int64_t lda,
-                       const std::vector<sycl::event> &dependencies) {
-    return column_major::ger(func, queue, n, m, alpha, y, incy, x, incx, a, lda, dependencies);
-}
-
-#define GER_LAUNCHER_USM(EXT, TYPE, ROCBLAS_ROUTINE)                                          \
-    sycl::event ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, const TYPE *x, \
-                         int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda,     \
-                         const std::vector<sycl::event> &dependencies) {                      \
-        return ger##EXT(ROCBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda,        \
-                        dependencies);                                                        \
-    }
-
-GER_LAUNCHER_USM(, float, rocblas_sger)
-GER_LAUNCHER_USM(, double, rocblas_dger)
-GER_LAUNCHER_USM(u, std::complex<float>, rocblas_cgeru)
-GER_LAUNCHER_USM(u, std::complex<double>, rocblas_zgeru)
-GER_LAUNCHER_USM(c, std::complex<float>, rocblas_cgeru)
-GER_LAUNCHER_USM(c, std::complex<double>, rocblas_zgeru)
-
-#undef GER_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k,
-                        T alpha, const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    sycl::event done;
-
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_alpha = std::conj(alpha);
-    auto new_beta = std::conj(beta);
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (T *)x, y, n, incx, incy); })
-            .wait_and_throw();
-    }
-
-    done = column_major::hbmv(func, queue, new_uplo, n, k, new_alpha, a, lda, x, incx, new_beta, y,
-                              incy, dependencies);
-
-    if (n > 0) {
-        done = queue.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(done);
-            conj_vector(cgh, y, n, incy);
-        });
-    }
-
-    return done;
-}
-
-#define HBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,      \
-                     const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {                \
-        return hbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,   \
-                    incy, dependencies);                                                          \
-    }
-
-HBMV_LAUNCHER_USM(std::complex<float>, rocblas_chbmv)
-HBMV_LAUNCHER_USM(std::complex<double>, rocblas_zhbmv)
-
-#undef HBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hemv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    sycl::event done;
-
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_alpha = std::conj(alpha);
-    auto new_beta = std::conj(beta);
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (T *)x, y, n, incx, incy); })
-            .wait_and_throw();
-    }
-
-    done = column_major::hemv(func, queue, new_uplo, n, new_alpha, a, lda, x, incx, new_beta, y,
-                              incy, dependencies);
-
-    if (n > 0) {
-        done = queue.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(done);
-            conj_vector(cgh, y, n, incy);
-        });
-    }
-
-    return done;
-}
-
-#define HEMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                   \
-    sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a,   \
-                     int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy,   \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return hemv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, \
-                    dependencies);                                                                 \
-    }
-
-HEMV_LAUNCHER_USM(std::complex<float>, rocblas_chemv)
-HEMV_LAUNCHER_USM(std::complex<double>, rocblas_zhemv)
-
-#undef HEMV_LAUNCHER_USM
-
-template <typename Func, typename ScalarType, typename DataType>
-inline sycl::event her(Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                       const ScalarType alpha, const DataType *x, int64_t incx, DataType *a,
-                       int64_t lda, const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (DataType *)x, n, incx); })
-            .wait_and_throw();
-    }
-
-    return column_major::her(func, queue, new_uplo, n, alpha, x, incx, a, lda, dependencies);
-}
-
-#define HER_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE)                                 \
-    sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha,     \
-                    const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, int64_t lda,                  \
-                    const std::vector<sycl::event> &dependencies) {                               \
-        return her(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); \
-    }
-
-HER_LAUNCHER_USM(float, std::complex<float>, rocblas_cher)
-HER_LAUNCHER_USM(double, std::complex<double>, rocblas_zher)
-
-#undef HER_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event her2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *x, int64_t incx, const T *y, int64_t incy, T *a, int64_t lda,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (T *)x, (T *)y, n, incx, incy); })
-            .wait_and_throw();
-    }
-
-    return column_major::her2(func, queue, new_uplo, n, alpha, y, incy, x, incx, a, lda,
-                              dependencies);
-}
-
-#define HER2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                 \
-    sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda,            \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return her2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda,     \
-                    dependencies);                                                               \
-    }
-
-HER2_LAUNCHER_USM(std::complex<float>, rocblas_cher2)
-HER2_LAUNCHER_USM(std::complex<double>, rocblas_zher2)
-
-#undef HER2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hpmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *a, const T *x, int64_t incx, T beta, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    sycl::event done;
-
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_alpha = std::conj(alpha);
-    auto new_beta = std::conj(beta);
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (T *)x, y, n, incx, incy); })
-            .wait_and_throw();
-    }
-
-    done = column_major::hpmv(func, queue, new_uplo, n, new_alpha, a, x, incx, new_beta, y, incy,
-                              dependencies);
-
-    if (n > 0) {
-        done = queue.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(done);
-            conj_vector(cgh, y, n, incy);
-        });
-    }
-
-    return done;
-}
-
-#define HPMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                 \
-    sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \
-                     const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy,              \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return hpmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy,    \
-                    dependencies);                                                               \
-    }
-
-HPMV_LAUNCHER_USM(std::complex<float>, rocblas_chpmv)
-HPMV_LAUNCHER_USM(std::complex<double>, rocblas_zhpmv)
-
-#undef HPMV_LAUNCHER_USM
-
-template <typename Func, typename ScalarType, typename DataType>
-inline sycl::event hpr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n,
-                       const ScalarType alpha, const DataType *x, int64_t incx, DataType *a,
-                       const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (DataType *)x, n, incx); })
-            .wait_and_throw();
-    }
-
-    return column_major::hpr(func, queue, new_uplo, n, alpha, x, incx, a, dependencies);
-}
-
-#define HPR_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE)                             \
-    sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \
-                    const DATA_TYPE *x, int64_t incx, DATA_TYPE *a,                           \
-                    const std::vector<sycl::event> &dependencies) {                           \
-        return hpr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, dependencies);  \
-    }
-
-HPR_LAUNCHER_USM(float, std::complex<float>, rocblas_chpr)
-HPR_LAUNCHER_USM(double, std::complex<double>, rocblas_zhpr)
-
-#undef HPR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hpr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *x, int64_t incx, const T *y, int64_t incy, T *a,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    if (n > 0) {
-        queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (T *)x, (T *)y, n, incx, incy); })
-            .wait_and_throw();
-    }
-
-    return column_major::hpr2(func, queue, new_uplo, n, alpha, y, incy, x, incx, a, dependencies);
-}
-
-#define HPR2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                 \
-    sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a,                         \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return hpr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a,          \
-                    dependencies);                                                               \
-    }
-
-HPR2_LAUNCHER_USM(std::complex<float>, rocblas_chpr2)
-HPR2_LAUNCHER_USM(std::complex<double>, rocblas_zhpr2)
-
-#undef HPR2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event sbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k,
-                        T alpha, const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    return column_major::sbmv(func, queue, new_uplo, n, k, alpha, a, lda, x, incx, beta, y, incy,
-                              dependencies);
-}
-
-#define SBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha,      \
-                     const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \
-                     int64_t incy, const std::vector<sycl::event> &dependencies) {                \
-        return sbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y,   \
-                    incy, dependencies);                                                          \
-    }
-
-SBMV_LAUNCHER_USM(float, rocblas_ssbmv)
-SBMV_LAUNCHER_USM(double, rocblas_dsbmv)
-
-#undef SBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event symv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y,
-                        int64_t incy, const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    return column_major::symv(func, queue, new_uplo, n, alpha, a, lda, x, incx, beta, y, incy,
-                              dependencies);
-}
-
-#define SYMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                   \
-    sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a,   \
-                     int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy,   \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return symv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, \
-                    dependencies);                                                                 \
-    }
-
-SYMV_LAUNCHER_USM(float, rocblas_ssymv)
-SYMV_LAUNCHER_USM(double, rocblas_dsymv)
-
-#undef SYMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                       const T *x, int64_t incx, T *a, int64_t lda,
-                       const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    return column_major::syr(func, queue, new_uplo, n, alpha, x, incx, a, lda, dependencies);
-}
-
-#define SYR_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                   \
-    sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x,   \
-                    int64_t incx, TYPE *a, int64_t lda,                                           \
-                    const std::vector<sycl::event> &dependencies) {                               \
-        return syr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); \
-    }
-
-SYR_LAUNCHER_USM(float, rocblas_ssyr)
-SYR_LAUNCHER_USM(double, rocblas_dsyr)
-// Intel does not support the following two
-SYR_LAUNCHER_USM(std::complex<float>, rocblas_csyr)
-SYR_LAUNCHER_USM(std::complex<double>, rocblas_zsyr)
-
-#undef SYR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *x, int64_t incx, const T *y, int64_t incy, T *a, int64_t lda,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    return column_major::syr2(func, queue, new_uplo, n, alpha, x, incx, y, incy, a, lda,
-                              dependencies);
-}
-
-#define SYR2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                 \
-    sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda,            \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return syr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda,     \
-                    dependencies);                                                               \
-    }
-
-SYR2_LAUNCHER_USM(float, rocblas_ssyr2)
-SYR2_LAUNCHER_USM(double, rocblas_dsyr2)
-// Intel does not support the following two
-SYR2_LAUNCHER_USM(std::complex<float>, rocblas_csyr2)
-SYR2_LAUNCHER_USM(std::complex<double>, rocblas_zsyr2)
-
-#undef SYR2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event spmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *a, const T *x, int64_t incx, T beta, T *y, int64_t incy,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    return column_major::spmv(func, queue, new_uplo, n, alpha, a, x, incx, beta, y, incy,
-                              dependencies);
-}
-
-#define SPMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                 \
-    sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \
-                     const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy,              \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return spmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy,    \
-                    dependencies);                                                               \
-    }
-
-SPMV_LAUNCHER_USM(float, rocblas_sspmv)
-SPMV_LAUNCHER_USM(double, rocblas_dspmv)
-
-#undef SPMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event spr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                       const T *x, int64_t incx, T *a,
-                       const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    return column_major::spr(func, queue, new_uplo, n, alpha, x, incx, a, dependencies);
-}
-
-#define SPR_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                 \
-    sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                    int64_t incx, TYPE *a, const std::vector<sycl::event> &dependencies) {      \
-        return spr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, dependencies);    \
-    }
-
-SPR_LAUNCHER_USM(float, rocblas_sspr)
-SPR_LAUNCHER_USM(double, rocblas_dspr)
-
-#undef SPR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event spr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha,
-                        const T *x, int64_t incx, const T *y, int64_t incy, T *a,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    return column_major::spr2(func, queue, new_uplo, n, alpha, x, incx, y, incy, a, dependencies);
-}
-
-#define SPR2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                 \
-    sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \
-                     int64_t incx, const TYPE *y, int64_t incy, TYPE *a,                         \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        return spr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a,          \
-                    dependencies);                                                               \
-    }
-
-SPR2_LAUNCHER_USM(float, rocblas_sspr2)
-SPR2_LAUNCHER_USM(double, rocblas_dspr2)
-
-#undef SPR2_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, int64_t k, const std::complex<T> *a, int64_t lda,
-                        std::complex<T> *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    sycl::event done;
-
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); })
-                .wait_and_throw();
-        }
-    }
-
-    done = column_major::tbmv(func, queue, new_uplo, new_trans, unit_diag, n, k, a, lda, x, incx,
-                              dependencies);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            done = queue.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(done);
-                conj_vector(cgh, x, n, incx);
-            });
-        }
-    }
-
-    return done;
-}
-
-template <typename Func, typename T>
-inline sycl::event tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, int64_t k, const T *a, int64_t lda, T *x,
-                        int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    return column_major::tbmv(func, queue, new_uplo, new_trans, unit_diag, n, k, a, lda, x, incx,
-                              dependencies);
-}
-
-#define TBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,       \
-                     int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,     \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return tbmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, \
-                    dependencies);                                                                \
-    }
-
-TBMV_LAUNCHER_USM(float, rocblas_stbmv)
-TBMV_LAUNCHER_USM(double, rocblas_dtbmv)
-TBMV_LAUNCHER_USM(std::complex<float>, rocblas_ctbmv)
-TBMV_LAUNCHER_USM(std::complex<double>, rocblas_ztbmv)
-
-#undef TBMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, int64_t k, const std::complex<T> *a, int64_t lda,
-                        std::complex<T> *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    sycl::event done;
-
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); })
-                .wait_and_throw();
-        }
-    }
-
-    done = column_major::tbsv(func, queue, new_uplo, new_trans, unit_diag, n, k, a, lda, x, incx,
-                              dependencies);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            done = queue.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(done);
-                conj_vector(cgh, x, n, incx);
-            });
-        }
-    }
-
-    return done;
-}
-
-template <typename Func, typename T>
-inline sycl::event tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, int64_t k, const T *a, int64_t lda, T *x,
-                        int64_t incx, const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    return column_major::tbsv(func, queue, new_uplo, new_trans, unit_diag, n, k, a, lda, x, incx,
-                              dependencies);
-}
-
-#define TBSV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,       \
-                     int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,     \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return tbsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, \
-                    dependencies);                                                                \
-    }
-
-TBSV_LAUNCHER_USM(float, rocblas_stbsv)
-TBSV_LAUNCHER_USM(double, rocblas_dtbsv)
-TBSV_LAUNCHER_USM(std::complex<float>, rocblas_ctbsv)
-TBSV_LAUNCHER_USM(std::complex<double>, rocblas_ztbsv)
-
-#undef TBSV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, const std::complex<T> *a, std::complex<T> *x,
-                        int64_t incx, const std::vector<sycl::event> &dependencies) {
-    sycl::event done;
-
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); })
-                .wait_and_throw();
-        }
-    }
-
-    done = column_major::tpmv(func, queue, new_uplo, new_trans, unit_diag, n, a, x, incx,
-                              dependencies);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            incx = std::abs(incx);
-
-            done = queue.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(done);
-                conj_vector(cgh, x, n, incx);
-            });
-        }
-    }
-
-    return done;
-}
-
-template <typename Func, typename T>
-inline sycl::event tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, const T *a, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    return column_major::tpmv(func, queue, new_uplo, new_trans, unit_diag, n, a, x, incx,
-                              dependencies);
-}
-
-#define TPMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                            \
-    sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \
-                     int64_t n, const TYPE *a, TYPE *x, int64_t incx,                       \
-                     const std::vector<sycl::event> &dependencies) {                        \
-        return tpmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx,   \
-                    dependencies);                                                          \
-    }
-
-TPMV_LAUNCHER_USM(float, rocblas_stpmv)
-TPMV_LAUNCHER_USM(double, rocblas_dtpmv)
-TPMV_LAUNCHER_USM(std::complex<float>, rocblas_ctpmv)
-TPMV_LAUNCHER_USM(std::complex<double>, rocblas_ztpmv)
-
-#undef TPMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, const std::complex<T> *a, std::complex<T> *x,
-                        int64_t incx, const std::vector<sycl::event> &dependencies) {
-    sycl::event done;
-
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); })
-                .wait_and_throw();
-        }
-    }
-
-    done = column_major::tpsv(func, queue, new_uplo, new_trans, unit_diag, n, a, x, incx,
-                              dependencies);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            incx = std::abs(incx);
-
-            done = queue.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(done);
-                conj_vector(cgh, x, n, incx);
-            });
-        }
-    }
-
-    return done;
-}
-
-template <typename Func, typename T>
-inline sycl::event tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, const T *a, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    return column_major::tpsv(func, queue, new_uplo, new_trans, unit_diag, n, a, x, incx,
-                              dependencies);
-}
-
-#define TPSV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                            \
-    sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \
-                     int64_t n, const TYPE *a, TYPE *x, int64_t incx,                       \
-                     const std::vector<sycl::event> &dependencies) {                        \
-        return tpsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx,   \
-                    dependencies);                                                          \
-    }
-
-TPSV_LAUNCHER_USM(float, rocblas_stpsv)
-TPSV_LAUNCHER_USM(double, rocblas_dtpsv)
-TPSV_LAUNCHER_USM(std::complex<float>, rocblas_ctpsv)
-TPSV_LAUNCHER_USM(std::complex<double>, rocblas_ztpsv)
-
-#undef TPSV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, const std::complex<T> *a, int64_t lda,
-                        std::complex<T> *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    sycl::event done;
-
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); })
-                .wait_and_throw();
-        }
-    }
-
-    done = column_major::trmv(func, queue, new_uplo, new_trans, unit_diag, n, a, lda, x, incx,
-                              dependencies);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            done = queue.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(done);
-                conj_vector(cgh, x, n, incx);
-            });
-        }
-    }
-
-    return done;
-}
-
-template <typename Func, typename T>
-inline sycl::event trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    return column_major::trmv(func, queue, new_uplo, new_trans, unit_diag, n, a, lda, x, incx,
-                              dependencies);
-}
-
-#define TRMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                               \
-    sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,    \
-                     int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,             \
-                     const std::vector<sycl::event> &dependencies) {                           \
-        return trmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, \
-                    dependencies);                                                             \
-    }
-
-TRMV_LAUNCHER_USM(float, rocblas_strmv)
-TRMV_LAUNCHER_USM(double, rocblas_dtrmv)
-TRMV_LAUNCHER_USM(std::complex<float>, rocblas_ctrmv)
-TRMV_LAUNCHER_USM(std::complex<double>, rocblas_ztrmv)
-
-#undef TRMV_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, const std::complex<T> *a, int64_t lda,
-                        std::complex<T> *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    sycl::event done;
-
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); })
-                .wait_and_throw();
-        }
-    }
-
-    done = column_major::trsv(func, queue, new_uplo, new_trans, unit_diag, n, a, lda, x, incx,
-                              dependencies);
-
-    if (trans == oneapi::mkl::transpose::conjtrans) {
-        if (n > 0) {
-            done = queue.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(done);
-                conj_vector(cgh, x, n, incx);
-            });
-        }
-    }
-
-    return done;
-}
-
-template <typename Func, typename T>
-inline sycl::event trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                        diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, int64_t incx,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    return column_major::trsv(func, queue, new_uplo, new_trans, unit_diag, n, a, lda, x, incx,
-                              dependencies);
-}
-
-#define TRSV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                               \
-    sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag,    \
-                     int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx,             \
-                     const std::vector<sycl::event> &dependencies) {                           \
-        return trsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, \
-                    dependencies);                                                             \
-    }
-
-TRSV_LAUNCHER_USM(float, rocblas_strsv)
-TRSV_LAUNCHER_USM(double, rocblas_dtrsv)
-TRSV_LAUNCHER_USM(std::complex<float>, rocblas_ctrsv)
-TRSV_LAUNCHER_USM(std::complex<double>, rocblas_ztrsv)
-
-#undef TRSV_LAUNCHER_USM
-
-} // namespace row_major
-} // namespace rocblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/rocblas/rocblas_level3.cpp b/src/blas/backends/rocblas/rocblas_level3.cpp
deleted file mode 100644
index ef739a88b..000000000
--- a/src/blas/backends/rocblas/rocblas_level3.cpp
+++ /dev/null
@@ -1,1482 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#include "rocblas_helper.hpp"
-#include "rocblas_task.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace rocblas {
-namespace column_major {
-
-// Buffer APIs
-
-template <typename Func, typename T>
-inline void gemm(Func func, sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                 int64_t n, int64_t k, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
-                 sycl::buffer<T, 1> &b, int64_t ldb, T beta, sycl::buffer<T, 1> &c, int64_t ldc) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, ldb, ldc);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto b_ = sc.get_mem<rocDataType *>(b_acc);
-            auto c_ = sc.get_mem<rocDataType *>(c_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa),
-                                    get_rocblas_operation(transb), m, n, k, (rocDataType *)&alpha,
-                                    a_, lda, b_, ldb, (rocDataType *)&beta, c_, ldc);
-        });
-    });
-}
-
-#define GEMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                  \
-    void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,   \
-              int64_t k, TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda,                   \
-              sycl::buffer<TYPE, 1> &b, int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c,     \
-              int64_t ldc) {                                                                  \
-        gemm(ROCBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, \
-             ldc);                                                                            \
-    }
-
-GEMM_LAUNCHER(float, rocblas_sgemm)
-GEMM_LAUNCHER(double, rocblas_dgemm)
-GEMM_LAUNCHER(std::complex<float>, rocblas_cgemm)
-GEMM_LAUNCHER(std::complex<double>, rocblas_zgemm)
-
-#undef GEMM_LAUNCHER
-
-template <typename Func, typename T_A, typename T_B, typename T_C, typename T_S,
-          typename DATATYPE_A, typename DATATYPE_B, typename DATATYPE_C, typename COMPUTETYPE>
-inline void gemm_ex(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, COMPUTETYPE CT,
-                    sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    int64_t k, T_S alpha, sycl::buffer<T_A, 1> &a, int64_t lda,
-                    sycl::buffer<T_B, 1> &b, int64_t ldb, T_S beta, sycl::buffer<T_C, 1> &c,
-                    int64_t ldc) {
-    using rocDataType_A = typename RocEquivalentType<T_A>::Type;
-    using rocDataType_B = typename RocEquivalentType<T_B>::Type;
-    using rocDataType_C = typename RocEquivalentType<T_C>::Type;
-    using rocDataType_S = typename RocEquivalentType<T_S>::Type;
-    overflow_check(m, n, k, lda, ldb, ldc);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType_A *>(a_acc);
-            auto b_ = sc.get_mem<rocDataType_B *>(b_acc);
-            auto c_ = sc.get_mem<rocDataType_C *>(c_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa),
-                                    get_rocblas_operation(transb), m, n, k, (rocDataType_S *)&alpha,
-                                    a_, DT_A, lda, b_, DT_B, ldb, (rocDataType_S *)&beta, c_, DT_C,
-                                    ldc, c_, DT_C, ldc, CT, rocblas_gemm_algo_standard, 0, 0);
-        });
-    });
-}
-
-#define GEMM_EX_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S, ROCBLAS_ROUTINE, ROCMDATATYPE_A,         \
-                         ROCMDATATYPE_B, ROCMDATATYPE_C, ROCMCOMPUTETYPE)                         \
-    void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,       \
-              int64_t k, TYPE_S alpha, sycl::buffer<TYPE_A, 1> &a, int64_t lda,                   \
-              sycl::buffer<TYPE_B, 1> &b, int64_t ldb, TYPE_S beta, sycl::buffer<TYPE_C, 1> &c,   \
-              int64_t ldc) {                                                                      \
-        gemm_ex(ROCBLAS_ROUTINE, ROCMDATATYPE_A, ROCMDATATYPE_B, ROCMDATATYPE_C, ROCMCOMPUTETYPE, \
-                queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);             \
-    }
-
-GEMM_EX_LAUNCHER(sycl::half, sycl::half, float, float, rocblas_gemm_ex, rocblas_datatype_f16_r,
-                 rocblas_datatype_f16_r, rocblas_datatype_f32_r, rocblas_datatype_f32_r)
-GEMM_EX_LAUNCHER(sycl::half, sycl::half, sycl::half, sycl::half, rocblas_gemm_ex,
-                 rocblas_datatype_f16_r, rocblas_datatype_f16_r, rocblas_datatype_f16_r,
-                 rocblas_datatype_f16_r)
-
-GEMM_EX_LAUNCHER(bfloat16, bfloat16, float, float, rocblas_gemm_ex, rocblas_datatype_bf16_r,
-                 rocblas_datatype_bf16_r, rocblas_datatype_f32_r, rocblas_datatype_f32_r)
-GEMM_EX_LAUNCHER(bfloat16, bfloat16, bfloat16, float, rocblas_gemm_ex, rocblas_datatype_bf16_r,
-                 rocblas_datatype_bf16_r, rocblas_datatype_bf16_r, rocblas_datatype_f32_r)
-
-#undef GEMM_EX_LAUNCHER
-
-template <typename Func, typename T>
-inline void symm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,
-                 int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b,
-                 int64_t ldb, T beta, sycl::buffer<T, 1> &c, int64_t ldc) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto b_ = sc.get_mem<rocDataType *>(b_acc);
-            auto c_ = sc.get_mem<rocDataType *>(c_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
-                                    get_rocblas_fill_mode(upper_lower), m, n, (rocDataType *)&alpha,
-                                    a_, lda, b_, ldb, (rocDataType *)&beta, c_, ldc);
-        });
-    });
-}
-
-#define SYMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                     \
-    void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,       \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b,       \
-              int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c, int64_t ldc) {                   \
-        symm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, \
-             c, ldc);                                                                            \
-    }
-
-SYMM_LAUNCHER(float, rocblas_ssymm)
-SYMM_LAUNCHER(double, rocblas_dsymm)
-SYMM_LAUNCHER(std::complex<float>, rocblas_csymm)
-SYMM_LAUNCHER(std::complex<double>, rocblas_zsymm)
-
-#undef SYMM_LAUNCHER
-
-template <typename Func, typename T>
-inline void hemm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,
-                 int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b,
-                 int64_t ldb, T beta, sycl::buffer<T, 1> &c, int64_t ldc) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto b_ = sc.get_mem<rocDataType *>(b_acc);
-            auto c_ = sc.get_mem<rocDataType *>(c_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
-                                    get_rocblas_fill_mode(upper_lower), m, n, (rocDataType *)&alpha,
-                                    a_, lda, b_, ldb, (rocDataType *)&beta, c_, ldc);
-        });
-    });
-}
-
-#define HEMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                     \
-    void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,       \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b,       \
-              int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c, int64_t ldc) {                   \
-        hemm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, \
-             c, ldc);                                                                            \
-    }
-
-HEMM_LAUNCHER(std::complex<float>, rocblas_chemm)
-HEMM_LAUNCHER(std::complex<double>, rocblas_zhemm)
-
-#undef HEMM_LAUNCHER
-
-template <typename Func, typename T>
-inline void syrk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                 int64_t k, T alpha, sycl::buffer<T, 1> &a, int64_t lda, T beta,
-                 sycl::buffer<T, 1> &c, int64_t ldc) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, k, lda, ldc);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto c_ = sc.get_mem<rocDataType *>(c_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_,
-                                    lda, (rocDataType *)&beta, c_, ldc);
-        });
-    });
-}
-
-#define SYRK_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                 \
-    void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,   \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, TYPE beta,                  \
-              sycl::buffer<TYPE, 1> &c, int64_t ldc) {                                       \
-        syrk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); \
-    }
-
-SYRK_LAUNCHER(float, rocblas_ssyrk)
-SYRK_LAUNCHER(double, rocblas_dsyrk)
-SYRK_LAUNCHER(std::complex<float>, rocblas_csyrk)
-SYRK_LAUNCHER(std::complex<double>, rocblas_zsyrk)
-
-#undef SYRK_LAUNCHER
-
-template <typename Func, typename DataType, typename ScalarType>
-inline void herk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                 int64_t k, ScalarType alpha, sycl::buffer<DataType, 1> &a, int64_t lda,
-                 ScalarType beta, sycl::buffer<DataType, 1> &c, int64_t ldc) {
-    using rocDataType = typename RocEquivalentType<DataType>::Type;
-    using rocScalarType = typename RocEquivalentType<ScalarType>::Type;
-    overflow_check(n, k, lda, ldc);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto c_ = sc.get_mem<rocDataType *>(c_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), n, k, (rocScalarType *)&alpha, a_,
-                                    lda, (rocScalarType *)&beta, c_, ldc);
-        });
-    });
-}
-
-#define HERK_LAUNCHER(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE)                                 \
-    void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,     \
-              SCALAR_TYPE alpha, sycl::buffer<DATA_TYPE, 1> &a, int64_t lda, SCALAR_TYPE beta, \
-              sycl::buffer<DATA_TYPE, 1> &c, int64_t ldc) {                                    \
-        herk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);   \
-    }
-
-HERK_LAUNCHER(std::complex<float>, float, rocblas_cherk)
-HERK_LAUNCHER(std::complex<double>, double, rocblas_zherk)
-
-#undef HERK_LAUNCHER
-
-template <typename Func, typename T>
-inline void syr2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                  int64_t k, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b,
-                  int64_t ldb, T beta, sycl::buffer<T, 1> &c, int64_t ldc) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, k, lda, ldb, ldc);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto b_ = sc.get_mem<rocDataType *>(b_acc);
-            auto c_ = sc.get_mem<rocDataType *>(c_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_,
-                                    lda, b_, ldb, (rocDataType *)&beta, c_, ldc);
-        });
-    });
-}
-
-#define SYR2K_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                   \
-    void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,     \
-               TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b,     \
-               int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c, int64_t ldc) {                 \
-        syr2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, \
-              ldc);                                                                             \
-    }
-
-SYR2K_LAUNCHER(float, rocblas_ssyr2k)
-SYR2K_LAUNCHER(double, rocblas_dsyr2k)
-SYR2K_LAUNCHER(std::complex<float>, rocblas_csyr2k)
-SYR2K_LAUNCHER(std::complex<double>, rocblas_zsyr2k)
-
-#undef SYR2K_LAUNCHER
-
-template <typename Func, typename DataType, typename ScalarType>
-inline void her2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                  int64_t k, DataType alpha, sycl::buffer<DataType, 1> &a, int64_t lda,
-                  sycl::buffer<DataType, 1> &b, int64_t ldb, ScalarType beta,
-                  sycl::buffer<DataType, 1> &c, int64_t ldc) {
-    using rocDataType = typename RocEquivalentType<DataType>::Type;
-    using rocScalarType = typename RocEquivalentType<ScalarType>::Type;
-    overflow_check(n, k, lda, ldb, ldc);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto b_ = sc.get_mem<rocDataType *>(b_acc);
-            auto c_ = sc.get_mem<rocDataType *>(c_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_,
-                                    lda, b_, ldb, (rocScalarType *)&beta, c_, ldc);
-        });
-    });
-}
-
-#define HER2K_LAUNCHER(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE)                                 \
-    void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,     \
-               DATA_TYPE alpha, sycl::buffer<DATA_TYPE, 1> &a, int64_t lda,                     \
-               sycl::buffer<DATA_TYPE, 1> &b, int64_t ldb, SCALAR_TYPE beta,                    \
-               sycl::buffer<DATA_TYPE, 1> &c, int64_t ldc) {                                    \
-        her2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, \
-              ldc);                                                                             \
-    }
-
-HER2K_LAUNCHER(std::complex<float>, float, rocblas_cher2k)
-HER2K_LAUNCHER(std::complex<double>, double, rocblas_zher2k)
-
-#undef HER2K_LAUNCHER
-
-// NOTE: In rocblas TRMM diverted from the netlib blas and for performance
-// reason it requires the C matrix to be
-// separated from the B matrix. It is possible to use B instead of C, but this
-// will slow-down the code.
-template <typename Func, typename T>
-inline void trmm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
-                 sycl::buffer<T, 1> &b, int64_t ldb) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto b_ = sc.get_mem<rocDataType *>(b_acc);
-            rocblas_status err;
-#if ROCBLAS_VERSION_MAJOR >= 4
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
-                                    get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    m, n, (rocDataType *)&alpha, a_, lda, b_, ldb, b_, ldb);
-#else
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
-                                    get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    m, n, (rocDataType *)&alpha, a_, lda, b_, ldb);
-#endif
-        });
-    });
-}
-
-#define TRMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,           \
-              diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &a,       \
-              int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) {                             \
-        trmm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, \
-             lda, b, ldb);                                                                      \
-    }
-
-TRMM_LAUNCHER(float, rocblas_strmm)
-TRMM_LAUNCHER(double, rocblas_dtrmm)
-TRMM_LAUNCHER(std::complex<float>, rocblas_ctrmm)
-TRMM_LAUNCHER(std::complex<double>, rocblas_ztrmm)
-
-#undef TRMM_LAUNCHER
-
-template <typename Func, typename T>
-inline void trsm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
-                 sycl::buffer<T, 1> &b, int64_t ldb) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = sc.get_mem<rocDataType *>(a_acc);
-            auto b_ = sc.get_mem<rocDataType *>(b_acc);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
-                                    get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    m, n, (rocDataType *)&alpha, a_, lda, b_, ldb);
-        });
-    });
-}
-
-#define TRSM_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,           \
-              diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &a,       \
-              int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) {                             \
-        trsm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, \
-             lda, b, ldb);                                                                      \
-    }
-
-TRSM_LAUNCHER(float, rocblas_strsm)
-TRSM_LAUNCHER(double, rocblas_dtrsm)
-TRSM_LAUNCHER(std::complex<float>, rocblas_ctrsm)
-TRSM_LAUNCHER(std::complex<double>, rocblas_ztrsm)
-
-#undef TRSM_LAUNCHER
-
-// USM APIs
-
-template <typename Func, typename T>
-inline sycl::event gemm(Func func, sycl::queue &queue, transpose transa, transpose transb,
-                        int64_t m, int64_t n, int64_t k, T alpha, const T *a, int64_t lda,
-                        const T *b, int64_t ldb, T beta, T *c, int64_t ldc,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, ldb, ldc);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto b_ = reinterpret_cast<const rocDataType *>(b);
-            auto c_ = reinterpret_cast<rocDataType *>(c);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa),
-                                    get_rocblas_operation(transb), m, n, k, (rocDataType *)&alpha,
-                                    a_, lda, b_, ldb, (rocDataType *)&beta, c_, ldc);
-        });
-    });
-
-    return done;
-}
-
-#define GEMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                   \
-    sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                     int64_t k, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b,             \
-                     int64_t ldb, TYPE beta, TYPE *c, int64_t ldc,                                 \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return gemm(ROCBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta,  \
-                    c, ldc, dependencies);                                                         \
-    }
-
-GEMM_LAUNCHER_USM(float, rocblas_sgemm)
-GEMM_LAUNCHER_USM(double, rocblas_dgemm)
-GEMM_LAUNCHER_USM(std::complex<float>, rocblas_cgemm)
-GEMM_LAUNCHER_USM(std::complex<double>, rocblas_zgemm)
-
-#undef GEMM_LAUNCHER_USM
-
-template <typename Func, typename T_A, typename T_B, typename T_C, typename T_S,
-          typename DATATYPE_A, typename DATATYPE_B, typename DATATYPE_C, typename COMPUTETYPE>
-inline sycl::event gemm_ex(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C,
-                           COMPUTETYPE CT, sycl::queue &queue, transpose transa, transpose transb,
-                           int64_t m, int64_t n, int64_t k, T_S alpha, const T_A *a, int64_t lda,
-                           const T_B *b, int64_t ldb, T_S beta, T_C *c, int64_t ldc,
-                           const std::vector<sycl::event> &dependencies) {
-    using rocDataType_A = typename RocEquivalentType<T_A>::Type;
-    using rocDataType_B = typename RocEquivalentType<T_B>::Type;
-    using rocDataType_C = typename RocEquivalentType<T_C>::Type;
-    using rocDataType_S = typename RocEquivalentType<T_S>::Type;
-    overflow_check(m, n, k, lda, ldb, ldc);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType_A *>(a);
-            auto b_ = reinterpret_cast<const rocDataType_B *>(b);
-            auto c_ = reinterpret_cast<rocDataType_C *>(c);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa),
-                                    get_rocblas_operation(transb), m, n, k, (rocDataType_S *)&alpha,
-                                    a_, DT_A, lda, b_, DT_B, ldb, (rocDataType_S *)&beta, c_, DT_C,
-                                    ldc, c_, DT_C, ldc, CT, rocblas_gemm_algo_standard, 0, 0);
-        });
-    });
-
-    return done;
-}
-
-#define GEMM_EX_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S, ROCBLAS_ROUTINE, ROCMDATATYPE_A,      \
-                             ROCMDATATYPE_B, ROCMDATATYPE_C, ROCMCOMPUTETYPE)                      \
-    sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                     int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, const TYPE_B *b,       \
-                     int64_t ldb, TYPE_S beta, TYPE_C *c, int64_t ldc,                             \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return gemm_ex(ROCBLAS_ROUTINE, ROCMDATATYPE_A, ROCMDATATYPE_B, ROCMDATATYPE_C,            \
-                       ROCMCOMPUTETYPE, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,     \
-                       beta, c, ldc, dependencies);                                                \
-    }
-
-GEMM_EX_LAUNCHER_USM(sycl::half, sycl::half, float, float, rocblas_gemm_ex, rocblas_datatype_f16_r,
-                     rocblas_datatype_f16_r, rocblas_datatype_f32_r, rocblas_datatype_f32_r)
-GEMM_EX_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, sycl::half, rocblas_gemm_ex,
-                     rocblas_datatype_f16_r, rocblas_datatype_f16_r, rocblas_datatype_f16_r,
-                     rocblas_datatype_f16_r)
-
-GEMM_EX_LAUNCHER_USM(bfloat16, bfloat16, float, float, rocblas_gemm_ex, rocblas_datatype_bf16_r,
-                     rocblas_datatype_bf16_r, rocblas_datatype_f32_r, rocblas_datatype_f32_r)
-GEMM_EX_LAUNCHER_USM(bfloat16, bfloat16, bfloat16, float, rocblas_gemm_ex, rocblas_datatype_bf16_r,
-                     rocblas_datatype_bf16_r, rocblas_datatype_bf16_r, rocblas_datatype_f32_r)
-
-#undef GEMM_EX_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event symm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,
-                        int64_t n, T alpha, const T *a, int64_t lda, const T *b, int64_t ldb,
-                        T beta, T *c, int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto b_ = reinterpret_cast<const rocDataType *>(b);
-            auto c_ = reinterpret_cast<rocDataType *>(c);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
-                                    get_rocblas_fill_mode(upper_lower), m, n, (rocDataType *)&alpha,
-                                    a_, lda, b_, ldb, (rocDataType *)&beta, c_, ldc);
-        });
-    });
-
-    return done;
-}
-
-#define SYMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \
-                     TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb,          \
-                     TYPE beta, TYPE *c, int64_t ldc,                                             \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return symm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, \
-                    beta, c, ldc, dependencies);                                                  \
-    }
-
-SYMM_LAUNCHER_USM(float, rocblas_ssymm)
-SYMM_LAUNCHER_USM(double, rocblas_dsymm)
-SYMM_LAUNCHER_USM(std::complex<float>, rocblas_csymm)
-SYMM_LAUNCHER_USM(std::complex<double>, rocblas_zsymm)
-
-#undef SYMM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hemm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,
-                        int64_t n, T alpha, const T *a, int64_t lda, const T *b, int64_t ldb,
-                        T beta, T *c, int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb, ldc);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto b_ = reinterpret_cast<const rocDataType *>(b);
-            auto c_ = reinterpret_cast<rocDataType *>(c);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
-                                    get_rocblas_fill_mode(upper_lower), m, n, (rocDataType *)&alpha,
-                                    a_, lda, b_, ldb, (rocDataType *)&beta, c_, ldc);
-        });
-    });
-
-    return done;
-}
-
-#define HEMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \
-                     TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb,          \
-                     TYPE beta, TYPE *c, int64_t ldc,                                             \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return hemm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, \
-                    beta, c, ldc, dependencies);                                                  \
-    }
-
-HEMM_LAUNCHER_USM(std::complex<float>, rocblas_chemm)
-HEMM_LAUNCHER_USM(std::complex<double>, rocblas_zhemm)
-
-#undef HEMM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syrk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                        int64_t k, T alpha, const T *a, int64_t lda, T beta, T *c, int64_t ldc,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, k, lda, ldc);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto c_ = reinterpret_cast<rocDataType *>(c);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_,
-                                    lda, (rocDataType *)&beta, c_, ldc);
-        });
-    });
-
-    return done;
-}
-
-#define SYRK_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                   \
-    sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,  \
-                     TYPE alpha, const TYPE *a, int64_t lda, TYPE beta, TYPE *c, int64_t ldc,      \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return syrk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, \
-                    dependencies);                                                                 \
-    }
-
-SYRK_LAUNCHER_USM(float, rocblas_ssyrk)
-SYRK_LAUNCHER_USM(double, rocblas_dsyrk)
-SYRK_LAUNCHER_USM(std::complex<float>, rocblas_csyrk)
-SYRK_LAUNCHER_USM(std::complex<double>, rocblas_zsyrk)
-
-#undef SYRK_LAUNCHER_USM
-
-template <typename Func, typename DataType, typename ScalarType>
-inline sycl::event herk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                        int64_t k, const ScalarType alpha, const DataType *a, int64_t lda,
-                        const ScalarType beta, DataType *c, int64_t ldc,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<DataType>::Type;
-    using rocScalarType = typename RocEquivalentType<ScalarType>::Type;
-    overflow_check(n, k, lda, ldc);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto c_ = reinterpret_cast<rocDataType *>(c);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), n, k, (rocScalarType *)&alpha, a_,
-                                    lda, (rocScalarType *)&beta, c_, ldc);
-        });
-    });
-
-    return done;
-}
-
-#define HERK_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE)                                 \
-    sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,  \
-                     const SCALAR_TYPE alpha, const DATA_TYPE *a, int64_t lda,                     \
-                     const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc,                            \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return herk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, \
-                    dependencies);                                                                 \
-    }
-
-HERK_LAUNCHER_USM(std::complex<float>, float, rocblas_cherk)
-HERK_LAUNCHER_USM(std::complex<double>, double, rocblas_zherk)
-
-#undef HERK_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syr2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                         int64_t n, int64_t k, T alpha, const T *a, int64_t lda, const T *b,
-                         int64_t ldb, T beta, T *c, int64_t ldc,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(n, k, lda, ldb, ldc);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto b_ = reinterpret_cast<const rocDataType *>(b);
-            auto c_ = reinterpret_cast<rocDataType *>(c);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_,
-                                    lda, b_, ldb, (rocDataType *)&beta, c_, ldc);
-        });
-    });
-
-    return done;
-}
-
-#define SYR2K_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \
-                      TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb,          \
-                      TYPE beta, TYPE *c, int64_t ldc,                                             \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return syr2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,      \
-                     beta, c, ldc, dependencies);                                                  \
-    }
-
-SYR2K_LAUNCHER_USM(float, rocblas_ssyr2k)
-SYR2K_LAUNCHER_USM(double, rocblas_dsyr2k)
-SYR2K_LAUNCHER_USM(std::complex<float>, rocblas_csyr2k)
-SYR2K_LAUNCHER_USM(std::complex<double>, rocblas_zsyr2k)
-
-#undef SYR2K_LAUNCHER_USM
-
-template <typename Func, typename DataType, typename ScalarType>
-inline sycl::event her2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                         int64_t n, int64_t k, const DataType alpha, const DataType *a, int64_t lda,
-                         const DataType *b, int64_t ldb, const ScalarType beta, DataType *c,
-                         int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<DataType>::Type;
-    using rocScalarType = typename RocEquivalentType<ScalarType>::Type;
-    overflow_check(n, k, lda, ldb, ldc);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto b_ = reinterpret_cast<const rocDataType *>(b);
-            auto c_ = reinterpret_cast<rocDataType *>(c);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_,
-                                    lda, b_, ldb, (rocScalarType *)&beta, c_, ldc);
-        });
-    });
-
-    return done;
-}
-
-#define HER2K_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE)                                \
-    sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \
-                      const DATA_TYPE alpha, const DATA_TYPE *a, int64_t lda, const DATA_TYPE *b,  \
-                      int64_t ldb, const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc,              \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return her2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,      \
-                     beta, c, ldc, dependencies);                                                  \
-    }
-
-HER2K_LAUNCHER_USM(std::complex<float>, float, rocblas_cher2k)
-HER2K_LAUNCHER_USM(std::complex<double>, double, rocblas_zher2k)
-
-#undef HER2K_LAUNCHER_USM
-
-// NOTE: In rocblas TRMM diverted from the netlib blas and for performance
-// reason it requires the C matrix to be
-// separated from the B matrix. It is possible to use B instead of C, but this
-// will slow-down the code.
-template <typename Func, typename T>
-inline sycl::event trmm(Func func, sycl::queue &queue, side left_right, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, const T *a,
-                        int64_t lda, T *b, int64_t ldb,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto b_ = reinterpret_cast<rocDataType *>(b);
-            rocblas_status err;
-#if ROCBLAS_VERSION_MAJOR >= 4
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
-                                    get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    m, n, (rocDataType *)&alpha, a_, lda, b_, ldb, b_, ldb);
-#else
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
-                                    get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    m, n, (rocDataType *)&alpha, a_, lda, b_, ldb);
-#endif
-        });
-    });
-
-    return done;
-}
-
-#define TRMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                   \
-    sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,       \
-                     diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \
-                     TYPE *b, int64_t ldb, const std::vector<sycl::event> &dependencies) {         \
-        return trmm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n,       \
-                    alpha, a, lda, b, ldb, dependencies);                                          \
-    }
-
-TRMM_LAUNCHER_USM(float, rocblas_strmm)
-TRMM_LAUNCHER_USM(double, rocblas_dtrmm)
-TRMM_LAUNCHER_USM(std::complex<float>, rocblas_ctrmm)
-TRMM_LAUNCHER_USM(std::complex<double>, rocblas_ztrmm)
-
-#undef TRMM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trsm(Func func, sycl::queue &queue, side left_right, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, const T *a,
-                        int64_t lda, T *b, int64_t ldb,
-                        const std::vector<sycl::event> &dependencies) {
-    using rocDataType = typename RocEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldb);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-
-            auto a_ = reinterpret_cast<const rocDataType *>(a);
-            auto b_ = reinterpret_cast<rocDataType *>(b);
-            rocblas_status err;
-            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
-                                    get_rocblas_fill_mode(upper_lower),
-                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
-                                    m, n, (rocDataType *)&alpha, a_, lda, b_, ldb);
-        });
-    });
-
-    return done;
-}
-
-#define TRSM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                   \
-    sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,       \
-                     diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \
-                     TYPE *b, int64_t ldb, const std::vector<sycl::event> &dependencies) {         \
-        return trsm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n,       \
-                    alpha, a, lda, b, ldb, dependencies);                                          \
-    }
-
-TRSM_LAUNCHER_USM(float, rocblas_strsm)
-TRSM_LAUNCHER_USM(double, rocblas_dtrsm)
-TRSM_LAUNCHER_USM(std::complex<float>, rocblas_ctrsm)
-TRSM_LAUNCHER_USM(std::complex<double>, rocblas_ztrsm)
-
-#undef TRSM_LAUNCHER_USM
-
-} // namespace column_major
-
-namespace row_major {
-
-// Buffer APIs
-
-template <typename Func, typename T>
-inline void gemm(Func func, sycl::queue &queue, transpose transa, transpose transb, int64_t m,
-                 int64_t n, int64_t k, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
-                 sycl::buffer<T, 1> &b, int64_t ldb, T beta, sycl::buffer<T, 1> &c, int64_t ldc) {
-    auto new_transa = transb;
-    auto new_transb = transa;
-
-    column_major::gemm(func, queue, new_transa, new_transb, n, m, k, alpha, b, ldb, a, lda, beta, c,
-                       ldc);
-}
-
-#define GEMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                  \
-    void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,   \
-              int64_t k, TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda,                   \
-              sycl::buffer<TYPE, 1> &b, int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c,     \
-              int64_t ldc) {                                                                  \
-        gemm(ROCBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, \
-             ldc);                                                                            \
-    }
-
-GEMM_LAUNCHER(float, rocblas_sgemm)
-GEMM_LAUNCHER(double, rocblas_dgemm)
-GEMM_LAUNCHER(std::complex<float>, rocblas_cgemm)
-GEMM_LAUNCHER(std::complex<double>, rocblas_zgemm)
-
-#undef GEMM_LAUNCHER
-
-template <typename Func, typename T_A, typename T_B, typename T_C, typename T_S,
-          typename DATATYPE_A, typename DATATYPE_B, typename DATATYPE_C, typename COMPUTETYPE>
-inline void gemm_ex(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, COMPUTETYPE CT,
-                    sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,
-                    int64_t k, T_S alpha, sycl::buffer<T_A, 1> &a, int64_t lda,
-                    sycl::buffer<T_B, 1> &b, int64_t ldb, T_S beta, sycl::buffer<T_C, 1> &c,
-                    int64_t ldc) {
-    auto new_transa = transb;
-    auto new_transb = transa;
-
-    column_major::gemm_ex(func, DT_A, DT_B, DT_C, CT, queue, new_transa, new_transb, n, m, k, alpha,
-                          b, ldb, a, lda, beta, c, ldc);
-}
-
-#define GEMM_EX_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S, ROCBLAS_ROUTINE, ROCMDATATYPE_A,         \
-                         ROCMDATATYPE_B, ROCMDATATYPE_C, ROCMCOMPUTETYPE)                         \
-    void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n,       \
-              int64_t k, TYPE_S alpha, sycl::buffer<TYPE_A, 1> &a, int64_t lda,                   \
-              sycl::buffer<TYPE_B, 1> &b, int64_t ldb, TYPE_S beta, sycl::buffer<TYPE_C, 1> &c,   \
-              int64_t ldc) {                                                                      \
-        gemm_ex(ROCBLAS_ROUTINE, ROCMDATATYPE_A, ROCMDATATYPE_B, ROCMDATATYPE_C, ROCMCOMPUTETYPE, \
-                queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);             \
-    }
-
-GEMM_EX_LAUNCHER(sycl::half, sycl::half, float, float, rocblas_gemm_ex, rocblas_datatype_f16_r,
-                 rocblas_datatype_f16_r, rocblas_datatype_f32_r, rocblas_datatype_f32_r)
-GEMM_EX_LAUNCHER(sycl::half, sycl::half, sycl::half, sycl::half, rocblas_gemm_ex,
-                 rocblas_datatype_f16_r, rocblas_datatype_f16_r, rocblas_datatype_f16_r,
-                 rocblas_datatype_f16_r)
-
-GEMM_EX_LAUNCHER(bfloat16, bfloat16, float, float, rocblas_gemm_ex, rocblas_datatype_bf16_r,
-                 rocblas_datatype_bf16_r, rocblas_datatype_f32_r, rocblas_datatype_f32_r)
-GEMM_EX_LAUNCHER(bfloat16, bfloat16, bfloat16, float, rocblas_gemm_ex, rocblas_datatype_bf16_r,
-                 rocblas_datatype_bf16_r, rocblas_datatype_bf16_r, rocblas_datatype_f32_r)
-
-#undef GEMM_EX_LAUNCHER
-
-template <typename Func, typename T>
-inline void symm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,
-                 int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b,
-                 int64_t ldb, T beta, sycl::buffer<T, 1> &c, int64_t ldc) {
-    auto new_side =
-        left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left;
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    column_major::symm(func, queue, new_side, new_uplo, n, m, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-#define SYMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                     \
-    void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,       \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b,       \
-              int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c, int64_t ldc) {                   \
-        symm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, \
-             c, ldc);                                                                            \
-    }
-
-SYMM_LAUNCHER(float, rocblas_ssymm)
-SYMM_LAUNCHER(double, rocblas_dsymm)
-SYMM_LAUNCHER(std::complex<float>, rocblas_csymm)
-SYMM_LAUNCHER(std::complex<double>, rocblas_zsymm)
-
-#undef SYMM_LAUNCHER
-
-template <typename Func, typename T>
-inline void hemm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,
-                 int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b,
-                 int64_t ldb, T beta, sycl::buffer<T, 1> &c, int64_t ldc) {
-    auto new_side =
-        left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left;
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    column_major::hemm(func, queue, new_side, new_uplo, n, m, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-#define HEMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                     \
-    void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n,       \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b,       \
-              int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c, int64_t ldc) {                   \
-        hemm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, \
-             c, ldc);                                                                            \
-    }
-
-HEMM_LAUNCHER(std::complex<float>, rocblas_chemm)
-HEMM_LAUNCHER(std::complex<double>, rocblas_zhemm)
-
-#undef HEMM_LAUNCHER
-
-template <typename Func, typename T>
-inline void syrk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                 int64_t k, T alpha, sycl::buffer<T, 1> &a, int64_t lda, T beta,
-                 sycl::buffer<T, 1> &c, int64_t ldc) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    column_major::syrk(func, queue, new_uplo, new_trans, n, k, alpha, a, lda, beta, c, ldc);
-}
-
-#define SYRK_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                 \
-    void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,   \
-              TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, TYPE beta,                  \
-              sycl::buffer<TYPE, 1> &c, int64_t ldc) {                                       \
-        syrk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); \
-    }
-
-SYRK_LAUNCHER(float, rocblas_ssyrk)
-SYRK_LAUNCHER(double, rocblas_dsyrk)
-SYRK_LAUNCHER(std::complex<float>, rocblas_csyrk)
-SYRK_LAUNCHER(std::complex<double>, rocblas_zsyrk)
-
-#undef SYRK_LAUNCHER
-
-template <typename Func, typename DataType, typename ScalarType>
-inline void herk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                 int64_t k, ScalarType alpha, sycl::buffer<DataType, 1> &a, int64_t lda,
-                 ScalarType beta, sycl::buffer<DataType, 1> &c, int64_t ldc) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::conjtrans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    column_major::herk(func, queue, new_uplo, new_trans, n, k, alpha, a, lda, beta, c, ldc);
-}
-
-#define HERK_LAUNCHER(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE)                                 \
-    void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,     \
-              SCALAR_TYPE alpha, sycl::buffer<DATA_TYPE, 1> &a, int64_t lda, SCALAR_TYPE beta, \
-              sycl::buffer<DATA_TYPE, 1> &c, int64_t ldc) {                                    \
-        herk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc);   \
-    }
-
-HERK_LAUNCHER(std::complex<float>, float, rocblas_cherk)
-HERK_LAUNCHER(std::complex<double>, double, rocblas_zherk)
-
-#undef HERK_LAUNCHER
-
-template <typename Func, typename T>
-inline void syr2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                  int64_t k, T alpha, sycl::buffer<T, 1> &a, int64_t lda, sycl::buffer<T, 1> &b,
-                  int64_t ldb, T beta, sycl::buffer<T, 1> &c, int64_t ldc) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    column_major::syr2k(func, queue, new_uplo, new_trans, n, k, alpha, a, lda, b, ldb, beta, c,
-                        ldc);
-}
-
-#define SYR2K_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                   \
-    void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,     \
-               TYPE alpha, sycl::buffer<TYPE, 1> &a, int64_t lda, sycl::buffer<TYPE, 1> &b,     \
-               int64_t ldb, TYPE beta, sycl::buffer<TYPE, 1> &c, int64_t ldc) {                 \
-        syr2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, \
-              ldc);                                                                             \
-    }
-
-SYR2K_LAUNCHER(float, rocblas_ssyr2k)
-SYR2K_LAUNCHER(double, rocblas_dsyr2k)
-SYR2K_LAUNCHER(std::complex<float>, rocblas_csyr2k)
-SYR2K_LAUNCHER(std::complex<double>, rocblas_zsyr2k)
-
-#undef SYR2K_LAUNCHER
-
-template <typename Func, typename DataType, typename ScalarType>
-inline void her2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                  int64_t k, DataType alpha, sycl::buffer<DataType, 1> &a, int64_t lda,
-                  sycl::buffer<DataType, 1> &b, int64_t ldb, ScalarType beta,
-                  sycl::buffer<DataType, 1> &c, int64_t ldc) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::conjtrans
-                                                               : oneapi::mkl::transpose::nontrans;
-    auto new_alpha = std::conj(alpha);
-
-    column_major::her2k(func, queue, new_uplo, new_trans, n, k, new_alpha, a, lda, b, ldb, beta, c,
-                        ldc);
-}
-
-#define HER2K_LAUNCHER(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE)                                 \
-    void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,     \
-               DATA_TYPE alpha, sycl::buffer<DATA_TYPE, 1> &a, int64_t lda,                     \
-               sycl::buffer<DATA_TYPE, 1> &b, int64_t ldb, SCALAR_TYPE beta,                    \
-               sycl::buffer<DATA_TYPE, 1> &c, int64_t ldc) {                                    \
-        her2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, \
-              ldc);                                                                             \
-    }
-
-HER2K_LAUNCHER(std::complex<float>, float, rocblas_cher2k)
-HER2K_LAUNCHER(std::complex<double>, double, rocblas_zher2k)
-
-#undef HER2K_LAUNCHER
-
-// NOTE: In rocblas TRMM diverted from the netlib blas and for performance
-// reason it requires the C matrix to be
-// separated from the B matrix. It is possible to use B instead of C, but this
-// will slow-down the code.
-template <typename Func, typename T>
-inline void trmm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
-                 sycl::buffer<T, 1> &b, int64_t ldb) {
-    auto new_side =
-        left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left;
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    column_major::trmm(func, queue, new_side, new_uplo, trans, unit_diag, n, m, alpha, a, lda, b,
-                       ldb);
-}
-
-#define TRMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,           \
-              diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &a,       \
-              int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) {                             \
-        trmm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, \
-             lda, b, ldb);                                                                      \
-    }
-
-TRMM_LAUNCHER(float, rocblas_strmm)
-TRMM_LAUNCHER(double, rocblas_dtrmm)
-TRMM_LAUNCHER(std::complex<float>, rocblas_ctrmm)
-TRMM_LAUNCHER(std::complex<double>, rocblas_ztrmm)
-
-#undef TRMM_LAUNCHER
-
-template <typename Func, typename T>
-inline void trsm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,
-                 diag unit_diag, int64_t m, int64_t n, T alpha, sycl::buffer<T, 1> &a, int64_t lda,
-                 sycl::buffer<T, 1> &b, int64_t ldb) {
-    auto new_side =
-        left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left;
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    column_major::trsm(func, queue, new_side, new_uplo, trans, unit_diag, n, m, alpha, a, lda, b,
-                       ldb);
-}
-
-#define TRSM_LAUNCHER(TYPE, ROCBLAS_ROUTINE)                                                    \
-    void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,           \
-              diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer<TYPE, 1> &a,       \
-              int64_t lda, sycl::buffer<TYPE, 1> &b, int64_t ldb) {                             \
-        trsm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, \
-             lda, b, ldb);                                                                      \
-    }
-
-TRSM_LAUNCHER(float, rocblas_strsm)
-TRSM_LAUNCHER(double, rocblas_dtrsm)
-TRSM_LAUNCHER(std::complex<float>, rocblas_ctrsm)
-TRSM_LAUNCHER(std::complex<double>, rocblas_ztrsm)
-
-#undef TRSM_LAUNCHER
-
-// USM APIs
-
-template <typename Func, typename T>
-inline sycl::event gemm(Func func, sycl::queue &queue, transpose transa, transpose transb,
-                        int64_t m, int64_t n, int64_t k, T alpha, const T *a, int64_t lda,
-                        const T *b, int64_t ldb, T beta, T *c, int64_t ldc,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_transa = transb;
-    auto new_transb = transa;
-
-    return column_major::gemm(func, queue, new_transa, new_transb, n, m, k, alpha, b, ldb, a, lda,
-                              beta, c, ldc, dependencies);
-}
-
-#define GEMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                   \
-    sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                     int64_t k, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b,             \
-                     int64_t ldb, TYPE beta, TYPE *c, int64_t ldc,                                 \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return gemm(ROCBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta,  \
-                    c, ldc, dependencies);                                                         \
-    }
-
-GEMM_LAUNCHER_USM(float, rocblas_sgemm)
-GEMM_LAUNCHER_USM(double, rocblas_dgemm)
-GEMM_LAUNCHER_USM(std::complex<float>, rocblas_cgemm)
-GEMM_LAUNCHER_USM(std::complex<double>, rocblas_zgemm)
-
-#undef GEMM_LAUNCHER_USM
-
-template <typename Func, typename T_A, typename T_B, typename T_C, typename T_S,
-          typename DATATYPE_A, typename DATATYPE_B, typename DATATYPE_C, typename COMPUTETYPE>
-inline sycl::event gemm_ex(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C,
-                           COMPUTETYPE CT, sycl::queue &queue, transpose transa, transpose transb,
-                           int64_t m, int64_t n, int64_t k, T_S alpha, const T_A *a, int64_t lda,
-                           const T_B *b, int64_t ldb, T_S beta, T_C *c, int64_t ldc,
-                           const std::vector<sycl::event> &dependencies) {
-    auto new_transa = transb;
-    auto new_transb = transa;
-
-    return column_major::gemm_ex(func, DT_A, DT_B, DT_C, CT, queue, new_transa, new_transb, n, m, k,
-                                 alpha, b, ldb, a, lda, beta, c, ldc, dependencies);
-}
-
-#define GEMM_EX_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S, ROCBLAS_ROUTINE, ROCMDATATYPE_A,      \
-                             ROCMDATATYPE_B, ROCMDATATYPE_C, ROCMCOMPUTETYPE)                      \
-    sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \
-                     int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, const TYPE_B *b,       \
-                     int64_t ldb, TYPE_S beta, TYPE_C *c, int64_t ldc,                             \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return gemm_ex(ROCBLAS_ROUTINE, ROCMDATATYPE_A, ROCMDATATYPE_B, ROCMDATATYPE_C,            \
-                       ROCMCOMPUTETYPE, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb,     \
-                       beta, c, ldc, dependencies);                                                \
-    }
-
-GEMM_EX_LAUNCHER_USM(sycl::half, sycl::half, float, float, rocblas_gemm_ex, rocblas_datatype_f16_r,
-                     rocblas_datatype_f16_r, rocblas_datatype_f32_r, rocblas_datatype_f32_r)
-GEMM_EX_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, sycl::half, rocblas_gemm_ex,
-                     rocblas_datatype_f16_r, rocblas_datatype_f16_r, rocblas_datatype_f16_r,
-                     rocblas_datatype_f16_r)
-
-GEMM_EX_LAUNCHER_USM(bfloat16, bfloat16, float, float, rocblas_gemm_ex, rocblas_datatype_bf16_r,
-                     rocblas_datatype_bf16_r, rocblas_datatype_f32_r, rocblas_datatype_f32_r)
-GEMM_EX_LAUNCHER_USM(bfloat16, bfloat16, bfloat16, float, rocblas_gemm_ex, rocblas_datatype_bf16_r,
-                     rocblas_datatype_bf16_r, rocblas_datatype_bf16_r, rocblas_datatype_f32_r)
-
-#undef GEMM_EX_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event symm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,
-                        int64_t n, T alpha, const T *a, int64_t lda, const T *b, int64_t ldb,
-                        T beta, T *c, int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto new_side =
-        left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left;
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    return column_major::symm(func, queue, new_side, new_uplo, n, m, alpha, a, lda, b, ldb, beta, c,
-                              ldc, dependencies);
-}
-
-#define SYMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \
-                     TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb,          \
-                     TYPE beta, TYPE *c, int64_t ldc,                                             \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return symm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, \
-                    beta, c, ldc, dependencies);                                                  \
-    }
-
-SYMM_LAUNCHER_USM(float, rocblas_ssymm)
-SYMM_LAUNCHER_USM(double, rocblas_dsymm)
-SYMM_LAUNCHER_USM(std::complex<float>, rocblas_csymm)
-SYMM_LAUNCHER_USM(std::complex<double>, rocblas_zsymm)
-
-#undef SYMM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event hemm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m,
-                        int64_t n, T alpha, const T *a, int64_t lda, const T *b, int64_t ldb,
-                        T beta, T *c, int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto new_side =
-        left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left;
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    return column_major::hemm(func, queue, new_side, new_uplo, n, m, alpha, a, lda, b, ldb, beta, c,
-                              ldc, dependencies);
-}
-
-#define HEMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \
-                     TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb,          \
-                     TYPE beta, TYPE *c, int64_t ldc,                                             \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        return hemm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, \
-                    beta, c, ldc, dependencies);                                                  \
-    }
-
-HEMM_LAUNCHER_USM(std::complex<float>, rocblas_chemm)
-HEMM_LAUNCHER_USM(std::complex<double>, rocblas_zhemm)
-
-#undef HEMM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syrk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                        int64_t k, T alpha, const T *a, int64_t lda, T beta, T *c, int64_t ldc,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    return column_major::syrk(func, queue, new_uplo, new_trans, n, k, alpha, a, lda, beta, c, ldc,
-                              dependencies);
-}
-
-#define SYRK_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                   \
-    sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,  \
-                     TYPE alpha, const TYPE *a, int64_t lda, TYPE beta, TYPE *c, int64_t ldc,      \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return syrk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, \
-                    dependencies);                                                                 \
-    }
-
-SYRK_LAUNCHER_USM(float, rocblas_ssyrk)
-SYRK_LAUNCHER_USM(double, rocblas_dsyrk)
-SYRK_LAUNCHER_USM(std::complex<float>, rocblas_csyrk)
-SYRK_LAUNCHER_USM(std::complex<double>, rocblas_zsyrk)
-
-#undef SYRK_LAUNCHER_USM
-
-template <typename Func, typename DataType, typename ScalarType>
-inline sycl::event herk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n,
-                        int64_t k, const ScalarType alpha, const DataType *a, int64_t lda,
-                        const ScalarType beta, DataType *c, int64_t ldc,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::conjtrans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    return column_major::herk(func, queue, new_uplo, new_trans, n, k, alpha, a, lda, beta, c, ldc,
-                              dependencies);
-}
-
-#define HERK_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE)                                 \
-    sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k,  \
-                     const SCALAR_TYPE alpha, const DATA_TYPE *a, int64_t lda,                     \
-                     const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc,                            \
-                     const std::vector<sycl::event> &dependencies) {                               \
-        return herk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, \
-                    dependencies);                                                                 \
-    }
-
-HERK_LAUNCHER_USM(std::complex<float>, float, rocblas_cherk)
-HERK_LAUNCHER_USM(std::complex<double>, double, rocblas_zherk)
-
-#undef HERK_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syr2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                         int64_t n, int64_t k, T alpha, const T *a, int64_t lda, const T *b,
-                         int64_t ldb, T beta, T *c, int64_t ldc,
-                         const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans
-                                                               : oneapi::mkl::transpose::nontrans;
-
-    return column_major::syr2k(func, queue, new_uplo, new_trans, n, k, alpha, a, lda, b, ldb, beta,
-                               c, ldc, dependencies);
-}
-
-#define SYR2K_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                  \
-    sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \
-                      TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb,          \
-                      TYPE beta, TYPE *c, int64_t ldc,                                             \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return syr2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,      \
-                     beta, c, ldc, dependencies);                                                  \
-    }
-
-SYR2K_LAUNCHER_USM(float, rocblas_ssyr2k)
-SYR2K_LAUNCHER_USM(double, rocblas_dsyr2k)
-SYR2K_LAUNCHER_USM(std::complex<float>, rocblas_csyr2k)
-SYR2K_LAUNCHER_USM(std::complex<double>, rocblas_zsyr2k)
-
-#undef SYR2K_LAUNCHER_USM
-
-template <typename Func, typename DataType, typename ScalarType>
-inline sycl::event her2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans,
-                         int64_t n, int64_t k, const DataType alpha, const DataType *a, int64_t lda,
-                         const DataType *b, int64_t ldb, const ScalarType beta, DataType *c,
-                         int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-    auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::conjtrans
-                                                               : oneapi::mkl::transpose::nontrans;
-    auto new_alpha = std::conj(alpha);
-
-    return column_major::her2k(func, queue, new_uplo, new_trans, n, k, new_alpha, a, lda, b, ldb,
-                               beta, c, ldc, dependencies);
-}
-
-#define HER2K_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE)                                \
-    sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \
-                      const DATA_TYPE alpha, const DATA_TYPE *a, int64_t lda, const DATA_TYPE *b,  \
-                      int64_t ldb, const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc,              \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return her2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb,      \
-                     beta, c, ldc, dependencies);                                                  \
-    }
-
-HER2K_LAUNCHER_USM(std::complex<float>, float, rocblas_cher2k)
-HER2K_LAUNCHER_USM(std::complex<double>, double, rocblas_zher2k)
-
-#undef HER2K_LAUNCHER_USM
-
-// NOTE: In rocblas TRMM diverted from the netlib blas and for performance
-// reason it requires the C matrix to be
-// separated from the B matrix. It is possible to use B instead of C, but this
-// will slow-down the code.
-template <typename Func, typename T>
-inline sycl::event trmm(Func func, sycl::queue &queue, side left_right, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, const T *a,
-                        int64_t lda, T *b, int64_t ldb,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_side =
-        left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left;
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    return column_major::trmm(func, queue, new_side, new_uplo, trans, unit_diag, n, m, alpha, a,
-                              lda, b, ldb, dependencies);
-}
-
-#define TRMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                   \
-    sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,       \
-                     diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \
-                     TYPE *b, int64_t ldb, const std::vector<sycl::event> &dependencies) {         \
-        return trmm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n,       \
-                    alpha, a, lda, b, ldb, dependencies);                                          \
-    }
-
-TRMM_LAUNCHER_USM(float, rocblas_strmm)
-TRMM_LAUNCHER_USM(double, rocblas_dtrmm)
-TRMM_LAUNCHER_USM(std::complex<float>, rocblas_ctrmm)
-TRMM_LAUNCHER_USM(std::complex<double>, rocblas_ztrmm)
-
-#undef TRMM_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event trsm(Func func, sycl::queue &queue, side left_right, uplo upper_lower,
-                        transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, const T *a,
-                        int64_t lda, T *b, int64_t ldb,
-                        const std::vector<sycl::event> &dependencies) {
-    auto new_side =
-        left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left;
-    auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper
-                                                            : oneapi::mkl::uplo::lower;
-
-    return column_major::trsm(func, queue, new_side, new_uplo, trans, unit_diag, n, m, alpha, a,
-                              lda, b, ldb, dependencies);
-}
-
-#define TRSM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE)                                                   \
-    sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans,       \
-                     diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \
-                     TYPE *b, int64_t ldb, const std::vector<sycl::event> &dependencies) {         \
-        return trsm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n,       \
-                    alpha, a, lda, b, ldb, dependencies);                                          \
-    }
-
-TRSM_LAUNCHER_USM(float, rocblas_strsm)
-TRSM_LAUNCHER_USM(double, rocblas_dtrsm)
-TRSM_LAUNCHER_USM(std::complex<float>, rocblas_ctrsm)
-TRSM_LAUNCHER_USM(std::complex<double>, rocblas_ztrsm)
-
-#undef TRSM_LAUNCHER_USM
-
-} // namespace row_major
-} // namespace rocblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/rocblas/rocblas_scope_handle.cpp b/src/blas/backends/rocblas/rocblas_scope_handle.cpp
deleted file mode 100644
index 404d1fc06..000000000
--- a/src/blas/backends/rocblas/rocblas_scope_handle.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/***************************************************************************
-*  Copyright 2020-2022 Intel Corporation
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "rocblas_scope_handle.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace rocblas {
-
-template <typename T>
-rocblas_handle_container<T>::~rocblas_handle_container() noexcept(false) {
-    for (auto &handle_pair : rocblas_handle_container_mapper_) {
-        rocblas_status err;
-        if (handle_pair.second != nullptr) {
-            auto handle = handle_pair.second->exchange(nullptr);
-            if (handle != nullptr) {
-                ROCBLAS_ERROR_FUNC(rocblas_destroy_handle, err, handle);
-                handle = nullptr;
-            }
-            else {
-                delete handle_pair.second;
-            }
-            handle_pair.second = nullptr;
-        }
-    }
-    rocblas_handle_container_mapper_.clear();
-}
-
-/**
- * Inserts a new element in the map if its key is unique. This new element
- * is constructed in place using args as the arguments for the construction
- * of a value_type (which is an object of a pair type). The insertion only
- * takes place if no other element in the container has a key equivalent to
- * the one being emplaced (keys in a map container are unique).
- */
-thread_local rocblas_handle_container<pi_context> RocblasScopedContextHandler::handle_helper =
-    rocblas_handle_container<pi_context>{};
-
-RocblasScopedContextHandler::RocblasScopedContextHandler(sycl::queue queue,
-                                                         sycl::interop_handle &ih)
-        : interop_h(ih),
-          needToRecover_(false) {
-    placedContext_ = new sycl::context(queue.get_context());
-    auto hipDevice = ih.get_native_device<sycl::backend::ext_oneapi_hip>();
-    hipError_t err;
-    hipCtx_t desired;
-    HIP_ERROR_FUNC(hipCtxGetCurrent, err, &original_);
-    HIP_ERROR_FUNC(hipDevicePrimaryCtxRetain, err, &desired, hipDevice);
-    if (original_ != desired) {
-        // Sets the desired context as the active one for the thread
-        HIP_ERROR_FUNC(hipCtxSetCurrent, err, desired);
-        // No context is installed and the suggested context is primary
-        // This is the most common case. We can activate the context in the
-        // thread and leave it there until all the PI context referring to the
-        // same underlying rocblas primary context are destroyed. This emulates
-        // the behaviour of the rocblas runtime api, and avoids costly context
-        // switches. No action is required on this side of the if.
-        needToRecover_ = !(original_ == nullptr);
-    }
-}
-
-RocblasScopedContextHandler::~RocblasScopedContextHandler() noexcept(false) {
-    if (needToRecover_) {
-        hipError_t err;
-        HIP_ERROR_FUNC(hipCtxSetCurrent, err, original_);
-    }
-    delete placedContext_;
-}
-
-void ContextCallback(void *userData) {
-    auto *ptr = static_cast<std::atomic<rocblas_handle> *>(userData);
-    if (!ptr) {
-        return;
-    }
-    auto handle = ptr->exchange(nullptr);
-    if (handle != nullptr) {
-        rocblas_status err1;
-        ROCBLAS_ERROR_FUNC(rocblas_destroy_handle, err1, handle);
-        handle = nullptr;
-    }
-    else {
-        // if the handle is nullptr it means the handle was already destroyed by
-        // the rocblas_handle destructor and we're free to delete the atomic
-        // object.
-        delete ptr;
-    }
-}
-
-rocblas_handle RocblasScopedContextHandler::get_handle(const sycl::queue &queue) {
-    auto hipDevice = interop_h.get_native_device<sycl::backend::ext_oneapi_hip>();
-    hipError_t hipErr;
-    hipCtx_t desired;
-    HIP_ERROR_FUNC(hipDevicePrimaryCtxRetain, hipErr, &desired, hipDevice);
-    auto piPlacedContext_ = reinterpret_cast<pi_context>(desired);
-    hipStream_t streamId = get_stream(queue);
-    rocblas_status err;
-    auto it = handle_helper.rocblas_handle_container_mapper_.find(piPlacedContext_);
-    if (it != handle_helper.rocblas_handle_container_mapper_.end()) {
-        if (it->second == nullptr) {
-            handle_helper.rocblas_handle_container_mapper_.erase(it);
-        }
-        else {
-            auto handle = it->second->load();
-            if (handle != nullptr) {
-                hipStream_t currentStreamId;
-                ROCBLAS_ERROR_FUNC(rocblas_get_stream, err, handle, &currentStreamId);
-                if (currentStreamId != streamId) {
-                    ROCBLAS_ERROR_FUNC(rocblas_set_stream, err, handle, streamId);
-                }
-                return handle;
-            }
-            else {
-                handle_helper.rocblas_handle_container_mapper_.erase(it);
-            }
-        }
-    }
-
-    rocblas_handle handle;
-
-    ROCBLAS_ERROR_FUNC(rocblas_create_handle, err, &handle);
-    ROCBLAS_ERROR_FUNC(rocblas_set_stream, err, handle, streamId);
-
-    auto insert_iter = handle_helper.rocblas_handle_container_mapper_.insert(
-        std::make_pair(piPlacedContext_, new std::atomic<rocblas_handle>(handle)));
-
-    sycl::detail::pi::contextSetExtendedDeleter(*placedContext_, ContextCallback,
-                                                insert_iter.first->second);
-    return handle;
-}
-
-hipStream_t RocblasScopedContextHandler::get_stream(const sycl::queue &queue) {
-    return sycl::get_native<sycl::backend::ext_oneapi_hip>(queue);
-}
-sycl::context RocblasScopedContextHandler::get_context(const sycl::queue &queue) {
-    return queue.get_context();
-}
-
-} // namespace rocblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/blas/backends/rocblas/rocblas_scope_handle.hpp b/src/blas/backends/rocblas/rocblas_scope_handle.hpp
deleted file mode 100644
index 908d4c05c..000000000
--- a/src/blas/backends/rocblas/rocblas_scope_handle.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/***************************************************************************
-*  Copyright 2020-2022 Intel Corporation
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#ifndef _ROCBLAS_SCOPED_HANDLE_HPP_
-#define _ROCBLAS_SCOPED_HANDLE_HPP_
-#include <CL/sycl.hpp>
-#include <memory>
-#include <thread>
-#include <atomic>
-#include <unordered_map>
-#include "rocblas_helper.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace rocblas {
-
-template <typename T>
-struct rocblas_handle_container {
-    using handle_container_t = std::unordered_map<T, std::atomic<rocblas_handle> *>;
-    handle_container_t rocblas_handle_container_mapper_{};
-    ~rocblas_handle_container() noexcept(false);
-};
-
-class RocblasScopedContextHandler {
-    HIPcontext original_;
-    sycl::context *placedContext_;
-    bool needToRecover_;
-    sycl::interop_handle &interop_h;
-    static thread_local rocblas_handle_container<pi_context> handle_helper;
-    sycl::context get_context(const sycl::queue &queue);
-    hipStream_t get_stream(const sycl::queue &queue);
-
-public:
-    RocblasScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih);
-    ~RocblasScopedContextHandler() noexcept(false);
-
-    rocblas_handle get_handle(const sycl::queue &queue);
-
-    // This is a work-around function for reinterpret_casting the memory. This
-    // will be fixed when SYCL-2020 has been implemented for Pi backend.
-    template <typename T, typename U>
-    inline T get_mem(U acc) {
-        return reinterpret_cast<T>(interop_h.get_native_mem<sycl::backend::ext_oneapi_hip>(acc));
-    }
-};
-
-} // namespace rocblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-#endif //_ROCBLAS_SCOPED_HANDLE_HPP_
diff --git a/src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.cpp b/src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.cpp
deleted file mode 100644
index da9791411..000000000
--- a/src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#include "rocblas_scope_handle_hipsycl.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace rocblas {
-
-rocblas_handle_container::~rocblas_handle_container() noexcept(false) {
-    for (auto &handle_pair : rocblas_handle_mapper_) {
-        rocblas_status err;
-        if (handle_pair.second != nullptr) {
-            auto handle = handle_pair.second->exchange(nullptr);
-            if (handle != nullptr) {
-                ROCBLAS_ERROR_FUNC(rocblas_destroy_handle, err, handle);
-                handle = nullptr;
-            }
-            delete handle_pair.second;
-            handle_pair.second = nullptr;
-        }
-    }
-    rocblas_handle_mapper_.clear();
-}
-
-thread_local rocblas_handle_container RocblasScopedContextHandler::handle_helper =
-    rocblas_handle_container{};
-
-RocblasScopedContextHandler::RocblasScopedContextHandler(sycl::queue queue,
-                                                         sycl::interop_handle &ih)
-        : interop_h(ih) {}
-
-rocblas_handle RocblasScopedContextHandler::get_handle(const sycl::queue &queue) {
-    sycl::device device = queue.get_device();
-    int current_device = interop_h.get_native_device<sycl::backend::hip>();
-    hipStream_t streamId = get_stream(queue);
-    rocblas_status err;
-    auto it = handle_helper.rocblas_handle_mapper_.find(current_device);
-    if (it != handle_helper.rocblas_handle_mapper_.end()) {
-        if (it->second == nullptr) {
-            handle_helper.rocblas_handle_mapper_.erase(it);
-        }
-        else {
-            auto handle = it->second->load();
-            if (handle != nullptr) {
-                hipStream_t currentStreamId;
-                ROCBLAS_ERROR_FUNC(rocblas_get_stream, err, handle, &currentStreamId);
-                if (currentStreamId != streamId) {
-                    ROCBLAS_ERROR_FUNC(rocblas_set_stream, err, handle, streamId);
-                }
-                return handle;
-            }
-            else {
-                handle_helper.rocblas_handle_mapper_.erase(it);
-            }
-        }
-    }
-    rocblas_handle handle;
-
-    ROCBLAS_ERROR_FUNC(rocblas_create_handle, err, &handle);
-    ROCBLAS_ERROR_FUNC(rocblas_set_stream, err, handle, streamId);
-
-    auto insert_iter = handle_helper.rocblas_handle_mapper_.insert(
-        std::make_pair(current_device, new std::atomic<rocblas_handle>(handle)));
-    return handle;
-}
-
-hipStream_t RocblasScopedContextHandler::get_stream(const sycl::queue &queue) {
-    return interop_h.get_native_queue<sycl::backend::hip>();
-}
-
-} // namespace rocblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
\ No newline at end of file
diff --git a/src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.hpp b/src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.hpp
deleted file mode 100644
index 3c156ab6c..000000000
--- a/src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#ifndef _ROCBLAS_SCOPED_HANDLE_HPP_
-#define _ROCBLAS_SCOPED_HANDLE_HPP_
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <memory>
-#include <thread>
-#include <unordered_map>
-#include "rocblas_helper.hpp"
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace rocblas {
-
-struct rocblas_handle_container {
-    using handle_container_t = std::unordered_map<int, std::atomic<rocblas_handle> *>;
-    handle_container_t rocblas_handle_mapper_{};
-    ~rocblas_handle_container() noexcept(false);
-};
-
-class RocblasScopedContextHandler {
-    sycl::interop_handle interop_h;
-    static thread_local rocblas_handle_container handle_helper;
-    sycl::context get_context(const sycl::queue &queue);
-    hipStream_t get_stream(const sycl::queue &queue);
-
-public:
-    RocblasScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih);
-
-    rocblas_handle get_handle(const sycl::queue &queue);
-
-    // This is a work-around function for reinterpret_casting the memory. This
-    // will be fixed when SYCL-2020 has been implemented for Pi backend.
-    template <typename T, typename U>
-    inline T get_mem(U acc) {
-        return reinterpret_cast<T>(interop_h.get_native_mem<sycl::backend::hip>(acc));
-    }
-};
-
-} // namespace rocblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-#endif //_ROCBLAS_SCOPED_HANDLE_HPP_
diff --git a/src/blas/backends/rocblas/rocblas_task.hpp b/src/blas/backends/rocblas/rocblas_task.hpp
deleted file mode 100644
index 94e2b2b4a..000000000
--- a/src/blas/backends/rocblas/rocblas_task.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#ifndef _ROCBLAS_TASK_HPP_
-#define _ROCBLAS_TASK_HPP_
-#include <rocblas/rocblas.h>
-#include <complex>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl/types.hpp"
-#ifndef __HIPSYCL__
-#include "rocblas_scope_handle.hpp"
-#if __has_include(<sycl/detail/pi.hpp>)
-#include <sycl/detail/pi.hpp>
-#else
-#include <CL/sycl/detail/pi.hpp>
-#endif
-#else
-#include "rocblas_scope_handle_hipsycl.hpp"
-
-#endif
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace rocblas {
-
-#ifdef __HIPSYCL__
-template <typename H, typename F>
-static inline void host_task_internal(H &cgh, sycl::queue queue, F f) {
-    cgh.hipSYCL_enqueue_custom_operation([f, queue](sycl::interop_handle ih) {
-        auto sc = RocblasScopedContextHandler(queue, ih);
-        f(sc);
-    });
-}
-#else
-template <typename H, typename F>
-static inline void host_task_internal(H &cgh, sycl::queue queue, F f) {
-    cgh.host_task([f, queue](sycl::interop_handle ih) {
-        auto sc = RocblasScopedContextHandler(queue, ih);
-        f(sc);
-    });
-}
-#endif
-template <typename H, typename F>
-static inline void onemkl_rocblas_host_task(H &cgh, sycl::queue queue, F f) {
-    (void)host_task_internal(cgh, queue, f);
-}
-
-} // namespace rocblas
-} // namespace blas
-} // namespace mkl
-} // namespace oneapi
-#endif // _ROCBLAS_TASK_HPP_
diff --git a/src/blas/backends/rocblas/rocblas_wrappers.cpp b/src/blas/backends/rocblas/rocblas_wrappers.cpp
deleted file mode 100644
index ce4c92da5..000000000
--- a/src/blas/backends/rocblas/rocblas_wrappers.cpp
+++ /dev/null
@@ -1,1008 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) and Computing Centre (URZ)
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "blas/function_table.hpp"
-#include "oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hpp"
-
-#define WRAPPER_VERSION 1
-
-extern "C" blas_function_table_t mkl_blas_table = {
-    WRAPPER_VERSION,
-    oneapi::mkl::blas::rocblas::column_major::asum,
-    oneapi::mkl::blas::rocblas::column_major::asum,
-    oneapi::mkl::blas::rocblas::column_major::asum,
-    oneapi::mkl::blas::rocblas::column_major::asum,
-    oneapi::mkl::blas::rocblas::column_major::axpy,
-    oneapi::mkl::blas::rocblas::column_major::axpy,
-    oneapi::mkl::blas::rocblas::column_major::axpy,
-    oneapi::mkl::blas::rocblas::column_major::axpy,
-    oneapi::mkl::blas::rocblas::column_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::column_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::column_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::column_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::column_major::axpby,
-    oneapi::mkl::blas::rocblas::column_major::axpby,
-    oneapi::mkl::blas::rocblas::column_major::axpby,
-    oneapi::mkl::blas::rocblas::column_major::axpby,
-    oneapi::mkl::blas::rocblas::column_major::copy,
-    oneapi::mkl::blas::rocblas::column_major::copy,
-    oneapi::mkl::blas::rocblas::column_major::copy,
-    oneapi::mkl::blas::rocblas::column_major::copy,
-    oneapi::mkl::blas::rocblas::column_major::copy_batch,
-    oneapi::mkl::blas::rocblas::column_major::copy_batch,
-    oneapi::mkl::blas::rocblas::column_major::copy_batch,
-    oneapi::mkl::blas::rocblas::column_major::copy_batch,
-    oneapi::mkl::blas::rocblas::column_major::dot,
-    oneapi::mkl::blas::rocblas::column_major::dot,
-    oneapi::mkl::blas::rocblas::column_major::dot,
-    oneapi::mkl::blas::rocblas::column_major::dotc,
-    oneapi::mkl::blas::rocblas::column_major::dotc,
-    oneapi::mkl::blas::rocblas::column_major::dotu,
-    oneapi::mkl::blas::rocblas::column_major::dotu,
-    oneapi::mkl::blas::rocblas::column_major::iamin,
-    oneapi::mkl::blas::rocblas::column_major::iamin,
-    oneapi::mkl::blas::rocblas::column_major::iamin,
-    oneapi::mkl::blas::rocblas::column_major::iamin,
-    oneapi::mkl::blas::rocblas::column_major::iamax,
-    oneapi::mkl::blas::rocblas::column_major::iamax,
-    oneapi::mkl::blas::rocblas::column_major::iamax,
-    oneapi::mkl::blas::rocblas::column_major::iamax,
-    oneapi::mkl::blas::rocblas::column_major::nrm2,
-    oneapi::mkl::blas::rocblas::column_major::nrm2,
-    oneapi::mkl::blas::rocblas::column_major::nrm2,
-    oneapi::mkl::blas::rocblas::column_major::nrm2,
-    oneapi::mkl::blas::rocblas::column_major::rot,
-    oneapi::mkl::blas::rocblas::column_major::rot,
-    oneapi::mkl::blas::rocblas::column_major::rot,
-    oneapi::mkl::blas::rocblas::column_major::rot,
-    oneapi::mkl::blas::rocblas::column_major::rotg,
-    oneapi::mkl::blas::rocblas::column_major::rotg,
-    oneapi::mkl::blas::rocblas::column_major::rotg,
-    oneapi::mkl::blas::rocblas::column_major::rotg,
-    oneapi::mkl::blas::rocblas::column_major::rotm,
-    oneapi::mkl::blas::rocblas::column_major::rotm,
-    oneapi::mkl::blas::rocblas::column_major::rotmg,
-    oneapi::mkl::blas::rocblas::column_major::rotmg,
-    oneapi::mkl::blas::rocblas::column_major::scal,
-    oneapi::mkl::blas::rocblas::column_major::scal,
-    oneapi::mkl::blas::rocblas::column_major::scal,
-    oneapi::mkl::blas::rocblas::column_major::scal,
-    oneapi::mkl::blas::rocblas::column_major::scal,
-    oneapi::mkl::blas::rocblas::column_major::scal,
-    oneapi::mkl::blas::rocblas::column_major::sdsdot,
-    oneapi::mkl::blas::rocblas::column_major::swap,
-    oneapi::mkl::blas::rocblas::column_major::swap,
-    oneapi::mkl::blas::rocblas::column_major::swap,
-    oneapi::mkl::blas::rocblas::column_major::swap,
-    oneapi::mkl::blas::rocblas::column_major::gbmv,
-    oneapi::mkl::blas::rocblas::column_major::gbmv,
-    oneapi::mkl::blas::rocblas::column_major::gbmv,
-    oneapi::mkl::blas::rocblas::column_major::gbmv,
-    oneapi::mkl::blas::rocblas::column_major::gemv,
-    oneapi::mkl::blas::rocblas::column_major::gemv,
-    oneapi::mkl::blas::rocblas::column_major::gemv,
-    oneapi::mkl::blas::rocblas::column_major::gemv,
-    oneapi::mkl::blas::rocblas::column_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::column_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::column_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::column_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::column_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::column_major::ger,
-    oneapi::mkl::blas::rocblas::column_major::ger,
-    oneapi::mkl::blas::rocblas::column_major::gerc,
-    oneapi::mkl::blas::rocblas::column_major::gerc,
-    oneapi::mkl::blas::rocblas::column_major::geru,
-    oneapi::mkl::blas::rocblas::column_major::geru,
-    oneapi::mkl::blas::rocblas::column_major::hbmv,
-    oneapi::mkl::blas::rocblas::column_major::hbmv,
-    oneapi::mkl::blas::rocblas::column_major::hemv,
-    oneapi::mkl::blas::rocblas::column_major::hemv,
-    oneapi::mkl::blas::rocblas::column_major::her,
-    oneapi::mkl::blas::rocblas::column_major::her,
-    oneapi::mkl::blas::rocblas::column_major::her2,
-    oneapi::mkl::blas::rocblas::column_major::her2,
-    oneapi::mkl::blas::rocblas::column_major::hpmv,
-    oneapi::mkl::blas::rocblas::column_major::hpmv,
-    oneapi::mkl::blas::rocblas::column_major::hpr,
-    oneapi::mkl::blas::rocblas::column_major::hpr,
-    oneapi::mkl::blas::rocblas::column_major::hpr2,
-    oneapi::mkl::blas::rocblas::column_major::hpr2,
-    oneapi::mkl::blas::rocblas::column_major::sbmv,
-    oneapi::mkl::blas::rocblas::column_major::sbmv,
-    oneapi::mkl::blas::rocblas::column_major::spmv,
-    oneapi::mkl::blas::rocblas::column_major::spmv,
-    oneapi::mkl::blas::rocblas::column_major::spr,
-    oneapi::mkl::blas::rocblas::column_major::spr,
-    oneapi::mkl::blas::rocblas::column_major::spr2,
-    oneapi::mkl::blas::rocblas::column_major::spr2,
-    oneapi::mkl::blas::rocblas::column_major::symv,
-    oneapi::mkl::blas::rocblas::column_major::symv,
-    oneapi::mkl::blas::rocblas::column_major::syr,
-    oneapi::mkl::blas::rocblas::column_major::syr,
-    oneapi::mkl::blas::rocblas::column_major::syr2,
-    oneapi::mkl::blas::rocblas::column_major::syr2,
-    oneapi::mkl::blas::rocblas::column_major::tbmv,
-    oneapi::mkl::blas::rocblas::column_major::tbmv,
-    oneapi::mkl::blas::rocblas::column_major::tbmv,
-    oneapi::mkl::blas::rocblas::column_major::tbmv,
-    oneapi::mkl::blas::rocblas::column_major::tbsv,
-    oneapi::mkl::blas::rocblas::column_major::tbsv,
-    oneapi::mkl::blas::rocblas::column_major::tbsv,
-    oneapi::mkl::blas::rocblas::column_major::tbsv,
-    oneapi::mkl::blas::rocblas::column_major::tpmv,
-    oneapi::mkl::blas::rocblas::column_major::tpmv,
-    oneapi::mkl::blas::rocblas::column_major::tpmv,
-    oneapi::mkl::blas::rocblas::column_major::tpmv,
-    oneapi::mkl::blas::rocblas::column_major::tpsv,
-    oneapi::mkl::blas::rocblas::column_major::tpsv,
-    oneapi::mkl::blas::rocblas::column_major::tpsv,
-    oneapi::mkl::blas::rocblas::column_major::tpsv,
-    oneapi::mkl::blas::rocblas::column_major::trmv,
-    oneapi::mkl::blas::rocblas::column_major::trmv,
-    oneapi::mkl::blas::rocblas::column_major::trmv,
-    oneapi::mkl::blas::rocblas::column_major::trmv,
-    oneapi::mkl::blas::rocblas::column_major::trsv,
-    oneapi::mkl::blas::rocblas::column_major::trsv,
-    oneapi::mkl::blas::rocblas::column_major::trsv,
-    oneapi::mkl::blas::rocblas::column_major::trsv,
-    oneapi::mkl::blas::rocblas::column_major::gemm,
-    oneapi::mkl::blas::rocblas::column_major::gemm,
-    oneapi::mkl::blas::rocblas::column_major::gemm,
-    oneapi::mkl::blas::rocblas::column_major::gemm,
-    oneapi::mkl::blas::rocblas::column_major::gemm,
-    oneapi::mkl::blas::rocblas::column_major::gemm,
-    oneapi::mkl::blas::rocblas::column_major::gemm,
-    oneapi::mkl::blas::rocblas::column_major::hemm,
-    oneapi::mkl::blas::rocblas::column_major::hemm,
-    oneapi::mkl::blas::rocblas::column_major::herk,
-    oneapi::mkl::blas::rocblas::column_major::herk,
-    oneapi::mkl::blas::rocblas::column_major::her2k,
-    oneapi::mkl::blas::rocblas::column_major::her2k,
-    oneapi::mkl::blas::rocblas::column_major::symm,
-    oneapi::mkl::blas::rocblas::column_major::symm,
-    oneapi::mkl::blas::rocblas::column_major::symm,
-    oneapi::mkl::blas::rocblas::column_major::symm,
-    oneapi::mkl::blas::rocblas::column_major::syrk,
-    oneapi::mkl::blas::rocblas::column_major::syrk,
-    oneapi::mkl::blas::rocblas::column_major::syrk,
-    oneapi::mkl::blas::rocblas::column_major::syrk,
-    oneapi::mkl::blas::rocblas::column_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::column_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::column_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::column_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::column_major::syr2k,
-    oneapi::mkl::blas::rocblas::column_major::syr2k,
-    oneapi::mkl::blas::rocblas::column_major::syr2k,
-    oneapi::mkl::blas::rocblas::column_major::syr2k,
-    oneapi::mkl::blas::rocblas::column_major::trmm,
-    oneapi::mkl::blas::rocblas::column_major::trmm,
-    oneapi::mkl::blas::rocblas::column_major::trmm,
-    oneapi::mkl::blas::rocblas::column_major::trmm,
-    oneapi::mkl::blas::rocblas::column_major::trsm,
-    oneapi::mkl::blas::rocblas::column_major::trsm,
-    oneapi::mkl::blas::rocblas::column_major::trsm,
-    oneapi::mkl::blas::rocblas::column_major::trsm,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::column_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::column_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::column_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemmt,
-    oneapi::mkl::blas::rocblas::column_major::gemmt,
-    oneapi::mkl::blas::rocblas::column_major::gemmt,
-    oneapi::mkl::blas::rocblas::column_major::gemmt,
-    oneapi::mkl::blas::rocblas::column_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::column_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::column_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::column_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy,
-    oneapi::mkl::blas::rocblas::column_major::omatadd,
-    oneapi::mkl::blas::rocblas::column_major::omatadd,
-    oneapi::mkl::blas::rocblas::column_major::omatadd,
-    oneapi::mkl::blas::rocblas::column_major::omatadd,
-    oneapi::mkl::blas::rocblas::column_major::asum,
-    oneapi::mkl::blas::rocblas::column_major::asum,
-    oneapi::mkl::blas::rocblas::column_major::asum,
-    oneapi::mkl::blas::rocblas::column_major::asum,
-    oneapi::mkl::blas::rocblas::column_major::axpy,
-    oneapi::mkl::blas::rocblas::column_major::axpy,
-    oneapi::mkl::blas::rocblas::column_major::axpy,
-    oneapi::mkl::blas::rocblas::column_major::axpy,
-    oneapi::mkl::blas::rocblas::column_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::column_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::column_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::column_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::column_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::column_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::column_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::column_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::column_major::axpby,
-    oneapi::mkl::blas::rocblas::column_major::axpby,
-    oneapi::mkl::blas::rocblas::column_major::axpby,
-    oneapi::mkl::blas::rocblas::column_major::axpby,
-    oneapi::mkl::blas::rocblas::column_major::copy,
-    oneapi::mkl::blas::rocblas::column_major::copy,
-    oneapi::mkl::blas::rocblas::column_major::copy,
-    oneapi::mkl::blas::rocblas::column_major::copy,
-    oneapi::mkl::blas::rocblas::column_major::copy_batch,
-    oneapi::mkl::blas::rocblas::column_major::copy_batch,
-    oneapi::mkl::blas::rocblas::column_major::copy_batch,
-    oneapi::mkl::blas::rocblas::column_major::copy_batch,
-    oneapi::mkl::blas::rocblas::column_major::copy_batch,
-    oneapi::mkl::blas::rocblas::column_major::copy_batch,
-    oneapi::mkl::blas::rocblas::column_major::copy_batch,
-    oneapi::mkl::blas::rocblas::column_major::copy_batch,
-    oneapi::mkl::blas::rocblas::column_major::dot,
-    oneapi::mkl::blas::rocblas::column_major::dot,
-    oneapi::mkl::blas::rocblas::column_major::dot,
-    oneapi::mkl::blas::rocblas::column_major::dotc,
-    oneapi::mkl::blas::rocblas::column_major::dotc,
-    oneapi::mkl::blas::rocblas::column_major::dotu,
-    oneapi::mkl::blas::rocblas::column_major::dotu,
-    oneapi::mkl::blas::rocblas::column_major::iamin,
-    oneapi::mkl::blas::rocblas::column_major::iamin,
-    oneapi::mkl::blas::rocblas::column_major::iamin,
-    oneapi::mkl::blas::rocblas::column_major::iamin,
-    oneapi::mkl::blas::rocblas::column_major::iamax,
-    oneapi::mkl::blas::rocblas::column_major::iamax,
-    oneapi::mkl::blas::rocblas::column_major::iamax,
-    oneapi::mkl::blas::rocblas::column_major::iamax,
-    oneapi::mkl::blas::rocblas::column_major::nrm2,
-    oneapi::mkl::blas::rocblas::column_major::nrm2,
-    oneapi::mkl::blas::rocblas::column_major::nrm2,
-    oneapi::mkl::blas::rocblas::column_major::nrm2,
-    oneapi::mkl::blas::rocblas::column_major::rot,
-    oneapi::mkl::blas::rocblas::column_major::rot,
-    oneapi::mkl::blas::rocblas::column_major::rot,
-    oneapi::mkl::blas::rocblas::column_major::rot,
-    oneapi::mkl::blas::rocblas::column_major::rotg,
-    oneapi::mkl::blas::rocblas::column_major::rotg,
-    oneapi::mkl::blas::rocblas::column_major::rotg,
-    oneapi::mkl::blas::rocblas::column_major::rotg,
-    oneapi::mkl::blas::rocblas::column_major::rotm,
-    oneapi::mkl::blas::rocblas::column_major::rotm,
-    oneapi::mkl::blas::rocblas::column_major::rotmg,
-    oneapi::mkl::blas::rocblas::column_major::rotmg,
-    oneapi::mkl::blas::rocblas::column_major::scal,
-    oneapi::mkl::blas::rocblas::column_major::scal,
-    oneapi::mkl::blas::rocblas::column_major::scal,
-    oneapi::mkl::blas::rocblas::column_major::scal,
-    oneapi::mkl::blas::rocblas::column_major::scal,
-    oneapi::mkl::blas::rocblas::column_major::scal,
-    oneapi::mkl::blas::rocblas::column_major::sdsdot,
-    oneapi::mkl::blas::rocblas::column_major::swap,
-    oneapi::mkl::blas::rocblas::column_major::swap,
-    oneapi::mkl::blas::rocblas::column_major::swap,
-    oneapi::mkl::blas::rocblas::column_major::swap,
-    oneapi::mkl::blas::rocblas::column_major::gbmv,
-    oneapi::mkl::blas::rocblas::column_major::gbmv,
-    oneapi::mkl::blas::rocblas::column_major::gbmv,
-    oneapi::mkl::blas::rocblas::column_major::gbmv,
-    oneapi::mkl::blas::rocblas::column_major::gemv,
-    oneapi::mkl::blas::rocblas::column_major::gemv,
-    oneapi::mkl::blas::rocblas::column_major::gemv,
-    oneapi::mkl::blas::rocblas::column_major::gemv,
-    oneapi::mkl::blas::rocblas::column_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::column_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::column_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::column_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::column_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::column_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::column_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::column_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::column_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::column_major::ger,
-    oneapi::mkl::blas::rocblas::column_major::ger,
-    oneapi::mkl::blas::rocblas::column_major::gerc,
-    oneapi::mkl::blas::rocblas::column_major::gerc,
-    oneapi::mkl::blas::rocblas::column_major::geru,
-    oneapi::mkl::blas::rocblas::column_major::geru,
-    oneapi::mkl::blas::rocblas::column_major::hbmv,
-    oneapi::mkl::blas::rocblas::column_major::hbmv,
-    oneapi::mkl::blas::rocblas::column_major::hemv,
-    oneapi::mkl::blas::rocblas::column_major::hemv,
-    oneapi::mkl::blas::rocblas::column_major::her,
-    oneapi::mkl::blas::rocblas::column_major::her,
-    oneapi::mkl::blas::rocblas::column_major::her2,
-    oneapi::mkl::blas::rocblas::column_major::her2,
-    oneapi::mkl::blas::rocblas::column_major::hpmv,
-    oneapi::mkl::blas::rocblas::column_major::hpmv,
-    oneapi::mkl::blas::rocblas::column_major::hpr,
-    oneapi::mkl::blas::rocblas::column_major::hpr,
-    oneapi::mkl::blas::rocblas::column_major::hpr2,
-    oneapi::mkl::blas::rocblas::column_major::hpr2,
-    oneapi::mkl::blas::rocblas::column_major::sbmv,
-    oneapi::mkl::blas::rocblas::column_major::sbmv,
-    oneapi::mkl::blas::rocblas::column_major::spmv,
-    oneapi::mkl::blas::rocblas::column_major::spmv,
-    oneapi::mkl::blas::rocblas::column_major::spr,
-    oneapi::mkl::blas::rocblas::column_major::spr,
-    oneapi::mkl::blas::rocblas::column_major::spr2,
-    oneapi::mkl::blas::rocblas::column_major::spr2,
-    oneapi::mkl::blas::rocblas::column_major::symv,
-    oneapi::mkl::blas::rocblas::column_major::symv,
-    oneapi::mkl::blas::rocblas::column_major::syr,
-    oneapi::mkl::blas::rocblas::column_major::syr,
-    oneapi::mkl::blas::rocblas::column_major::syr2,
-    oneapi::mkl::blas::rocblas::column_major::syr2,
-    oneapi::mkl::blas::rocblas::column_major::tbmv,
-    oneapi::mkl::blas::rocblas::column_major::tbmv,
-    oneapi::mkl::blas::rocblas::column_major::tbmv,
-    oneapi::mkl::blas::rocblas::column_major::tbmv,
-    oneapi::mkl::blas::rocblas::column_major::tbsv,
-    oneapi::mkl::blas::rocblas::column_major::tbsv,
-    oneapi::mkl::blas::rocblas::column_major::tbsv,
-    oneapi::mkl::blas::rocblas::column_major::tbsv,
-    oneapi::mkl::blas::rocblas::column_major::tpmv,
-    oneapi::mkl::blas::rocblas::column_major::tpmv,
-    oneapi::mkl::blas::rocblas::column_major::tpmv,
-    oneapi::mkl::blas::rocblas::column_major::tpmv,
-    oneapi::mkl::blas::rocblas::column_major::tpsv,
-    oneapi::mkl::blas::rocblas::column_major::tpsv,
-    oneapi::mkl::blas::rocblas::column_major::tpsv,
-    oneapi::mkl::blas::rocblas::column_major::tpsv,
-    oneapi::mkl::blas::rocblas::column_major::trmv,
-    oneapi::mkl::blas::rocblas::column_major::trmv,
-    oneapi::mkl::blas::rocblas::column_major::trmv,
-    oneapi::mkl::blas::rocblas::column_major::trmv,
-    oneapi::mkl::blas::rocblas::column_major::trsv,
-    oneapi::mkl::blas::rocblas::column_major::trsv,
-    oneapi::mkl::blas::rocblas::column_major::trsv,
-    oneapi::mkl::blas::rocblas::column_major::trsv,
-    oneapi::mkl::blas::rocblas::column_major::gemm,
-    oneapi::mkl::blas::rocblas::column_major::gemm,
-    oneapi::mkl::blas::rocblas::column_major::gemm,
-    oneapi::mkl::blas::rocblas::column_major::gemm,
-    oneapi::mkl::blas::rocblas::column_major::gemm,
-    oneapi::mkl::blas::rocblas::column_major::gemm,
-    oneapi::mkl::blas::rocblas::column_major::gemm,
-    oneapi::mkl::blas::rocblas::column_major::hemm,
-    oneapi::mkl::blas::rocblas::column_major::hemm,
-    oneapi::mkl::blas::rocblas::column_major::herk,
-    oneapi::mkl::blas::rocblas::column_major::herk,
-    oneapi::mkl::blas::rocblas::column_major::her2k,
-    oneapi::mkl::blas::rocblas::column_major::her2k,
-    oneapi::mkl::blas::rocblas::column_major::symm,
-    oneapi::mkl::blas::rocblas::column_major::symm,
-    oneapi::mkl::blas::rocblas::column_major::symm,
-    oneapi::mkl::blas::rocblas::column_major::symm,
-    oneapi::mkl::blas::rocblas::column_major::syrk,
-    oneapi::mkl::blas::rocblas::column_major::syrk,
-    oneapi::mkl::blas::rocblas::column_major::syrk,
-    oneapi::mkl::blas::rocblas::column_major::syrk,
-    oneapi::mkl::blas::rocblas::column_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::column_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::column_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::column_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::column_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::column_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::column_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::column_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::column_major::syr2k,
-    oneapi::mkl::blas::rocblas::column_major::syr2k,
-    oneapi::mkl::blas::rocblas::column_major::syr2k,
-    oneapi::mkl::blas::rocblas::column_major::syr2k,
-    oneapi::mkl::blas::rocblas::column_major::trmm,
-    oneapi::mkl::blas::rocblas::column_major::trmm,
-    oneapi::mkl::blas::rocblas::column_major::trmm,
-    oneapi::mkl::blas::rocblas::column_major::trmm,
-    oneapi::mkl::blas::rocblas::column_major::trsm,
-    oneapi::mkl::blas::rocblas::column_major::trsm,
-    oneapi::mkl::blas::rocblas::column_major::trsm,
-    oneapi::mkl::blas::rocblas::column_major::trsm,
-    oneapi::mkl::blas::rocblas::column_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::column_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::column_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::column_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::column_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::column_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::column_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::column_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::column_major::gemmt,
-    oneapi::mkl::blas::rocblas::column_major::gemmt,
-    oneapi::mkl::blas::rocblas::column_major::gemmt,
-    oneapi::mkl::blas::rocblas::column_major::gemmt,
-    oneapi::mkl::blas::rocblas::column_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::column_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::column_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::column_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy,
-    oneapi::mkl::blas::rocblas::column_major::omatadd,
-    oneapi::mkl::blas::rocblas::column_major::omatadd,
-    oneapi::mkl::blas::rocblas::column_major::omatadd,
-    oneapi::mkl::blas::rocblas::column_major::omatadd,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::column_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::asum,
-    oneapi::mkl::blas::rocblas::row_major::asum,
-    oneapi::mkl::blas::rocblas::row_major::asum,
-    oneapi::mkl::blas::rocblas::row_major::asum,
-    oneapi::mkl::blas::rocblas::row_major::axpy,
-    oneapi::mkl::blas::rocblas::row_major::axpy,
-    oneapi::mkl::blas::rocblas::row_major::axpy,
-    oneapi::mkl::blas::rocblas::row_major::axpy,
-    oneapi::mkl::blas::rocblas::row_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::row_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::row_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::row_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::row_major::axpby,
-    oneapi::mkl::blas::rocblas::row_major::axpby,
-    oneapi::mkl::blas::rocblas::row_major::axpby,
-    oneapi::mkl::blas::rocblas::row_major::axpby,
-    oneapi::mkl::blas::rocblas::row_major::copy,
-    oneapi::mkl::blas::rocblas::row_major::copy,
-    oneapi::mkl::blas::rocblas::row_major::copy,
-    oneapi::mkl::blas::rocblas::row_major::copy,
-    oneapi::mkl::blas::rocblas::row_major::copy_batch,
-    oneapi::mkl::blas::rocblas::row_major::copy_batch,
-    oneapi::mkl::blas::rocblas::row_major::copy_batch,
-    oneapi::mkl::blas::rocblas::row_major::copy_batch,
-    oneapi::mkl::blas::rocblas::row_major::dot,
-    oneapi::mkl::blas::rocblas::row_major::dot,
-    oneapi::mkl::blas::rocblas::row_major::dot,
-    oneapi::mkl::blas::rocblas::row_major::dotc,
-    oneapi::mkl::blas::rocblas::row_major::dotc,
-    oneapi::mkl::blas::rocblas::row_major::dotu,
-    oneapi::mkl::blas::rocblas::row_major::dotu,
-    oneapi::mkl::blas::rocblas::row_major::iamin,
-    oneapi::mkl::blas::rocblas::row_major::iamin,
-    oneapi::mkl::blas::rocblas::row_major::iamin,
-    oneapi::mkl::blas::rocblas::row_major::iamin,
-    oneapi::mkl::blas::rocblas::row_major::iamax,
-    oneapi::mkl::blas::rocblas::row_major::iamax,
-    oneapi::mkl::blas::rocblas::row_major::iamax,
-    oneapi::mkl::blas::rocblas::row_major::iamax,
-    oneapi::mkl::blas::rocblas::row_major::nrm2,
-    oneapi::mkl::blas::rocblas::row_major::nrm2,
-    oneapi::mkl::blas::rocblas::row_major::nrm2,
-    oneapi::mkl::blas::rocblas::row_major::nrm2,
-    oneapi::mkl::blas::rocblas::row_major::rot,
-    oneapi::mkl::blas::rocblas::row_major::rot,
-    oneapi::mkl::blas::rocblas::row_major::rot,
-    oneapi::mkl::blas::rocblas::row_major::rot,
-    oneapi::mkl::blas::rocblas::row_major::rotg,
-    oneapi::mkl::blas::rocblas::row_major::rotg,
-    oneapi::mkl::blas::rocblas::row_major::rotg,
-    oneapi::mkl::blas::rocblas::row_major::rotg,
-    oneapi::mkl::blas::rocblas::row_major::rotm,
-    oneapi::mkl::blas::rocblas::row_major::rotm,
-    oneapi::mkl::blas::rocblas::row_major::rotmg,
-    oneapi::mkl::blas::rocblas::row_major::rotmg,
-    oneapi::mkl::blas::rocblas::row_major::scal,
-    oneapi::mkl::blas::rocblas::row_major::scal,
-    oneapi::mkl::blas::rocblas::row_major::scal,
-    oneapi::mkl::blas::rocblas::row_major::scal,
-    oneapi::mkl::blas::rocblas::row_major::scal,
-    oneapi::mkl::blas::rocblas::row_major::scal,
-    oneapi::mkl::blas::rocblas::row_major::sdsdot,
-    oneapi::mkl::blas::rocblas::row_major::swap,
-    oneapi::mkl::blas::rocblas::row_major::swap,
-    oneapi::mkl::blas::rocblas::row_major::swap,
-    oneapi::mkl::blas::rocblas::row_major::swap,
-    oneapi::mkl::blas::rocblas::row_major::gbmv,
-    oneapi::mkl::blas::rocblas::row_major::gbmv,
-    oneapi::mkl::blas::rocblas::row_major::gbmv,
-    oneapi::mkl::blas::rocblas::row_major::gbmv,
-    oneapi::mkl::blas::rocblas::row_major::gemv,
-    oneapi::mkl::blas::rocblas::row_major::gemv,
-    oneapi::mkl::blas::rocblas::row_major::gemv,
-    oneapi::mkl::blas::rocblas::row_major::gemv,
-    oneapi::mkl::blas::rocblas::row_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::row_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::row_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::row_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::row_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::row_major::ger,
-    oneapi::mkl::blas::rocblas::row_major::ger,
-    oneapi::mkl::blas::rocblas::row_major::gerc,
-    oneapi::mkl::blas::rocblas::row_major::gerc,
-    oneapi::mkl::blas::rocblas::row_major::geru,
-    oneapi::mkl::blas::rocblas::row_major::geru,
-    oneapi::mkl::blas::rocblas::row_major::hbmv,
-    oneapi::mkl::blas::rocblas::row_major::hbmv,
-    oneapi::mkl::blas::rocblas::row_major::hemv,
-    oneapi::mkl::blas::rocblas::row_major::hemv,
-    oneapi::mkl::blas::rocblas::row_major::her,
-    oneapi::mkl::blas::rocblas::row_major::her,
-    oneapi::mkl::blas::rocblas::row_major::her2,
-    oneapi::mkl::blas::rocblas::row_major::her2,
-    oneapi::mkl::blas::rocblas::row_major::hpmv,
-    oneapi::mkl::blas::rocblas::row_major::hpmv,
-    oneapi::mkl::blas::rocblas::row_major::hpr,
-    oneapi::mkl::blas::rocblas::row_major::hpr,
-    oneapi::mkl::blas::rocblas::row_major::hpr2,
-    oneapi::mkl::blas::rocblas::row_major::hpr2,
-    oneapi::mkl::blas::rocblas::row_major::sbmv,
-    oneapi::mkl::blas::rocblas::row_major::sbmv,
-    oneapi::mkl::blas::rocblas::row_major::spmv,
-    oneapi::mkl::blas::rocblas::row_major::spmv,
-    oneapi::mkl::blas::rocblas::row_major::spr,
-    oneapi::mkl::blas::rocblas::row_major::spr,
-    oneapi::mkl::blas::rocblas::row_major::spr2,
-    oneapi::mkl::blas::rocblas::row_major::spr2,
-    oneapi::mkl::blas::rocblas::row_major::symv,
-    oneapi::mkl::blas::rocblas::row_major::symv,
-    oneapi::mkl::blas::rocblas::row_major::syr,
-    oneapi::mkl::blas::rocblas::row_major::syr,
-    oneapi::mkl::blas::rocblas::row_major::syr2,
-    oneapi::mkl::blas::rocblas::row_major::syr2,
-    oneapi::mkl::blas::rocblas::row_major::tbmv,
-    oneapi::mkl::blas::rocblas::row_major::tbmv,
-    oneapi::mkl::blas::rocblas::row_major::tbmv,
-    oneapi::mkl::blas::rocblas::row_major::tbmv,
-    oneapi::mkl::blas::rocblas::row_major::tbsv,
-    oneapi::mkl::blas::rocblas::row_major::tbsv,
-    oneapi::mkl::blas::rocblas::row_major::tbsv,
-    oneapi::mkl::blas::rocblas::row_major::tbsv,
-    oneapi::mkl::blas::rocblas::row_major::tpmv,
-    oneapi::mkl::blas::rocblas::row_major::tpmv,
-    oneapi::mkl::blas::rocblas::row_major::tpmv,
-    oneapi::mkl::blas::rocblas::row_major::tpmv,
-    oneapi::mkl::blas::rocblas::row_major::tpsv,
-    oneapi::mkl::blas::rocblas::row_major::tpsv,
-    oneapi::mkl::blas::rocblas::row_major::tpsv,
-    oneapi::mkl::blas::rocblas::row_major::tpsv,
-    oneapi::mkl::blas::rocblas::row_major::trmv,
-    oneapi::mkl::blas::rocblas::row_major::trmv,
-    oneapi::mkl::blas::rocblas::row_major::trmv,
-    oneapi::mkl::blas::rocblas::row_major::trmv,
-    oneapi::mkl::blas::rocblas::row_major::trsv,
-    oneapi::mkl::blas::rocblas::row_major::trsv,
-    oneapi::mkl::blas::rocblas::row_major::trsv,
-    oneapi::mkl::blas::rocblas::row_major::trsv,
-    oneapi::mkl::blas::rocblas::row_major::gemm,
-    oneapi::mkl::blas::rocblas::row_major::gemm,
-    oneapi::mkl::blas::rocblas::row_major::gemm,
-    oneapi::mkl::blas::rocblas::row_major::gemm,
-    oneapi::mkl::blas::rocblas::row_major::gemm,
-    oneapi::mkl::blas::rocblas::row_major::gemm,
-    oneapi::mkl::blas::rocblas::row_major::gemm,
-    oneapi::mkl::blas::rocblas::row_major::hemm,
-    oneapi::mkl::blas::rocblas::row_major::hemm,
-    oneapi::mkl::blas::rocblas::row_major::herk,
-    oneapi::mkl::blas::rocblas::row_major::herk,
-    oneapi::mkl::blas::rocblas::row_major::her2k,
-    oneapi::mkl::blas::rocblas::row_major::her2k,
-    oneapi::mkl::blas::rocblas::row_major::symm,
-    oneapi::mkl::blas::rocblas::row_major::symm,
-    oneapi::mkl::blas::rocblas::row_major::symm,
-    oneapi::mkl::blas::rocblas::row_major::symm,
-    oneapi::mkl::blas::rocblas::row_major::syrk,
-    oneapi::mkl::blas::rocblas::row_major::syrk,
-    oneapi::mkl::blas::rocblas::row_major::syrk,
-    oneapi::mkl::blas::rocblas::row_major::syrk,
-    oneapi::mkl::blas::rocblas::row_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::row_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::row_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::row_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::row_major::syr2k,
-    oneapi::mkl::blas::rocblas::row_major::syr2k,
-    oneapi::mkl::blas::rocblas::row_major::syr2k,
-    oneapi::mkl::blas::rocblas::row_major::syr2k,
-    oneapi::mkl::blas::rocblas::row_major::trmm,
-    oneapi::mkl::blas::rocblas::row_major::trmm,
-    oneapi::mkl::blas::rocblas::row_major::trmm,
-    oneapi::mkl::blas::rocblas::row_major::trmm,
-    oneapi::mkl::blas::rocblas::row_major::trsm,
-    oneapi::mkl::blas::rocblas::row_major::trsm,
-    oneapi::mkl::blas::rocblas::row_major::trsm,
-    oneapi::mkl::blas::rocblas::row_major::trsm,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::row_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::row_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::row_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemmt,
-    oneapi::mkl::blas::rocblas::row_major::gemmt,
-    oneapi::mkl::blas::rocblas::row_major::gemmt,
-    oneapi::mkl::blas::rocblas::row_major::gemmt,
-    oneapi::mkl::blas::rocblas::row_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::row_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::row_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::row_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy,
-    oneapi::mkl::blas::rocblas::row_major::omatadd,
-    oneapi::mkl::blas::rocblas::row_major::omatadd,
-    oneapi::mkl::blas::rocblas::row_major::omatadd,
-    oneapi::mkl::blas::rocblas::row_major::omatadd,
-    oneapi::mkl::blas::rocblas::row_major::asum,
-    oneapi::mkl::blas::rocblas::row_major::asum,
-    oneapi::mkl::blas::rocblas::row_major::asum,
-    oneapi::mkl::blas::rocblas::row_major::asum,
-    oneapi::mkl::blas::rocblas::row_major::axpy,
-    oneapi::mkl::blas::rocblas::row_major::axpy,
-    oneapi::mkl::blas::rocblas::row_major::axpy,
-    oneapi::mkl::blas::rocblas::row_major::axpy,
-    oneapi::mkl::blas::rocblas::row_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::row_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::row_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::row_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::row_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::row_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::row_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::row_major::axpy_batch,
-    oneapi::mkl::blas::rocblas::row_major::axpby,
-    oneapi::mkl::blas::rocblas::row_major::axpby,
-    oneapi::mkl::blas::rocblas::row_major::axpby,
-    oneapi::mkl::blas::rocblas::row_major::axpby,
-    oneapi::mkl::blas::rocblas::row_major::copy,
-    oneapi::mkl::blas::rocblas::row_major::copy,
-    oneapi::mkl::blas::rocblas::row_major::copy,
-    oneapi::mkl::blas::rocblas::row_major::copy,
-    oneapi::mkl::blas::rocblas::row_major::copy_batch,
-    oneapi::mkl::blas::rocblas::row_major::copy_batch,
-    oneapi::mkl::blas::rocblas::row_major::copy_batch,
-    oneapi::mkl::blas::rocblas::row_major::copy_batch,
-    oneapi::mkl::blas::rocblas::row_major::copy_batch,
-    oneapi::mkl::blas::rocblas::row_major::copy_batch,
-    oneapi::mkl::blas::rocblas::row_major::copy_batch,
-    oneapi::mkl::blas::rocblas::row_major::copy_batch,
-    oneapi::mkl::blas::rocblas::row_major::dot,
-    oneapi::mkl::blas::rocblas::row_major::dot,
-    oneapi::mkl::blas::rocblas::row_major::dot,
-    oneapi::mkl::blas::rocblas::row_major::dotc,
-    oneapi::mkl::blas::rocblas::row_major::dotc,
-    oneapi::mkl::blas::rocblas::row_major::dotu,
-    oneapi::mkl::blas::rocblas::row_major::dotu,
-    oneapi::mkl::blas::rocblas::row_major::iamin,
-    oneapi::mkl::blas::rocblas::row_major::iamin,
-    oneapi::mkl::blas::rocblas::row_major::iamin,
-    oneapi::mkl::blas::rocblas::row_major::iamin,
-    oneapi::mkl::blas::rocblas::row_major::iamax,
-    oneapi::mkl::blas::rocblas::row_major::iamax,
-    oneapi::mkl::blas::rocblas::row_major::iamax,
-    oneapi::mkl::blas::rocblas::row_major::iamax,
-    oneapi::mkl::blas::rocblas::row_major::nrm2,
-    oneapi::mkl::blas::rocblas::row_major::nrm2,
-    oneapi::mkl::blas::rocblas::row_major::nrm2,
-    oneapi::mkl::blas::rocblas::row_major::nrm2,
-    oneapi::mkl::blas::rocblas::row_major::rot,
-    oneapi::mkl::blas::rocblas::row_major::rot,
-    oneapi::mkl::blas::rocblas::row_major::rot,
-    oneapi::mkl::blas::rocblas::row_major::rot,
-    oneapi::mkl::blas::rocblas::row_major::rotg,
-    oneapi::mkl::blas::rocblas::row_major::rotg,
-    oneapi::mkl::blas::rocblas::row_major::rotg,
-    oneapi::mkl::blas::rocblas::row_major::rotg,
-    oneapi::mkl::blas::rocblas::row_major::rotm,
-    oneapi::mkl::blas::rocblas::row_major::rotm,
-    oneapi::mkl::blas::rocblas::row_major::rotmg,
-    oneapi::mkl::blas::rocblas::row_major::rotmg,
-    oneapi::mkl::blas::rocblas::row_major::scal,
-    oneapi::mkl::blas::rocblas::row_major::scal,
-    oneapi::mkl::blas::rocblas::row_major::scal,
-    oneapi::mkl::blas::rocblas::row_major::scal,
-    oneapi::mkl::blas::rocblas::row_major::scal,
-    oneapi::mkl::blas::rocblas::row_major::scal,
-    oneapi::mkl::blas::rocblas::row_major::sdsdot,
-    oneapi::mkl::blas::rocblas::row_major::swap,
-    oneapi::mkl::blas::rocblas::row_major::swap,
-    oneapi::mkl::blas::rocblas::row_major::swap,
-    oneapi::mkl::blas::rocblas::row_major::swap,
-    oneapi::mkl::blas::rocblas::row_major::gbmv,
-    oneapi::mkl::blas::rocblas::row_major::gbmv,
-    oneapi::mkl::blas::rocblas::row_major::gbmv,
-    oneapi::mkl::blas::rocblas::row_major::gbmv,
-    oneapi::mkl::blas::rocblas::row_major::gemv,
-    oneapi::mkl::blas::rocblas::row_major::gemv,
-    oneapi::mkl::blas::rocblas::row_major::gemv,
-    oneapi::mkl::blas::rocblas::row_major::gemv,
-    oneapi::mkl::blas::rocblas::row_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemv_batch,
-    oneapi::mkl::blas::rocblas::row_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::row_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::row_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::row_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::row_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::row_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::row_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::row_major::dgmm_batch,
-    oneapi::mkl::blas::rocblas::row_major::ger,
-    oneapi::mkl::blas::rocblas::row_major::ger,
-    oneapi::mkl::blas::rocblas::row_major::gerc,
-    oneapi::mkl::blas::rocblas::row_major::gerc,
-    oneapi::mkl::blas::rocblas::row_major::geru,
-    oneapi::mkl::blas::rocblas::row_major::geru,
-    oneapi::mkl::blas::rocblas::row_major::hbmv,
-    oneapi::mkl::blas::rocblas::row_major::hbmv,
-    oneapi::mkl::blas::rocblas::row_major::hemv,
-    oneapi::mkl::blas::rocblas::row_major::hemv,
-    oneapi::mkl::blas::rocblas::row_major::her,
-    oneapi::mkl::blas::rocblas::row_major::her,
-    oneapi::mkl::blas::rocblas::row_major::her2,
-    oneapi::mkl::blas::rocblas::row_major::her2,
-    oneapi::mkl::blas::rocblas::row_major::hpmv,
-    oneapi::mkl::blas::rocblas::row_major::hpmv,
-    oneapi::mkl::blas::rocblas::row_major::hpr,
-    oneapi::mkl::blas::rocblas::row_major::hpr,
-    oneapi::mkl::blas::rocblas::row_major::hpr2,
-    oneapi::mkl::blas::rocblas::row_major::hpr2,
-    oneapi::mkl::blas::rocblas::row_major::sbmv,
-    oneapi::mkl::blas::rocblas::row_major::sbmv,
-    oneapi::mkl::blas::rocblas::row_major::spmv,
-    oneapi::mkl::blas::rocblas::row_major::spmv,
-    oneapi::mkl::blas::rocblas::row_major::spr,
-    oneapi::mkl::blas::rocblas::row_major::spr,
-    oneapi::mkl::blas::rocblas::row_major::spr2,
-    oneapi::mkl::blas::rocblas::row_major::spr2,
-    oneapi::mkl::blas::rocblas::row_major::symv,
-    oneapi::mkl::blas::rocblas::row_major::symv,
-    oneapi::mkl::blas::rocblas::row_major::syr,
-    oneapi::mkl::blas::rocblas::row_major::syr,
-    oneapi::mkl::blas::rocblas::row_major::syr2,
-    oneapi::mkl::blas::rocblas::row_major::syr2,
-    oneapi::mkl::blas::rocblas::row_major::tbmv,
-    oneapi::mkl::blas::rocblas::row_major::tbmv,
-    oneapi::mkl::blas::rocblas::row_major::tbmv,
-    oneapi::mkl::blas::rocblas::row_major::tbmv,
-    oneapi::mkl::blas::rocblas::row_major::tbsv,
-    oneapi::mkl::blas::rocblas::row_major::tbsv,
-    oneapi::mkl::blas::rocblas::row_major::tbsv,
-    oneapi::mkl::blas::rocblas::row_major::tbsv,
-    oneapi::mkl::blas::rocblas::row_major::tpmv,
-    oneapi::mkl::blas::rocblas::row_major::tpmv,
-    oneapi::mkl::blas::rocblas::row_major::tpmv,
-    oneapi::mkl::blas::rocblas::row_major::tpmv,
-    oneapi::mkl::blas::rocblas::row_major::tpsv,
-    oneapi::mkl::blas::rocblas::row_major::tpsv,
-    oneapi::mkl::blas::rocblas::row_major::tpsv,
-    oneapi::mkl::blas::rocblas::row_major::tpsv,
-    oneapi::mkl::blas::rocblas::row_major::trmv,
-    oneapi::mkl::blas::rocblas::row_major::trmv,
-    oneapi::mkl::blas::rocblas::row_major::trmv,
-    oneapi::mkl::blas::rocblas::row_major::trmv,
-    oneapi::mkl::blas::rocblas::row_major::trsv,
-    oneapi::mkl::blas::rocblas::row_major::trsv,
-    oneapi::mkl::blas::rocblas::row_major::trsv,
-    oneapi::mkl::blas::rocblas::row_major::trsv,
-    oneapi::mkl::blas::rocblas::row_major::gemm,
-    oneapi::mkl::blas::rocblas::row_major::gemm,
-    oneapi::mkl::blas::rocblas::row_major::gemm,
-    oneapi::mkl::blas::rocblas::row_major::gemm,
-    oneapi::mkl::blas::rocblas::row_major::gemm,
-    oneapi::mkl::blas::rocblas::row_major::gemm,
-    oneapi::mkl::blas::rocblas::row_major::gemm,
-    oneapi::mkl::blas::rocblas::row_major::hemm,
-    oneapi::mkl::blas::rocblas::row_major::hemm,
-    oneapi::mkl::blas::rocblas::row_major::herk,
-    oneapi::mkl::blas::rocblas::row_major::herk,
-    oneapi::mkl::blas::rocblas::row_major::her2k,
-    oneapi::mkl::blas::rocblas::row_major::her2k,
-    oneapi::mkl::blas::rocblas::row_major::symm,
-    oneapi::mkl::blas::rocblas::row_major::symm,
-    oneapi::mkl::blas::rocblas::row_major::symm,
-    oneapi::mkl::blas::rocblas::row_major::symm,
-    oneapi::mkl::blas::rocblas::row_major::syrk,
-    oneapi::mkl::blas::rocblas::row_major::syrk,
-    oneapi::mkl::blas::rocblas::row_major::syrk,
-    oneapi::mkl::blas::rocblas::row_major::syrk,
-    oneapi::mkl::blas::rocblas::row_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::row_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::row_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::row_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::row_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::row_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::row_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::row_major::syrk_batch,
-    oneapi::mkl::blas::rocblas::row_major::syr2k,
-    oneapi::mkl::blas::rocblas::row_major::syr2k,
-    oneapi::mkl::blas::rocblas::row_major::syr2k,
-    oneapi::mkl::blas::rocblas::row_major::syr2k,
-    oneapi::mkl::blas::rocblas::row_major::trmm,
-    oneapi::mkl::blas::rocblas::row_major::trmm,
-    oneapi::mkl::blas::rocblas::row_major::trmm,
-    oneapi::mkl::blas::rocblas::row_major::trmm,
-    oneapi::mkl::blas::rocblas::row_major::trsm,
-    oneapi::mkl::blas::rocblas::row_major::trsm,
-    oneapi::mkl::blas::rocblas::row_major::trsm,
-    oneapi::mkl::blas::rocblas::row_major::trsm,
-    oneapi::mkl::blas::rocblas::row_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::row_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::row_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::row_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::row_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::row_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::row_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::row_major::trsm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemm_batch,
-    oneapi::mkl::blas::rocblas::row_major::gemmt,
-    oneapi::mkl::blas::rocblas::row_major::gemmt,
-    oneapi::mkl::blas::rocblas::row_major::gemmt,
-    oneapi::mkl::blas::rocblas::row_major::gemmt,
-    oneapi::mkl::blas::rocblas::row_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::row_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::row_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::row_major::gemm_bias,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatadd_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy2,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy,
-    oneapi::mkl::blas::rocblas::row_major::omatadd,
-    oneapi::mkl::blas::rocblas::row_major::omatadd,
-    oneapi::mkl::blas::rocblas::row_major::omatadd,
-    oneapi::mkl::blas::rocblas::row_major::omatadd,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::omatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy_batch,
-    oneapi::mkl::blas::rocblas::row_major::imatcopy_batch,
-};
diff --git a/src/blas/blas_loader.cpp b/src/blas/blas_loader.cpp
deleted file mode 100644
index c1f1339c6..000000000
--- a/src/blas/blas_loader.cpp
+++ /dev/null
@@ -1,7898 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/blas/detail/blas_loader.hpp"
-
-#include "function_table_initializer.hpp"
-#include "blas/function_table.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace blas {
-namespace column_major {
-namespace detail {
-
-static oneapi::mkl::detail::table_initializer<domain::blas, blas_function_table_t> function_tables;
-
-// Buffer APIs
-
-void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    function_tables[libkey].column_major_scasum_sycl(queue, n, x, incx, result);
-}
-
-void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    function_tables[libkey].column_major_dzasum_sycl(queue, n, x, incx, result);
-}
-
-void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    function_tables[libkey].column_major_sasum_sycl(queue, n, x, incx, result);
-}
-
-void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &result) {
-    function_tables[libkey].column_major_dasum_sycl(queue, n, x, incx, result);
-}
-
-void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].column_major_saxpy_sycl(queue, n, alpha, x, incx, y, incy);
-}
-
-void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].column_major_daxpy_sycl(queue, n, alpha, x, incx, y, incy);
-}
-
-void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_caxpy_sycl(queue, n, alpha, x, incx, y, incy);
-}
-
-void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_zaxpy_sycl(queue, n, alpha, x, incx, y, incy);
-}
-
-void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_saxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex,
-                                                                  y, incy, stridey, batch_size);
-}
-
-void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_daxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex,
-                                                                  y, incy, stridey, batch_size);
-}
-
-void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    function_tables[libkey].column_major_caxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex,
-                                                                  y, incy, stridey, batch_size);
-}
-
-void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    function_tables[libkey].column_major_zaxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex,
-                                                                  y, incy, stridey, batch_size);
-}
-
-void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-           sycl::buffer<float, 1> &x, std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-           std::int64_t incy) {
-    function_tables[libkey].column_major_saxpby_sycl(queue, n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-           sycl::buffer<double, 1> &x, std::int64_t incx, double beta, sycl::buffer<double, 1> &y,
-           std::int64_t incy) {
-    function_tables[libkey].column_major_daxpby_sycl(queue, n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_caxpby_sycl(queue, n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_zaxpby_sycl(queue, n, alpha, x, incx, beta, y, incy);
-}
-
-void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_scopy_sycl(queue, n, x, incx, y, incy);
-}
-
-void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].column_major_dcopy_sycl(queue, n, x, incx, y, incy);
-}
-
-void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_ccopy_sycl(queue, n, x, incx, y, incy);
-}
-
-void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_zcopy_sycl(queue, n, x, incx, y, incy);
-}
-
-void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_scopy_batch_strided_sycl(queue, n, x, incx, stridex, y,
-                                                                  incy, stridey, batch_size);
-}
-
-void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_dcopy_batch_strided_sycl(queue, n, x, incx, stridex, y,
-                                                                  incy, stridey, batch_size);
-}
-
-void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_ccopy_batch_strided_sycl(queue, n, x, incx, stridex, y,
-                                                                  incy, stridey, batch_size);
-}
-
-void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_zcopy_batch_strided_sycl(queue, n, x, incx, stridex, y,
-                                                                  incy, stridey, batch_size);
-}
-
-void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<float, 1> &result) {
-    function_tables[libkey].column_major_sdot_sycl(queue, n, x, incx, y, incy, result);
-}
-
-void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-         sycl::buffer<double, 1> &result) {
-    function_tables[libkey].column_major_ddot_sycl(queue, n, x, incx, y, incy, result);
-}
-
-void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<double, 1> &result) {
-    function_tables[libkey].column_major_dsdot_sycl(queue, n, x, incx, y, incy, result);
-}
-
-void dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    function_tables[libkey].column_major_cdotc_sycl(queue, n, x, incx, y, incy, result);
-}
-
-void dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    function_tables[libkey].column_major_zdotc_sycl(queue, n, x, incx, y, incy, result);
-}
-
-void dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    function_tables[libkey].column_major_cdotu_sycl(queue, n, x, incx, y, incy, result);
-}
-
-void dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    function_tables[libkey].column_major_zdotu_sycl(queue, n, x, incx, y, incy, result);
-}
-
-void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].column_major_isamin_sycl(queue, n, x, incx, result);
-}
-
-void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].column_major_idamin_sycl(queue, n, x, incx, result);
-}
-
-void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].column_major_icamin_sycl(queue, n, x, incx, result);
-}
-
-void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].column_major_izamin_sycl(queue, n, x, incx, result);
-}
-
-void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].column_major_isamax_sycl(queue, n, x, incx, result);
-}
-
-void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].column_major_idamax_sycl(queue, n, x, incx, result);
-}
-
-void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].column_major_icamax_sycl(queue, n, x, incx, result);
-}
-
-void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].column_major_izamax_sycl(queue, n, x, incx, result);
-}
-
-void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    function_tables[libkey].column_major_scnrm2_sycl(queue, n, x, incx, result);
-}
-
-void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    function_tables[libkey].column_major_dznrm2_sycl(queue, n, x, incx, result);
-}
-
-void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    function_tables[libkey].column_major_snrm2_sycl(queue, n, x, incx, result);
-}
-
-void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &result) {
-    function_tables[libkey].column_major_dnrm2_sycl(queue, n, x, incx, result);
-}
-
-void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c, float s) {
-    function_tables[libkey].column_major_srot_sycl(queue, n, x, incx, y, incy, c, s);
-}
-
-void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c, double s) {
-    function_tables[libkey].column_major_drot_sycl(queue, n, x, incx, y, incy, c, s);
-}
-
-void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy, float c, float s) {
-    function_tables[libkey].column_major_csrot_sycl(queue, n, x, incx, y, incy, c, s);
-}
-
-void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy, double c, double s) {
-    function_tables[libkey].column_major_zdrot_sycl(queue, n, x, incx, y, incy, c, s);
-}
-
-void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &b, sycl::buffer<float, 1> &c, sycl::buffer<float, 1> &s) {
-    function_tables[libkey].column_major_srotg_sycl(queue, a, b, c, s);
-}
-
-void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &b, sycl::buffer<double, 1> &c, sycl::buffer<double, 1> &s) {
-    function_tables[libkey].column_major_drotg_sycl(queue, a, b, c, s);
-}
-
-void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<std::complex<float>, 1> &s) {
-    function_tables[libkey].column_major_crotg_sycl(queue, a, b, c, s);
-}
-
-void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<std::complex<double>, 1> &s) {
-    function_tables[libkey].column_major_zrotg_sycl(queue, a, b, c, s);
-}
-
-void rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-          sycl::buffer<float, 1> &param) {
-    function_tables[libkey].column_major_srotm_sycl(queue, n, x, incx, y, incy, param);
-}
-
-void rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &param) {
-    function_tables[libkey].column_major_drotm_sycl(queue, n, x, incx, y, incy, param);
-}
-
-void rotmg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer<float, 1> &d1,
-           sycl::buffer<float, 1> &d2, sycl::buffer<float, 1> &x1, float y1,
-           sycl::buffer<float, 1> &param) {
-    function_tables[libkey].column_major_srotmg_sycl(queue, d1, d2, x1, y1, param);
-}
-
-void rotmg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer<double, 1> &d1,
-           sycl::buffer<double, 1> &d2, sycl::buffer<double, 1> &x1, double y1,
-           sycl::buffer<double, 1> &param) {
-    function_tables[libkey].column_major_drotmg_sycl(queue, d1, d2, x1, y1, param);
-}
-
-void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_sscal_sycl(queue, n, alpha, x, incx);
-}
-
-void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_dscal_sycl(queue, n, alpha, x, incx);
-}
-
-void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_cscal_sycl(queue, n, alpha, x, incx);
-}
-
-void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_csscal_sycl(queue, n, alpha, x, incx);
-}
-
-void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_zscal_sycl(queue, n, alpha, x, incx);
-}
-
-void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_zdscal_sycl(queue, n, alpha, x, incx);
-}
-
-void sdsdot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float sb,
-            sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-            std::int64_t incy, sycl::buffer<float, 1> &result) {
-    function_tables[libkey].column_major_sdsdot_sycl(queue, n, sb, x, incx, y, incy, result);
-}
-
-void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_sswap_sycl(queue, n, x, incx, y, incy);
-}
-
-void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].column_major_dswap_sycl(queue, n, x, incx, y, incy);
-}
-
-void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_cswap_sycl(queue, n, x, incx, y, incy);
-}
-
-void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_zswap_sycl(queue, n, x, incx, y, incy);
-}
-
-void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_sgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x,
-                                                    incx, beta, y, incy);
-}
-
-void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_dgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x,
-                                                    incx, beta, y, incy);
-}
-
-void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_cgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x,
-                                                    incx, beta, y, incy);
-}
-
-void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_zgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x,
-                                                    incx, beta, y, incy);
-}
-
-void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].column_major_sgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx,
-                                                    beta, y, incy);
-}
-
-void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].column_major_dgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx,
-                                                    beta, y, incy);
-}
-
-void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_cgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx,
-                                                    beta, y, incy);
-}
-
-void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_zgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx,
-                                                    beta, y, incy);
-}
-
-void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<float, 1> &x, std::int64_t incx,
-                std::int64_t stridex, float beta, sycl::buffer<float, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    function_tables[libkey].column_major_sgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda,
-                                                                  stridea, x, incx, stridex, beta,
-                                                                  y, incy, stridey, batch_size);
-}
-
-void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<double, 1> &x, std::int64_t incx,
-                std::int64_t stridex, double beta, sycl::buffer<double, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    function_tables[libkey].column_major_dgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda,
-                                                                  stridea, x, incx, stridex, beta,
-                                                                  y, incy, stridey, batch_size);
-}
-
-void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_cgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda,
-                                                                  stridea, x, incx, stridex, beta,
-                                                                  y, incy, stridey, batch_size);
-}
-
-void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                std::int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    function_tables[libkey].column_major_zgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda,
-                                                                  stridea, x, incx, stridex, beta,
-                                                                  y, incy, stridey, batch_size);
-}
-
-void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_sdgmm_batch_strided_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size);
-}
-
-void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_ddgmm_batch_strided_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size);
-}
-
-void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    function_tables[libkey].column_major_cdgmm_batch_strided_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size);
-}
-
-void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    function_tables[libkey].column_major_zdgmm_batch_strided_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size);
-}
-
-void ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-         float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-         std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda) {
-    function_tables[libkey].column_major_sger_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-         double alpha, sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-         std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda) {
-    function_tables[libkey].column_major_dger_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].column_major_cgerc_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].column_major_zgerc_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].column_major_cgeru_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].column_major_zgeru_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_chbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x,
-                                                    incx, beta, y, incy);
-}
-
-void hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_zhbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x,
-                                                    incx, beta, y, incy);
-}
-
-void hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_chemv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx,
-                                                    beta, y, incy);
-}
-
-void hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_zhemv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx,
-                                                    beta, y, incy);
-}
-
-void her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         float alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].column_major_cher_sycl(queue, upper_lower, n, alpha, x, incx, a, lda);
-}
-
-void her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         double alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].column_major_zher_sycl(queue, upper_lower, n, alpha, x, incx, a, lda);
-}
-
-void her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].column_major_cher2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
-                                                    a, lda);
-}
-
-void her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].column_major_zher2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
-                                                    a, lda);
-}
-
-void hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_chpmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta,
-                                                    y, incy);
-}
-
-void hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_zhpmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta,
-                                                    y, incy);
-}
-
-void hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         float alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a) {
-    function_tables[libkey].column_major_chpr_sycl(queue, upper_lower, n, alpha, x, incx, a);
-}
-
-void hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         double alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a) {
-    function_tables[libkey].column_major_zhpr_sycl(queue, upper_lower, n, alpha, x, incx, a);
-}
-
-void hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a) {
-    function_tables[libkey].column_major_chpr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
-                                                    a);
-}
-
-void hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a) {
-    function_tables[libkey].column_major_zhpr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
-                                                    a);
-}
-
-void sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].column_major_ssbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x,
-                                                    incx, beta, y, incy);
-}
-
-void sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::int64_t k, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].column_major_dsbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x,
-                                                    incx, beta, y, incy);
-}
-
-void spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_sspmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta,
-                                                    y, incy);
-}
-
-void spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x, std::int64_t incx,
-          double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_dspmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta,
-                                                    y, incy);
-}
-
-void spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a) {
-    function_tables[libkey].column_major_sspr_sycl(queue, upper_lower, n, alpha, x, incx, a);
-}
-
-void spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         double alpha, sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a) {
-    function_tables[libkey].column_major_dspr_sycl(queue, upper_lower, n, alpha, x, incx, a);
-}
-
-void spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a) {
-    function_tables[libkey].column_major_sspr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
-                                                    a);
-}
-
-void spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &a) {
-    function_tables[libkey].column_major_dspr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
-                                                    a);
-}
-
-void symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_ssymv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx,
-                                                    beta, y, incy);
-}
-
-void symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    function_tables[libkey].column_major_dsymv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx,
-                                                    beta, y, incy);
-}
-
-void syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a,
-         std::int64_t lda) {
-    function_tables[libkey].column_major_ssyr_sycl(queue, upper_lower, n, alpha, x, incx, a, lda);
-}
-
-void syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         double alpha, sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a,
-         std::int64_t lda) {
-    function_tables[libkey].column_major_dsyr_sycl(queue, upper_lower, n, alpha, x, incx, a, lda);
-}
-
-void syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda) {
-    function_tables[libkey].column_major_ssyr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
-                                                    a, lda);
-}
-
-void syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda) {
-    function_tables[libkey].column_major_dsyr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy,
-                                                    a, lda);
-}
-
-void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_stbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
-                                                    lda, x, incx);
-}
-
-void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_dtbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
-                                                    lda, x, incx);
-}
-
-void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_ctbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
-                                                    lda, x, incx);
-}
-
-void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_ztbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
-                                                    lda, x, incx);
-}
-
-void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_stbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
-                                                    lda, x, incx);
-}
-
-void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_dtbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
-                                                    lda, x, incx);
-}
-
-void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_ctbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
-                                                    lda, x, incx);
-}
-
-void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_ztbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a,
-                                                    lda, x, incx);
-}
-
-void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-          std::int64_t incx) {
-    function_tables[libkey].column_major_stpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                    incx);
-}
-
-void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-          std::int64_t incx) {
-    function_tables[libkey].column_major_dtpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                    incx);
-}
-
-void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_ctpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                    incx);
-}
-
-void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_ztpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                    incx);
-}
-
-void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-          std::int64_t incx) {
-    function_tables[libkey].column_major_stpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                    incx);
-}
-
-void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-          std::int64_t incx) {
-    function_tables[libkey].column_major_dtpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                    incx);
-}
-
-void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_ctpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                    incx);
-}
-
-void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_ztpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                    incx);
-}
-
-void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_strmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
-                                                    x, incx);
-}
-
-void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_dtrmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
-                                                    x, incx);
-}
-
-void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_ctrmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
-                                                    x, incx);
-}
-
-void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_ztrmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
-                                                    x, incx);
-}
-
-void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_strsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
-                                                    x, incx);
-}
-
-void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_dtrsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
-                                                    x, incx);
-}
-
-void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_ctrsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
-                                                    x, incx);
-}
-
-void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].column_major_ztrsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda,
-                                                    x, incx);
-}
-
-void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_sgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda,
-                                                    b, ldb, beta, c, ldc);
-}
-
-void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_dgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda,
-                                                    b, ldb, beta, c, ldc);
-}
-
-void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_cgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda,
-                                                    b, ldb, beta, c, ldc);
-}
-
-void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_zgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda,
-                                                    b, ldb, beta, c, ldc);
-}
-
-void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_hgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda,
-                                                    b, ldb, beta, c, ldc);
-}
-
-void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_gemm_f16f16f32_sycl(queue, transa, transb, m, n, k, alpha,
-                                                             a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer<bfloat16, 1> &a,
-          std::int64_t lda, sycl::buffer<bfloat16, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_gemm_bf16bf16f32_sycl(queue, transa, transb, m, n, k,
-                                                               alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_chemm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
-                                                    lda, b, ldb, beta, c, ldc);
-}
-
-void hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_zhemm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
-                                                    lda, b, ldb, beta, c, ldc);
-}
-
-void herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, float beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_cherk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                    beta, c, ldc);
-}
-
-void herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, double alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, double beta, sycl::buffer<std::complex<double>, 1> &c,
-          std::int64_t ldc) {
-    function_tables[libkey].column_major_zherk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                    beta, c, ldc);
-}
-
-void her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_cher2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                     b, ldb, beta, c, ldc);
-}
-
-void her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_zher2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                     b, ldb, beta, c, ldc);
-}
-
-void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &b, std::int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-          std::int64_t ldc) {
-    function_tables[libkey].column_major_ssymm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
-                                                    lda, b, ldb, beta, c, ldc);
-}
-
-void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_dsymm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
-                                                    lda, b, ldb, beta, c, ldc);
-}
-
-void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_csymm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
-                                                    lda, b, ldb, beta, c, ldc);
-}
-
-void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_zsymm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
-                                                    lda, b, ldb, beta, c, ldc);
-}
-
-void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_ssyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                    beta, c, ldc);
-}
-
-void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_dsyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                    beta, c, ldc);
-}
-
-void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_csyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                    beta, c, ldc);
-}
-
-void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_zsyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                    beta, c, ldc);
-}
-
-void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].column_major_ssyrk_batch_strided_sycl(queue, upper_lower, trans, n, k,
-                                                                  alpha, a, lda, stride_a, beta, c,
-                                                                  ldc, stride_c, batch_size);
-}
-
-void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer<double, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].column_major_dsyrk_batch_strided_sycl(queue, upper_lower, trans, n, k,
-                                                                  alpha, a, lda, stride_a, beta, c,
-                                                                  ldc, stride_c, batch_size);
-}
-
-void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].column_major_csyrk_batch_strided_sycl(queue, upper_lower, trans, n, k,
-                                                                  alpha, a, lda, stride_a, beta, c,
-                                                                  ldc, stride_c, batch_size);
-}
-
-void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].column_major_zsyrk_batch_strided_sycl(queue, upper_lower, trans, n, k,
-                                                                  alpha, a, lda, stride_a, beta, c,
-                                                                  ldc, stride_c, batch_size);
-}
-
-void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-           sycl::buffer<float, 1> &b, std::int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-           std::int64_t ldc) {
-    function_tables[libkey].column_major_ssyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                     b, ldb, beta, c, ldc);
-}
-
-void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-           std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_dsyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                     b, ldb, beta, c, ldc);
-}
-
-void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_csyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                     b, ldb, beta, c, ldc);
-}
-
-void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_zsyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                     b, ldb, beta, c, ldc);
-}
-
-void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb) {
-    function_tables[libkey].column_major_strmm_sycl(queue, left_right, upper_lower, trans,
-                                                    unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb) {
-    function_tables[libkey].column_major_dtrmm_sycl(queue, left_right, upper_lower, trans,
-                                                    unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].column_major_ctrmm_sycl(queue, left_right, upper_lower, trans,
-                                                    unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].column_major_ztrmm_sycl(queue, left_right, upper_lower, trans,
-                                                    unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb) {
-    function_tables[libkey].column_major_strsm_sycl(queue, left_right, upper_lower, trans,
-                                                    unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb) {
-    function_tables[libkey].column_major_dtrsm_sycl(queue, left_right, upper_lower, trans,
-                                                    unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].column_major_ctrsm_sycl(queue, left_right, upper_lower, trans,
-                                                    unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].column_major_ztrsm_sycl(queue, left_right, upper_lower, trans,
-                                                    unit_diag, m, n, alpha, a, lda, b, ldb);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_sgemm_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b, double beta,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_dgemm_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].column_major_cgemm_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].column_major_zgemm_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].column_major_hgemm_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_gemm_f16f16f32_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_gemm_s8s8f32_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].column_major_gemm_s8s8s32_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_strsm_batch_strided_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size);
-}
-
-void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    function_tables[libkey].column_major_dtrsm_batch_strided_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size);
-}
-
-void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    function_tables[libkey].column_major_ctrsm_batch_strided_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size);
-}
-
-void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    function_tables[libkey].column_major_ztrsm_batch_strided_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size);
-}
-
-void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-           std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_sgemmt_sycl(queue, upper_lower, transa, transb, n, k,
-                                                     alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, double alpha,
-           sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-           std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_dgemmt_sycl(queue, upper_lower, transa, transb, n, k,
-                                                     alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_cgemmt_sycl(queue, upper_lower, transa, transb, n, k,
-                                                     alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_zgemmt_sycl(queue, upper_lower, transa, transb, n, k,
-                                                     alpha, a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao, sycl::buffer<uint8_t, 1> &b,
-               std::int64_t ldb, uint8_t bo, float beta, sycl::buffer<int32_t, 1> &c,
-               std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    function_tables[libkey].column_major_gemm_s8u8s32_bias_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao, sycl::buffer<int8_t, 1> &b,
-               std::int64_t ldb, int8_t bo, float beta, sycl::buffer<int32_t, 1> &c,
-               std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    function_tables[libkey].column_major_gemm_s8s8s32_bias_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    function_tables[libkey].column_major_gemm_u8s8s32_bias_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    function_tables[libkey].column_major_gemm_u8u8s32_bias_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    function_tables[libkey].column_major_somatcopy_batch_strided_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    function_tables[libkey].column_major_domatcopy_batch_strided_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    function_tables[libkey].column_major_comatcopy_batch_strided_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    function_tables[libkey].column_major_zomatcopy_batch_strided_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    function_tables[libkey].column_major_simatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab,
-                                                                      lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    function_tables[libkey].column_major_dimatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab,
-                                                                      lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    function_tables[libkey].column_major_cimatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab,
-                                                                      lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    function_tables[libkey].column_major_zimatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab,
-                                                                      lda, ldb, stride, batch_size);
-}
-
-void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                   transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                   sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a, float beta,
-                   sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                   sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                   std::int64_t batch_size) {
-    function_tables[libkey].column_major_somatadd_batch_strided_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size);
-}
-
-void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                   transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                   sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a, double beta,
-                   sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                   sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                   std::int64_t batch_size) {
-    function_tables[libkey].column_major_domatadd_batch_strided_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size);
-}
-
-void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                   transpose transb, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                   sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                   std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].column_major_comatadd_batch_strided_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size);
-}
-
-void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                   transpose transb, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                   sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                   std::int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                   std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].column_major_zomatadd_batch_strided_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size);
-}
-
-void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-              sycl::buffer<float, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].column_major_somatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-              sycl::buffer<double, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].column_major_domatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].column_major_comatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].column_major_zomatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-               std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<float, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    function_tables[libkey].column_major_somatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea,
-                                                         b, ldb, strideb);
-}
-
-void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-               std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<double, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    function_tables[libkey].column_major_domatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea,
-                                                         b, ldb, strideb);
-}
-
-void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    function_tables[libkey].column_major_comatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea,
-                                                         b, ldb, strideb);
-}
-
-void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    function_tables[libkey].column_major_zomatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea,
-                                                         b, ldb, strideb);
-}
-
-void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    function_tables[libkey].column_major_simatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    function_tables[libkey].column_major_dimatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    function_tables[libkey].column_major_cimatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    function_tables[libkey].column_major_zimatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-             std::int64_t lda, float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-             sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_somatadd_sycl(queue, transa, transb, m, n, alpha, a, lda,
-                                                       beta, b, ldb, c, ldc);
-}
-
-void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-             std::int64_t lda, double beta, sycl::buffer<double, 1> &b, std::int64_t ldb,
-             sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_domatadd_sycl(queue, transa, transb, m, n, alpha, a, lda,
-                                                       beta, b, ldb, c, ldc);
-}
-
-void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<float> alpha,
-             sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-             sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_comatadd_sycl(queue, transa, transb, m, n, alpha, a, lda,
-                                                       beta, b, ldb, c, ldc);
-}
-
-void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<double> alpha,
-             sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-             sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].column_major_zomatadd_sycl(queue, transa, transb, m, n, alpha, a, lda,
-                                                       beta, b, ldb, c, ldc);
-}
-
-// USM APIs
-
-sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, float *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_scasum_usm_sycl(queue, n, x, incx, result,
-                                                                dependencies);
-}
-
-sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, double *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dzasum_usm_sycl(queue, n, x, incx, result,
-                                                                dependencies);
-}
-
-sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x,
-                 std::int64_t incx, float *result, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sasum_usm_sycl(queue, n, x, incx, result,
-                                                               dependencies);
-}
-
-sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x,
-                 std::int64_t incx, double *result, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dasum_usm_sycl(queue, n, x, incx, result,
-                                                               dependencies);
-}
-
-sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-                 const float *x, std::int64_t incx, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_saxpy_usm_sycl(queue, n, alpha, x, incx, y, incy,
-                                                               dependencies);
-}
-
-sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-                 const double *x, std::int64_t incx, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_daxpy_usm_sycl(queue, n, alpha, x, incx, y, incy,
-                                                               dependencies);
-}
-
-sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_caxpy_usm_sycl(queue, n, alpha, x, incx, y, incy,
-                                                               dependencies);
-}
-
-sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zaxpy_usm_sycl(queue, n, alpha, x, incx, y, incy,
-                                                               dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       float *alpha, const float **x, std::int64_t *incx, float **y,
-                       std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_saxpy_batch_group_usm_sycl(
-        queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       double *alpha, const double **x, std::int64_t *incx, double **y,
-                       std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_daxpy_batch_group_usm_sycl(
-        queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       std::complex<float> *alpha, const std::complex<float> **x,
-                       std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_caxpy_batch_group_usm_sycl(
-        queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       std::complex<double> *alpha, const std::complex<double> **x,
-                       std::int64_t *incx, std::complex<double> **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zaxpy_batch_group_usm_sycl(
-        queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-                       const float *x, std::int64_t incx, std::int64_t stridex, float *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_saxpy_batch_strided_usm_sycl(
-        queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-                       const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_daxpy_batch_strided_usm_sycl(
-        queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                       std::int64_t stridex, std::complex<float> *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_caxpy_batch_strided_usm_sycl(
-        queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                       std::int64_t stridex, std::complex<double> *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zaxpy_batch_strided_usm_sycl(
-        queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-                  const float *x, std::int64_t incx, const float beta, float *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_saxpby_usm_sycl(queue, n, alpha, x, incx, beta, y,
-                                                                incy, dependencies);
-}
-
-sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-                  const double *x, std::int64_t incx, const double beta, double *y,
-                  std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_daxpby_usm_sycl(queue, n, alpha, x, incx, beta, y,
-                                                                incy, dependencies);
-}
-
-sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                  std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                  const std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_caxpby_usm_sycl(queue, n, alpha, x, incx, beta, y,
-                                                                incy, dependencies);
-}
-
-sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                  std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                  const std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zaxpby_usm_sycl(queue, n, alpha, x, incx, beta, y,
-                                                                incy, dependencies);
-}
-
-sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x,
-                 std::int64_t incx, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_scopy_usm_sycl(queue, n, x, incx, y, incy,
-                                                               dependencies);
-}
-
-sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x,
-                 std::int64_t incx, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dcopy_usm_sycl(queue, n, x, incx, y, incy,
-                                                               dependencies);
-}
-
-sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ccopy_usm_sycl(queue, n, x, incx, y, incy,
-                                                               dependencies);
-}
-
-sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zcopy_usm_sycl(queue, n, x, incx, y, incy,
-                                                               dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       const float **x, std::int64_t *incx, float **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_scopy_batch_group_usm_sycl(
-        queue, n, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       const double **x, std::int64_t *incx, double **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dcopy_batch_group_usm_sycl(
-        queue, n, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y,
-                       std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ccopy_batch_group_usm_sycl(
-        queue, n, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
-                       std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zcopy_batch_group_usm_sycl(
-        queue, n, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       const float *x, std::int64_t incx, std::int64_t stridex, float *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_scopy_batch_strided_usm_sycl(
-        queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dcopy_batch_strided_usm_sycl(
-        queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ccopy_batch_strided_usm_sycl(
-        queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zcopy_batch_strided_usm_sycl(
-        queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x,
-                std::int64_t incx, const float *y, std::int64_t incy, float *result,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sdot_usm_sycl(queue, n, x, incx, y, incy, result,
-                                                              dependencies);
-}
-
-sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x,
-                std::int64_t incx, const double *y, std::int64_t incy, double *result,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ddot_usm_sycl(queue, n, x, incx, y, incy, result,
-                                                              dependencies);
-}
-
-sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x,
-                std::int64_t incx, const float *y, std::int64_t incy, double *result,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dsdot_usm_sycl(queue, n, x, incx, y, incy, result,
-                                                               dependencies);
-}
-
-sycl::event dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                 std::int64_t incy, std::complex<float> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cdotc_usm_sycl(queue, n, x, incx, y, incy, result,
-                                                               dependencies);
-}
-
-sycl::event dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
-                 std::int64_t incy, std::complex<double> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zdotc_usm_sycl(queue, n, x, incx, y, incy, result,
-                                                               dependencies);
-}
-
-sycl::event dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                 std::int64_t incy, std::complex<float> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cdotu_usm_sycl(queue, n, x, incx, y, incy, result,
-                                                               dependencies);
-}
-
-sycl::event dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
-                 std::int64_t incy, std::complex<double> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zdotu_usm_sycl(queue, n, x, incx, y, incy, result,
-                                                               dependencies);
-}
-
-sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_isamin_usm_sycl(queue, n, x, incx, result,
-                                                                dependencies);
-}
-
-sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_idamin_usm_sycl(queue, n, x, incx, result,
-                                                                dependencies);
-}
-
-sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                  const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_icamin_usm_sycl(queue, n, x, incx, result,
-                                                                dependencies);
-}
-
-sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                  const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_izamin_usm_sycl(queue, n, x, incx, result,
-                                                                dependencies);
-}
-
-sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_isamax_usm_sycl(queue, n, x, incx, result,
-                                                                dependencies);
-}
-
-sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_idamax_usm_sycl(queue, n, x, incx, result,
-                                                                dependencies);
-}
-
-sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                  const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_icamax_usm_sycl(queue, n, x, incx, result,
-                                                                dependencies);
-}
-
-sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                  const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_izamax_usm_sycl(queue, n, x, incx, result,
-                                                                dependencies);
-}
-
-sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, float *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_scnrm2_usm_sycl(queue, n, x, incx, result,
-                                                                dependencies);
-}
-
-sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, double *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dznrm2_usm_sycl(queue, n, x, incx, result,
-                                                                dependencies);
-}
-
-sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x,
-                 std::int64_t incx, float *result, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_snrm2_usm_sycl(queue, n, x, incx, result,
-                                                               dependencies);
-}
-
-sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x,
-                 std::int64_t incx, double *result, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dnrm2_usm_sycl(queue, n, x, incx, result,
-                                                               dependencies);
-}
-
-sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                std::int64_t incy, float c, float s, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_srot_usm_sycl(queue, n, x, incx, y, incy, c, s,
-                                                              dependencies);
-}
-
-sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                std::int64_t incy, double c, double s,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_drot_usm_sycl(queue, n, x, incx, y, incy, c, s,
-                                                              dependencies);
-}
-
-sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *x,
-                std::int64_t incx, float *y, std::int64_t incy, float c, float s,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_csrot_usm_sycl(queue, n, x, incx, y, incy, c, s,
-                                                               dependencies);
-}
-
-sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *x,
-                std::int64_t incx, double *y, std::int64_t incy, double c, double s,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zdrot_usm_sycl(queue, n, x, incx, y, incy, c, s,
-                                                               dependencies);
-}
-
-sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, float *a, float *b, float *c,
-                 float *s, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_srotg_usm_sycl(queue, a, b, c, s, dependencies);
-}
-
-sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, double *a, double *b, double *c,
-                 double *s, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_drotg_usm_sycl(queue, a, b, c, s, dependencies);
-}
-
-sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, std::complex<float> *a,
-                 std::complex<float> *b, float *c, std::complex<float> *s,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_crotg_usm_sycl(queue, a, b, c, s, dependencies);
-}
-
-sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, std::complex<double> *a,
-                 std::complex<double> *b, double *c, std::complex<double> *s,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zrotg_usm_sycl(queue, a, b, c, s, dependencies);
-}
-
-sycl::event rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *x,
-                 std::int64_t incx, float *y, std::int64_t incy, float *param,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_srotm_usm_sycl(queue, n, x, incx, y, incy, param,
-                                                               dependencies);
-}
-
-sycl::event rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *x,
-                 std::int64_t incx, double *y, std::int64_t incy, double *param,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_drotm_usm_sycl(queue, n, x, incx, y, incy, param,
-                                                               dependencies);
-}
-
-sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue &queue, float *d1, float *d2, float *x1,
-                  float y1, float *param, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_srotmg_usm_sycl(queue, d1, d2, x1, y1, param,
-                                                                dependencies);
-}
-
-sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue &queue, double *d1, double *d2,
-                  double *x1, double y1, double *param,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_drotmg_usm_sycl(queue, d1, d2, x1, y1, param,
-                                                                dependencies);
-}
-
-sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-                 float *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sscal_usm_sycl(queue, n, alpha, x, incx,
-                                                               dependencies);
-}
-
-sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-                 double *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dscal_usm_sycl(queue, n, alpha, x, incx,
-                                                               dependencies);
-}
-
-sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 std::complex<float> alpha, std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cscal_usm_sycl(queue, n, alpha, x, incx,
-                                                               dependencies);
-}
-
-sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 std::complex<double> alpha, std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_csscal_usm_sycl(queue, n, alpha, x, incx,
-                                                                dependencies);
-}
-
-sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zscal_usm_sycl(queue, n, alpha, x, incx,
-                                                               dependencies);
-}
-
-sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zdscal_usm_sycl(queue, n, alpha, x, incx,
-                                                                dependencies);
-}
-
-sycl::event sdsdot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float sb,
-                   const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                   float *result, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sdsdot_usm_sycl(queue, n, sb, x, incx, y, incy,
-                                                                result, dependencies);
-}
-
-sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *x,
-                 std::int64_t incx, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sswap_usm_sycl(queue, n, x, incx, y, incy,
-                                                               dependencies);
-}
-
-sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *x,
-                 std::int64_t incx, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dswap_usm_sycl(queue, n, x, incx, y, incy,
-                                                               dependencies);
-}
-
-sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cswap_usm_sycl(queue, n, x, incx, y, incy,
-                                                               dependencies);
-}
-
-sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zswap_usm_sycl(queue, n, x, incx, y, incy,
-                                                               dependencies);
-}
-
-sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a,
-                 std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sgbmv_usm_sycl(
-        queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, const double *a,
-                 std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dgbmv_usm_sycl(
-        queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
-                 std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cgbmv_usm_sycl(
-        queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
-                 std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zgbmv_usm_sycl(
-        queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x,
-                 std::int64_t incx, float beta, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x,
-                                                               incx, beta, y, incy, dependencies);
-}
-
-sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, double alpha, const double *a, std::int64_t lda, const double *x,
-                 std::int64_t incx, double beta, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x,
-                                                               incx, beta, y, incy, dependencies);
-}
-
-sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                 std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                 std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x,
-                                                               incx, beta, y, incy, dependencies);
-}
-
-sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                 std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                 std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x,
-                                                               incx, beta, y, incy, dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                       std::int64_t m, std::int64_t n, float alpha, const float *a,
-                       std::int64_t lda, std::int64_t stridea, const float *x, std::int64_t incx,
-                       std::int64_t stridex, float beta, float *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sgemv_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey,
-        batch_size, dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                       std::int64_t m, std::int64_t n, double alpha, const double *a,
-                       std::int64_t lda, std::int64_t stridea, const double *x, std::int64_t incx,
-                       std::int64_t stridex, double beta, double *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dgemv_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey,
-        batch_size, dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                       std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                       const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-                       const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cgemv_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey,
-        batch_size, dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                       std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                       const std::complex<double> *a, std::int64_t lda, std::int64_t stridea,
-                       const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zgemv_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey,
-        batch_size, dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                       std::int64_t *m, std::int64_t *n, float *alpha, const float **a,
-                       std::int64_t *lda, const float **x, std::int64_t *incx, float *beta,
-                       float **y, std::int64_t *incy, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sgemv_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size,
-        dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                       std::int64_t *m, std::int64_t *n, double *alpha, const double **a,
-                       std::int64_t *lda, const double **x, std::int64_t *incx, double *beta,
-                       double **y, std::int64_t *incy, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dgemv_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size,
-        dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                       std::int64_t *m, std::int64_t *n, std::complex<float> *alpha,
-                       const std::complex<float> **a, std::int64_t *lda,
-                       const std::complex<float> **x, std::int64_t *incx, std::complex<float> *beta,
-                       std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cgemv_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size,
-        dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                       std::int64_t *m, std::int64_t *n, std::complex<double> *alpha,
-                       const std::complex<double> **a, std::int64_t *lda,
-                       const std::complex<double> **x, std::int64_t *incx,
-                       std::complex<double> *beta, std::complex<double> **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zgemv_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size,
-        dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       std::int64_t m, std::int64_t n, const float *a, std::int64_t lda,
-                       std::int64_t stridea, const float *x, std::int64_t incx,
-                       std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sdgmm_batch_strided_usm_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size,
-        dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       std::int64_t m, std::int64_t n, const double *a, std::int64_t lda,
-                       std::int64_t stridea, const double *x, std::int64_t incx,
-                       std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ddgmm_batch_strided_usm_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size,
-        dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       std::int64_t m, std::int64_t n, const std::complex<float> *a,
-                       std::int64_t lda, std::int64_t stridea, const std::complex<float> *x,
-                       std::int64_t incx, std::int64_t stridex, std::complex<float> *c,
-                       std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cdgmm_batch_strided_usm_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size,
-        dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       std::int64_t m, std::int64_t n, const std::complex<double> *a,
-                       std::int64_t lda, std::int64_t stridea, const std::complex<double> *x,
-                       std::int64_t incx, std::int64_t stridex, std::complex<double> *c,
-                       std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zdgmm_batch_strided_usm_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size,
-        dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       std::int64_t *m, std::int64_t *n, const float **a, std::int64_t *lda,
-                       const float **x, std::int64_t *incx, float **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sdgmm_batch_group_usm_sycl(
-        queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       std::int64_t *m, std::int64_t *n, const double **a, std::int64_t *lda,
-                       const double **x, std::int64_t *incx, double **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ddgmm_batch_group_usm_sycl(
-        queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       std::int64_t *m, std::int64_t *n, const std::complex<float> **a,
-                       std::int64_t *lda, const std::complex<float> **x, std::int64_t *incx,
-                       std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cdgmm_batch_group_usm_sycl(
-        queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       std::int64_t *m, std::int64_t *n, const std::complex<double> **a,
-                       std::int64_t *lda, const std::complex<double> **x, std::int64_t *incx,
-                       std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zdgmm_batch_group_usm_sycl(
-        queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies);
-}
-
-sycl::event ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                float *a, std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sger_usm_sycl(queue, m, n, alpha, x, incx, y, incy,
-                                                              a, lda, dependencies);
-}
-
-sycl::event ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                double alpha, const double *x, std::int64_t incx, const double *y,
-                std::int64_t incy, double *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dger_usm_sycl(queue, m, n, alpha, x, incx, y, incy,
-                                                              a, lda, dependencies);
-}
-
-sycl::event gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cgerc_usm_sycl(queue, m, n, alpha, x, incx, y, incy,
-                                                               a, lda, dependencies);
-}
-
-sycl::event gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zgerc_usm_sycl(queue, m, n, alpha, x, incx, y, incy,
-                                                               a, lda, dependencies);
-}
-
-sycl::event geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cgeru_usm_sycl(queue, m, n, alpha, x, incx, y, incy,
-                                                               a, lda, dependencies);
-}
-
-sycl::event geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zgeru_usm_sycl(queue, m, n, alpha, x, incx, y, incy,
-                                                               a, lda, dependencies);
-}
-
-sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                 std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                 std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_chbmv_usm_sycl(
-        queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                 std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                 std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zhbmv_usm_sycl(
-        queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                 const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_chemv_usm_sycl(
-        queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                 const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zhemv_usm_sycl(
-        queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                float alpha, const std::complex<float> *x, std::int64_t incx,
-                std::complex<float> *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cher_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                              a, lda, dependencies);
-}
-
-sycl::event her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                double alpha, const std::complex<double> *x, std::int64_t incx,
-                std::complex<double> *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zher_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                              a, lda, dependencies);
-}
-
-sycl::event her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cher2_usm_sycl(queue, upper_lower, n, alpha, x,
-                                                               incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zher2_usm_sycl(queue, upper_lower, n, alpha, x,
-                                                               incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a,
-                 const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_chpmv_usm_sycl(queue, upper_lower, n, alpha, a, x,
-                                                               incx, beta, y, incy, dependencies);
-}
-
-sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a,
-                 const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zhpmv_usm_sycl(queue, upper_lower, n, alpha, a, x,
-                                                               incx, beta, y, incy, dependencies);
-}
-
-sycl::event hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                float alpha, const std::complex<float> *x, std::int64_t incx,
-                std::complex<float> *a, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_chpr_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                              a, dependencies);
-}
-
-sycl::event hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                double alpha, const std::complex<double> *x, std::int64_t incx,
-                std::complex<double> *a, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zhpr_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                              a, dependencies);
-}
-
-sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_chpr2_usm_sycl(queue, upper_lower, n, alpha, x,
-                                                               incx, y, incy, a, dependencies);
-}
-
-sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zhpr2_usm_sycl(queue, upper_lower, n, alpha, x,
-                                                               incx, y, incy, a, dependencies);
-}
-
-sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x,
-                 std::int64_t incx, float beta, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ssbmv_usm_sycl(
-        queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *x,
-                 std::int64_t incx, double beta, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dsbmv_usm_sycl(
-        queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 float alpha, const float *a, const float *x, std::int64_t incx, float beta,
-                 float *y, std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sspmv_usm_sycl(queue, upper_lower, n, alpha, a, x,
-                                                               incx, beta, y, incy, dependencies);
-}
-
-sycl::event spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 double alpha, const double *a, const double *x, std::int64_t incx, double beta,
-                 double *y, std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dspmv_usm_sycl(queue, upper_lower, n, alpha, a, x,
-                                                               incx, beta, y, incy, dependencies);
-}
-
-sycl::event spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                float alpha, const float *x, std::int64_t incx, float *a,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sspr_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                              a, dependencies);
-}
-
-sycl::event spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                double alpha, const double *x, std::int64_t incx, double *a,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dspr_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                              a, dependencies);
-}
-
-sycl::event spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                 float *a, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sspr2_usm_sycl(queue, upper_lower, n, alpha, x,
-                                                               incx, y, incy, a, dependencies);
-}
-
-sycl::event spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 double alpha, const double *x, std::int64_t incx, const double *y,
-                 std::int64_t incy, double *a, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dspr2_usm_sycl(queue, upper_lower, n, alpha, x,
-                                                               incx, y, incy, a, dependencies);
-}
-
-sycl::event symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 float alpha, const float *a, std::int64_t lda, const float *x, std::int64_t incx,
-                 float beta, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ssymv_usm_sycl(
-        queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 double alpha, const double *a, std::int64_t lda, const double *x,
-                 std::int64_t incx, double beta, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dsymv_usm_sycl(
-        queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ssyr_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                              a, lda, dependencies);
-}
-
-sycl::event syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dsyr_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                              a, lda, dependencies);
-}
-
-sycl::event syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                 float *a, std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ssyr2_usm_sycl(queue, upper_lower, n, alpha, x,
-                                                               incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 double alpha, const double *x, std::int64_t incx, const double *y,
-                 std::int64_t incy, double *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dsyr2_usm_sycl(queue, upper_lower, n, alpha, x,
-                                                               incx, y, incy, a, lda, dependencies);
-}
-
-sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda,
-                 float *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_stbmv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda,
-                 double *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dtbmv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                 std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ctbmv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                 std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ztbmv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda,
-                 float *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_stbsv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda,
-                 double *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dtbsv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                 std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ctbsv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                 std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ztbsv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_stpmv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, x, incx, dependencies);
-}
-
-sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dtpmv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, x, incx, dependencies);
-}
-
-sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ctpmv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, x, incx, dependencies);
-}
-
-sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ztpmv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, x, incx, dependencies);
-}
-
-sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_stpsv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, x, incx, dependencies);
-}
-
-sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dtpsv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, x, incx, dependencies);
-}
-
-sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ctpsv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, x, incx, dependencies);
-}
-
-sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ztpsv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, x, incx, dependencies);
-}
-
-sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_strmv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, lda, x, incx, dependencies);
-}
-
-sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dtrmv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, lda, x, incx, dependencies);
-}
-
-sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ctrmv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, lda, x, incx, dependencies);
-}
-
-sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ztrmv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, lda, x, incx, dependencies);
-}
-
-sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_strsv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, lda, x, incx, dependencies);
-}
-
-sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dtrsv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, lda, x, incx, dependencies);
-}
-
-sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ctrsv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, lda, x, incx, dependencies);
-}
-
-sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ztrsv_usm_sycl(queue, upper_lower, trans, unit_diag,
-                                                               n, a, lda, x, incx, dependencies);
-}
-
-sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a,
-                 std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sgemm_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a,
-                 std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dgemm_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                 std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cgemm_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                 std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zgemm_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                 const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                 sycl::half beta, sycl::half *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_hgemm_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a,
-                 std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_gemm_f16f16f32_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a,
-                 std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_gemm_bf16bf16f32_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                 std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_chemm_usm_sycl(
-        queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                 std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zhemm_usm_sycl(
-        queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, float alpha, const std::complex<float> *a,
-                 std::int64_t lda, float beta, std::complex<float> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cherk_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-}
-
-sycl::event herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, double alpha, const std::complex<double> *a,
-                 std::int64_t lda, double beta, std::complex<double> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zherk_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-}
-
-sycl::event her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                  const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                  std::int64_t ldb, float beta, std::complex<float> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cher2k_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                  const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                  std::int64_t ldb, double beta, std::complex<double> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zher2k_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                 const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ssymm_usm_sycl(
-        queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                 const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dsymm_usm_sycl(
-        queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                 std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_csymm_usm_sycl(
-        queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                 std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zsymm_usm_sycl(
-        queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                 float beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ssyrk_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-}
-
-sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                 double beta, double *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dsyrk_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-}
-
-sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                 std::complex<float> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_csyrk_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-}
-
-sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                 std::complex<double> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zsyrk_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower,
-                       transpose *trans, std::int64_t *n, std::int64_t *k, float *alpha,
-                       const float **a, std::int64_t *lda, float *beta, float **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ssyrk_batch_group_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size,
-        dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower,
-                       transpose *trans, std::int64_t *n, std::int64_t *k, double *alpha,
-                       const double **a, std::int64_t *lda, double *beta, double **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dsyrk_batch_group_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size,
-        dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower,
-                       transpose *trans, std::int64_t *n, std::int64_t *k,
-                       std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
-                       std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_csyrk_batch_group_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size,
-        dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower,
-                       transpose *trans, std::int64_t *n, std::int64_t *k,
-                       std::complex<double> *alpha, const std::complex<double> **a,
-                       std::int64_t *lda, std::complex<double> *beta, std::complex<double> **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zsyrk_batch_group_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size,
-        dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       transpose trans, std::int64_t n, std::int64_t k, float alpha, const float *a,
-                       std::int64_t lda, std::int64_t stride_a, float beta, float *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ssyrk_batch_strided_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-        batch_size, dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                       const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                       double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dsyrk_batch_strided_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-        batch_size, dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       transpose trans, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                       const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                       std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_csyrk_batch_strided_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-        batch_size, dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       transpose trans, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                       const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                       std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zsyrk_batch_strided_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-        batch_size, dependencies);
-}
-
-sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                  const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ssyr2k_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                  const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dsyr2k_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                  const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                  std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_csyr2k_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                  const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                  std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zsyr2k_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                 const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_strmm_usm_sycl(queue, left_right, upper_lower,
-                                                               trans, unit_diag, m, n, alpha, a,
-                                                               lda, b, ldb, dependencies);
-}
-
-sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                 const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dtrmm_usm_sycl(queue, left_right, upper_lower,
-                                                               trans, unit_diag, m, n, alpha, a,
-                                                               lda, b, ldb, dependencies);
-}
-
-sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ctrmm_usm_sycl(queue, left_right, upper_lower,
-                                                               trans, unit_diag, m, n, alpha, a,
-                                                               lda, b, ldb, dependencies);
-}
-
-sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ztrmm_usm_sycl(queue, left_right, upper_lower,
-                                                               trans, unit_diag, m, n, alpha, a,
-                                                               lda, b, ldb, dependencies);
-}
-
-sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                 const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_strsm_usm_sycl(queue, left_right, upper_lower,
-                                                               trans, unit_diag, m, n, alpha, a,
-                                                               lda, b, ldb, dependencies);
-}
-
-sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                 const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dtrsm_usm_sycl(queue, left_right, upper_lower,
-                                                               trans, unit_diag, m, n, alpha, a,
-                                                               lda, b, ldb, dependencies);
-}
-
-sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ctrsm_usm_sycl(queue, left_right, upper_lower,
-                                                               trans, unit_diag, m, n, alpha, a,
-                                                               lda, b, ldb, dependencies);
-}
-
-sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ztrsm_usm_sycl(queue, left_right, upper_lower,
-                                                               trans, unit_diag, m, n, alpha, a,
-                                                               lda, b, ldb, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                       std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                       std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_strsm_batch_strided_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                       std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                       std::int64_t stride_a, double *b, std::int64_t ldb, std::int64_t stride_b,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dtrsm_batch_strided_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                       std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                       std::int64_t lda, std::int64_t stride_a, std::complex<float> *b,
-                       std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ctrsm_batch_strided_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                       std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                       std::int64_t lda, std::int64_t stride_a, std::complex<double> *b,
-                       std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ztrsm_batch_strided_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                       std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, float **b,
-                       std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_strsm_batch_group_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count,
-        group_size, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                       std::int64_t *n, double *alpha, const double **a, std::int64_t *lda,
-                       double **b, std::int64_t *ldb, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dtrsm_batch_group_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count,
-        group_size, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                       std::int64_t *n, std::complex<float> *alpha, const std::complex<float> **a,
-                       std::int64_t *lda, std::complex<float> **b, std::int64_t *ldb,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ctrsm_batch_group_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count,
-        group_size, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                       std::int64_t *n, std::complex<double> *alpha, const std::complex<double> **a,
-                       std::int64_t *lda, std::complex<double> **b, std::int64_t *ldb,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_ztrsm_batch_group_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const float **a, std::int64_t *lda, const float **b,
-                       std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sgemm_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       double *alpha, const double **a, std::int64_t *lda, const double **b,
-                       std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dgemm_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
-                       const std::complex<float> **b, std::int64_t *ldb, std::complex<float> *beta,
-                       std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cgemm_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       std::complex<double> *alpha, const std::complex<double> **a,
-                       std::int64_t *lda, const std::complex<double> **b, std::int64_t *ldb,
-                       std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zgemm_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       sycl::half *alpha, const sycl::half **a, std::int64_t *lda,
-                       const sycl::half **b, std::int64_t *ldb, sycl::half *beta, sycl::half **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_hgemm_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const sycl::half **a, std::int64_t *lda, const sycl::half **b,
-                       std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_gemm_f16f16f32_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, float **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_gemm_s8s8f32_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_gemm_s8s8s32_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
-                       const float *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sgemm_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
-                       const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
-                       double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dgemm_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                       std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb,
-                       std::int64_t stride_b, std::complex<float> beta, std::complex<float> *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cgemm_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                       std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb,
-                       std::int64_t stride_b, std::complex<double> beta, std::complex<double> *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zgemm_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       sycl::half alpha, const sycl::half *a, std::int64_t lda,
-                       std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-                       std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_hgemm_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const sycl::half *a, std::int64_t lda, std::int64_t stride_a,
-                       const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_gemm_f16f16f32_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_gemm_s8s8f32_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       std::int32_t *c, std::int64_t ldc, std::int64_t stride_c,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_gemm_s8s8s32_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                  transpose transa, transpose transb, std::int64_t n, std::int64_t k, float alpha,
-                  const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta,
-                  float *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_sgemmt_usm_sycl(queue, upper_lower, transa, transb,
-                                                                n, k, alpha, a, lda, b, ldb, beta,
-                                                                c, ldc, dependencies);
-}
-
-sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                  transpose transa, transpose transb, std::int64_t n, std::int64_t k, double alpha,
-                  const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta,
-                  double *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dgemmt_usm_sycl(queue, upper_lower, transa, transb,
-                                                                n, k, alpha, a, lda, b, ldb, beta,
-                                                                c, ldc, dependencies);
-}
-
-sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                  transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                  std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                  const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
-                  std::complex<float> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cgemmt_usm_sycl(queue, upper_lower, transa, transb,
-                                                                n, k, alpha, a, lda, b, ldb, beta,
-                                                                c, ldc, dependencies);
-}
-
-sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                  transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                  std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                  const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
-                  std::complex<double> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zgemmt_usm_sycl(queue, upper_lower, transa, transb,
-                                                                n, k, alpha, a, lda, b, ldb, beta,
-                                                                c, ldc, dependencies);
-}
-
-sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                      transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                      std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                      std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                      float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_gemm_s8u8s32_bias_usm_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co,
-        dependencies);
-}
-
-sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                      transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                      std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                      std::int8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                      float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_gemm_s8s8s32_bias_usm_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co,
-        dependencies);
-}
-
-sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                      transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                      std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda,
-                      std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                      float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_gemm_u8s8s32_bias_usm_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co,
-        dependencies);
-}
-
-sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                      transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                      std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda,
-                      std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                      float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_gemm_u8u8s32_bias_usm_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co,
-        dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, const float *a,
-                           std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_somatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, const double *a,
-                           std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_domatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_comatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zomatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda,
-                           std::int64_t ldb, std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_simatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, double *ab,
-                           std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dimatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cimatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zimatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-}
-
-sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                          const float *a, std::int64_t lda, std::int64_t stride_a, float beta,
-                          const float *b, std::int64_t ldb, std::int64_t stride_b, float *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_somatadd_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                          const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                          const double *b, std::int64_t ldb, std::int64_t stride_b, double *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_domatadd_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                          std::int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_comatadd_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<double> alpha, const std::complex<double> *a,
-                          std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zomatadd_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                     float *b, std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_somatcopy_usm_sycl(queue, trans, m, n, alpha, a,
-                                                                   lda, b, ldb, dependencies);
-}
-
-sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, double alpha, const double *a,
-                     std::int64_t lda, double *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_domatcopy_usm_sycl(queue, trans, m, n, alpha, a,
-                                                                   lda, b, ldb, dependencies);
-}
-
-sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_comatcopy_usm_sycl(queue, trans, m, n, alpha, a,
-                                                                   lda, b, ldb, dependencies);
-}
-
-sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zomatcopy_usm_sycl(queue, trans, m, n, alpha, a,
-                                                                   lda, b, ldb, dependencies);
-}
-
-sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                      std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                      std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_somatcopy2_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-}
-
-sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                      std::int64_t m, std::int64_t n, double alpha, const double *a,
-                      std::int64_t lda, std::int64_t stridea, double *b, std::int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_domatcopy2_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-}
-
-sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                      std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                      const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-                      std::complex<float> *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_comatcopy2_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-}
-
-sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                      std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                      const std::complex<double> *a, std::int64_t lda, std::int64_t stridea,
-                      std::complex<double> *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zomatcopy2_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-}
-
-sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_simatcopy_usm_sycl(queue, trans, m, n, alpha, ab,
-                                                                   lda, ldb, dependencies);
-}
-
-sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, double alpha, double *ab, std::int64_t lda,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dimatcopy_usm_sycl(queue, trans, m, n, alpha, ab,
-                                                                   lda, ldb, dependencies);
-}
-
-sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                     std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cimatcopy_usm_sycl(queue, trans, m, n, alpha, ab,
-                                                                   lda, ldb, dependencies);
-}
-
-sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                     std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zimatcopy_usm_sycl(queue, trans, m, n, alpha, ab,
-                                                                   lda, ldb, dependencies);
-}
-
-sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                    transpose transb, std::int64_t m, std::int64_t n, float alpha, const float *a,
-                    std::int64_t lda, float beta, const float *b, std::int64_t ldb, float *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_somatadd_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies);
-}
-
-sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                    transpose transb, std::int64_t m, std::int64_t n, double alpha, const double *a,
-                    std::int64_t lda, double beta, const double *b, std::int64_t ldb, double *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_domatadd_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies);
-}
-
-sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                    transpose transb, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                    const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                    const std::complex<float> *b, std::int64_t ldb, std::complex<float> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_comatadd_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies);
-}
-
-sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                    transpose transb, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                    const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                    const std::complex<double> *b, std::int64_t ldb, std::complex<double> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zomatadd_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, float *alpha, const float **a,
-                           std::int64_t *lda, float **b, std::int64_t *ldb,
-                           std::int64_t group_count, std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_somatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, double *alpha, const double **a,
-                           std::int64_t *lda, double **b, std::int64_t *ldb,
-                           std::int64_t group_count, std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_domatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<float> *alpha,
-                           const std::complex<float> **a, std::int64_t *lda,
-                           std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_comatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<double> *alpha,
-                           const std::complex<double> **a, std::int64_t *lda,
-                           std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zomatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, float *alpha, float **ab,
-                           std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_simatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, double *alpha, double **ab,
-                           std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_dimatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<float> *alpha,
-                           std::complex<float> **ab, std::int64_t *lda, std::int64_t *ldb,
-                           std::int64_t group_count, std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_cimatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<double> *alpha,
-                           std::complex<double> **ab, std::int64_t *lda, std::int64_t *ldb,
-                           std::int64_t group_count, std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].column_major_zimatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies);
-}
-
-} //namespace detail
-} //namespace column_major
-namespace row_major {
-namespace detail {
-
-static oneapi::mkl::detail::table_initializer<domain::blas, blas_function_table_t> function_tables;
-
-// Buffer APIs
-
-void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    function_tables[libkey].row_major_scasum_sycl(queue, n, x, incx, result);
-}
-
-void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    function_tables[libkey].row_major_dzasum_sycl(queue, n, x, incx, result);
-}
-
-void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    function_tables[libkey].row_major_sasum_sycl(queue, n, x, incx, result);
-}
-
-void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &result) {
-    function_tables[libkey].row_major_dasum_sycl(queue, n, x, incx, result);
-}
-
-void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].row_major_saxpy_sycl(queue, n, alpha, x, incx, y, incy);
-}
-
-void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].row_major_daxpy_sycl(queue, n, alpha, x, incx, y, incy);
-}
-
-void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_caxpy_sycl(queue, n, alpha, x, incx, y, incy);
-}
-
-void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_zaxpy_sycl(queue, n, alpha, x, incx, y, incy);
-}
-
-void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_saxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex, y,
-                                                               incy, stridey, batch_size);
-}
-
-void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_daxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex, y,
-                                                               incy, stridey, batch_size);
-}
-
-void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    function_tables[libkey].row_major_caxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex, y,
-                                                               incy, stridey, batch_size);
-}
-
-void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    function_tables[libkey].row_major_zaxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex, y,
-                                                               incy, stridey, batch_size);
-}
-
-void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-           sycl::buffer<float, 1> &x, std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-           std::int64_t incy) {
-    function_tables[libkey].row_major_saxpby_sycl(queue, n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-           sycl::buffer<double, 1> &x, std::int64_t incx, double beta, sycl::buffer<double, 1> &y,
-           std::int64_t incy) {
-    function_tables[libkey].row_major_daxpby_sycl(queue, n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_caxpby_sycl(queue, n, alpha, x, incx, beta, y, incy);
-}
-
-void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_zaxpby_sycl(queue, n, alpha, x, incx, beta, y, incy);
-}
-
-void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_scopy_sycl(queue, n, x, incx, y, incy);
-}
-
-void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].row_major_dcopy_sycl(queue, n, x, incx, y, incy);
-}
-
-void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_ccopy_sycl(queue, n, x, incx, y, incy);
-}
-
-void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_zcopy_sycl(queue, n, x, incx, y, incy);
-}
-
-void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_scopy_batch_strided_sycl(queue, n, x, incx, stridex, y, incy,
-                                                               stridey, batch_size);
-}
-
-void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_dcopy_batch_strided_sycl(queue, n, x, incx, stridex, y, incy,
-                                                               stridey, batch_size);
-}
-
-void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_ccopy_batch_strided_sycl(queue, n, x, incx, stridex, y, incy,
-                                                               stridey, batch_size);
-}
-
-void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_zcopy_batch_strided_sycl(queue, n, x, incx, stridex, y, incy,
-                                                               stridey, batch_size);
-}
-
-void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<float, 1> &result) {
-    function_tables[libkey].row_major_sdot_sycl(queue, n, x, incx, y, incy, result);
-}
-
-void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-         sycl::buffer<double, 1> &result) {
-    function_tables[libkey].row_major_ddot_sycl(queue, n, x, incx, y, incy, result);
-}
-
-void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-         sycl::buffer<double, 1> &result) {
-    function_tables[libkey].row_major_dsdot_sycl(queue, n, x, incx, y, incy, result);
-}
-
-void dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    function_tables[libkey].row_major_cdotc_sycl(queue, n, x, incx, y, incy, result);
-}
-
-void dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    function_tables[libkey].row_major_zdotc_sycl(queue, n, x, incx, y, incy, result);
-}
-
-void dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &result) {
-    function_tables[libkey].row_major_cdotu_sycl(queue, n, x, incx, y, incy, result);
-}
-
-void dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &result) {
-    function_tables[libkey].row_major_zdotu_sycl(queue, n, x, incx, y, incy, result);
-}
-
-void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].row_major_isamin_sycl(queue, n, x, incx, result);
-}
-
-void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].row_major_idamin_sycl(queue, n, x, incx, result);
-}
-
-void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].row_major_icamin_sycl(queue, n, x, incx, result);
-}
-
-void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].row_major_izamin_sycl(queue, n, x, incx, result);
-}
-
-void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].row_major_isamax_sycl(queue, n, x, incx, result);
-}
-
-void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].row_major_idamax_sycl(queue, n, x, incx, result);
-}
-
-void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].row_major_icamax_sycl(queue, n, x, incx, result);
-}
-
-void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-           sycl::buffer<std::int64_t, 1> &result) {
-    function_tables[libkey].row_major_izamax_sycl(queue, n, x, incx, result);
-}
-
-void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<float, 1> &result) {
-    function_tables[libkey].row_major_scnrm2_sycl(queue, n, x, incx, result);
-}
-
-void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<double, 1> &result) {
-    function_tables[libkey].row_major_dznrm2_sycl(queue, n, x, incx, result);
-}
-
-void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &result) {
-    function_tables[libkey].row_major_snrm2_sycl(queue, n, x, incx, result);
-}
-
-void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &result) {
-    function_tables[libkey].row_major_dnrm2_sycl(queue, n, x, incx, result);
-}
-
-void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-         sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c, float s) {
-    function_tables[libkey].row_major_srot_sycl(queue, n, x, incx, y, incy, c, s);
-}
-
-void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-         sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy, double c, double s) {
-    function_tables[libkey].row_major_drot_sycl(queue, n, x, incx, y, incy, c, s);
-}
-
-void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-         std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy, float c, float s) {
-    function_tables[libkey].row_major_csrot_sycl(queue, n, x, incx, y, incy, c, s);
-}
-
-void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-         std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy, double c, double s) {
-    function_tables[libkey].row_major_zdrot_sycl(queue, n, x, incx, y, incy, c, s);
-}
-
-void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer<float, 1> &a,
-          sycl::buffer<float, 1> &b, sycl::buffer<float, 1> &c, sycl::buffer<float, 1> &s) {
-    function_tables[libkey].row_major_srotg_sycl(queue, a, b, c, s);
-}
-
-void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer<double, 1> &a,
-          sycl::buffer<double, 1> &b, sycl::buffer<double, 1> &c, sycl::buffer<double, 1> &s) {
-    function_tables[libkey].row_major_drotg_sycl(queue, a, b, c, s);
-}
-
-void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-          sycl::buffer<std::complex<float>, 1> &s) {
-    function_tables[libkey].row_major_crotg_sycl(queue, a, b, c, s);
-}
-
-void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &b, sycl::buffer<double, 1> &c,
-          sycl::buffer<std::complex<double>, 1> &s) {
-    function_tables[libkey].row_major_zrotg_sycl(queue, a, b, c, s);
-}
-
-void rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-          sycl::buffer<float, 1> &param) {
-    function_tables[libkey].row_major_srotm_sycl(queue, n, x, incx, y, incy, param);
-}
-
-void rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &param) {
-    function_tables[libkey].row_major_drotm_sycl(queue, n, x, incx, y, incy, param);
-}
-
-void rotmg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer<float, 1> &d1,
-           sycl::buffer<float, 1> &d2, sycl::buffer<float, 1> &x1, float y1,
-           sycl::buffer<float, 1> &param) {
-    function_tables[libkey].row_major_srotmg_sycl(queue, d1, d2, x1, y1, param);
-}
-
-void rotmg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer<double, 1> &d1,
-           sycl::buffer<double, 1> &d2, sycl::buffer<double, 1> &x1, double y1,
-           sycl::buffer<double, 1> &param) {
-    function_tables[libkey].row_major_drotmg_sycl(queue, d1, d2, x1, y1, param);
-}
-
-void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_sscal_sycl(queue, n, alpha, x, incx);
-}
-
-void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_dscal_sycl(queue, n, alpha, x, incx);
-}
-
-void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_cscal_sycl(queue, n, alpha, x, incx);
-}
-
-void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_csscal_sycl(queue, n, alpha, x, incx);
-}
-
-void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_zscal_sycl(queue, n, alpha, x, incx);
-}
-
-void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_zdscal_sycl(queue, n, alpha, x, incx);
-}
-
-void sdsdot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float sb,
-            sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-            std::int64_t incy, sycl::buffer<float, 1> &result) {
-    function_tables[libkey].row_major_sdsdot_sycl(queue, n, sb, x, incx, y, incy, result);
-}
-
-void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-          std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_sswap_sycl(queue, n, x, incx, y, incy);
-}
-
-void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].row_major_dswap_sycl(queue, n, x, incx, y, incy);
-}
-
-void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_cswap_sycl(queue, n, x, incx, y, incy);
-}
-
-void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_zswap_sycl(queue, n, x, incx, y, incy);
-}
-
-void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-          sycl::buffer<float, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_sgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx,
-                                                 beta, y, incy);
-}
-
-void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_dgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx,
-                                                 beta, y, incy);
-}
-
-void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_cgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx,
-                                                 beta, y, incy);
-}
-
-void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_zgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx,
-                                                 beta, y, incy);
-}
-
-void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].row_major_sgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta,
-                                                 y, incy);
-}
-
-void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].row_major_dgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta,
-                                                 y, incy);
-}
-
-void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_cgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta,
-                                                 y, incy);
-}
-
-void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-          std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_zgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta,
-                                                 y, incy);
-}
-
-void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<float, 1> &x, std::int64_t incx,
-                std::int64_t stridex, float beta, sycl::buffer<float, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    function_tables[libkey].row_major_sgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda,
-                                                               stridea, x, incx, stridex, beta, y,
-                                                               incy, stridey, batch_size);
-}
-
-void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<double, 1> &x, std::int64_t incx,
-                std::int64_t stridex, double beta, sycl::buffer<double, 1> &y, std::int64_t incy,
-                std::int64_t stridey, std::int64_t batch_size) {
-    function_tables[libkey].row_major_dgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda,
-                                                               stridea, x, incx, stridex, beta, y,
-                                                               incy, stridey, batch_size);
-}
-
-void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x,
-                std::int64_t incx, std::int64_t stridex, std::complex<float> beta,
-                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, std::int64_t stridey,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_cgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda,
-                                                               stridea, x, incx, stridex, beta, y,
-                                                               incy, stridey, batch_size);
-}
-
-void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                std::int64_t n, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::int64_t stridex,
-                std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-                std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) {
-    function_tables[libkey].row_major_zgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda,
-                                                               stridea, x, incx, stridex, beta, y,
-                                                               incy, stridey, batch_size);
-}
-
-void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<float, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_sdgmm_batch_strided_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size);
-}
-
-void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stridea,
-                sycl::buffer<double, 1> &x, std::int64_t incx, std::int64_t stridex,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stridec,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_ddgmm_batch_strided_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size);
-}
-
-void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    function_tables[libkey].row_major_cdgmm_batch_strided_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size);
-}
-
-void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m,
-                std::int64_t n, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                std::int64_t stridex, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-                std::int64_t stridec, std::int64_t batch_size) {
-    function_tables[libkey].row_major_zdgmm_batch_strided_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size);
-}
-
-void ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-         float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-         std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda) {
-    function_tables[libkey].row_major_sger_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-         double alpha, sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-         std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda) {
-    function_tables[libkey].row_major_dger_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].row_major_cgerc_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].row_major_zgerc_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].row_major_cgeru_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].row_major_zgeru_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda);
-}
-
-void hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::int64_t k, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_chbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx,
-                                                 beta, y, incy);
-}
-
-void hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::int64_t k, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_zhbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx,
-                                                 beta, y, incy);
-}
-
-void hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_chemv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx,
-                                                 beta, y, incy);
-}
-
-void hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_zhemv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx,
-                                                 beta, y, incy);
-}
-
-void her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         float alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].row_major_cher_sycl(queue, upper_lower, n, alpha, x, incx, a, lda);
-}
-
-void her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         double alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].row_major_zher_sycl(queue, upper_lower, n, alpha, x, incx, a, lda);
-}
-
-void her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].row_major_cher2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a,
-                                                 lda);
-}
-
-void her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda) {
-    function_tables[libkey].row_major_zher2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a,
-                                                 lda);
-}
-
-void hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_chpmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y,
-                                                 incy);
-}
-
-void hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_zhpmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y,
-                                                 incy);
-}
-
-void hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         float alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<float>, 1> &a) {
-    function_tables[libkey].row_major_chpr_sycl(queue, upper_lower, n, alpha, x, incx, a);
-}
-
-void hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         double alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-         sycl::buffer<std::complex<double>, 1> &a) {
-    function_tables[libkey].row_major_zhpr_sycl(queue, upper_lower, n, alpha, x, incx, a);
-}
-
-void hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<float>, 1> &a) {
-    function_tables[libkey].row_major_chpr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a);
-}
-
-void hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-          sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-          sycl::buffer<std::complex<double>, 1> &a) {
-    function_tables[libkey].row_major_zhpr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a);
-}
-
-void sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].row_major_ssbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx,
-                                                 beta, y, incy);
-}
-
-void sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          std::int64_t k, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx, double beta, sycl::buffer<double, 1> &y,
-          std::int64_t incy) {
-    function_tables[libkey].row_major_dsbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx,
-                                                 beta, y, incy);
-}
-
-void spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x, std::int64_t incx,
-          float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_sspmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y,
-                                                 incy);
-}
-
-void spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x, std::int64_t incx,
-          double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_dspmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y,
-                                                 incy);
-}
-
-void spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a) {
-    function_tables[libkey].row_major_sspr_sycl(queue, upper_lower, n, alpha, x, incx, a);
-}
-
-void spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         double alpha, sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a) {
-    function_tables[libkey].row_major_dspr_sycl(queue, upper_lower, n, alpha, x, incx, a);
-}
-
-void spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a) {
-    function_tables[libkey].row_major_sspr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a);
-}
-
-void spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &a) {
-    function_tables[libkey].row_major_dspr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a);
-}
-
-void symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &x,
-          std::int64_t incx, float beta, sycl::buffer<float, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_ssymv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx,
-                                                 beta, y, incy);
-}
-
-void symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &x,
-          std::int64_t incx, double beta, sycl::buffer<double, 1> &y, std::int64_t incy) {
-    function_tables[libkey].row_major_dsymv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx,
-                                                 beta, y, incy);
-}
-
-void syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &a,
-         std::int64_t lda) {
-    function_tables[libkey].row_major_ssyr_sycl(queue, upper_lower, n, alpha, x, incx, a, lda);
-}
-
-void syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-         double alpha, sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &a,
-         std::int64_t lda) {
-    function_tables[libkey].row_major_dsyr_sycl(queue, upper_lower, n, alpha, x, incx, a, lda);
-}
-
-void syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          float alpha, sycl::buffer<float, 1> &x, std::int64_t incx, sycl::buffer<float, 1> &y,
-          std::int64_t incy, sycl::buffer<float, 1> &a, std::int64_t lda) {
-    function_tables[libkey].row_major_ssyr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a,
-                                                 lda);
-}
-
-void syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-          double alpha, sycl::buffer<double, 1> &x, std::int64_t incx, sycl::buffer<double, 1> &y,
-          std::int64_t incy, sycl::buffer<double, 1> &a, std::int64_t lda) {
-    function_tables[libkey].row_major_dsyr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a,
-                                                 lda);
-}
-
-void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_stbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda,
-                                                 x, incx);
-}
-
-void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_dtbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda,
-                                                 x, incx);
-}
-
-void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_ctbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda,
-                                                 x, incx);
-}
-
-void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_ztbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda,
-                                                 x, incx);
-}
-
-void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_stbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda,
-                                                 x, incx);
-}
-
-void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_dtbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda,
-                                                 x, incx);
-}
-
-void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_ctbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda,
-                                                 x, incx);
-}
-
-void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_ztbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda,
-                                                 x, incx);
-}
-
-void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-          std::int64_t incx) {
-    function_tables[libkey].row_major_stpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                 incx);
-}
-
-void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-          std::int64_t incx) {
-    function_tables[libkey].row_major_dtpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                 incx);
-}
-
-void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_ctpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                 incx);
-}
-
-void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_ztpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                 incx);
-}
-
-void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-          std::int64_t incx) {
-    function_tables[libkey].row_major_stpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                 incx);
-}
-
-void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, sycl::buffer<double, 1> &x,
-          std::int64_t incx) {
-    function_tables[libkey].row_major_dtpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                 incx);
-}
-
-void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_ctpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                 incx);
-}
-
-void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_ztpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x,
-                                                 incx);
-}
-
-void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_strmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x,
-                                                 incx);
-}
-
-void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_dtrmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x,
-                                                 incx);
-}
-
-void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_ctrmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x,
-                                                 incx);
-}
-
-void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_ztrmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x,
-                                                 incx);
-}
-
-void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_strsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x,
-                                                 incx);
-}
-
-void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-          sycl::buffer<double, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_dtrsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x,
-                                                 incx);
-}
-
-void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_ctrsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x,
-                                                 incx);
-}
-
-void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          diag unit_diag, std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx) {
-    function_tables[libkey].row_major_ztrsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x,
-                                                 incx);
-}
-
-void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-          std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_sgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b,
-                                                 ldb, beta, c, ldc);
-}
-
-void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_dgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b,
-                                                 ldb, beta, c, ldc);
-}
-
-void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_cgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b,
-                                                 ldb, beta, c, ldc);
-}
-
-void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_zgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b,
-                                                 ldb, beta, c, ldc);
-}
-
-void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_hgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b,
-                                                 ldb, beta, c, ldc);
-}
-
-void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-          sycl::buffer<sycl::half, 1> &a, std::int64_t lda, sycl::buffer<sycl::half, 1> &b,
-          std::int64_t ldb, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_gemm_f16f16f32_sycl(queue, transa, transb, m, n, k, alpha, a,
-                                                          lda, b, ldb, beta, c, ldc);
-}
-
-void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-          std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer<bfloat16, 1> &a,
-          std::int64_t lda, sycl::buffer<bfloat16, 1> &b, std::int64_t ldb, float beta,
-          sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_gemm_bf16bf16f32_sycl(queue, transa, transb, m, n, k, alpha,
-                                                            a, lda, b, ldb, beta, c, ldc);
-}
-
-void hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_chemm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
-                                                 lda, b, ldb, beta, c, ldc);
-}
-
-void hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_zhemm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
-                                                 lda, b, ldb, beta, c, ldc);
-}
-
-void herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<std::complex<float>, 1> &a,
-          std::int64_t lda, float beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_cherk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                 beta, c, ldc);
-}
-
-void herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, double alpha, sycl::buffer<std::complex<double>, 1> &a,
-          std::int64_t lda, double beta, sycl::buffer<std::complex<double>, 1> &c,
-          std::int64_t ldc) {
-    function_tables[libkey].row_major_zherk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                 beta, c, ldc);
-}
-
-void her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_cher2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b,
-                                                  ldb, beta, c, ldc);
-}
-
-void her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_zher2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b,
-                                                  ldb, beta, c, ldc);
-}
-
-void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          sycl::buffer<float, 1> &b, std::int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-          std::int64_t ldc) {
-    function_tables[libkey].row_major_ssymm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
-                                                 lda, b, ldb, beta, c, ldc);
-}
-
-void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-          sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_dsymm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
-                                                 lda, b, ldb, beta, c, ldc);
-}
-
-void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_csymm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
-                                                 lda, b, ldb, beta, c, ldc);
-}
-
-void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          std::int64_t m, std::int64_t n, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_zsymm_sycl(queue, left_right, upper_lower, m, n, alpha, a,
-                                                 lda, b, ldb, beta, c, ldc);
-}
-
-void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-          float beta, sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_ssyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                 beta, c, ldc);
-}
-
-void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-          std::int64_t lda, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_dsyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                 beta, c, ldc);
-}
-
-void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<float> alpha,
-          sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-          sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_csyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                 beta, c, ldc);
-}
-
-void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-          std::int64_t n, std::int64_t k, std::complex<double> alpha,
-          sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-          sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_zsyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda,
-                                                 beta, c, ldc);
-}
-
-void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer<float, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].row_major_ssyrk_batch_strided_sycl(queue, upper_lower, trans, n, k,
-                                                               alpha, a, lda, stride_a, beta, c,
-                                                               ldc, stride_c, batch_size);
-}
-
-void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer<double, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].row_major_dsyrk_batch_strided_sycl(queue, upper_lower, trans, n, k,
-                                                               alpha, a, lda, stride_a, beta, c,
-                                                               ldc, stride_c, batch_size);
-}
-
-void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].row_major_csyrk_batch_strided_sycl(queue, upper_lower, trans, n, k,
-                                                               alpha, a, lda, stride_a, beta, c,
-                                                               ldc, stride_c, batch_size);
-}
-
-void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].row_major_zsyrk_batch_strided_sycl(queue, upper_lower, trans, n, k,
-                                                               alpha, a, lda, stride_a, beta, c,
-                                                               ldc, stride_c, batch_size);
-}
-
-void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-           sycl::buffer<float, 1> &b, std::int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-           std::int64_t ldc) {
-    function_tables[libkey].row_major_ssyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b,
-                                                  ldb, beta, c, ldc);
-}
-
-void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-           std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-           sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_dsyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b,
-                                                  ldb, beta, c, ldc);
-}
-
-void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_csyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b,
-                                                  ldb, beta, c, ldc);
-}
-
-void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-           std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_zsyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b,
-                                                  ldb, beta, c, ldc);
-}
-
-void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb) {
-    function_tables[libkey].row_major_strmm_sycl(queue, left_right, upper_lower, trans, unit_diag,
-                                                 m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb) {
-    function_tables[libkey].row_major_dtrmm_sycl(queue, left_right, upper_lower, trans, unit_diag,
-                                                 m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].row_major_ctrmm_sycl(queue, left_right, upper_lower, trans, unit_diag,
-                                                 m, n, alpha, a, lda, b, ldb);
-}
-
-void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].row_major_ztrmm_sycl(queue, left_right, upper_lower, trans, unit_diag,
-                                                 m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-          sycl::buffer<float, 1> &a, std::int64_t lda, sycl::buffer<float, 1> &b,
-          std::int64_t ldb) {
-    function_tables[libkey].row_major_strsm_sycl(queue, left_right, upper_lower, trans, unit_diag,
-                                                 m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-          sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-          std::int64_t ldb) {
-    function_tables[libkey].row_major_dtrsm_sycl(queue, left_right, upper_lower, trans, unit_diag,
-                                                 m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].row_major_ctrsm_sycl(queue, left_right, upper_lower, trans, unit_diag,
-                                                 m, n, alpha, a, lda, b, ldb);
-}
-
-void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-          transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-          std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-          sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].row_major_ztrsm_sycl(queue, left_right, upper_lower, trans, unit_diag,
-                                                 m, n, alpha, a, lda, b, ldb);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_sgemm_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b, double beta,
-                sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_dgemm_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].row_major_cgemm_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c,
-                std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].row_major_zgemm_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].row_major_hgemm_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<sycl::half, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<sycl::half, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_gemm_f16f16f32_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_gemm_s8s8f32_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-                sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                float beta, sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc,
-                std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].row_major_gemm_s8s8s32_batch_strided_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size);
-}
-
-void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_strsm_batch_strided_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size);
-}
-
-void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                std::int64_t batch_size) {
-    function_tables[libkey].row_major_dtrsm_batch_strided_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size);
-}
-
-void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    function_tables[libkey].row_major_ctrsm_batch_strided_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size);
-}
-
-void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-                std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    function_tables[libkey].row_major_ztrsm_batch_strided_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size);
-}
-
-void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-           std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-           sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_sgemmt_sycl(queue, upper_lower, transa, transb, n, k, alpha,
-                                                  a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, double alpha,
-           sycl::buffer<double, 1> &a, std::int64_t lda, sycl::buffer<double, 1> &b,
-           std::int64_t ldb, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_dgemmt_sycl(queue, upper_lower, transa, transb, n, k, alpha,
-                                                  a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-           sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::complex<float> beta,
-           sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_cgemmt_sycl(queue, upper_lower, transa, transb, n, k, alpha,
-                                                  a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa,
-           transpose transb, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-           sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::complex<double> beta,
-           sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_zgemmt_sycl(queue, upper_lower, transa, transb, n, k, alpha,
-                                                  a, lda, b, ldb, beta, c, ldc);
-}
-
-void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao, sycl::buffer<uint8_t, 1> &b,
-               std::int64_t ldb, uint8_t bo, float beta, sycl::buffer<int32_t, 1> &c,
-               std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    function_tables[libkey].row_major_gemm_s8u8s32_bias_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao, sycl::buffer<int8_t, 1> &b,
-               std::int64_t ldb, int8_t bo, float beta, sycl::buffer<int32_t, 1> &c,
-               std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    function_tables[libkey].row_major_gemm_s8s8s32_bias_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<int8_t, 1> &b, std::int64_t ldb, int8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    function_tables[libkey].row_major_gemm_u8s8s32_bias_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-               offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-               sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao,
-               sycl::buffer<uint8_t, 1> &b, std::int64_t ldb, uint8_t bo, float beta,
-               sycl::buffer<int32_t, 1> &c, std::int64_t ldc, sycl::buffer<int32_t, 1> &co) {
-    function_tables[libkey].row_major_gemm_u8u8s32_bias_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co);
-}
-
-void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    function_tables[libkey].row_major_somatcopy_batch_strided_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                    std::int64_t stride_b, std::int64_t batch_size) {
-    function_tables[libkey].row_major_domatcopy_batch_strided_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    function_tables[libkey].row_major_comatcopy_batch_strided_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                    std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b,
-                    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    function_tables[libkey].row_major_zomatcopy_batch_strided_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size);
-}
-
-void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    function_tables[libkey].row_major_simatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab,
-                                                                   lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-                    std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) {
-    function_tables[libkey].row_major_dimatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab,
-                                                                   lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<float> alpha,
-                    sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    function_tables[libkey].row_major_cimatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab,
-                                                                   lda, ldb, stride, batch_size);
-}
-
-void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                    std::int64_t n, std::complex<double> alpha,
-                    sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda, std::int64_t ldb,
-                    std::int64_t stride, std::int64_t batch_size) {
-    function_tables[libkey].row_major_zimatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab,
-                                                                   lda, ldb, stride, batch_size);
-}
-
-void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                   transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                   sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a, float beta,
-                   sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                   sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                   std::int64_t batch_size) {
-    function_tables[libkey].row_major_somatadd_batch_strided_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size);
-}
-
-void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                   transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                   sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a, double beta,
-                   sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-                   sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-                   std::int64_t batch_size) {
-    function_tables[libkey].row_major_domatadd_batch_strided_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size);
-}
-
-void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                   transpose transb, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                   sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-                   std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b,
-                   std::int64_t ldb, std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].row_major_comatadd_batch_strided_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size);
-}
-
-void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                   transpose transb, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                   sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                   std::int64_t stride_a, std::complex<double> beta,
-                   sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                   std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c,
-                   std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) {
-    function_tables[libkey].row_major_zomatadd_batch_strided_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size);
-}
-
-void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-              sycl::buffer<float, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].row_major_somatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-              sycl::buffer<double, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].row_major_domatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].row_major_comatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-              std::int64_t lda, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb) {
-    function_tables[libkey].row_major_zomatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb);
-}
-
-void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-               std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<float, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    function_tables[libkey].row_major_somatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea, b,
-                                                      ldb, strideb);
-}
-
-void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-               std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-               std::int64_t stridea, sycl::buffer<double, 1> &b, std::int64_t ldb,
-               std::int64_t strideb) {
-    function_tables[libkey].row_major_domatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea, b,
-                                                      ldb, strideb);
-}
-
-void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    function_tables[libkey].row_major_comatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea, b,
-                                                      ldb, strideb);
-}
-
-void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-               std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a,
-               std::int64_t lda, std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &b,
-               std::int64_t ldb, std::int64_t strideb) {
-    function_tables[libkey].row_major_zomatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea, b,
-                                                      ldb, strideb);
-}
-
-void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, float alpha, sycl::buffer<float, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    function_tables[libkey].row_major_simatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, double alpha, sycl::buffer<double, 1> &ab, std::int64_t lda,
-              std::int64_t ldb) {
-    function_tables[libkey].row_major_dimatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    function_tables[libkey].row_major_cimatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-              std::int64_t n, std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &ab,
-              std::int64_t lda, std::int64_t ldb) {
-    function_tables[libkey].row_major_zimatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb);
-}
-
-void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-             std::int64_t lda, float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-             sycl::buffer<float, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_somatadd_sycl(queue, transa, transb, m, n, alpha, a, lda,
-                                                    beta, b, ldb, c, ldc);
-}
-
-void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-             std::int64_t lda, double beta, sycl::buffer<double, 1> &b, std::int64_t ldb,
-             sycl::buffer<double, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_domatadd_sycl(queue, transa, transb, m, n, alpha, a, lda,
-                                                    beta, b, ldb, c, ldc);
-}
-
-void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<float> alpha,
-             sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::complex<float> beta,
-             sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_comatadd_sycl(queue, transa, transb, m, n, alpha, a, lda,
-                                                    beta, b, ldb, c, ldc);
-}
-
-void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-             std::int64_t m, std::int64_t n, std::complex<double> alpha,
-             sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::complex<double> beta,
-             sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-             sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc) {
-    function_tables[libkey].row_major_zomatadd_sycl(queue, transa, transb, m, n, alpha, a, lda,
-                                                    beta, b, ldb, c, ldc);
-}
-
-// USM APIs
-
-sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, float *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_scasum_usm_sycl(queue, n, x, incx, result,
-                                                             dependencies);
-}
-
-sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, double *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dzasum_usm_sycl(queue, n, x, incx, result,
-                                                             dependencies);
-}
-
-sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x,
-                 std::int64_t incx, float *result, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sasum_usm_sycl(queue, n, x, incx, result,
-                                                            dependencies);
-}
-
-sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x,
-                 std::int64_t incx, double *result, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dasum_usm_sycl(queue, n, x, incx, result,
-                                                            dependencies);
-}
-
-sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-                 const float *x, std::int64_t incx, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_saxpy_usm_sycl(queue, n, alpha, x, incx, y, incy,
-                                                            dependencies);
-}
-
-sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-                 const double *x, std::int64_t incx, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_daxpy_usm_sycl(queue, n, alpha, x, incx, y, incy,
-                                                            dependencies);
-}
-
-sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_caxpy_usm_sycl(queue, n, alpha, x, incx, y, incy,
-                                                            dependencies);
-}
-
-sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zaxpy_usm_sycl(queue, n, alpha, x, incx, y, incy,
-                                                            dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       float *alpha, const float **x, std::int64_t *incx, float **y,
-                       std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_saxpy_batch_group_usm_sycl(
-        queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       double *alpha, const double **x, std::int64_t *incx, double **y,
-                       std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_daxpy_batch_group_usm_sycl(
-        queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       std::complex<float> *alpha, const std::complex<float> **x,
-                       std::int64_t *incx, std::complex<float> **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_caxpy_batch_group_usm_sycl(
-        queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       std::complex<double> *alpha, const std::complex<double> **x,
-                       std::int64_t *incx, std::complex<double> **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zaxpy_batch_group_usm_sycl(
-        queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-                       const float *x, std::int64_t incx, std::int64_t stridex, float *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_saxpy_batch_strided_usm_sycl(
-        queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-                       const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_daxpy_batch_strided_usm_sycl(
-        queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                       std::int64_t stridex, std::complex<float> *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_caxpy_batch_strided_usm_sycl(
-        queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                       std::int64_t stridex, std::complex<double> *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zaxpy_batch_strided_usm_sycl(
-        queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-                  const float *x, std::int64_t incx, const float beta, float *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_saxpby_usm_sycl(queue, n, alpha, x, incx, beta, y,
-                                                             incy, dependencies);
-}
-
-sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-                  const double *x, std::int64_t incx, const double beta, double *y,
-                  std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_daxpby_usm_sycl(queue, n, alpha, x, incx, beta, y,
-                                                             incy, dependencies);
-}
-
-sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                  std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                  const std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_caxpby_usm_sycl(queue, n, alpha, x, incx, beta, y,
-                                                             incy, dependencies);
-}
-
-sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                  std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                  const std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zaxpby_usm_sycl(queue, n, alpha, x, incx, beta, y,
-                                                             incy, dependencies);
-}
-
-sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x,
-                 std::int64_t incx, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_scopy_usm_sycl(queue, n, x, incx, y, incy,
-                                                            dependencies);
-}
-
-sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x,
-                 std::int64_t incx, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dcopy_usm_sycl(queue, n, x, incx, y, incy,
-                                                            dependencies);
-}
-
-sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ccopy_usm_sycl(queue, n, x, incx, y, incy,
-                                                            dependencies);
-}
-
-sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zcopy_usm_sycl(queue, n, x, incx, y, incy,
-                                                            dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       const float **x, std::int64_t *incx, float **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_scopy_batch_group_usm_sycl(
-        queue, n, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       const double **x, std::int64_t *incx, double **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dcopy_batch_group_usm_sycl(
-        queue, n, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y,
-                       std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ccopy_batch_group_usm_sycl(
-        queue, n, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                       const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
-                       std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zcopy_batch_group_usm_sycl(
-        queue, n, x, incx, y, incy, group_count, group_size, dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       const float *x, std::int64_t incx, std::int64_t stridex, float *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_scopy_batch_strided_usm_sycl(
-        queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       const double *x, std::int64_t incx, std::int64_t stridex, double *y,
-                       std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dcopy_batch_strided_usm_sycl(
-        queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ccopy_batch_strided_usm_sycl(
-        queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                       const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zcopy_batch_strided_usm_sycl(
-        queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies);
-}
-
-sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x,
-                std::int64_t incx, const float *y, std::int64_t incy, float *result,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sdot_usm_sycl(queue, n, x, incx, y, incy, result,
-                                                           dependencies);
-}
-
-sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x,
-                std::int64_t incx, const double *y, std::int64_t incy, double *result,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ddot_usm_sycl(queue, n, x, incx, y, incy, result,
-                                                           dependencies);
-}
-
-sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x,
-                std::int64_t incx, const float *y, std::int64_t incy, double *result,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dsdot_usm_sycl(queue, n, x, incx, y, incy, result,
-                                                            dependencies);
-}
-
-sycl::event dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                 std::int64_t incy, std::complex<float> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cdotc_usm_sycl(queue, n, x, incx, y, incy, result,
-                                                            dependencies);
-}
-
-sycl::event dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
-                 std::int64_t incy, std::complex<double> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zdotc_usm_sycl(queue, n, x, incx, y, incy, result,
-                                                            dependencies);
-}
-
-sycl::event dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, const std::complex<float> *y,
-                 std::int64_t incy, std::complex<float> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cdotu_usm_sycl(queue, n, x, incx, y, incy, result,
-                                                            dependencies);
-}
-
-sycl::event dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, const std::complex<double> *y,
-                 std::int64_t incy, std::complex<double> *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zdotu_usm_sycl(queue, n, x, incx, y, incy, result,
-                                                            dependencies);
-}
-
-sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_isamin_usm_sycl(queue, n, x, incx, result,
-                                                             dependencies);
-}
-
-sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_idamin_usm_sycl(queue, n, x, incx, result,
-                                                             dependencies);
-}
-
-sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                  const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_icamin_usm_sycl(queue, n, x, incx, result,
-                                                             dependencies);
-}
-
-sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                  const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_izamin_usm_sycl(queue, n, x, incx, result,
-                                                             dependencies);
-}
-
-sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_isamax_usm_sycl(queue, n, x, incx, result,
-                                                             dependencies);
-}
-
-sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x,
-                  std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_idamax_usm_sycl(queue, n, x, incx, result,
-                                                             dependencies);
-}
-
-sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                  const std::complex<float> *x, std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_icamax_usm_sycl(queue, n, x, incx, result,
-                                                             dependencies);
-}
-
-sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                  const std::complex<double> *x, std::int64_t incx, std::int64_t *result,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_izamax_usm_sycl(queue, n, x, incx, result,
-                                                             dependencies);
-}
-
-sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<float> *x, std::int64_t incx, float *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_scnrm2_usm_sycl(queue, n, x, incx, result,
-                                                             dependencies);
-}
-
-sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 const std::complex<double> *x, std::int64_t incx, double *result,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dznrm2_usm_sycl(queue, n, x, incx, result,
-                                                             dependencies);
-}
-
-sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x,
-                 std::int64_t incx, float *result, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_snrm2_usm_sycl(queue, n, x, incx, result,
-                                                            dependencies);
-}
-
-sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x,
-                 std::int64_t incx, double *result, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dnrm2_usm_sycl(queue, n, x, incx, result,
-                                                            dependencies);
-}
-
-sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                std::int64_t incy, float c, float s, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_srot_usm_sycl(queue, n, x, incx, y, incy, c, s,
-                                                           dependencies);
-}
-
-sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                std::int64_t incy, double c, double s,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_drot_usm_sycl(queue, n, x, incx, y, incy, c, s,
-                                                           dependencies);
-}
-
-sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *x,
-                std::int64_t incx, float *y, std::int64_t incy, float c, float s,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_csrot_usm_sycl(queue, n, x, incx, y, incy, c, s,
-                                                            dependencies);
-}
-
-sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *x,
-                std::int64_t incx, double *y, std::int64_t incy, double c, double s,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zdrot_usm_sycl(queue, n, x, incx, y, incy, c, s,
-                                                            dependencies);
-}
-
-sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, float *a, float *b, float *c,
-                 float *s, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_srotg_usm_sycl(queue, a, b, c, s, dependencies);
-}
-
-sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, double *a, double *b, double *c,
-                 double *s, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_drotg_usm_sycl(queue, a, b, c, s, dependencies);
-}
-
-sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, std::complex<float> *a,
-                 std::complex<float> *b, float *c, std::complex<float> *s,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_crotg_usm_sycl(queue, a, b, c, s, dependencies);
-}
-
-sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, std::complex<double> *a,
-                 std::complex<double> *b, double *c, std::complex<double> *s,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zrotg_usm_sycl(queue, a, b, c, s, dependencies);
-}
-
-sycl::event rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *x,
-                 std::int64_t incx, float *y, std::int64_t incy, float *param,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_srotm_usm_sycl(queue, n, x, incx, y, incy, param,
-                                                            dependencies);
-}
-
-sycl::event rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *x,
-                 std::int64_t incx, double *y, std::int64_t incy, double *param,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_drotm_usm_sycl(queue, n, x, incx, y, incy, param,
-                                                            dependencies);
-}
-
-sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue &queue, float *d1, float *d2, float *x1,
-                  float y1, float *param, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_srotmg_usm_sycl(queue, d1, d2, x1, y1, param,
-                                                             dependencies);
-}
-
-sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue &queue, double *d1, double *d2,
-                  double *x1, double y1, double *param,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_drotmg_usm_sycl(queue, d1, d2, x1, y1, param,
-                                                             dependencies);
-}
-
-sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-                 float *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sscal_usm_sycl(queue, n, alpha, x, incx, dependencies);
-}
-
-sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-                 double *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dscal_usm_sycl(queue, n, alpha, x, incx, dependencies);
-}
-
-sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 std::complex<float> alpha, std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cscal_usm_sycl(queue, n, alpha, x, incx, dependencies);
-}
-
-sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 std::complex<double> alpha, std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_csscal_usm_sycl(queue, n, alpha, x, incx,
-                                                             dependencies);
-}
-
-sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zscal_usm_sycl(queue, n, alpha, x, incx, dependencies);
-}
-
-sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zdscal_usm_sycl(queue, n, alpha, x, incx,
-                                                             dependencies);
-}
-
-sycl::event sdsdot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float sb,
-                   const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                   float *result, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sdsdot_usm_sycl(queue, n, sb, x, incx, y, incy, result,
-                                                             dependencies);
-}
-
-sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *x,
-                 std::int64_t incx, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sswap_usm_sycl(queue, n, x, incx, y, incy,
-                                                            dependencies);
-}
-
-sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *x,
-                 std::int64_t incx, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dswap_usm_sycl(queue, n, x, incx, y, incy,
-                                                            dependencies);
-}
-
-sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 std::complex<float> *x, std::int64_t incx, std::complex<float> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cswap_usm_sycl(queue, n, x, incx, y, incy,
-                                                            dependencies);
-}
-
-sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 std::complex<double> *x, std::int64_t incx, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zswap_usm_sycl(queue, n, x, incx, y, incy,
-                                                            dependencies);
-}
-
-sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a,
-                 std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sgbmv_usm_sycl(
-        queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, const double *a,
-                 std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dgbmv_usm_sycl(
-        queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *x,
-                 std::int64_t incx, std::complex<float> beta, std::complex<float> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cgbmv_usm_sycl(
-        queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *x,
-                 std::int64_t incx, std::complex<double> beta, std::complex<double> *y,
-                 std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zgbmv_usm_sycl(
-        queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x,
-                 std::int64_t incx, float beta, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x,
-                                                            incx, beta, y, incy, dependencies);
-}
-
-sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, double alpha, const double *a, std::int64_t lda, const double *x,
-                 std::int64_t incx, double beta, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x,
-                                                            incx, beta, y, incy, dependencies);
-}
-
-sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                 std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                 std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x,
-                                                            incx, beta, y, incy, dependencies);
-}
-
-sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m,
-                 std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                 std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                 std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x,
-                                                            incx, beta, y, incy, dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                       std::int64_t m, std::int64_t n, float alpha, const float *a,
-                       std::int64_t lda, std::int64_t stridea, const float *x, std::int64_t incx,
-                       std::int64_t stridex, float beta, float *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sgemv_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey,
-        batch_size, dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                       std::int64_t m, std::int64_t n, double alpha, const double *a,
-                       std::int64_t lda, std::int64_t stridea, const double *x, std::int64_t incx,
-                       std::int64_t stridex, double beta, double *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dgemv_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey,
-        batch_size, dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                       std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                       const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-                       const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cgemv_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey,
-        batch_size, dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                       std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                       const std::complex<double> *a, std::int64_t lda, std::int64_t stridea,
-                       const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-                       std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                       std::int64_t stridey, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zgemv_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey,
-        batch_size, dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                       std::int64_t *m, std::int64_t *n, float *alpha, const float **a,
-                       std::int64_t *lda, const float **x, std::int64_t *incx, float *beta,
-                       float **y, std::int64_t *incy, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sgemv_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size,
-        dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                       std::int64_t *m, std::int64_t *n, double *alpha, const double **a,
-                       std::int64_t *lda, const double **x, std::int64_t *incx, double *beta,
-                       double **y, std::int64_t *incy, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dgemv_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size,
-        dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                       std::int64_t *m, std::int64_t *n, std::complex<float> *alpha,
-                       const std::complex<float> **a, std::int64_t *lda,
-                       const std::complex<float> **x, std::int64_t *incx, std::complex<float> *beta,
-                       std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cgemv_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size,
-        dependencies);
-}
-
-sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                       std::int64_t *m, std::int64_t *n, std::complex<double> *alpha,
-                       const std::complex<double> **a, std::int64_t *lda,
-                       const std::complex<double> **x, std::int64_t *incx,
-                       std::complex<double> *beta, std::complex<double> **y, std::int64_t *incy,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zgemv_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size,
-        dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       std::int64_t m, std::int64_t n, const float *a, std::int64_t lda,
-                       std::int64_t stridea, const float *x, std::int64_t incx,
-                       std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sdgmm_batch_strided_usm_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size,
-        dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       std::int64_t m, std::int64_t n, const double *a, std::int64_t lda,
-                       std::int64_t stridea, const double *x, std::int64_t incx,
-                       std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ddgmm_batch_strided_usm_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size,
-        dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       std::int64_t m, std::int64_t n, const std::complex<float> *a,
-                       std::int64_t lda, std::int64_t stridea, const std::complex<float> *x,
-                       std::int64_t incx, std::int64_t stridex, std::complex<float> *c,
-                       std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cdgmm_batch_strided_usm_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size,
-        dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       std::int64_t m, std::int64_t n, const std::complex<double> *a,
-                       std::int64_t lda, std::int64_t stridea, const std::complex<double> *x,
-                       std::int64_t incx, std::int64_t stridex, std::complex<double> *c,
-                       std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zdgmm_batch_strided_usm_sycl(
-        queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size,
-        dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       std::int64_t *m, std::int64_t *n, const float **a, std::int64_t *lda,
-                       const float **x, std::int64_t *incx, float **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sdgmm_batch_group_usm_sycl(
-        queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       std::int64_t *m, std::int64_t *n, const double **a, std::int64_t *lda,
-                       const double **x, std::int64_t *incx, double **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ddgmm_batch_group_usm_sycl(
-        queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       std::int64_t *m, std::int64_t *n, const std::complex<float> **a,
-                       std::int64_t *lda, const std::complex<float> **x, std::int64_t *incx,
-                       std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cdgmm_batch_group_usm_sycl(
-        queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies);
-}
-
-sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       std::int64_t *m, std::int64_t *n, const std::complex<double> **a,
-                       std::int64_t *lda, const std::complex<double> **x, std::int64_t *incx,
-                       std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zdgmm_batch_group_usm_sycl(
-        queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies);
-}
-
-sycl::event ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                float *a, std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sger_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a,
-                                                           lda, dependencies);
-}
-
-sycl::event ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                double alpha, const double *x, std::int64_t incx, const double *y,
-                std::int64_t incy, double *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dger_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a,
-                                                           lda, dependencies);
-}
-
-sycl::event gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cgerc_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a,
-                                                            lda, dependencies);
-}
-
-sycl::event gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zgerc_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a,
-                                                            lda, dependencies);
-}
-
-sycl::event geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cgeru_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a,
-                                                            lda, dependencies);
-}
-
-sycl::event geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zgeru_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a,
-                                                            lda, dependencies);
-}
-
-sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-                 std::int64_t lda, const std::complex<float> *x, std::int64_t incx,
-                 std::complex<float> beta, std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_chbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda,
-                                                            x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-                 std::int64_t lda, const std::complex<double> *x, std::int64_t incx,
-                 std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zhbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda,
-                                                            x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                 const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_chemv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x,
-                                                            incx, beta, y, incy, dependencies);
-}
-
-sycl::event hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                 const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zhemv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x,
-                                                            incx, beta, y, incy, dependencies);
-}
-
-sycl::event her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                float alpha, const std::complex<float> *x, std::int64_t incx,
-                std::complex<float> *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cher_usm_sycl(queue, upper_lower, n, alpha, x, incx, a,
-                                                           lda, dependencies);
-}
-
-sycl::event her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                double alpha, const std::complex<double> *x, std::int64_t incx,
-                std::complex<double> *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zher_usm_sycl(queue, upper_lower, n, alpha, x, incx, a,
-                                                           lda, dependencies);
-}
-
-sycl::event her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cher2_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                            y, incy, a, lda, dependencies);
-}
-
-sycl::event her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                 std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zher2_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                            y, incy, a, lda, dependencies);
-}
-
-sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a,
-                 const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-                 std::complex<float> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_chpmv_usm_sycl(queue, upper_lower, n, alpha, a, x,
-                                                            incx, beta, y, incy, dependencies);
-}
-
-sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a,
-                 const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-                 std::complex<double> *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zhpmv_usm_sycl(queue, upper_lower, n, alpha, a, x,
-                                                            incx, beta, y, incy, dependencies);
-}
-
-sycl::event hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                float alpha, const std::complex<float> *x, std::int64_t incx,
-                std::complex<float> *a, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_chpr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a,
-                                                           dependencies);
-}
-
-sycl::event hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                double alpha, const std::complex<double> *x, std::int64_t incx,
-                std::complex<double> *a, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zhpr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a,
-                                                           dependencies);
-}
-
-sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *x, std::int64_t incx,
-                 const std::complex<float> *y, std::int64_t incy, std::complex<float> *a,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_chpr2_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                            y, incy, a, dependencies);
-}
-
-sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *x, std::int64_t incx,
-                 const std::complex<double> *y, std::int64_t incy, std::complex<double> *a,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zhpr2_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                            y, incy, a, dependencies);
-}
-
-sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x,
-                 std::int64_t incx, float beta, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ssbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda,
-                                                            x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *x,
-                 std::int64_t incx, double beta, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dsbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda,
-                                                            x, incx, beta, y, incy, dependencies);
-}
-
-sycl::event spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 float alpha, const float *a, const float *x, std::int64_t incx, float beta,
-                 float *y, std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sspmv_usm_sycl(queue, upper_lower, n, alpha, a, x,
-                                                            incx, beta, y, incy, dependencies);
-}
-
-sycl::event spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 double alpha, const double *a, const double *x, std::int64_t incx, double beta,
-                 double *y, std::int64_t incy, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dspmv_usm_sycl(queue, upper_lower, n, alpha, a, x,
-                                                            incx, beta, y, incy, dependencies);
-}
-
-sycl::event spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                float alpha, const float *x, std::int64_t incx, float *a,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sspr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a,
-                                                           dependencies);
-}
-
-sycl::event spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                double alpha, const double *x, std::int64_t incx, double *a,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dspr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a,
-                                                           dependencies);
-}
-
-sycl::event spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                 float *a, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sspr2_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                            y, incy, a, dependencies);
-}
-
-sycl::event spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 double alpha, const double *x, std::int64_t incx, const double *y,
-                 std::int64_t incy, double *a, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dspr2_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                            y, incy, a, dependencies);
-}
-
-sycl::event symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 float alpha, const float *a, std::int64_t lda, const float *x, std::int64_t incx,
-                 float beta, float *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ssymv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x,
-                                                            incx, beta, y, incy, dependencies);
-}
-
-sycl::event symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 double alpha, const double *a, std::int64_t lda, const double *x,
-                 std::int64_t incx, double beta, double *y, std::int64_t incy,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dsymv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x,
-                                                            incx, beta, y, incy, dependencies);
-}
-
-sycl::event syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ssyr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a,
-                                                           lda, dependencies);
-}
-
-sycl::event syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda,
-                const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dsyr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a,
-                                                           lda, dependencies);
-}
-
-sycl::event syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy,
-                 float *a, std::int64_t lda, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ssyr2_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                            y, incy, a, lda, dependencies);
-}
-
-sycl::event syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n,
-                 double alpha, const double *x, std::int64_t incx, const double *y,
-                 std::int64_t incy, double *a, std::int64_t lda,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dsyr2_usm_sycl(queue, upper_lower, n, alpha, x, incx,
-                                                            y, incy, a, lda, dependencies);
-}
-
-sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda,
-                 float *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_stbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda,
-                 double *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dtbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                 std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ctbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                 std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ztbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda,
-                 float *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_stbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda,
-                 double *x, std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dtbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<float> *a,
-                 std::int64_t lda, std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ctbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, std::int64_t k, const std::complex<double> *a,
-                 std::int64_t lda, std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ztbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            k, a, lda, x, incx, dependencies);
-}
-
-sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_stpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, x, incx, dependencies);
-}
-
-sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dtpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, x, incx, dependencies);
-}
-
-sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ctpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, x, incx, dependencies);
-}
-
-sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ztpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, x, incx, dependencies);
-}
-
-sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_stpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, x, incx, dependencies);
-}
-
-sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dtpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, x, incx, dependencies);
-}
-
-sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<float> *a,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ctpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, x, incx, dependencies);
-}
-
-sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<double> *a,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ztpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, x, incx, dependencies);
-}
-
-sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_strmv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, lda, x, incx, dependencies);
-}
-
-sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dtrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, lda, x, incx, dependencies);
-}
-
-sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ctrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, lda, x, incx, dependencies);
-}
-
-sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ztrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, lda, x, incx, dependencies);
-}
-
-sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_strsv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, lda, x, incx, dependencies);
-}
-
-sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x,
-                 std::int64_t incx, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dtrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, lda, x, incx, dependencies);
-}
-
-sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ctrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, lda, x, incx, dependencies);
-}
-
-sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 diag unit_diag, std::int64_t n, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *x, std::int64_t incx,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ztrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n,
-                                                            a, lda, x, incx, dependencies);
-}
-
-sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a,
-                 std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sgemm_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a,
-                 std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dgemm_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                 std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cgemm_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                 std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zgemm_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha,
-                 const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb,
-                 sycl::half beta, sycl::half *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_hgemm_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a,
-                 std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_gemm_f16f16f32_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb,
-                 std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a,
-                 std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_gemm_bf16bf16f32_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                 std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_chemm_usm_sycl(
-        queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                 std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zhemm_usm_sycl(
-        queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, float alpha, const std::complex<float> *a,
-                 std::int64_t lda, float beta, std::complex<float> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cherk_usm_sycl(queue, upper_lower, trans, n, k, alpha,
-                                                            a, lda, beta, c, ldc, dependencies);
-}
-
-sycl::event herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, double alpha, const std::complex<double> *a,
-                 std::int64_t lda, double beta, std::complex<double> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zherk_usm_sycl(queue, upper_lower, trans, n, k, alpha,
-                                                            a, lda, beta, c, ldc, dependencies);
-}
-
-sycl::event her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                  const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                  std::int64_t ldb, float beta, std::complex<float> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cher2k_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                  const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                  std::int64_t ldb, double beta, std::complex<double> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zher2k_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                 const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ssymm_usm_sycl(
-        queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                 const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dsymm_usm_sycl(
-        queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                 std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_csymm_usm_sycl(
-        queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                 std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                 std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zsymm_usm_sycl(
-        queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                 float beta, float *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ssyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha,
-                                                            a, lda, beta, c, ldc, dependencies);
-}
-
-sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                 double beta, double *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dsyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha,
-                                                            a, lda, beta, c, ldc, dependencies);
-}
-
-sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                 const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                 std::complex<float> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_csyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha,
-                                                            a, lda, beta, c, ldc, dependencies);
-}
-
-sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                 std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                 const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                 std::complex<double> *c, std::int64_t ldc,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zsyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha,
-                                                            a, lda, beta, c, ldc, dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower,
-                       transpose *trans, std::int64_t *n, std::int64_t *k, float *alpha,
-                       const float **a, std::int64_t *lda, float *beta, float **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ssyrk_batch_group_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size,
-        dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower,
-                       transpose *trans, std::int64_t *n, std::int64_t *k, double *alpha,
-                       const double **a, std::int64_t *lda, double *beta, double **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dsyrk_batch_group_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size,
-        dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower,
-                       transpose *trans, std::int64_t *n, std::int64_t *k,
-                       std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
-                       std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_csyrk_batch_group_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size,
-        dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower,
-                       transpose *trans, std::int64_t *n, std::int64_t *k,
-                       std::complex<double> *alpha, const std::complex<double> **a,
-                       std::int64_t *lda, std::complex<double> *beta, std::complex<double> **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zsyrk_batch_group_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size,
-        dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       transpose trans, std::int64_t n, std::int64_t k, float alpha, const float *a,
-                       std::int64_t lda, std::int64_t stride_a, float beta, float *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ssyrk_batch_strided_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-        batch_size, dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       transpose trans, std::int64_t n, std::int64_t k, double alpha,
-                       const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                       double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dsyrk_batch_strided_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-        batch_size, dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       transpose trans, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                       const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                       std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_csyrk_batch_strided_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-        batch_size, dependencies);
-}
-
-sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                       transpose trans, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                       const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                       std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zsyrk_batch_strided_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c,
-        batch_size, dependencies);
-}
-
-sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-                  const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ssyr2k_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
-                  const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dsyr2k_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                  const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-                  std::int64_t ldb, std::complex<float> beta, std::complex<float> *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_csyr2k_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans,
-                  std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                  const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-                  std::int64_t ldb, std::complex<double> beta, std::complex<double> *c,
-                  std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zsyr2k_usm_sycl(
-        queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies);
-}
-
-sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                 const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_strmm_usm_sycl(queue, left_right, upper_lower, trans,
-                                                            unit_diag, m, n, alpha, a, lda, b, ldb,
-                                                            dependencies);
-}
-
-sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                 const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dtrmm_usm_sycl(queue, left_right, upper_lower, trans,
-                                                            unit_diag, m, n, alpha, a, lda, b, ldb,
-                                                            dependencies);
-}
-
-sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ctrmm_usm_sycl(queue, left_right, upper_lower, trans,
-                                                            unit_diag, m, n, alpha, a, lda, b, ldb,
-                                                            dependencies);
-}
-
-sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ztrmm_usm_sycl(queue, left_right, upper_lower, trans,
-                                                            unit_diag, m, n, alpha, a, lda, b, ldb,
-                                                            dependencies);
-}
-
-sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha,
-                 const float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_strsm_usm_sycl(queue, left_right, upper_lower, trans,
-                                                            unit_diag, m, n, alpha, a, lda, b, ldb,
-                                                            dependencies);
-}
-
-sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha,
-                 const double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dtrsm_usm_sycl(queue, left_right, upper_lower, trans,
-                                                            unit_diag, m, n, alpha, a, lda, b, ldb,
-                                                            dependencies);
-}
-
-sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                 std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                 std::complex<float> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ctrsm_usm_sycl(queue, left_right, upper_lower, trans,
-                                                            unit_diag, m, n, alpha, a, lda, b, ldb,
-                                                            dependencies);
-}
-
-sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower,
-                 transpose trans, diag unit_diag, std::int64_t m, std::int64_t n,
-                 std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                 std::complex<double> *b, std::int64_t ldb,
-                 const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ztrsm_usm_sycl(queue, left_right, upper_lower, trans,
-                                                            unit_diag, m, n, alpha, a, lda, b, ldb,
-                                                            dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                       std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                       std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_strsm_batch_strided_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                       std::int64_t n, double alpha, const double *a, std::int64_t lda,
-                       std::int64_t stride_a, double *b, std::int64_t ldb, std::int64_t stride_b,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dtrsm_batch_strided_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                       std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-                       std::int64_t lda, std::int64_t stride_a, std::complex<float> *b,
-                       std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ctrsm_batch_strided_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right,
-                       uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m,
-                       std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-                       std::int64_t lda, std::int64_t stride_a, std::complex<double> *b,
-                       std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ztrsm_batch_strided_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb,
-        stride_b, batch_size, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                       std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, float **b,
-                       std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_strsm_batch_group_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count,
-        group_size, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                       std::int64_t *n, double *alpha, const double **a, std::int64_t *lda,
-                       double **b, std::int64_t *ldb, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dtrsm_batch_group_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count,
-        group_size, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                       std::int64_t *n, std::complex<float> *alpha, const std::complex<float> **a,
-                       std::int64_t *lda, std::complex<float> **b, std::int64_t *ldb,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ctrsm_batch_group_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count,
-        group_size, dependencies);
-}
-
-sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right,
-                       uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m,
-                       std::int64_t *n, std::complex<double> *alpha, const std::complex<double> **a,
-                       std::int64_t *lda, std::complex<double> **b, std::int64_t *ldb,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_ztrsm_batch_group_usm_sycl(
-        queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const float **a, std::int64_t *lda, const float **b,
-                       std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sgemm_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       double *alpha, const double **a, std::int64_t *lda, const double **b,
-                       std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dgemm_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
-                       const std::complex<float> **b, std::int64_t *ldb, std::complex<float> *beta,
-                       std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count,
-                       std::int64_t *group_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cgemm_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       std::complex<double> *alpha, const std::complex<double> **a,
-                       std::int64_t *lda, const std::complex<double> **b, std::int64_t *ldb,
-                       std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zgemm_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       sycl::half *alpha, const sycl::half **a, std::int64_t *lda,
-                       const sycl::half **b, std::int64_t *ldb, sycl::half *beta, sycl::half **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_hgemm_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const sycl::half **a, std::int64_t *lda, const sycl::half **b,
-                       std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc,
-                       std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_gemm_f16f16f32_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, float **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_gemm_s8s8f32_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa,
-                       transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                       float *alpha, const std::int8_t **a, std::int64_t *lda,
-                       const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c,
-                       std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_gemm_s8s8s32_batch_group_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count,
-        group_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const float *a, std::int64_t lda, std::int64_t stride_a,
-                       const float *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sgemm_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       double alpha, const double *a, std::int64_t lda, std::int64_t stride_a,
-                       const double *b, std::int64_t ldb, std::int64_t stride_b, double beta,
-                       double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dgemm_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                       std::int64_t stride_a, const std::complex<float> *b, std::int64_t ldb,
-                       std::int64_t stride_b, std::complex<float> beta, std::complex<float> *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cgemm_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                       std::int64_t stride_a, const std::complex<double> *b, std::int64_t ldb,
-                       std::int64_t stride_b, std::complex<double> beta, std::complex<double> *c,
-                       std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zgemm_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       sycl::half alpha, const sycl::half *a, std::int64_t lda,
-                       std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-                       std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc,
-                       std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_hgemm_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const sycl::half *a, std::int64_t lda, std::int64_t stride_a,
-                       const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_gemm_f16f16f32_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                       const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_gemm_s8s8f32_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                       transpose transb, std::int64_t m, std::int64_t n, std::int64_t k,
-                       float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a,
-                       const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta,
-                       std::int32_t *c, std::int64_t ldc, std::int64_t stride_c,
-                       std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_gemm_s8s8s32_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                  transpose transa, transpose transb, std::int64_t n, std::int64_t k, float alpha,
-                  const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta,
-                  float *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_sgemmt_usm_sycl(queue, upper_lower, transa, transb, n,
-                                                             k, alpha, a, lda, b, ldb, beta, c, ldc,
-                                                             dependencies);
-}
-
-sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                  transpose transa, transpose transb, std::int64_t n, std::int64_t k, double alpha,
-                  const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta,
-                  double *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dgemmt_usm_sycl(queue, upper_lower, transa, transb, n,
-                                                             k, alpha, a, lda, b, ldb, beta, c, ldc,
-                                                             dependencies);
-}
-
-sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                  transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                  std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                  const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
-                  std::complex<float> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cgemmt_usm_sycl(queue, upper_lower, transa, transb, n,
-                                                             k, alpha, a, lda, b, ldb, beta, c, ldc,
-                                                             dependencies);
-}
-
-sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower,
-                  transpose transa, transpose transb, std::int64_t n, std::int64_t k,
-                  std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-                  const std::complex<double> *b, std::int64_t ldb, std::complex<double> beta,
-                  std::complex<double> *c, std::int64_t ldc,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zgemmt_usm_sycl(queue, upper_lower, transa, transb, n,
-                                                             k, alpha, a, lda, b, ldb, beta, c, ldc,
-                                                             dependencies);
-}
-
-sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                      transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                      std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                      std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                      float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_gemm_s8u8s32_bias_usm_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co,
-        dependencies);
-}
-
-sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                      transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                      std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda,
-                      std::int8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                      float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_gemm_s8s8s32_bias_usm_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co,
-        dependencies);
-}
-
-sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                      transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                      std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda,
-                      std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo,
-                      float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_gemm_u8s8s32_bias_usm_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co,
-        dependencies);
-}
-
-sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                      transpose transb, offset offsetc, std::int64_t m, std::int64_t n,
-                      std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda,
-                      std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo,
-                      float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co,
-                      const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_gemm_u8u8s32_bias_usm_sycl(
-        queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co,
-        dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, const float *a,
-                           std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_somatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, const double *a,
-                           std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb,
-                           std::int64_t stride_b, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_domatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_comatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                           std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zomatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda,
-                           std::int64_t ldb, std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_simatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, double alpha, double *ab,
-                           std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-                           std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dimatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                           std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cimatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                           std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                           std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-                           std::int64_t stride, std::int64_t batch_size,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zimatcopy_batch_strided_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies);
-}
-
-sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, float alpha,
-                          const float *a, std::int64_t lda, std::int64_t stride_a, float beta,
-                          const float *b, std::int64_t ldb, std::int64_t stride_b, float *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_somatadd_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n, double alpha,
-                          const double *a, std::int64_t lda, std::int64_t stride_a, double beta,
-                          const double *b, std::int64_t ldb, std::int64_t stride_b, double *c,
-                          std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-                          const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_domatadd_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-                          std::int64_t stride_a, std::complex<float> beta,
-                          const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_comatadd_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                          transpose transb, std::int64_t m, std::int64_t n,
-                          std::complex<double> alpha, const std::complex<double> *a,
-                          std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-                          const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                          std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-                          std::int64_t batch_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zomatadd_batch_strided_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc,
-        stride_c, batch_size, dependencies);
-}
-
-sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                     float *b, std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_somatcopy_usm_sycl(queue, trans, m, n, alpha, a, lda,
-                                                                b, ldb, dependencies);
-}
-
-sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, double alpha, const double *a,
-                     std::int64_t lda, double *b, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_domatcopy_usm_sycl(queue, trans, m, n, alpha, a, lda,
-                                                                b, ldb, dependencies);
-}
-
-sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                     const std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_comatcopy_usm_sycl(queue, trans, m, n, alpha, a, lda,
-                                                                b, ldb, dependencies);
-}
-
-sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                     const std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zomatcopy_usm_sycl(queue, trans, m, n, alpha, a, lda,
-                                                                b, ldb, dependencies);
-}
-
-sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                      std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-                      std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_somatcopy2_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-}
-
-sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                      std::int64_t m, std::int64_t n, double alpha, const double *a,
-                      std::int64_t lda, std::int64_t stridea, double *b, std::int64_t ldb,
-                      std::int64_t strideb, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_domatcopy2_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-}
-
-sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                      std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                      const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-                      std::complex<float> *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_comatcopy2_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-}
-
-sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                      std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                      const std::complex<double> *a, std::int64_t lda, std::int64_t stridea,
-                      std::complex<double> *b, std::int64_t ldb, std::int64_t strideb,
-                      const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zomatcopy2_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies);
-}
-
-sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_simatcopy_usm_sycl(queue, trans, m, n, alpha, ab, lda,
-                                                                ldb, dependencies);
-}
-
-sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, double alpha, double *ab, std::int64_t lda,
-                     std::int64_t ldb, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dimatcopy_usm_sycl(queue, trans, m, n, alpha, ab, lda,
-                                                                ldb, dependencies);
-}
-
-sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                     std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cimatcopy_usm_sycl(queue, trans, m, n, alpha, ab, lda,
-                                                                ldb, dependencies);
-}
-
-sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans,
-                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                     std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-                     const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zimatcopy_usm_sycl(queue, trans, m, n, alpha, ab, lda,
-                                                                ldb, dependencies);
-}
-
-sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                    transpose transb, std::int64_t m, std::int64_t n, float alpha, const float *a,
-                    std::int64_t lda, float beta, const float *b, std::int64_t ldb, float *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_somatadd_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies);
-}
-
-sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                    transpose transb, std::int64_t m, std::int64_t n, double alpha, const double *a,
-                    std::int64_t lda, double beta, const double *b, std::int64_t ldb, double *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_domatadd_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies);
-}
-
-sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                    transpose transb, std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                    const std::complex<float> *a, std::int64_t lda, std::complex<float> beta,
-                    const std::complex<float> *b, std::int64_t ldb, std::complex<float> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_comatadd_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies);
-}
-
-sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa,
-                    transpose transb, std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                    const std::complex<double> *a, std::int64_t lda, std::complex<double> beta,
-                    const std::complex<double> *b, std::int64_t ldb, std::complex<double> *c,
-                    std::int64_t ldc, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zomatadd_usm_sycl(
-        queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, float *alpha, const float **a,
-                           std::int64_t *lda, float **b, std::int64_t *ldb,
-                           std::int64_t group_count, std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_somatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, double *alpha, const double **a,
-                           std::int64_t *lda, double **b, std::int64_t *ldb,
-                           std::int64_t group_count, std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_domatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<float> *alpha,
-                           const std::complex<float> **a, std::int64_t *lda,
-                           std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_comatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<double> *alpha,
-                           const std::complex<double> **a, std::int64_t *lda,
-                           std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zomatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, float *alpha, float **ab,
-                           std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_simatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, double *alpha, double **ab,
-                           std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-                           std::int64_t *groupsize, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_dimatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<float> *alpha,
-                           std::complex<float> **ab, std::int64_t *lda, std::int64_t *ldb,
-                           std::int64_t group_count, std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_cimatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies);
-}
-
-sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans,
-                           std::int64_t *m, std::int64_t *n, std::complex<double> *alpha,
-                           std::complex<double> **ab, std::int64_t *lda, std::int64_t *ldb,
-                           std::int64_t group_count, std::int64_t *groupsize,
-                           const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].row_major_zimatcopy_batch_group_usm_sycl(
-        queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies);
-}
-
-} //namespace detail
-} //namespace row_major
-} //namespace blas
-} //namespace mkl
-} //namespace oneapi
diff --git a/src/blas/function_table.hpp b/src/blas/function_table.hpp
deleted file mode 100644
index a242fd0c0..000000000
--- a/src/blas/function_table.hpp
+++ /dev/null
@@ -1,4974 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _BLAS_FUNCTION_TABLE_HPP_
-#define _BLAS_FUNCTION_TABLE_HPP_
-
-#include <complex>
-#include <cstdint>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl/types.hpp"
-
-typedef struct {
-    int version;
-
-    // Buffer APIs
-
-    void (*column_major_scasum_sycl)(sycl::queue &queue, std::int64_t n,
-                                     sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                     sycl::buffer<float, 1> &result);
-    void (*column_major_dzasum_sycl)(sycl::queue &queue, std::int64_t n,
-                                     sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                     sycl::buffer<double, 1> &result);
-    void (*column_major_sasum_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                    std::int64_t incx, sycl::buffer<float, 1> &result);
-    void (*column_major_dasum_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                    std::int64_t incx, sycl::buffer<double, 1> &result);
-    void (*column_major_saxpy_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                    sycl::buffer<float, 1> &x, std::int64_t incx,
-                                    sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*column_major_daxpy_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                    sycl::buffer<double, 1> &x, std::int64_t incx,
-                                    sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*column_major_caxpy_sycl)(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-    void (*column_major_zaxpy_sycl)(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*column_major_saxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                                  sycl::buffer<float, 1> &x, std::int64_t incx,
-                                                  std::int64_t stridex, sycl::buffer<float, 1> &y,
-                                                  std::int64_t incy, std::int64_t stridey,
-                                                  std::int64_t batch_size);
-    void (*column_major_daxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                                  sycl::buffer<double, 1> &x, std::int64_t incx,
-                                                  std::int64_t stridex, sycl::buffer<double, 1> &y,
-                                                  std::int64_t incy, std::int64_t stridey,
-                                                  std::int64_t batch_size);
-    void (*column_major_caxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n,
-                                                  std::complex<float> alpha,
-                                                  sycl::buffer<std::complex<float>, 1> &x,
-                                                  std::int64_t incx, std::int64_t stridex,
-                                                  sycl::buffer<std::complex<float>, 1> &y,
-                                                  std::int64_t incy, std::int64_t stridey,
-                                                  std::int64_t batch_size);
-    void (*column_major_zaxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n,
-                                                  std::complex<double> alpha,
-                                                  sycl::buffer<std::complex<double>, 1> &x,
-                                                  std::int64_t incx, std::int64_t stridex,
-                                                  sycl::buffer<std::complex<double>, 1> &y,
-                                                  std::int64_t incy, std::int64_t stridey,
-                                                  std::int64_t batch_size);
-    void (*column_major_saxpby_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                     sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                                     sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*column_major_daxpby_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                     sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                                     sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*column_major_caxpby_sycl)(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                                     sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                     std::complex<float> beta,
-                                     sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-    void (*column_major_zaxpby_sycl)(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                                     sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                     std::complex<double> beta,
-                                     sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*column_major_scopy_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                    std::int64_t incx, sycl::buffer<float, 1> &y,
-                                    std::int64_t incy);
-    void (*column_major_dcopy_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                    std::int64_t incx, sycl::buffer<double, 1> &y,
-                                    std::int64_t incy);
-    void (*column_major_ccopy_sycl)(sycl::queue &queue, std::int64_t n,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-    void (*column_major_zcopy_sycl)(sycl::queue &queue, std::int64_t n,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*column_major_scopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n,
-                                                  sycl::buffer<float, 1> &x, std::int64_t incx,
-                                                  std::int64_t stridex, sycl::buffer<float, 1> &y,
-                                                  std::int64_t incy, std::int64_t stridey,
-                                                  std::int64_t batch_size);
-    void (*column_major_dcopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n,
-                                                  sycl::buffer<double, 1> &x, std::int64_t incx,
-                                                  std::int64_t stridex, sycl::buffer<double, 1> &y,
-                                                  std::int64_t incy, std::int64_t stridey,
-                                                  std::int64_t batch_size);
-    void (*column_major_ccopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n,
-                                                  sycl::buffer<std::complex<float>, 1> &x,
-                                                  std::int64_t incx, std::int64_t stridex,
-                                                  sycl::buffer<std::complex<float>, 1> &y,
-                                                  std::int64_t incy, std::int64_t stridey,
-                                                  std::int64_t batch_size);
-    void (*column_major_zcopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n,
-                                                  sycl::buffer<std::complex<double>, 1> &x,
-                                                  std::int64_t incx, std::int64_t stridex,
-                                                  sycl::buffer<std::complex<double>, 1> &y,
-                                                  std::int64_t incy, std::int64_t stridey,
-                                                  std::int64_t batch_size);
-    void (*column_major_sdot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                   std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                                   sycl::buffer<float, 1> &result);
-    void (*column_major_ddot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                   std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                                   sycl::buffer<double, 1> &result);
-    void (*column_major_dsdot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                    std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                                    sycl::buffer<double, 1> &result);
-    void (*column_major_cdotc_sycl)(sycl::queue &queue, std::int64_t n,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                                    sycl::buffer<std::complex<float>, 1> &result);
-    void (*column_major_zdotc_sycl)(sycl::queue &queue, std::int64_t n,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                                    sycl::buffer<std::complex<double>, 1> &result);
-    void (*column_major_cdotu_sycl)(sycl::queue &queue, std::int64_t n,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                                    sycl::buffer<std::complex<float>, 1> &result);
-    void (*column_major_zdotu_sycl)(sycl::queue &queue, std::int64_t n,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                                    sycl::buffer<std::complex<double>, 1> &result);
-    void (*column_major_isamin_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                     std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-    void (*column_major_idamin_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                     std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-    void (*column_major_icamin_sycl)(sycl::queue &queue, std::int64_t n,
-                                     sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                     sycl::buffer<std::int64_t, 1> &result);
-    void (*column_major_izamin_sycl)(sycl::queue &queue, std::int64_t n,
-                                     sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                     sycl::buffer<std::int64_t, 1> &result);
-    void (*column_major_isamax_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                     std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-    void (*column_major_idamax_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                     std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-    void (*column_major_icamax_sycl)(sycl::queue &queue, std::int64_t n,
-                                     sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                     sycl::buffer<std::int64_t, 1> &result);
-    void (*column_major_izamax_sycl)(sycl::queue &queue, std::int64_t n,
-                                     sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                     sycl::buffer<std::int64_t, 1> &result);
-    void (*column_major_scnrm2_sycl)(sycl::queue &queue, std::int64_t n,
-                                     sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                     sycl::buffer<float, 1> &result);
-    void (*column_major_dznrm2_sycl)(sycl::queue &queue, std::int64_t n,
-                                     sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                     sycl::buffer<double, 1> &result);
-    void (*column_major_snrm2_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                    std::int64_t incx, sycl::buffer<float, 1> &result);
-    void (*column_major_dnrm2_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                    std::int64_t incx, sycl::buffer<double, 1> &result);
-    void (*column_major_srot_sycl)(sycl::queue &queue, std::int64_t n,
-                                   sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                   sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                                   float c, float s);
-    void (*column_major_drot_sycl)(sycl::queue &queue, std::int64_t n,
-                                   sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                   sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                                   double c, double s);
-    void (*column_major_csrot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                    std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                                    float c, float s);
-    void (*column_major_zdrot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                    std::int64_t incx, sycl::buffer<double, 1> &y,
-                                    std::int64_t incy, double c, double s);
-    void (*column_major_srotg_sycl)(sycl::queue &queue, sycl::buffer<float, 1> &a,
-                                    sycl::buffer<float, 1> &b, sycl::buffer<float, 1> &c,
-                                    sycl::buffer<float, 1> &s);
-    void (*column_major_drotg_sycl)(sycl::queue &queue, sycl::buffer<double, 1> &a,
-                                    sycl::buffer<double, 1> &b, sycl::buffer<double, 1> &c,
-                                    sycl::buffer<double, 1> &s);
-    void (*column_major_crotg_sycl)(sycl::queue &queue, sycl::buffer<std::complex<float>, 1> &a,
-                                    sycl::buffer<std::complex<float>, 1> &b,
-                                    sycl::buffer<float, 1> &c,
-                                    sycl::buffer<std::complex<float>, 1> &s);
-    void (*column_major_zrotg_sycl)(sycl::queue &queue, sycl::buffer<std::complex<double>, 1> &a,
-                                    sycl::buffer<std::complex<double>, 1> &b,
-                                    sycl::buffer<double, 1> &c,
-                                    sycl::buffer<std::complex<double>, 1> &s);
-    void (*column_major_srotm_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                    std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                                    sycl::buffer<float, 1> &param);
-    void (*column_major_drotm_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                    std::int64_t incx, sycl::buffer<double, 1> &y,
-                                    std::int64_t incy, sycl::buffer<double, 1> &param);
-    void (*column_major_srotmg_sycl)(sycl::queue &queue, sycl::buffer<float, 1> &d1,
-                                     sycl::buffer<float, 1> &d2, sycl::buffer<float, 1> &x1,
-                                     float y1, sycl::buffer<float, 1> &param);
-    void (*column_major_drotmg_sycl)(sycl::queue &queue, sycl::buffer<double, 1> &d1,
-                                     sycl::buffer<double, 1> &d2, sycl::buffer<double, 1> &x1,
-                                     double y1, sycl::buffer<double, 1> &param);
-    void (*column_major_sscal_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                    sycl::buffer<float, 1> &x, std::int64_t incx);
-    void (*column_major_dscal_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                    sycl::buffer<double, 1> &x, std::int64_t incx);
-    void (*column_major_cscal_sycl)(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-    void (*column_major_csscal_sycl)(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                                     sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-    void (*column_major_zscal_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-    void (*column_major_zdscal_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                     sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-    void (*column_major_sdsdot_sycl)(sycl::queue &queue, std::int64_t n, float sb,
-                                     sycl::buffer<float, 1> &x, std::int64_t incx,
-                                     sycl::buffer<float, 1> &y, std::int64_t incy,
-                                     sycl::buffer<float, 1> &result);
-    void (*column_major_sswap_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                    std::int64_t incx, sycl::buffer<float, 1> &y,
-                                    std::int64_t incy);
-    void (*column_major_dswap_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                    std::int64_t incx, sycl::buffer<double, 1> &y,
-                                    std::int64_t incy);
-    void (*column_major_cswap_sycl)(sycl::queue &queue, std::int64_t n,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-    void (*column_major_zswap_sycl)(sycl::queue &queue, std::int64_t n,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*column_major_sgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                    std::int64_t m, std::int64_t n, std::int64_t kl,
-                                    std::int64_t ku, float alpha, sycl::buffer<float, 1> &a,
-                                    std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-                                    float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*column_major_dgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                    std::int64_t m, std::int64_t n, std::int64_t kl,
-                                    std::int64_t ku, double alpha, sycl::buffer<double, 1> &a,
-                                    std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx,
-                                    double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*column_major_cgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                    std::int64_t m, std::int64_t n, std::int64_t kl,
-                                    std::int64_t ku, std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                    std::complex<float> beta,
-                                    sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-    void (*column_major_zgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                    std::int64_t m, std::int64_t n, std::int64_t kl,
-                                    std::int64_t ku, std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                    std::complex<double> beta,
-                                    sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*column_major_sgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                    std::int64_t m, std::int64_t n, float alpha,
-                                    sycl::buffer<float, 1> &a, std::int64_t lda,
-                                    sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                                    sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*column_major_dgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                    std::int64_t m, std::int64_t n, double alpha,
-                                    sycl::buffer<double, 1> &a, std::int64_t lda,
-                                    sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                                    sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*column_major_cgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                    std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                    std::complex<float> beta,
-                                    sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-    void (*column_major_zgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                    std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                    std::complex<double> beta,
-                                    sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*column_major_sgemv_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                  std::int64_t m, std::int64_t n, float alpha,
-                                                  sycl::buffer<float, 1> &a, std::int64_t lda,
-                                                  std::int64_t stridea, sycl::buffer<float, 1> &x,
-                                                  std::int64_t incx, std::int64_t stridex,
-                                                  float beta, sycl::buffer<float, 1> &y,
-                                                  std::int64_t incy, std::int64_t stridey,
-                                                  std::int64_t batch_size);
-    void (*column_major_dgemv_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                  std::int64_t m, std::int64_t n, double alpha,
-                                                  sycl::buffer<double, 1> &a, std::int64_t lda,
-                                                  std::int64_t stridea, sycl::buffer<double, 1> &x,
-                                                  std::int64_t incx, std::int64_t stridex,
-                                                  double beta, sycl::buffer<double, 1> &y,
-                                                  std::int64_t incy, std::int64_t stridey,
-                                                  std::int64_t batch_size);
-    void (*column_major_cgemv_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-        std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-        std::int64_t stridex, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-        std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-    void (*column_major_zgemv_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-        std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-        std::int64_t stridex, std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-        std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-    void (*column_major_sdgmm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                                  std::int64_t m, std::int64_t n,
-                                                  sycl::buffer<float, 1> &a, std::int64_t lda,
-                                                  std::int64_t stridea, sycl::buffer<float, 1> &x,
-                                                  std::int64_t incx, std::int64_t stridex,
-                                                  sycl::buffer<float, 1> &c, std::int64_t ldc,
-                                                  std::int64_t stridec, std::int64_t batch_size);
-    void (*column_major_ddgmm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                                  std::int64_t m, std::int64_t n,
-                                                  sycl::buffer<double, 1> &a, std::int64_t lda,
-                                                  std::int64_t stridea, sycl::buffer<double, 1> &x,
-                                                  std::int64_t incx, std::int64_t stridex,
-                                                  sycl::buffer<double, 1> &c, std::int64_t ldc,
-                                                  std::int64_t stridec, std::int64_t batch_size);
-    void (*column_major_cdgmm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stridea,
-        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::int64_t stridex,
-        sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc, std::int64_t stridec,
-        std::int64_t batch_size);
-    void (*column_major_zdgmm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stridea,
-        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::int64_t stridex,
-        sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc, std::int64_t stridec,
-        std::int64_t batch_size);
-    void (*column_major_sger_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
-                                   sycl::buffer<float, 1> &x, std::int64_t incx,
-                                   sycl::buffer<float, 1> &y, std::int64_t incy,
-                                   sycl::buffer<float, 1> &a, std::int64_t lda);
-    void (*column_major_dger_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
-                                   sycl::buffer<double, 1> &x, std::int64_t incx,
-                                   sycl::buffer<double, 1> &y, std::int64_t incy,
-                                   sycl::buffer<double, 1> &a, std::int64_t lda);
-    void (*column_major_cgerc_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                    std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-    void (*column_major_zgerc_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                    std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-    void (*column_major_cgeru_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                    std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-    void (*column_major_zgeru_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                    std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-    void (*column_major_chbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                    std::complex<float> beta,
-                                    sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-    void (*column_major_zhbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                    std::complex<double> beta,
-                                    sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*column_major_chemv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                    std::complex<float> beta,
-                                    sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-    void (*column_major_zhemv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                    std::complex<double> beta,
-                                    sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*column_major_cher_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, float alpha,
-                                   sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                   sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-    void (*column_major_zher_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, double alpha,
-                                   sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                   sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-    void (*column_major_cher2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda);
-    void (*column_major_zher2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-    void (*column_major_chpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &a,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                    std::complex<float> beta,
-                                    sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-    void (*column_major_zhpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &a,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                    std::complex<double> beta,
-                                    sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*column_major_chpr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, float alpha,
-                                   sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                   sycl::buffer<std::complex<float>, 1> &a);
-    void (*column_major_zhpr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, double alpha,
-                                   sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                   sycl::buffer<std::complex<double>, 1> &a);
-    void (*column_major_chpr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                                    sycl::buffer<std::complex<float>, 1> &a);
-    void (*column_major_zhpr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                    sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                                    sycl::buffer<std::complex<double>, 1> &a);
-    void (*column_major_ssbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, std::int64_t k, float alpha,
-                                    sycl::buffer<float, 1> &a, std::int64_t lda,
-                                    sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                                    sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*column_major_dsbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, std::int64_t k, double alpha,
-                                    sycl::buffer<double, 1> &a, std::int64_t lda,
-                                    sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                                    sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*column_major_sspmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                                    sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                                    sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*column_major_dspmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                                    sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                                    sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*column_major_sspr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                                   std::int64_t incx, sycl::buffer<float, 1> &a);
-    void (*column_major_dspr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                                   std::int64_t incx, sycl::buffer<double, 1> &a);
-    void (*column_major_sspr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                                    std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                                    sycl::buffer<float, 1> &a);
-    void (*column_major_dspr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                                    std::int64_t incx, sycl::buffer<double, 1> &y,
-                                    std::int64_t incy, sycl::buffer<double, 1> &a);
-    void (*column_major_ssymv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                                    std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-                                    float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*column_major_dsymv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                                    std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx,
-                                    double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*column_major_ssyr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                                   std::int64_t incx, sycl::buffer<float, 1> &a, std::int64_t lda);
-    void (*column_major_dsyr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                   std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                                   std::int64_t incx, sycl::buffer<double, 1> &a, std::int64_t lda);
-    void (*column_major_ssyr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, float alpha, sycl::buffer<float, 1> &x,
-                                    std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                                    sycl::buffer<float, 1> &a, std::int64_t lda);
-    void (*column_major_dsyr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    std::int64_t n, double alpha, sycl::buffer<double, 1> &x,
-                                    std::int64_t incx, sycl::buffer<double, 1> &y,
-                                    std::int64_t incy, sycl::buffer<double, 1> &a,
-                                    std::int64_t lda);
-    void (*column_major_stbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-                                    std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx);
-    void (*column_major_dtbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-                                    std::int64_t lda, sycl::buffer<double, 1> &x,
-                                    std::int64_t incx);
-    void (*column_major_ctbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, std::int64_t k,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-    void (*column_major_ztbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, std::int64_t k,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-    void (*column_major_stbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-                                    std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx);
-    void (*column_major_dtbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-                                    std::int64_t lda, sycl::buffer<double, 1> &x,
-                                    std::int64_t incx);
-    void (*column_major_ctbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, std::int64_t k,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-    void (*column_major_ztbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, std::int64_t k,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-    void (*column_major_stpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<float, 1> &a,
-                                    sycl::buffer<float, 1> &x, std::int64_t incx);
-    void (*column_major_dtpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<double, 1> &a,
-                                    sycl::buffer<double, 1> &x, std::int64_t incx);
-    void (*column_major_ctpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-    void (*column_major_ztpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-    void (*column_major_stpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<float, 1> &a,
-                                    sycl::buffer<float, 1> &x, std::int64_t incx);
-    void (*column_major_dtpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<double, 1> &a,
-                                    sycl::buffer<double, 1> &x, std::int64_t incx);
-    void (*column_major_ctpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-                                    sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-    void (*column_major_ztpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-                                    sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-    void (*column_major_strmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                    sycl::buffer<float, 1> &x, std::int64_t incx);
-    void (*column_major_dtrmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                    sycl::buffer<double, 1> &x, std::int64_t incx);
-    void (*column_major_ctrmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-                                    std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                                    std::int64_t incx);
-    void (*column_major_ztrmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-                                    std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                                    std::int64_t incx);
-    void (*column_major_strsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                    sycl::buffer<float, 1> &x, std::int64_t incx);
-    void (*column_major_dtrsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                    sycl::buffer<double, 1> &x, std::int64_t incx);
-    void (*column_major_ctrsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-                                    std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                                    std::int64_t incx);
-    void (*column_major_ztrsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                    std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-                                    std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                                    std::int64_t incx);
-    void (*column_major_sgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                    oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                    std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                                    std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                                    float beta, sycl::buffer<float, 1> &c, std::int64_t ldc);
-    void (*column_major_dgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                    oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                    std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                                    std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                                    double beta, sycl::buffer<double, 1> &c, std::int64_t ldc);
-    void (*column_major_cgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                    oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                    std::int64_t k, std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                    std::complex<float> beta,
-                                    sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-    void (*column_major_zgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                    oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                    std::int64_t k, std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                    std::complex<double> beta,
-                                    sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-    void (*column_major_hgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                    oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                    std::int64_t k, sycl::half alpha,
-                                    sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                                    sycl::buffer<sycl::half, 1> &b, std::int64_t ldb,
-                                    sycl::half beta, sycl::buffer<sycl::half, 1> &c,
-                                    std::int64_t ldc);
-    void (*column_major_gemm_f16f16f32_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                             oneapi::mkl::transpose transb, std::int64_t m,
-                                             std::int64_t n, std::int64_t k, float alpha,
-                                             sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                                             sycl::buffer<sycl::half, 1> &b, std::int64_t ldb,
-                                             float beta, sycl::buffer<float, 1> &c,
-                                             std::int64_t ldc);
-    void (*column_major_gemm_bf16bf16f32_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                               oneapi::mkl::transpose transb, std::int64_t m,
-                                               std::int64_t n, std::int64_t k, float alpha,
-                                               sycl::buffer<oneapi::mkl::bfloat16, 1> &a,
-                                               std::int64_t lda,
-                                               sycl::buffer<oneapi::mkl::bfloat16, 1> &b,
-                                               std::int64_t ldb, float beta,
-                                               sycl::buffer<float, 1> &c, std::int64_t ldc);
-    void (*column_major_chemm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                    oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                    std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                    std::complex<float> beta,
-                                    sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-    void (*column_major_zhemm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                    oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                    std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                    std::complex<double> beta,
-                                    sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-    void (*column_major_cherk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                    float alpha, sycl::buffer<std::complex<float>, 1> &a,
-                                    std::int64_t lda, float beta,
-                                    sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-    void (*column_major_zherk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                    double alpha, sycl::buffer<std::complex<double>, 1> &a,
-                                    std::int64_t lda, double beta,
-                                    sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-    void (*column_major_cher2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                     oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                     std::complex<float> alpha,
-                                     sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                     sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                     float beta, sycl::buffer<std::complex<float>, 1> &c,
-                                     std::int64_t ldc);
-    void (*column_major_zher2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                     oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                     std::complex<double> alpha,
-                                     sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                     sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                     double beta, sycl::buffer<std::complex<double>, 1> &c,
-                                     std::int64_t ldc);
-    void (*column_major_ssymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                    oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                    float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                    sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                                    sycl::buffer<float, 1> &c, std::int64_t ldc);
-    void (*column_major_dsymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                    oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                    double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                    sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                                    sycl::buffer<double, 1> &c, std::int64_t ldc);
-    void (*column_major_csymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                    oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                    std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                    std::complex<float> beta,
-                                    sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-    void (*column_major_zsymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                    oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                    std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                    std::complex<double> beta,
-                                    sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-    void (*column_major_ssyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                    float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                    float beta, sycl::buffer<float, 1> &c, std::int64_t ldc);
-    void (*column_major_dsyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                    double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                    double beta, sycl::buffer<double, 1> &c, std::int64_t ldc);
-    void (*column_major_csyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                    std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                    std::complex<float> beta,
-                                    sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-    void (*column_major_zsyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                    oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                    std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                    std::complex<double> beta,
-                                    sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-    void (*column_major_ssyrk_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                                  oneapi::mkl::transpose trans, std::int64_t n,
-                                                  std::int64_t k, float alpha,
-                                                  sycl::buffer<float, 1> &a, std::int64_t lda,
-                                                  std::int64_t stride_a, float beta,
-                                                  sycl::buffer<float, 1> &c, std::int64_t ldc,
-                                                  std::int64_t stride_c, std::int64_t batch_size);
-    void (*column_major_dsyrk_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                                  oneapi::mkl::transpose trans, std::int64_t n,
-                                                  std::int64_t k, double alpha,
-                                                  sycl::buffer<double, 1> &a, std::int64_t lda,
-                                                  std::int64_t stride_a, double beta,
-                                                  sycl::buffer<double, 1> &c, std::int64_t ldc,
-                                                  std::int64_t stride_c, std::int64_t batch_size);
-    void (*column_major_csyrk_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, std::complex<float> alpha,
-        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-    void (*column_major_zsyrk_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, std::complex<double> alpha,
-        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-    void (*column_major_ssyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                     oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                     float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                     sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                                     sycl::buffer<float, 1> &c, std::int64_t ldc);
-    void (*column_major_dsyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                     oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                     double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                     sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                                     sycl::buffer<double, 1> &c, std::int64_t ldc);
-    void (*column_major_csyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                     oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                     std::complex<float> alpha,
-                                     sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                     sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                     std::complex<float> beta,
-                                     sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-    void (*column_major_zsyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                     oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                     std::complex<double> alpha,
-                                     sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                     sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                     std::complex<double> beta,
-                                     sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-    void (*column_major_strmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                    oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                    oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                    float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                    sycl::buffer<float, 1> &b, std::int64_t ldb);
-    void (*column_major_dtrmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                    oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                    oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                    double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                    sycl::buffer<double, 1> &b, std::int64_t ldb);
-    void (*column_major_ctrmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                    oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                    oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                    std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-    void (*column_major_ztrmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                    oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                    oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                    std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-    void (*column_major_strsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                    oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                    oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                    float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                    sycl::buffer<float, 1> &b, std::int64_t ldb);
-    void (*column_major_dtrsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                    oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                    oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                    double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                    sycl::buffer<double, 1> &b, std::int64_t ldb);
-    void (*column_major_ctrsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                    oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                    oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                    std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-    void (*column_major_ztrsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                    oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                    oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                    std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                    sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-    void (*column_major_sgemm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-        std::int64_t lda, std::int64_t stride_a, sycl::buffer<float, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-    void (*column_major_dgemm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-        std::int64_t lda, std::int64_t stride_a, sycl::buffer<double, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, double beta, sycl::buffer<double, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-    void (*column_major_cgemm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-    void (*column_major_zgemm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-    void (*column_major_hgemm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                                  oneapi::mkl::transpose transb, std::int64_t m,
-                                                  std::int64_t n, std::int64_t k, sycl::half alpha,
-                                                  sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                                                  std::int64_t stride_a,
-                                                  sycl::buffer<sycl::half, 1> &b, std::int64_t ldb,
-                                                  std::int64_t stride_b, sycl::half beta,
-                                                  sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                                                  std::int64_t stride_c, std::int64_t batch_size);
-    void (*column_major_gemm_f16f16f32_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer<sycl::half, 1> &a,
-        std::int64_t lda, std::int64_t stride_a, sycl::buffer<sycl::half, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-    void (*column_major_gemm_s8s8f32_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-        sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size);
-    void (*column_major_gemm_s8s8s32_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-        sycl::buffer<int32_t, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size);
-    void (*column_major_strsm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size);
-    void (*column_major_dtrsm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size);
-    void (*column_major_ctrsm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-        std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, std::int64_t batch_size);
-    void (*column_major_ztrsm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-        std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, std::int64_t batch_size);
-    void (*column_major_sgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                     oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                                     std::int64_t n, std::int64_t k, float alpha,
-                                     sycl::buffer<float, 1> &a, std::int64_t lda,
-                                     sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                                     sycl::buffer<float, 1> &c, std::int64_t ldc);
-    void (*column_major_dgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                     oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                                     std::int64_t n, std::int64_t k, double alpha,
-                                     sycl::buffer<double, 1> &a, std::int64_t lda,
-                                     sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                                     sycl::buffer<double, 1> &c, std::int64_t ldc);
-    void (*column_major_cgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                     oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                                     std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                                     sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                     sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                     std::complex<float> beta,
-                                     sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-    void (*column_major_zgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                     oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                                     std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                                     sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                     sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                     std::complex<double> beta,
-                                     sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-    void (*column_major_gemm_s8u8s32_bias_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao, sycl::buffer<uint8_t, 1> &b,
-        std::int64_t ldb, uint8_t bo, float beta, sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-        sycl::buffer<int32_t, 1> &co);
-    void (*column_major_gemm_s8s8s32_bias_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao, sycl::buffer<int8_t, 1> &b,
-        std::int64_t ldb, int8_t bo, float beta, sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-        sycl::buffer<int32_t, 1> &co);
-    void (*column_major_gemm_u8s8s32_bias_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao, sycl::buffer<int8_t, 1> &b,
-        std::int64_t ldb, int8_t bo, float beta, sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-        sycl::buffer<int32_t, 1> &co);
-    void (*column_major_gemm_u8u8s32_bias_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao, sycl::buffer<uint8_t, 1> &b,
-        std::int64_t ldb, uint8_t bo, float beta, sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-        sycl::buffer<int32_t, 1> &co);
-    void (*column_major_somatcopy_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size);
-    void (*column_major_domatcopy_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size);
-    void (*column_major_comatcopy_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-        std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, std::int64_t batch_size);
-    void (*column_major_zomatcopy_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-        std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, std::int64_t batch_size);
-    void (*column_major_simatcopy_batch_strided_sycl)(sycl::queue &queue,
-                                                      oneapi::mkl::transpose trans, std::int64_t m,
-                                                      std::int64_t n, float alpha,
-                                                      sycl::buffer<float, 1> &ab, std::int64_t lda,
-                                                      std::int64_t ldb, std::int64_t stride,
-                                                      std::int64_t batch_size);
-    void (*column_major_dimatcopy_batch_strided_sycl)(sycl::queue &queue,
-                                                      oneapi::mkl::transpose trans, std::int64_t m,
-                                                      std::int64_t n, double alpha,
-                                                      sycl::buffer<double, 1> &ab, std::int64_t lda,
-                                                      std::int64_t ldb, std::int64_t stride,
-                                                      std::int64_t batch_size);
-    void (*column_major_cimatcopy_batch_strided_sycl)(sycl::queue &queue,
-                                                      oneapi::mkl::transpose trans, std::int64_t m,
-                                                      std::int64_t n, std::complex<float> alpha,
-                                                      sycl::buffer<std::complex<float>, 1> &ab,
-                                                      std::int64_t lda, std::int64_t ldb,
-                                                      std::int64_t stride, std::int64_t batch_size);
-    void (*column_major_zimatcopy_batch_strided_sycl)(sycl::queue &queue,
-                                                      oneapi::mkl::transpose trans, std::int64_t m,
-                                                      std::int64_t n, std::complex<double> alpha,
-                                                      sycl::buffer<std::complex<double>, 1> &ab,
-                                                      std::int64_t lda, std::int64_t ldb,
-                                                      std::int64_t stride, std::int64_t batch_size);
-    void (*column_major_somatadd_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-        std::int64_t stride_a, float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size);
-    void (*column_major_domatadd_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-        std::int64_t stride_a, double beta, sycl::buffer<double, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size);
-    void (*column_major_comatadd_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::complex<float> alpha,
-        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-    void (*column_major_zomatadd_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::complex<double> alpha,
-        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-
-    void (*column_major_somatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                        std::int64_t m, std::int64_t n, float alpha,
-                                        sycl::buffer<float, 1> &a, std::int64_t lda,
-                                        sycl::buffer<float, 1> &b, std::int64_t ldb);
-    void (*column_major_domatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                        std::int64_t m, std::int64_t n, double alpha,
-                                        sycl::buffer<double, 1> &a, std::int64_t lda,
-                                        sycl::buffer<double, 1> &b, std::int64_t ldb);
-    void (*column_major_comatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                        std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-    void (*column_major_zomatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                        std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                        sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-    void (*column_major_somatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t m, std::int64_t n, float alpha,
-                                         sycl::buffer<float, 1> &a, std::int64_t lda,
-                                         std::int64_t stridea, sycl::buffer<float, 1> &b,
-                                         std::int64_t ldb, std::int64_t strideb);
-    void (*column_major_domatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t m, std::int64_t n, double alpha,
-                                         sycl::buffer<double, 1> &a, std::int64_t lda,
-                                         std::int64_t stridea, sycl::buffer<double, 1> &b,
-                                         std::int64_t ldb, std::int64_t strideb);
-    void (*column_major_comatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                         sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                         std::int64_t stridea,
-                                         sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                         std::int64_t strideb);
-    void (*column_major_zomatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                         sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                         std::int64_t stridea,
-                                         sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                         std::int64_t strideb);
-    void (*column_major_simatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                        std::int64_t m, std::int64_t n, float alpha,
-                                        sycl::buffer<float, 1> &ab, std::int64_t lda,
-                                        std::int64_t ldb);
-    void (*column_major_dimatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                        std::int64_t m, std::int64_t n, double alpha,
-                                        sycl::buffer<double, 1> &ab, std::int64_t lda,
-                                        std::int64_t ldb);
-    void (*column_major_cimatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                        std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                        sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda,
-                                        std::int64_t ldb);
-    void (*column_major_zimatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                        std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                        sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda,
-                                        std::int64_t ldb);
-    void (*column_major_somatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                       oneapi::mkl::transpose transb, std::int64_t m,
-                                       std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                                       std::int64_t lda, float beta, sycl::buffer<float, 1> &b,
-                                       std::int64_t ldb, sycl::buffer<float, 1> &c,
-                                       std::int64_t ldc);
-    void (*column_major_domatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                       oneapi::mkl::transpose transb, std::int64_t m,
-                                       std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                                       std::int64_t lda, double beta, sycl::buffer<double, 1> &b,
-                                       std::int64_t ldb, sycl::buffer<double, 1> &c,
-                                       std::int64_t ldc);
-    void (*column_major_comatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                       oneapi::mkl::transpose transb, std::int64_t m,
-                                       std::int64_t n, std::complex<float> alpha,
-                                       sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                       std::complex<float> beta,
-                                       sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                       sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-    void (*column_major_zomatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                       oneapi::mkl::transpose transb, std::int64_t m,
-                                       std::int64_t n, std::complex<double> alpha,
-                                       sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                       std::complex<double> beta,
-                                       sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                       sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-    // USM APIs
-
-    sycl::event (*column_major_scasum_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                                const std::complex<float> *x, std::int64_t incx,
-                                                float *result,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dzasum_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                                const std::complex<double> *x, std::int64_t incx,
-                                                double *result,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sasum_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x,
-                                               std::int64_t incx, float *result,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dasum_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x,
-                                               std::int64_t incx, double *result,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_saxpy_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                               const float *x, std::int64_t incx, float *y,
-                                               std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_daxpy_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                               const double *x, std::int64_t incx, double *y,
-                                               std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_caxpy_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                               std::complex<float> alpha,
-                                               const std::complex<float> *x, std::int64_t incx,
-                                               std::complex<float> *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zaxpy_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                               std::complex<double> alpha,
-                                               const std::complex<double> *x, std::int64_t incx,
-                                               std::complex<double> *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_saxpy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx,
-        float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_daxpy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx,
-        double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_caxpy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
-        const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y,
-        std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zaxpy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
-        const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
-        std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_saxpy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx,
-        std::int64_t stridex, float *y, std::int64_t incy, std::int64_t stridey,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_daxpy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx,
-        std::int64_t stridex, double *y, std::int64_t incy, std::int64_t stridey,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_caxpy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, std::complex<float> alpha, const std::complex<float> *x,
-        std::int64_t incx, std::int64_t stridex, std::complex<float> *y, std::int64_t incy,
-        std::int64_t stridey, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zaxpy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-        const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-        std::complex<double> *y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_saxpby_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                                const float *x, std::int64_t incx, const float beta,
-                                                float *y, std::int64_t incy,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_daxpby_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                                const double *x, std::int64_t incx,
-                                                const double beta, double *y, std::int64_t incy,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_caxpby_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                                std::complex<float> alpha,
-                                                const std::complex<float> *x, std::int64_t incx,
-                                                const std::complex<float> beta,
-                                                std::complex<float> *y, std::int64_t incy,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zaxpby_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                                std::complex<double> alpha,
-                                                const std::complex<double> *x, std::int64_t incx,
-                                                const std::complex<double> beta,
-                                                std::complex<double> *y, std::int64_t incy,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_scopy_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x,
-                                               std::int64_t incx, float *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dcopy_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x,
-                                               std::int64_t incx, double *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ccopy_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                               const std::complex<float> *x, std::int64_t incx,
-                                               std::complex<float> *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zcopy_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                               const std::complex<double> *x, std::int64_t incx,
-                                               std::complex<double> *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_scopy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, const float **x, std::int64_t *incx, float **y,
-        std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, const double **x, std::int64_t *incx, double **y,
-        std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ccopy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, const std::complex<float> **x, std::int64_t *incx,
-        std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, const std::complex<double> **x, std::int64_t *incx,
-        std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_scopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t stridex,
-        float *y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-        std::int64_t stridex, double *y, std::int64_t incy, std::int64_t stridey,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ccopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
-        std::int64_t stridex, std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
-        std::int64_t stridex, std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sdot_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x,
-                                              std::int64_t incx, const float *y, std::int64_t incy,
-                                              float *result,
-                                              const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ddot_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x,
-                                              std::int64_t incx, const double *y, std::int64_t incy,
-                                              double *result,
-                                              const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dsdot_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x,
-                                               std::int64_t incx, const float *y, std::int64_t incy,
-                                               double *result,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cdotc_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                               const std::complex<float> *x, std::int64_t incx,
-                                               const std::complex<float> *y, std::int64_t incy,
-                                               std::complex<float> *result,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zdotc_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                               const std::complex<double> *x, std::int64_t incx,
-                                               const std::complex<double> *y, std::int64_t incy,
-                                               std::complex<double> *result,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cdotu_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                               const std::complex<float> *x, std::int64_t incx,
-                                               const std::complex<float> *y, std::int64_t incy,
-                                               std::complex<float> *result,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zdotu_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                               const std::complex<double> *x, std::int64_t incx,
-                                               const std::complex<double> *y, std::int64_t incy,
-                                               std::complex<double> *result,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_isamin_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x,
-                                                std::int64_t incx, std::int64_t *result,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_idamin_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x,
-                                                std::int64_t incx, std::int64_t *result,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_icamin_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                                const std::complex<float> *x, std::int64_t incx,
-                                                std::int64_t *result,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_izamin_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                                const std::complex<double> *x, std::int64_t incx,
-                                                std::int64_t *result,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_isamax_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x,
-                                                std::int64_t incx, std::int64_t *result,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_idamax_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x,
-                                                std::int64_t incx, std::int64_t *result,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_icamax_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                                const std::complex<float> *x, std::int64_t incx,
-                                                std::int64_t *result,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_izamax_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                                const std::complex<double> *x, std::int64_t incx,
-                                                std::int64_t *result,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_scnrm2_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                                const std::complex<float> *x, std::int64_t incx,
-                                                float *result,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dznrm2_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                                const std::complex<double> *x, std::int64_t incx,
-                                                double *result,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_snrm2_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x,
-                                               std::int64_t incx, float *result,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dnrm2_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x,
-                                               std::int64_t incx, double *result,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_srot_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                              std::complex<float> *x, std::int64_t incx,
-                                              std::complex<float> *y, std::int64_t incy, float c,
-                                              float s,
-                                              const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_drot_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                              std::complex<double> *x, std::int64_t incx,
-                                              std::complex<double> *y, std::int64_t incy, double c,
-                                              double s,
-                                              const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_csrot_usm_sycl)(sycl::queue &queue, std::int64_t n, float *x,
-                                               std::int64_t incx, float *y, std::int64_t incy,
-                                               float c, float s,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zdrot_usm_sycl)(sycl::queue &queue, std::int64_t n, double *x,
-                                               std::int64_t incx, double *y, std::int64_t incy,
-                                               double c, double s,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_srotg_usm_sycl)(sycl::queue &queue, float *a, float *b, float *c,
-                                               float *s,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_drotg_usm_sycl)(sycl::queue &queue, double *a, double *b, double *c,
-                                               double *s,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_crotg_usm_sycl)(sycl::queue &queue, std::complex<float> *a,
-                                               std::complex<float> *b, float *c,
-                                               std::complex<float> *s,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zrotg_usm_sycl)(sycl::queue &queue, std::complex<double> *a,
-                                               std::complex<double> *b, double *c,
-                                               std::complex<double> *s,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_srotm_usm_sycl)(sycl::queue &queue, std::int64_t n, float *x,
-                                               std::int64_t incx, float *y, std::int64_t incy,
-                                               float *param,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_drotm_usm_sycl)(sycl::queue &queue, std::int64_t n, double *x,
-                                               std::int64_t incx, double *y, std::int64_t incy,
-                                               double *param,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_srotmg_usm_sycl)(sycl::queue &queue, float *d1, float *d2, float *x1,
-                                                float y1, float *param,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_drotmg_usm_sycl)(sycl::queue &queue, double *d1, double *d2,
-                                                double *x1, double y1, double *param,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sscal_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                               float *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dscal_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                               double *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cscal_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                               std::complex<float> alpha, std::complex<float> *x,
-                                               std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_csscal_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                                std::complex<double> alpha, std::complex<double> *x,
-                                                std::int64_t incx,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zscal_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                               std::complex<float> *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zdscal_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                                std::complex<double> *x, std::int64_t incx,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sdsdot_usm_sycl)(sycl::queue &queue, std::int64_t n, float sb,
-                                                const float *x, std::int64_t incx, const float *y,
-                                                std::int64_t incy, float *result,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sswap_usm_sycl)(sycl::queue &queue, std::int64_t n, float *x,
-                                               std::int64_t incx, float *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dswap_usm_sycl)(sycl::queue &queue, std::int64_t n, double *x,
-                                               std::int64_t incx, double *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cswap_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                               std::complex<float> *x, std::int64_t incx,
-                                               std::complex<float> *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zswap_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                               std::complex<double> *x, std::int64_t incx,
-                                               std::complex<double> *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sgbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                               std::int64_t m, std::int64_t n, std::int64_t kl,
-                                               std::int64_t ku, float alpha, const float *a,
-                                               std::int64_t lda, const float *x, std::int64_t incx,
-                                               float beta, float *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dgbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                               std::int64_t m, std::int64_t n, std::int64_t kl,
-                                               std::int64_t ku, double alpha, const double *a,
-                                               std::int64_t lda, const double *x, std::int64_t incx,
-                                               double beta, double *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cgbmv_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::int64_t kl, std::int64_t ku, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-        std::complex<float> *y, std::int64_t incy, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zgbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                               std::int64_t m, std::int64_t n, std::int64_t kl,
-                                               std::int64_t ku, std::complex<double> alpha,
-                                               const std::complex<double> *a, std::int64_t lda,
-                                               const std::complex<double> *x, std::int64_t incx,
-                                               std::complex<double> beta, std::complex<double> *y,
-                                               std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sgemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                               std::int64_t m, std::int64_t n, float alpha,
-                                               const float *a, std::int64_t lda, const float *x,
-                                               std::int64_t incx, float beta, float *y,
-                                               std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dgemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                               std::int64_t m, std::int64_t n, double alpha,
-                                               const double *a, std::int64_t lda, const double *x,
-                                               std::int64_t incx, double beta, double *y,
-                                               std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cgemv_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-        const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-        std::complex<float> *y, std::int64_t incy, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zgemv_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-        const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-        std::complex<double> *y, std::int64_t incy, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sgemv_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        float alpha, const float *a, std::int64_t lda, std::int64_t stridea, const float *x,
-        std::int64_t incx, std::int64_t stridex, float beta, float *y, std::int64_t incy,
-        std::int64_t stridey, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dgemv_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        double alpha, const double *a, std::int64_t lda, std::int64_t stridea, const double *x,
-        std::int64_t incx, std::int64_t stridex, double beta, double *y, std::int64_t incy,
-        std::int64_t stridey, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cgemv_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-        std::int64_t stridea, const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-        std::complex<float> beta, std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zgemv_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-        std::int64_t stridea, const std::complex<double> *x, std::int64_t incx,
-        std::int64_t stridex, std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-        std::int64_t stridey, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sgemv_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        float *alpha, const float **a, std::int64_t *lda, const float **x, std::int64_t *incx,
-        float *beta, float **y, std::int64_t *incy, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dgemv_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        double *alpha, const double **a, std::int64_t *lda, const double **x, std::int64_t *incx,
-        double *beta, double **y, std::int64_t *incy, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cgemv_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
-        const std::complex<float> **x, std::int64_t *incx, std::complex<float> *beta,
-        std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zgemv_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
-        const std::complex<double> **x, std::int64_t *incx, std::complex<double> *beta,
-        std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sdgmm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-        const float *a, std::int64_t lda, std::int64_t stridea, const float *x, std::int64_t incx,
-        std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ddgmm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-        const double *a, std::int64_t lda, std::int64_t stridea, const double *x, std::int64_t incx,
-        std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cdgmm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-        const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-        const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-        std::complex<float> *c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zdgmm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-        const std::complex<double> *a, std::int64_t lda, std::int64_t stridea,
-        const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-        std::complex<double> *c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sdgmm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n,
-        const float **a, std::int64_t *lda, const float **x, std::int64_t *incx, float **c,
-        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ddgmm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n,
-        const double **a, std::int64_t *lda, const double **x, std::int64_t *incx, double **c,
-        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cdgmm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n,
-        const std::complex<float> **a, std::int64_t *lda, const std::complex<float> **x,
-        std::int64_t *incx, std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zdgmm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n,
-        const std::complex<double> **a, std::int64_t *lda, const std::complex<double> **x,
-        std::int64_t *incx, std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sger_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                              float alpha, const float *x, std::int64_t incx,
-                                              const float *y, std::int64_t incy, float *a,
-                                              std::int64_t lda,
-                                              const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dger_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                              double alpha, const double *x, std::int64_t incx,
-                                              const double *y, std::int64_t incy, double *a,
-                                              std::int64_t lda,
-                                              const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cgerc_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                               std::complex<float> alpha,
-                                               const std::complex<float> *x, std::int64_t incx,
-                                               const std::complex<float> *y, std::int64_t incy,
-                                               std::complex<float> *a, std::int64_t lda,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zgerc_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                               std::complex<double> alpha,
-                                               const std::complex<double> *x, std::int64_t incx,
-                                               const std::complex<double> *y, std::int64_t incy,
-                                               std::complex<double> *a, std::int64_t lda,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cgeru_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                               std::complex<float> alpha,
-                                               const std::complex<float> *x, std::int64_t incx,
-                                               const std::complex<float> *y, std::int64_t incy,
-                                               std::complex<float> *a, std::int64_t lda,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zgeru_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                               std::complex<double> alpha,
-                                               const std::complex<double> *x, std::int64_t incx,
-                                               const std::complex<double> *y, std::int64_t incy,
-                                               std::complex<double> *a, std::int64_t lda,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_chbmv_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k,
-        std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-        const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-        std::complex<float> *y, std::int64_t incy, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zhbmv_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k,
-        std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-        const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-        std::complex<double> *y, std::int64_t incy, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_chemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, std::complex<float> alpha,
-                                               const std::complex<float> *a, std::int64_t lda,
-                                               const std::complex<float> *x, std::int64_t incx,
-                                               std::complex<float> beta, std::complex<float> *y,
-                                               std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zhemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, std::complex<double> alpha,
-                                               const std::complex<double> *a, std::int64_t lda,
-                                               const std::complex<double> *x, std::int64_t incx,
-                                               std::complex<double> beta, std::complex<double> *y,
-                                               std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cher_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                              std::int64_t n, float alpha,
-                                              const std::complex<float> *x, std::int64_t incx,
-                                              std::complex<float> *a, std::int64_t lda,
-                                              const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zher_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                              std::int64_t n, double alpha,
-                                              const std::complex<double> *x, std::int64_t incx,
-                                              std::complex<double> *a, std::int64_t lda,
-                                              const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cher2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, std::complex<float> alpha,
-                                               const std::complex<float> *x, std::int64_t incx,
-                                               const std::complex<float> *y, std::int64_t incy,
-                                               std::complex<float> *a, std::int64_t lda,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zher2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, std::complex<double> alpha,
-                                               const std::complex<double> *x, std::int64_t incx,
-                                               const std::complex<double> *y, std::int64_t incy,
-                                               std::complex<double> *a, std::int64_t lda,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_chpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, std::complex<float> alpha,
-                                               const std::complex<float> *a,
-                                               const std::complex<float> *x, std::int64_t incx,
-                                               std::complex<float> beta, std::complex<float> *y,
-                                               std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zhpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, std::complex<double> alpha,
-                                               const std::complex<double> *a,
-                                               const std::complex<double> *x, std::int64_t incx,
-                                               std::complex<double> beta, std::complex<double> *y,
-                                               std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_chpr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                              std::int64_t n, float alpha,
-                                              const std::complex<float> *x, std::int64_t incx,
-                                              std::complex<float> *a,
-                                              const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zhpr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                              std::int64_t n, double alpha,
-                                              const std::complex<double> *x, std::int64_t incx,
-                                              std::complex<double> *a,
-                                              const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_chpr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, std::complex<float> alpha,
-                                               const std::complex<float> *x, std::int64_t incx,
-                                               const std::complex<float> *y, std::int64_t incy,
-                                               std::complex<float> *a,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zhpr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, std::complex<double> alpha,
-                                               const std::complex<double> *x, std::int64_t incx,
-                                               const std::complex<double> *y, std::int64_t incy,
-                                               std::complex<double> *a,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ssbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, std::int64_t k, float alpha,
-                                               const float *a, std::int64_t lda, const float *x,
-                                               std::int64_t incx, float beta, float *y,
-                                               std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dsbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, std::int64_t k, double alpha,
-                                               const double *a, std::int64_t lda, const double *x,
-                                               std::int64_t incx, double beta, double *y,
-                                               std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sspmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, float alpha, const float *a,
-                                               const float *x, std::int64_t incx, float beta,
-                                               float *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dspmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, double alpha, const double *a,
-                                               const double *x, std::int64_t incx, double beta,
-                                               double *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sspr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                              std::int64_t n, float alpha, const float *x,
-                                              std::int64_t incx, float *a,
-                                              const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dspr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                              std::int64_t n, double alpha, const double *x,
-                                              std::int64_t incx, double *a,
-                                              const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sspr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, float alpha, const float *x,
-                                               std::int64_t incx, const float *y, std::int64_t incy,
-                                               float *a,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dspr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, double alpha, const double *x,
-                                               std::int64_t incx, const double *y,
-                                               std::int64_t incy, double *a,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ssymv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, float alpha, const float *a,
-                                               std::int64_t lda, const float *x, std::int64_t incx,
-                                               float beta, float *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dsymv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, double alpha, const double *a,
-                                               std::int64_t lda, const double *x, std::int64_t incx,
-                                               double beta, double *y, std::int64_t incy,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ssyr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                              std::int64_t n, float alpha, const float *x,
-                                              std::int64_t incx, float *a, std::int64_t lda,
-                                              const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dsyr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                              std::int64_t n, double alpha, const double *x,
-                                              std::int64_t incx, double *a, std::int64_t lda,
-                                              const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ssyr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, float alpha, const float *x,
-                                               std::int64_t incx, const float *y, std::int64_t incy,
-                                               float *a, std::int64_t lda,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dsyr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               std::int64_t n, double alpha, const double *x,
-                                               std::int64_t incx, const double *y,
-                                               std::int64_t incy, double *a, std::int64_t lda,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_stbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               std::int64_t k, const float *a, std::int64_t lda,
-                                               float *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dtbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               std::int64_t k, const double *a, std::int64_t lda,
-                                               double *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ctbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               std::int64_t k, const std::complex<float> *a,
-                                               std::int64_t lda, std::complex<float> *x,
-                                               std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ztbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               std::int64_t k, const std::complex<double> *a,
-                                               std::int64_t lda, std::complex<double> *x,
-                                               std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_stbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               std::int64_t k, const float *a, std::int64_t lda,
-                                               float *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dtbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               std::int64_t k, const double *a, std::int64_t lda,
-                                               double *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ctbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               std::int64_t k, const std::complex<float> *a,
-                                               std::int64_t lda, std::complex<float> *x,
-                                               std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ztbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               std::int64_t k, const std::complex<double> *a,
-                                               std::int64_t lda, std::complex<double> *x,
-                                               std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_stpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const float *a, float *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dtpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const double *a, double *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ctpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const std::complex<float> *a, std::complex<float> *x,
-                                               std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ztpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const std::complex<double> *a,
-                                               std::complex<double> *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_stpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const float *a, float *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dtpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const double *a, double *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ctpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const std::complex<float> *a, std::complex<float> *x,
-                                               std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ztpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const std::complex<double> *a,
-                                               std::complex<double> *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_strmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const float *a, std::int64_t lda, float *x,
-                                               std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dtrmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const double *a, std::int64_t lda, double *x,
-                                               std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ctrmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const std::complex<float> *a, std::int64_t lda,
-                                               std::complex<float> *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ztrmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const std::complex<double> *a, std::int64_t lda,
-                                               std::complex<double> *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_strsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const float *a, std::int64_t lda, float *x,
-                                               std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dtrsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const double *a, std::int64_t lda, double *x,
-                                               std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ctrsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const std::complex<float> *a, std::int64_t lda,
-                                               std::complex<float> *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ztrsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t n,
-                                               const std::complex<double> *a, std::int64_t lda,
-                                               std::complex<double> *x, std::int64_t incx,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sgemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                               oneapi::mkl::transpose transb, std::int64_t m,
-                                               std::int64_t n, std::int64_t k, float alpha,
-                                               const float *a, std::int64_t lda, const float *b,
-                                               std::int64_t ldb, float beta, float *c,
-                                               std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dgemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                               oneapi::mkl::transpose transb, std::int64_t m,
-                                               std::int64_t n, std::int64_t k, double alpha,
-                                               const double *a, std::int64_t lda, const double *b,
-                                               std::int64_t ldb, double beta, double *c,
-                                               std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cgemm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-        const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-        std::int64_t ldb, std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zgemm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-        const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-        std::int64_t ldb, std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_hgemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                               oneapi::mkl::transpose transb, std::int64_t m,
-                                               std::int64_t n, std::int64_t k, sycl::half alpha,
-                                               const sycl::half *a, std::int64_t lda,
-                                               const sycl::half *b, std::int64_t ldb,
-                                               sycl::half beta, sycl::half *c, std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_gemm_f16f16f32_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a,
-        std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c,
-        std::int64_t ldc, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_gemm_bf16bf16f32_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const oneapi::mkl::bfloat16 *a,
-        std::int64_t lda, const oneapi::mkl::bfloat16 *b, std::int64_t ldb, float beta, float *c,
-        std::int64_t ldc, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_chemm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
-        std::complex<float> *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zhemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                               oneapi::mkl::uplo upper_lower, std::int64_t m,
-                                               std::int64_t n, std::complex<double> alpha,
-                                               const std::complex<double> *a, std::int64_t lda,
-                                               const std::complex<double> *b, std::int64_t ldb,
-                                               std::complex<double> beta, std::complex<double> *c,
-                                               std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cherk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans, std::int64_t n,
-                                               std::int64_t k, float alpha,
-                                               const std::complex<float> *a, std::int64_t lda,
-                                               float beta, std::complex<float> *c, std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zherk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans, std::int64_t n,
-                                               std::int64_t k, double alpha,
-                                               const std::complex<double> *a, std::int64_t lda,
-                                               double beta, std::complex<double> *c,
-                                               std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cher2k_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, const std::complex<float> *b, std::int64_t ldb, float beta,
-        std::complex<float> *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zher2k_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-        std::int64_t lda, const std::complex<double> *b, std::int64_t ldb, double beta,
-        std::complex<double> *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ssymm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                               oneapi::mkl::uplo upper_lower, std::int64_t m,
-                                               std::int64_t n, float alpha, const float *a,
-                                               std::int64_t lda, const float *b, std::int64_t ldb,
-                                               float beta, float *c, std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dsymm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                               oneapi::mkl::uplo upper_lower, std::int64_t m,
-                                               std::int64_t n, double alpha, const double *a,
-                                               std::int64_t lda, const double *b, std::int64_t ldb,
-                                               double beta, double *c, std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_csymm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
-        std::complex<float> *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zsymm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                               oneapi::mkl::uplo upper_lower, std::int64_t m,
-                                               std::int64_t n, std::complex<double> alpha,
-                                               const std::complex<double> *a, std::int64_t lda,
-                                               const std::complex<double> *b, std::int64_t ldb,
-                                               std::complex<double> beta, std::complex<double> *c,
-                                               std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ssyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans, std::int64_t n,
-                                               std::int64_t k, float alpha, const float *a,
-                                               std::int64_t lda, float beta, float *c,
-                                               std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dsyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans, std::int64_t n,
-                                               std::int64_t k, double alpha, const double *a,
-                                               std::int64_t lda, double beta, double *c,
-                                               std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_csyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans, std::int64_t n,
-                                               std::int64_t k, std::complex<float> alpha,
-                                               const std::complex<float> *a, std::int64_t lda,
-                                               std::complex<float> beta, std::complex<float> *c,
-                                               std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zsyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans, std::int64_t n,
-                                               std::int64_t k, std::complex<double> alpha,
-                                               const std::complex<double> *a, std::int64_t lda,
-                                               std::complex<double> beta, std::complex<double> *c,
-                                               std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ssyrk_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans,
-        std::int64_t *n, std::int64_t *k, float *alpha, const float **a, std::int64_t *lda,
-        float *beta, float **c, std::int64_t *ldc, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dsyrk_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans,
-        std::int64_t *n, std::int64_t *k, double *alpha, const double **a, std::int64_t *lda,
-        double *beta, double **c, std::int64_t *ldc, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_csyrk_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans,
-        std::int64_t *n, std::int64_t *k, std::complex<float> *alpha, const std::complex<float> **a,
-        std::int64_t *lda, std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-        std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zsyrk_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans,
-        std::int64_t *n, std::int64_t *k, std::complex<double> *alpha,
-        const std::complex<double> **a, std::int64_t *lda, std::complex<double> *beta,
-        std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ssyrk_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-        std::int64_t stride_a, float beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dsyrk_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
-        std::int64_t stride_a, double beta, double *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_csyrk_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, std::int64_t stride_a, std::complex<float> beta, std::complex<float> *c,
-        std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zsyrk_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-        std::int64_t lda, std::int64_t stride_a, std::complex<double> beta, std::complex<double> *c,
-        std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ssyr2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                                oneapi::mkl::transpose trans, std::int64_t n,
-                                                std::int64_t k, float alpha, const float *a,
-                                                std::int64_t lda, const float *b, std::int64_t ldb,
-                                                float beta, float *c, std::int64_t ldc,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dsyr2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                                oneapi::mkl::transpose trans, std::int64_t n,
-                                                std::int64_t k, double alpha, const double *a,
-                                                std::int64_t lda, const double *b, std::int64_t ldb,
-                                                double beta, double *c, std::int64_t ldc,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_csyr2k_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
-        std::complex<float> *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zsyr2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                                oneapi::mkl::transpose trans, std::int64_t n,
-                                                std::int64_t k, std::complex<double> alpha,
-                                                const std::complex<double> *a, std::int64_t lda,
-                                                const std::complex<double> *b, std::int64_t ldb,
-                                                std::complex<double> beta, std::complex<double> *c,
-                                                std::int64_t ldc,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_strmm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                               oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t m,
-                                               std::int64_t n, float alpha, const float *a,
-                                               std::int64_t lda, float *b, std::int64_t ldb,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dtrmm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                               oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t m,
-                                               std::int64_t n, double alpha, const double *a,
-                                               std::int64_t lda, double *b, std::int64_t ldb,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ctrmm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-        std::complex<float> *b, std::int64_t ldb, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ztrmm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-        std::complex<double> *b, std::int64_t ldb, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_strsm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                               oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t m,
-                                               std::int64_t n, float alpha, const float *a,
-                                               std::int64_t lda, float *b, std::int64_t ldb,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dtrsm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                               oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans,
-                                               oneapi::mkl::diag unit_diag, std::int64_t m,
-                                               std::int64_t n, double alpha, const double *a,
-                                               std::int64_t lda, double *b, std::int64_t ldb,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ctrsm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-        std::complex<float> *b, std::int64_t ldb, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ztrsm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-        std::complex<double> *b, std::int64_t ldb, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_strsm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, float *b,
-        std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dtrsm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, double *b,
-        std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ctrsm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-        std::int64_t stride_a, std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ztrsm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-        std::int64_t stride_a, std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_strsm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower,
-        oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m,
-        std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, float **b,
-        std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dtrsm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower,
-        oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m,
-        std::int64_t *n, double *alpha, const double **a, std::int64_t *lda, double **b,
-        std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ctrsm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower,
-        oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m,
-        std::int64_t *n, std::complex<float> *alpha, const std::complex<float> **a,
-        std::int64_t *lda, std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_ztrsm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower,
-        oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m,
-        std::int64_t *n, std::complex<double> *alpha, const std::complex<double> **a,
-        std::int64_t *lda, std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sgemm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const float **a,
-        std::int64_t *lda, const float **b, std::int64_t *ldb, float *beta, float **c,
-        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dgemm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, double *alpha, const double **a,
-        std::int64_t *lda, const double **b, std::int64_t *ldb, double *beta, double **c,
-        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cgemm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex<float> *alpha,
-        const std::complex<float> **a, std::int64_t *lda, const std::complex<float> **b,
-        std::int64_t *ldb, std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-        std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zgemm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex<double> *alpha,
-        const std::complex<double> **a, std::int64_t *lda, const std::complex<double> **b,
-        std::int64_t *ldb, std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-        std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_hgemm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, sycl::half *alpha, const sycl::half **a,
-        std::int64_t *lda, const sycl::half **b, std::int64_t *ldb, sycl::half *beta,
-        sycl::half **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_gemm_f16f16f32_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const sycl::half **a,
-        std::int64_t *lda, const sycl::half **b, std::int64_t *ldb, float *beta, float **c,
-        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_gemm_s8s8f32_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const std::int8_t **a,
-        std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb, float *beta, float **c,
-        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_gemm_s8s8s32_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const std::int8_t **a,
-        std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c,
-        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sgemm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a,
-        std::int64_t lda, std::int64_t stride_a, const float *b, std::int64_t ldb,
-        std::int64_t stride_b, float beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dgemm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a,
-        std::int64_t lda, std::int64_t stride_a, const double *b, std::int64_t ldb,
-        std::int64_t stride_b, double beta, double *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cgemm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-        const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-        const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::complex<float> beta, std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zgemm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-        const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-        const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::complex<double> beta, std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_hgemm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half *a,
-        std::int64_t lda, std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-        std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_gemm_f16f16f32_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a,
-        std::int64_t lda, std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-        std::int64_t stride_b, float beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_gemm_s8s8f32_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a,
-        std::int64_t lda, std::int64_t stride_a, const std::int8_t *b, std::int64_t ldb,
-        std::int64_t stride_b, float beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_gemm_s8s8s32_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a,
-        std::int64_t lda, std::int64_t stride_a, const std::int8_t *b, std::int64_t ldb,
-        std::int64_t stride_b, float beta, std::int32_t *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_sgemmt_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                                oneapi::mkl::transpose transa,
-                                                oneapi::mkl::transpose transb, std::int64_t n,
-                                                std::int64_t k, float alpha, const float *a,
-                                                std::int64_t lda, const float *b, std::int64_t ldb,
-                                                float beta, float *c, std::int64_t ldc,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dgemmt_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                                oneapi::mkl::transpose transa,
-                                                oneapi::mkl::transpose transb, std::int64_t n,
-                                                std::int64_t k, double alpha, const double *a,
-                                                std::int64_t lda, const double *b, std::int64_t ldb,
-                                                double beta, double *c, std::int64_t ldc,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cgemmt_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa,
-        oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-        const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-        std::int64_t ldb, std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zgemmt_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa,
-        oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-        const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-        std::int64_t ldb, std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_gemm_s8u8s32_bias_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        const std::int8_t *a, std::int64_t lda, std::int8_t ao, const std::uint8_t *b,
-        std::int64_t ldb, std::uint8_t bo, float beta, std::int32_t *c, std::int64_t ldc,
-        const std::int32_t *co, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_gemm_s8s8s32_bias_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        const std::int8_t *a, std::int64_t lda, std::int8_t ao, const std::int8_t *b,
-        std::int64_t ldb, std::int8_t bo, float beta, std::int32_t *c, std::int64_t ldc,
-        const std::int32_t *co, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_gemm_u8s8s32_bias_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        const std::uint8_t *a, std::int64_t lda, std::uint8_t ao, const std::int8_t *b,
-        std::int64_t ldb, std::int8_t bo, float beta, std::int32_t *c, std::int64_t ldc,
-        const std::int32_t *co, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_gemm_u8u8s32_bias_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        const std::uint8_t *a, std::int64_t lda, std::uint8_t ao, const std::uint8_t *b,
-        std::int64_t ldb, std::uint8_t bo, float beta, std::int32_t *c, std::int64_t ldc,
-        const std::int32_t *co, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_somatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, float *b,
-        std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_domatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, double *b,
-        std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_comatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-        std::int64_t stride_a, std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zomatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-        std::int64_t stride_a, std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_simatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        float alpha, float *ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dimatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        double alpha, double *ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cimatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-        std::int64_t stride, std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zimatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-        std::int64_t stride, std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_somatadd_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-        std::int64_t stride_a, float beta, const float *b, std::int64_t ldb, std::int64_t stride_b,
-        float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_domatadd_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda,
-        std::int64_t stride_a, double beta, const double *b, std::int64_t ldb,
-        std::int64_t stride_b, double *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_comatadd_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, std::int64_t stride_a, std::complex<float> beta,
-        const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zomatadd_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-        std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-        const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-
-    sycl::event (*column_major_somatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                   std::int64_t m, std::int64_t n, float alpha,
-                                                   const float *a, std::int64_t lda, float *b,
-                                                   std::int64_t ldb,
-                                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_domatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                   std::int64_t m, std::int64_t n, double alpha,
-                                                   const double *a, std::int64_t lda, double *b,
-                                                   std::int64_t ldb,
-                                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_comatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                   std::int64_t m, std::int64_t n,
-                                                   std::complex<float> alpha,
-                                                   const std::complex<float> *a, std::int64_t lda,
-                                                   std::complex<float> *b, std::int64_t ldb,
-                                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zomatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                   std::int64_t m, std::int64_t n,
-                                                   std::complex<double> alpha,
-                                                   const std::complex<double> *a, std::int64_t lda,
-                                                   std::complex<double> *b, std::int64_t ldb,
-                                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_somatcopy2_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        float alpha, const float *a, std::int64_t lda, std::int64_t stridea, float *b,
-        std::int64_t ldb, std::int64_t strideb, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_domatcopy2_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        double alpha, const double *a, std::int64_t lda, std::int64_t stridea, double *b,
-        std::int64_t ldb, std::int64_t strideb, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_comatcopy2_usm_sycl)(sycl::queue &queue,
-                                                    oneapi::mkl::transpose trans, std::int64_t m,
-                                                    std::int64_t n, std::complex<float> alpha,
-                                                    const std::complex<float> *a, std::int64_t lda,
-                                                    std::int64_t stridea, std::complex<float> *b,
-                                                    std::int64_t ldb, std::int64_t strideb,
-                                                    const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zomatcopy2_usm_sycl)(sycl::queue &queue,
-                                                    oneapi::mkl::transpose trans, std::int64_t m,
-                                                    std::int64_t n, std::complex<double> alpha,
-                                                    const std::complex<double> *a, std::int64_t lda,
-                                                    std::int64_t stridea, std::complex<double> *b,
-                                                    std::int64_t ldb, std::int64_t strideb,
-                                                    const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_simatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                   std::int64_t m, std::int64_t n, float alpha,
-                                                   float *ab, std::int64_t lda, std::int64_t ldb,
-                                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dimatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                   std::int64_t m, std::int64_t n, double alpha,
-                                                   double *ab, std::int64_t lda, std::int64_t ldb,
-                                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cimatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                   std::int64_t m, std::int64_t n,
-                                                   std::complex<float> alpha,
-                                                   std::complex<float> *ab, std::int64_t lda,
-                                                   std::int64_t ldb,
-                                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zimatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                   std::int64_t m, std::int64_t n,
-                                                   std::complex<double> alpha,
-                                                   std::complex<double> *ab, std::int64_t lda,
-                                                   std::int64_t ldb,
-                                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_somatadd_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                                  oneapi::mkl::transpose transb, std::int64_t m,
-                                                  std::int64_t n, float alpha, const float *a,
-                                                  std::int64_t lda, float beta, const float *b,
-                                                  std::int64_t ldb, float *c, std::int64_t ldc,
-                                                  const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_domatadd_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                                  oneapi::mkl::transpose transb, std::int64_t m,
-                                                  std::int64_t n, double alpha, const double *a,
-                                                  std::int64_t lda, double beta, const double *b,
-                                                  std::int64_t ldb, double *c, std::int64_t ldc,
-                                                  const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_comatadd_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, std::complex<float> beta, const std::complex<float> *b, std::int64_t ldb,
-        std::complex<float> *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zomatadd_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                                  oneapi::mkl::transpose transb, std::int64_t m,
-                                                  std::int64_t n, std::complex<double> alpha,
-                                                  const std::complex<double> *a, std::int64_t lda,
-                                                  std::complex<double> beta,
-                                                  const std::complex<double> *b, std::int64_t ldb,
-                                                  std::complex<double> *c, std::int64_t ldc,
-                                                  const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_somatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        float *alpha, const float **a, std::int64_t *lda, float **b, std::int64_t *ldb,
-        std::int64_t group_count, std::int64_t *groupsize,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_domatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        double *alpha, const double **a, std::int64_t *lda, double **b, std::int64_t *ldb,
-        std::int64_t group_count, std::int64_t *groupsize,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_comatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
-        std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-        std::int64_t *groupsize, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zomatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
-        std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-        std::int64_t *groupsize, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_simatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        float *alpha, float **ab, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-        std::int64_t *groupsize, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_dimatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        double *alpha, double **ab, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-        std::int64_t *groupsize, const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_cimatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        std::complex<float> *alpha, std::complex<float> **ab, std::int64_t *lda, std::int64_t *ldb,
-        std::int64_t group_count, std::int64_t *groupsize,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*column_major_zimatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        std::complex<double> *alpha, std::complex<double> **ab, std::int64_t *lda,
-        std::int64_t *ldb, std::int64_t group_count, std::int64_t *groupsize,
-        const std::vector<sycl::event> &dependencies);
-
-    // Buffer APIs
-
-    void (*row_major_scasum_sycl)(sycl::queue &queue, std::int64_t n,
-                                  sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                  sycl::buffer<float, 1> &result);
-    void (*row_major_dzasum_sycl)(sycl::queue &queue, std::int64_t n,
-                                  sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                  sycl::buffer<double, 1> &result);
-    void (*row_major_sasum_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                 std::int64_t incx, sycl::buffer<float, 1> &result);
-    void (*row_major_dasum_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                 std::int64_t incx, sycl::buffer<double, 1> &result);
-    void (*row_major_saxpy_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                 sycl::buffer<float, 1> &x, std::int64_t incx,
-                                 sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*row_major_daxpy_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                 sycl::buffer<double, 1> &x, std::int64_t incx,
-                                 sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*row_major_caxpy_sycl)(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                                 sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                 sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-    void (*row_major_zaxpy_sycl)(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                 sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*row_major_saxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                               sycl::buffer<float, 1> &x, std::int64_t incx,
-                                               std::int64_t stridex, sycl::buffer<float, 1> &y,
-                                               std::int64_t incy, std::int64_t stridey,
-                                               std::int64_t batch_size);
-    void (*row_major_daxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                               sycl::buffer<double, 1> &x, std::int64_t incx,
-                                               std::int64_t stridex, sycl::buffer<double, 1> &y,
-                                               std::int64_t incy, std::int64_t stridey,
-                                               std::int64_t batch_size);
-    void (*row_major_caxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n,
-                                               std::complex<float> alpha,
-                                               sycl::buffer<std::complex<float>, 1> &x,
-                                               std::int64_t incx, std::int64_t stridex,
-                                               sycl::buffer<std::complex<float>, 1> &y,
-                                               std::int64_t incy, std::int64_t stridey,
-                                               std::int64_t batch_size);
-    void (*row_major_zaxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n,
-                                               std::complex<double> alpha,
-                                               sycl::buffer<std::complex<double>, 1> &x,
-                                               std::int64_t incx, std::int64_t stridex,
-                                               sycl::buffer<std::complex<double>, 1> &y,
-                                               std::int64_t incy, std::int64_t stridey,
-                                               std::int64_t batch_size);
-    void (*row_major_saxpby_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                  sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                                  sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*row_major_daxpby_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                  sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                                  sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*row_major_caxpby_sycl)(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                                  sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                  std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                                  std::int64_t incy);
-    void (*row_major_zaxpby_sycl)(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                                  sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                  std::complex<double> beta,
-                                  sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*row_major_scopy_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                 std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*row_major_dcopy_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                 std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*row_major_ccopy_sycl)(sycl::queue &queue, std::int64_t n,
-                                 sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                 sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-    void (*row_major_zcopy_sycl)(sycl::queue &queue, std::int64_t n,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                 sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*row_major_scopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n,
-                                               sycl::buffer<float, 1> &x, std::int64_t incx,
-                                               std::int64_t stridex, sycl::buffer<float, 1> &y,
-                                               std::int64_t incy, std::int64_t stridey,
-                                               std::int64_t batch_size);
-    void (*row_major_dcopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n,
-                                               sycl::buffer<double, 1> &x, std::int64_t incx,
-                                               std::int64_t stridex, sycl::buffer<double, 1> &y,
-                                               std::int64_t incy, std::int64_t stridey,
-                                               std::int64_t batch_size);
-    void (*row_major_ccopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n,
-                                               sycl::buffer<std::complex<float>, 1> &x,
-                                               std::int64_t incx, std::int64_t stridex,
-                                               sycl::buffer<std::complex<float>, 1> &y,
-                                               std::int64_t incy, std::int64_t stridey,
-                                               std::int64_t batch_size);
-    void (*row_major_zcopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n,
-                                               sycl::buffer<std::complex<double>, 1> &x,
-                                               std::int64_t incx, std::int64_t stridex,
-                                               sycl::buffer<std::complex<double>, 1> &y,
-                                               std::int64_t incy, std::int64_t stridey,
-                                               std::int64_t batch_size);
-    void (*row_major_sdot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                                sycl::buffer<float, 1> &result);
-    void (*row_major_ddot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                                sycl::buffer<double, 1> &result);
-    void (*row_major_dsdot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                 std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                                 sycl::buffer<double, 1> &result);
-    void (*row_major_cdotc_sycl)(sycl::queue &queue, std::int64_t n,
-                                 sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                 sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                                 sycl::buffer<std::complex<float>, 1> &result);
-    void (*row_major_zdotc_sycl)(sycl::queue &queue, std::int64_t n,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                 sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                                 sycl::buffer<std::complex<double>, 1> &result);
-    void (*row_major_cdotu_sycl)(sycl::queue &queue, std::int64_t n,
-                                 sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                 sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy,
-                                 sycl::buffer<std::complex<float>, 1> &result);
-    void (*row_major_zdotu_sycl)(sycl::queue &queue, std::int64_t n,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                 sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                                 sycl::buffer<std::complex<double>, 1> &result);
-    void (*row_major_isamin_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                  std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-    void (*row_major_idamin_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                  std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-    void (*row_major_icamin_sycl)(sycl::queue &queue, std::int64_t n,
-                                  sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                  sycl::buffer<std::int64_t, 1> &result);
-    void (*row_major_izamin_sycl)(sycl::queue &queue, std::int64_t n,
-                                  sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                  sycl::buffer<std::int64_t, 1> &result);
-    void (*row_major_isamax_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                  std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-    void (*row_major_idamax_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                  std::int64_t incx, sycl::buffer<std::int64_t, 1> &result);
-    void (*row_major_icamax_sycl)(sycl::queue &queue, std::int64_t n,
-                                  sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                  sycl::buffer<std::int64_t, 1> &result);
-    void (*row_major_izamax_sycl)(sycl::queue &queue, std::int64_t n,
-                                  sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                  sycl::buffer<std::int64_t, 1> &result);
-    void (*row_major_scnrm2_sycl)(sycl::queue &queue, std::int64_t n,
-                                  sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                  sycl::buffer<float, 1> &result);
-    void (*row_major_dznrm2_sycl)(sycl::queue &queue, std::int64_t n,
-                                  sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                  sycl::buffer<double, 1> &result);
-    void (*row_major_snrm2_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                 std::int64_t incx, sycl::buffer<float, 1> &result);
-    void (*row_major_dnrm2_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                 std::int64_t incx, sycl::buffer<double, 1> &result);
-    void (*row_major_srot_sycl)(sycl::queue &queue, std::int64_t n,
-                                sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy, float c,
-                                float s);
-    void (*row_major_drot_sycl)(sycl::queue &queue, std::int64_t n,
-                                sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                                double c, double s);
-    void (*row_major_csrot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                 std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                                 float c, float s);
-    void (*row_major_zdrot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                 std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                                 double c, double s);
-    void (*row_major_srotg_sycl)(sycl::queue &queue, sycl::buffer<float, 1> &a,
-                                 sycl::buffer<float, 1> &b, sycl::buffer<float, 1> &c,
-                                 sycl::buffer<float, 1> &s);
-    void (*row_major_drotg_sycl)(sycl::queue &queue, sycl::buffer<double, 1> &a,
-                                 sycl::buffer<double, 1> &b, sycl::buffer<double, 1> &c,
-                                 sycl::buffer<double, 1> &s);
-    void (*row_major_crotg_sycl)(sycl::queue &queue, sycl::buffer<std::complex<float>, 1> &a,
-                                 sycl::buffer<std::complex<float>, 1> &b, sycl::buffer<float, 1> &c,
-                                 sycl::buffer<std::complex<float>, 1> &s);
-    void (*row_major_zrotg_sycl)(sycl::queue &queue, sycl::buffer<std::complex<double>, 1> &a,
-                                 sycl::buffer<std::complex<double>, 1> &b,
-                                 sycl::buffer<double, 1> &c,
-                                 sycl::buffer<std::complex<double>, 1> &s);
-    void (*row_major_srotm_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                 std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy,
-                                 sycl::buffer<float, 1> &param);
-    void (*row_major_drotm_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                 std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy,
-                                 sycl::buffer<double, 1> &param);
-    void (*row_major_srotmg_sycl)(sycl::queue &queue, sycl::buffer<float, 1> &d1,
-                                  sycl::buffer<float, 1> &d2, sycl::buffer<float, 1> &x1, float y1,
-                                  sycl::buffer<float, 1> &param);
-    void (*row_major_drotmg_sycl)(sycl::queue &queue, sycl::buffer<double, 1> &d1,
-                                  sycl::buffer<double, 1> &d2, sycl::buffer<double, 1> &x1,
-                                  double y1, sycl::buffer<double, 1> &param);
-    void (*row_major_sscal_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                 sycl::buffer<float, 1> &x, std::int64_t incx);
-    void (*row_major_dscal_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                 sycl::buffer<double, 1> &x, std::int64_t incx);
-    void (*row_major_cscal_sycl)(sycl::queue &queue, std::int64_t n, std::complex<float> alpha,
-                                 sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-    void (*row_major_csscal_sycl)(sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-                                  sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-    void (*row_major_zscal_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                 sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-    void (*row_major_zdscal_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                  sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-    void (*row_major_sdsdot_sycl)(sycl::queue &queue, std::int64_t n, float sb,
-                                  sycl::buffer<float, 1> &x, std::int64_t incx,
-                                  sycl::buffer<float, 1> &y, std::int64_t incy,
-                                  sycl::buffer<float, 1> &result);
-    void (*row_major_sswap_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float, 1> &x,
-                                 std::int64_t incx, sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*row_major_dswap_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double, 1> &x,
-                                 std::int64_t incx, sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*row_major_cswap_sycl)(sycl::queue &queue, std::int64_t n,
-                                 sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                 sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-    void (*row_major_zswap_sycl)(sycl::queue &queue, std::int64_t n,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                 sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*row_major_sgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                 std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha,
-                                 sycl::buffer<float, 1> &a, std::int64_t lda,
-                                 sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                                 sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*row_major_dgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                 std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha,
-                                 sycl::buffer<double, 1> &a, std::int64_t lda,
-                                 sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                                 sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*row_major_cgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                 std::int64_t n, std::int64_t kl, std::int64_t ku,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                                 std::int64_t incx, std::complex<float> beta,
-                                 sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-    void (*row_major_zgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                 std::int64_t n, std::int64_t kl, std::int64_t ku,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                 std::complex<double> beta,
-                                 sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*row_major_sgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                 std::int64_t n, float alpha, sycl::buffer<float, 1> &a,
-                                 std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-                                 float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*row_major_dgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                 std::int64_t n, double alpha, sycl::buffer<double, 1> &a,
-                                 std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx,
-                                 double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*row_major_cgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                 std::int64_t n, std::complex<float> alpha,
-                                 sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                 std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                                 std::int64_t incy);
-    void (*row_major_zgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m,
-                                 std::int64_t n, std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                 std::complex<double> beta,
-                                 sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*row_major_sgemv_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                               std::int64_t m, std::int64_t n, float alpha,
-                                               sycl::buffer<float, 1> &a, std::int64_t lda,
-                                               std::int64_t stridea, sycl::buffer<float, 1> &x,
-                                               std::int64_t incx, std::int64_t stridex, float beta,
-                                               sycl::buffer<float, 1> &y, std::int64_t incy,
-                                               std::int64_t stridey, std::int64_t batch_size);
-    void (*row_major_dgemv_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                               std::int64_t m, std::int64_t n, double alpha,
-                                               sycl::buffer<double, 1> &a, std::int64_t lda,
-                                               std::int64_t stridea, sycl::buffer<double, 1> &x,
-                                               std::int64_t incx, std::int64_t stridex, double beta,
-                                               sycl::buffer<double, 1> &y, std::int64_t incy,
-                                               std::int64_t stridey, std::int64_t batch_size);
-    void (*row_major_cgemv_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-        std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-        std::int64_t stridex, std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-        std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-    void (*row_major_zgemv_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-        std::int64_t stridea, sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-        std::int64_t stridex, std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &y,
-        std::int64_t incy, std::int64_t stridey, std::int64_t batch_size);
-    void (*row_major_sdgmm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                               std::int64_t m, std::int64_t n,
-                                               sycl::buffer<float, 1> &a, std::int64_t lda,
-                                               std::int64_t stridea, sycl::buffer<float, 1> &x,
-                                               std::int64_t incx, std::int64_t stridex,
-                                               sycl::buffer<float, 1> &c, std::int64_t ldc,
-                                               std::int64_t stridec, std::int64_t batch_size);
-    void (*row_major_ddgmm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                               std::int64_t m, std::int64_t n,
-                                               sycl::buffer<double, 1> &a, std::int64_t lda,
-                                               std::int64_t stridea, sycl::buffer<double, 1> &x,
-                                               std::int64_t incx, std::int64_t stridex,
-                                               sycl::buffer<double, 1> &c, std::int64_t ldc,
-                                               std::int64_t stridec, std::int64_t batch_size);
-    void (*row_major_cdgmm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stridea,
-        sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx, std::int64_t stridex,
-        sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc, std::int64_t stridec,
-        std::int64_t batch_size);
-    void (*row_major_zdgmm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stridea,
-        sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx, std::int64_t stridex,
-        sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc, std::int64_t stridec,
-        std::int64_t batch_size);
-    void (*row_major_sger_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha,
-                                sycl::buffer<float, 1> &x, std::int64_t incx,
-                                sycl::buffer<float, 1> &y, std::int64_t incy,
-                                sycl::buffer<float, 1> &a, std::int64_t lda);
-    void (*row_major_dger_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha,
-                                sycl::buffer<double, 1> &x, std::int64_t incx,
-                                sycl::buffer<double, 1> &y, std::int64_t incy,
-                                sycl::buffer<double, 1> &a, std::int64_t lda);
-    void (*row_major_cgerc_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                                 std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                                 std::int64_t incy, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda);
-    void (*row_major_zgerc_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                 sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-    void (*row_major_cgeru_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                                 std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                                 std::int64_t incy, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda);
-    void (*row_major_zgeru_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                 sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-    void (*row_major_chbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 std::int64_t k, std::complex<float> alpha,
-                                 sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                 std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                                 std::int64_t incy);
-    void (*row_major_zhbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 std::int64_t k, std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                 std::complex<double> beta,
-                                 sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*row_major_chemv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                                 std::int64_t incx, std::complex<float> beta,
-                                 sycl::buffer<std::complex<float>, 1> &y, std::int64_t incy);
-    void (*row_major_zhemv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                 std::complex<double> beta,
-                                 sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*row_major_cher_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                float alpha, sycl::buffer<std::complex<float>, 1> &x,
-                                std::int64_t incx, sycl::buffer<std::complex<float>, 1> &a,
-                                std::int64_t lda);
-    void (*row_major_zher_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                double alpha, sycl::buffer<std::complex<double>, 1> &x,
-                                std::int64_t incx, sycl::buffer<std::complex<double>, 1> &a,
-                                std::int64_t lda);
-    void (*row_major_cher2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                                 std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                                 std::int64_t incy, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda);
-    void (*row_major_zher2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                 sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda);
-    void (*row_major_chpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                                 sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx,
-                                 std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &y,
-                                 std::int64_t incy);
-    void (*row_major_zhpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                 std::complex<double> beta,
-                                 sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy);
-    void (*row_major_chpr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                float alpha, sycl::buffer<std::complex<float>, 1> &x,
-                                std::int64_t incx, sycl::buffer<std::complex<float>, 1> &a);
-    void (*row_major_zhpr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                double alpha, sycl::buffer<std::complex<double>, 1> &x,
-                                std::int64_t incx, sycl::buffer<std::complex<double>, 1> &a);
-    void (*row_major_chpr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &x,
-                                 std::int64_t incx, sycl::buffer<std::complex<float>, 1> &y,
-                                 std::int64_t incy, sycl::buffer<std::complex<float>, 1> &a);
-    void (*row_major_zhpr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx,
-                                 sycl::buffer<std::complex<double>, 1> &y, std::int64_t incy,
-                                 sycl::buffer<std::complex<double>, 1> &a);
-    void (*row_major_ssbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                                 std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx,
-                                 float beta, sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*row_major_dsbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                                 std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx,
-                                 double beta, sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*row_major_sspmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 float alpha, sycl::buffer<float, 1> &a, sycl::buffer<float, 1> &x,
-                                 std::int64_t incx, float beta, sycl::buffer<float, 1> &y,
-                                 std::int64_t incy);
-    void (*row_major_dspmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 double alpha, sycl::buffer<double, 1> &a,
-                                 sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                                 sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*row_major_sspr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                float alpha, sycl::buffer<float, 1> &x, std::int64_t incx,
-                                sycl::buffer<float, 1> &a);
-    void (*row_major_dspr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-                                sycl::buffer<double, 1> &a);
-    void (*row_major_sspr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 float alpha, sycl::buffer<float, 1> &x, std::int64_t incx,
-                                 sycl::buffer<float, 1> &y, std::int64_t incy,
-                                 sycl::buffer<float, 1> &a);
-    void (*row_major_dspr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-                                 sycl::buffer<double, 1> &y, std::int64_t incy,
-                                 sycl::buffer<double, 1> &a);
-    void (*row_major_ssymv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                 sycl::buffer<float, 1> &x, std::int64_t incx, float beta,
-                                 sycl::buffer<float, 1> &y, std::int64_t incy);
-    void (*row_major_dsymv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                 sycl::buffer<double, 1> &x, std::int64_t incx, double beta,
-                                 sycl::buffer<double, 1> &y, std::int64_t incy);
-    void (*row_major_ssyr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                float alpha, sycl::buffer<float, 1> &x, std::int64_t incx,
-                                sycl::buffer<float, 1> &a, std::int64_t lda);
-    void (*row_major_dsyr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-                                sycl::buffer<double, 1> &a, std::int64_t lda);
-    void (*row_major_ssyr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 float alpha, sycl::buffer<float, 1> &x, std::int64_t incx,
-                                 sycl::buffer<float, 1> &y, std::int64_t incy,
-                                 sycl::buffer<float, 1> &a, std::int64_t lda);
-    void (*row_major_dsyr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n,
-                                 double alpha, sycl::buffer<double, 1> &x, std::int64_t incx,
-                                 sycl::buffer<double, 1> &y, std::int64_t incy,
-                                 sycl::buffer<double, 1> &a, std::int64_t lda);
-    void (*row_major_stbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-                                 std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx);
-    void (*row_major_dtbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-                                 std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx);
-    void (*row_major_ctbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, std::int64_t k,
-                                 sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-    void (*row_major_ztbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, std::int64_t k,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-    void (*row_major_stbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, std::int64_t k, sycl::buffer<float, 1> &a,
-                                 std::int64_t lda, sycl::buffer<float, 1> &x, std::int64_t incx);
-    void (*row_major_dtbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, std::int64_t k, sycl::buffer<double, 1> &a,
-                                 std::int64_t lda, sycl::buffer<double, 1> &x, std::int64_t incx);
-    void (*row_major_ctbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, std::int64_t k,
-                                 sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-    void (*row_major_ztbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, std::int64_t k,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-    void (*row_major_stpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<float, 1> &a,
-                                 sycl::buffer<float, 1> &x, std::int64_t incx);
-    void (*row_major_dtpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<double, 1> &a,
-                                 sycl::buffer<double, 1> &x, std::int64_t incx);
-    void (*row_major_ctpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-                                 sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-    void (*row_major_ztpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-    void (*row_major_stpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<float, 1> &a,
-                                 sycl::buffer<float, 1> &x, std::int64_t incx);
-    void (*row_major_dtpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<double, 1> &a,
-                                 sycl::buffer<double, 1> &x, std::int64_t incx);
-    void (*row_major_ctpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-                                 sycl::buffer<std::complex<float>, 1> &x, std::int64_t incx);
-    void (*row_major_ztpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-                                 sycl::buffer<std::complex<double>, 1> &x, std::int64_t incx);
-    void (*row_major_strmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                 sycl::buffer<float, 1> &x, std::int64_t incx);
-    void (*row_major_dtrmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                 sycl::buffer<double, 1> &x, std::int64_t incx);
-    void (*row_major_ctrmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                                 std::int64_t incx);
-    void (*row_major_ztrmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-                                 std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                                 std::int64_t incx);
-    void (*row_major_strsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                 sycl::buffer<float, 1> &x, std::int64_t incx);
-    void (*row_major_dtrsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                 sycl::buffer<double, 1> &x, std::int64_t incx);
-    void (*row_major_ctrsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda, sycl::buffer<std::complex<float>, 1> &x,
-                                 std::int64_t incx);
-    void (*row_major_ztrsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag,
-                                 std::int64_t n, sycl::buffer<std::complex<double>, 1> &a,
-                                 std::int64_t lda, sycl::buffer<std::complex<double>, 1> &x,
-                                 std::int64_t incx);
-    void (*row_major_sgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                 oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                 std::int64_t k, float alpha, sycl::buffer<float, 1> &a,
-                                 std::int64_t lda, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                                 float beta, sycl::buffer<float, 1> &c, std::int64_t ldc);
-    void (*row_major_dgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                 oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                 std::int64_t k, double alpha, sycl::buffer<double, 1> &a,
-                                 std::int64_t lda, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                                 double beta, sycl::buffer<double, 1> &c, std::int64_t ldc);
-    void (*row_major_cgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                 oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                 std::int64_t k, std::complex<float> alpha,
-                                 sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                 std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                                 std::int64_t ldc);
-    void (*row_major_zgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                 oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                 std::int64_t k, std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                 std::complex<double> beta,
-                                 sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-    void (*row_major_hgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                 oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                 std::int64_t k, sycl::half alpha, sycl::buffer<sycl::half, 1> &a,
-                                 std::int64_t lda, sycl::buffer<sycl::half, 1> &b, std::int64_t ldb,
-                                 sycl::half beta, sycl::buffer<sycl::half, 1> &c, std::int64_t ldc);
-    void (*row_major_gemm_f16f16f32_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                          oneapi::mkl::transpose transb, std::int64_t m,
-                                          std::int64_t n, std::int64_t k, float alpha,
-                                          sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                                          sycl::buffer<sycl::half, 1> &b, std::int64_t ldb,
-                                          float beta, sycl::buffer<float, 1> &c, std::int64_t ldc);
-    void (*row_major_gemm_bf16bf16f32_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                            oneapi::mkl::transpose transb, std::int64_t m,
-                                            std::int64_t n, std::int64_t k, float alpha,
-                                            sycl::buffer<oneapi::mkl::bfloat16, 1> &a,
-                                            std::int64_t lda,
-                                            sycl::buffer<oneapi::mkl::bfloat16, 1> &b,
-                                            std::int64_t ldb, float beta, sycl::buffer<float, 1> &c,
-                                            std::int64_t ldc);
-    void (*row_major_chemm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                 oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                                 std::int64_t ldb, std::complex<float> beta,
-                                 sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-    void (*row_major_zhemm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                 oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                 std::complex<double> beta,
-                                 sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-    void (*row_major_cherk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                 float alpha, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda, float beta,
-                                 sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-    void (*row_major_zherk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                 double alpha, sycl::buffer<std::complex<double>, 1> &a,
-                                 std::int64_t lda, double beta,
-                                 sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-    void (*row_major_cher2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                  std::complex<float> alpha,
-                                  sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                  sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                  float beta, sycl::buffer<std::complex<float>, 1> &c,
-                                  std::int64_t ldc);
-    void (*row_major_zher2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                  std::complex<double> alpha,
-                                  sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                  sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                  double beta, sycl::buffer<std::complex<double>, 1> &c,
-                                  std::int64_t ldc);
-    void (*row_major_ssymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                 oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                 float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                 sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                                 sycl::buffer<float, 1> &c, std::int64_t ldc);
-    void (*row_major_dsymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                 oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                 double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                 sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                                 sycl::buffer<double, 1> &c, std::int64_t ldc);
-    void (*row_major_csymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                 oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                                 std::int64_t ldb, std::complex<float> beta,
-                                 sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-    void (*row_major_zsymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                 oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                 std::complex<double> beta,
-                                 sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-    void (*row_major_ssyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                 float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                 float beta, sycl::buffer<float, 1> &c, std::int64_t ldc);
-    void (*row_major_dsyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                 double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                 double beta, sycl::buffer<double, 1> &c, std::int64_t ldc);
-    void (*row_major_csyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda, std::complex<float> beta,
-                                 sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-    void (*row_major_zsyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                 oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 std::complex<double> beta,
-                                 sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-    void (*row_major_ssyrk_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans, std::int64_t n,
-                                               std::int64_t k, float alpha,
-                                               sycl::buffer<float, 1> &a, std::int64_t lda,
-                                               std::int64_t stride_a, float beta,
-                                               sycl::buffer<float, 1> &c, std::int64_t ldc,
-                                               std::int64_t stride_c, std::int64_t batch_size);
-    void (*row_major_dsyrk_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                               oneapi::mkl::transpose trans, std::int64_t n,
-                                               std::int64_t k, double alpha,
-                                               sycl::buffer<double, 1> &a, std::int64_t lda,
-                                               std::int64_t stride_a, double beta,
-                                               sycl::buffer<double, 1> &c, std::int64_t ldc,
-                                               std::int64_t stride_c, std::int64_t batch_size);
-    void (*row_major_csyrk_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, std::complex<float> alpha,
-        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-    void (*row_major_zsyrk_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, std::complex<double> alpha,
-        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-    void (*row_major_ssyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                  float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                  sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                                  sycl::buffer<float, 1> &c, std::int64_t ldc);
-    void (*row_major_dsyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                  double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                  sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                                  sycl::buffer<double, 1> &c, std::int64_t ldc);
-    void (*row_major_csyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                  std::complex<float> alpha,
-                                  sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                  sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                  std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                                  std::int64_t ldc);
-    void (*row_major_zsyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k,
-                                  std::complex<double> alpha,
-                                  sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                  sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                  std::complex<double> beta,
-                                  sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-    void (*row_major_strmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                 oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                 oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                 float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                 sycl::buffer<float, 1> &b, std::int64_t ldb);
-    void (*row_major_dtrmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                 oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                 oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                 double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                 sycl::buffer<double, 1> &b, std::int64_t ldb);
-    void (*row_major_ctrmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                 oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                 oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                                 std::int64_t ldb);
-    void (*row_major_ztrmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                 oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                 oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-    void (*row_major_strsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                 oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                 oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                 float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                 sycl::buffer<float, 1> &b, std::int64_t ldb);
-    void (*row_major_dtrsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                 oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                 oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                 double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                 sycl::buffer<double, 1> &b, std::int64_t ldb);
-    void (*row_major_ctrsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                 oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                 oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                 std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a,
-                                 std::int64_t lda, sycl::buffer<std::complex<float>, 1> &b,
-                                 std::int64_t ldb);
-    void (*row_major_ztrsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                 oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                                 oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-                                 std::complex<double> alpha,
-                                 sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                 sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-    void (*row_major_sgemm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                               oneapi::mkl::transpose transb, std::int64_t m,
-                                               std::int64_t n, std::int64_t k, float alpha,
-                                               sycl::buffer<float, 1> &a, std::int64_t lda,
-                                               std::int64_t stride_a, sycl::buffer<float, 1> &b,
-                                               std::int64_t ldb, std::int64_t stride_b, float beta,
-                                               sycl::buffer<float, 1> &c, std::int64_t ldc,
-                                               std::int64_t stride_c, std::int64_t batch_size);
-    void (*row_major_dgemm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                               oneapi::mkl::transpose transb, std::int64_t m,
-                                               std::int64_t n, std::int64_t k, double alpha,
-                                               sycl::buffer<double, 1> &a, std::int64_t lda,
-                                               std::int64_t stride_a, sycl::buffer<double, 1> &b,
-                                               std::int64_t ldb, std::int64_t stride_b, double beta,
-                                               sycl::buffer<double, 1> &c, std::int64_t ldc,
-                                               std::int64_t stride_c, std::int64_t batch_size);
-    void (*row_major_cgemm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-    void (*row_major_zgemm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-    void (*row_major_hgemm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                               oneapi::mkl::transpose transb, std::int64_t m,
-                                               std::int64_t n, std::int64_t k, sycl::half alpha,
-                                               sycl::buffer<sycl::half, 1> &a, std::int64_t lda,
-                                               std::int64_t stride_a,
-                                               sycl::buffer<sycl::half, 1> &b, std::int64_t ldb,
-                                               std::int64_t stride_b, sycl::half beta,
-                                               sycl::buffer<sycl::half, 1> &c, std::int64_t ldc,
-                                               std::int64_t stride_c, std::int64_t batch_size);
-    void (*row_major_gemm_f16f16f32_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer<sycl::half, 1> &a,
-        std::int64_t lda, std::int64_t stride_a, sycl::buffer<sycl::half, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, float beta, sycl::buffer<float, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-    void (*row_major_gemm_s8s8f32_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-        sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size);
-    void (*row_major_gemm_s8s8s32_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        sycl::buffer<std::int8_t, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        sycl::buffer<std::int8_t, 1> &b, std::int64_t ldb, std::int64_t stride_b, float beta,
-        sycl::buffer<std::int32_t, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size);
-    void (*row_major_strsm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        float alpha, sycl::buffer<float, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        sycl::buffer<float, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size);
-    void (*row_major_dtrsm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        double alpha, sycl::buffer<double, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        sycl::buffer<double, 1> &b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size);
-    void (*row_major_ctrsm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-        std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, std::int64_t batch_size);
-    void (*row_major_ztrsm_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-        std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, std::int64_t batch_size);
-    void (*row_major_sgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                                  std::int64_t n, std::int64_t k, float alpha,
-                                  sycl::buffer<float, 1> &a, std::int64_t lda,
-                                  sycl::buffer<float, 1> &b, std::int64_t ldb, float beta,
-                                  sycl::buffer<float, 1> &c, std::int64_t ldc);
-    void (*row_major_dgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                                  std::int64_t n, std::int64_t k, double alpha,
-                                  sycl::buffer<double, 1> &a, std::int64_t lda,
-                                  sycl::buffer<double, 1> &b, std::int64_t ldb, double beta,
-                                  sycl::buffer<double, 1> &c, std::int64_t ldc);
-    void (*row_major_cgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                                  std::int64_t n, std::int64_t k, std::complex<float> alpha,
-                                  sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                  sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                  std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &c,
-                                  std::int64_t ldc);
-    void (*row_major_zgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                  oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-                                  std::int64_t n, std::int64_t k, std::complex<double> alpha,
-                                  sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                  sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                  std::complex<double> beta,
-                                  sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-    void (*row_major_gemm_s8u8s32_bias_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao, sycl::buffer<uint8_t, 1> &b,
-        std::int64_t ldb, uint8_t bo, float beta, sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-        sycl::buffer<int32_t, 1> &co);
-    void (*row_major_gemm_s8s8s32_bias_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        sycl::buffer<int8_t, 1> &a, std::int64_t lda, int8_t ao, sycl::buffer<int8_t, 1> &b,
-        std::int64_t ldb, int8_t bo, float beta, sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-        sycl::buffer<int32_t, 1> &co);
-    void (*row_major_gemm_u8s8s32_bias_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao, sycl::buffer<int8_t, 1> &b,
-        std::int64_t ldb, int8_t bo, float beta, sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-        sycl::buffer<int32_t, 1> &co);
-    void (*row_major_gemm_u8u8s32_bias_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        sycl::buffer<uint8_t, 1> &a, std::int64_t lda, uint8_t ao, sycl::buffer<uint8_t, 1> &b,
-        std::int64_t ldb, uint8_t bo, float beta, sycl::buffer<int32_t, 1> &c, std::int64_t ldc,
-        sycl::buffer<int32_t, 1> &co);
-    void (*row_major_somatcopy_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                   std::int64_t m, std::int64_t n, float alpha,
-                                                   sycl::buffer<float, 1> &a, std::int64_t lda,
-                                                   std::int64_t stride_a, sycl::buffer<float, 1> &b,
-                                                   std::int64_t ldb, std::int64_t stride_b,
-                                                   std::int64_t batch_size);
-    void (*row_major_domatcopy_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                   std::int64_t m, std::int64_t n, double alpha,
-                                                   sycl::buffer<double, 1> &a, std::int64_t lda,
-                                                   std::int64_t stride_a,
-                                                   sycl::buffer<double, 1> &b, std::int64_t ldb,
-                                                   std::int64_t stride_b, std::int64_t batch_size);
-    void (*row_major_comatcopy_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-        std::int64_t stride_a, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, std::int64_t batch_size);
-    void (*row_major_zomatcopy_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-        std::int64_t stride_a, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, std::int64_t batch_size);
-    void (*row_major_simatcopy_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                   std::int64_t m, std::int64_t n, float alpha,
-                                                   sycl::buffer<float, 1> &ab, std::int64_t lda,
-                                                   std::int64_t ldb, std::int64_t stride,
-                                                   std::int64_t batch_size);
-    void (*row_major_dimatcopy_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                   std::int64_t m, std::int64_t n, double alpha,
-                                                   sycl::buffer<double, 1> &ab, std::int64_t lda,
-                                                   std::int64_t ldb, std::int64_t stride,
-                                                   std::int64_t batch_size);
-    void (*row_major_cimatcopy_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                   std::int64_t m, std::int64_t n,
-                                                   std::complex<float> alpha,
-                                                   sycl::buffer<std::complex<float>, 1> &ab,
-                                                   std::int64_t lda, std::int64_t ldb,
-                                                   std::int64_t stride, std::int64_t batch_size);
-    void (*row_major_zimatcopy_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                   std::int64_t m, std::int64_t n,
-                                                   std::complex<double> alpha,
-                                                   sycl::buffer<std::complex<double>, 1> &ab,
-                                                   std::int64_t lda, std::int64_t ldb,
-                                                   std::int64_t stride, std::int64_t batch_size);
-    void (*row_major_somatadd_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-        std::int64_t stride_a, float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, sycl::buffer<float, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size);
-    void (*row_major_domatadd_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-        std::int64_t stride_a, double beta, sycl::buffer<double, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, sycl::buffer<double, 1> &c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size);
-    void (*row_major_comatadd_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::complex<float> alpha,
-        sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        std::complex<float> beta, sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-    void (*row_major_zomatadd_batch_strided_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::complex<double> alpha,
-        sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda, std::int64_t stride_a,
-        std::complex<double> beta, sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-        std::int64_t stride_b, sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size);
-
-    void (*row_major_somatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                     std::int64_t m, std::int64_t n, float alpha,
-                                     sycl::buffer<float, 1> &a, std::int64_t lda,
-                                     sycl::buffer<float, 1> &b, std::int64_t ldb);
-    void (*row_major_domatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                     std::int64_t m, std::int64_t n, double alpha,
-                                     sycl::buffer<double, 1> &a, std::int64_t lda,
-                                     sycl::buffer<double, 1> &b, std::int64_t ldb);
-    void (*row_major_comatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                     sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                     sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb);
-    void (*row_major_zomatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                     sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                     sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb);
-    void (*row_major_somatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t m, std::int64_t n, float alpha,
-                                      sycl::buffer<float, 1> &a, std::int64_t lda,
-                                      std::int64_t stridea, sycl::buffer<float, 1> &b,
-                                      std::int64_t ldb, std::int64_t strideb);
-    void (*row_major_domatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t m, std::int64_t n, double alpha,
-                                      sycl::buffer<double, 1> &a, std::int64_t lda,
-                                      std::int64_t stridea, sycl::buffer<double, 1> &b,
-                                      std::int64_t ldb, std::int64_t strideb);
-    void (*row_major_comatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                      sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                      std::int64_t stridea, sycl::buffer<std::complex<float>, 1> &b,
-                                      std::int64_t ldb, std::int64_t strideb);
-    void (*row_major_zomatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                      std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                      sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                      std::int64_t stridea,
-                                      sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                      std::int64_t strideb);
-    void (*row_major_simatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                     std::int64_t m, std::int64_t n, float alpha,
-                                     sycl::buffer<float, 1> &ab, std::int64_t lda,
-                                     std::int64_t ldb);
-    void (*row_major_dimatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                     std::int64_t m, std::int64_t n, double alpha,
-                                     sycl::buffer<double, 1> &ab, std::int64_t lda,
-                                     std::int64_t ldb);
-    void (*row_major_cimatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                     std::int64_t m, std::int64_t n, std::complex<float> alpha,
-                                     sycl::buffer<std::complex<float>, 1> &ab, std::int64_t lda,
-                                     std::int64_t ldb);
-    void (*row_major_zimatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                     std::int64_t m, std::int64_t n, std::complex<double> alpha,
-                                     sycl::buffer<std::complex<double>, 1> &ab, std::int64_t lda,
-                                     std::int64_t ldb);
-    void (*row_major_somatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                    oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                    float alpha, sycl::buffer<float, 1> &a, std::int64_t lda,
-                                    float beta, sycl::buffer<float, 1> &b, std::int64_t ldb,
-                                    sycl::buffer<float, 1> &c, std::int64_t ldc);
-    void (*row_major_domatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                    oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                    double alpha, sycl::buffer<double, 1> &a, std::int64_t lda,
-                                    double beta, sycl::buffer<double, 1> &b, std::int64_t ldb,
-                                    sycl::buffer<double, 1> &c, std::int64_t ldc);
-    void (*row_major_comatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                    oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                    std::complex<float> alpha,
-                                    sycl::buffer<std::complex<float>, 1> &a, std::int64_t lda,
-                                    std::complex<float> beta,
-                                    sycl::buffer<std::complex<float>, 1> &b, std::int64_t ldb,
-                                    sycl::buffer<std::complex<float>, 1> &c, std::int64_t ldc);
-    void (*row_major_zomatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                    oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n,
-                                    std::complex<double> alpha,
-                                    sycl::buffer<std::complex<double>, 1> &a, std::int64_t lda,
-                                    std::complex<double> beta,
-                                    sycl::buffer<std::complex<double>, 1> &b, std::int64_t ldb,
-                                    sycl::buffer<std::complex<double>, 1> &c, std::int64_t ldc);
-
-    // USM APIs
-
-    sycl::event (*row_major_scasum_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                             const std::complex<float> *x, std::int64_t incx,
-                                             float *result,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dzasum_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                             const std::complex<double> *x, std::int64_t incx,
-                                             double *result,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sasum_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x,
-                                            std::int64_t incx, float *result,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dasum_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x,
-                                            std::int64_t incx, double *result,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_saxpy_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                            const float *x, std::int64_t incx, float *y,
-                                            std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_daxpy_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                            const double *x, std::int64_t incx, double *y,
-                                            std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_caxpy_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                            std::complex<float> alpha, const std::complex<float> *x,
-                                            std::int64_t incx, std::complex<float> *y,
-                                            std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zaxpy_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                            std::complex<double> alpha,
-                                            const std::complex<double> *x, std::int64_t incx,
-                                            std::complex<double> *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_saxpy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx,
-        float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_daxpy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx,
-        double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_caxpy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, std::complex<float> *alpha,
-        const std::complex<float> **x, std::int64_t *incx, std::complex<float> **y,
-        std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zaxpy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, std::complex<double> *alpha,
-        const std::complex<double> **x, std::int64_t *incx, std::complex<double> **y,
-        std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_saxpy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx,
-        std::int64_t stridex, float *y, std::int64_t incy, std::int64_t stridey,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_daxpy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx,
-        std::int64_t stridex, double *y, std::int64_t incy, std::int64_t stridey,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_caxpy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, std::complex<float> alpha, const std::complex<float> *x,
-        std::int64_t incx, std::int64_t stridex, std::complex<float> *y, std::int64_t incy,
-        std::int64_t stridey, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zaxpy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, std::complex<double> alpha,
-        const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-        std::complex<double> *y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_saxpby_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                             const float *x, std::int64_t incx, const float beta,
-                                             float *y, std::int64_t incy,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_daxpby_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                             const double *x, std::int64_t incx, const double beta,
-                                             double *y, std::int64_t incy,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_caxpby_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                             std::complex<float> alpha,
-                                             const std::complex<float> *x, std::int64_t incx,
-                                             const std::complex<float> beta, std::complex<float> *y,
-                                             std::int64_t incy,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zaxpby_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                             std::complex<double> alpha,
-                                             const std::complex<double> *x, std::int64_t incx,
-                                             const std::complex<double> beta,
-                                             std::complex<double> *y, std::int64_t incy,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_scopy_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x,
-                                            std::int64_t incx, float *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dcopy_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x,
-                                            std::int64_t incx, double *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ccopy_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                            const std::complex<float> *x, std::int64_t incx,
-                                            std::complex<float> *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zcopy_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                            const std::complex<double> *x, std::int64_t incx,
-                                            std::complex<double> *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_scopy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, const float **x, std::int64_t *incx, float **y,
-        std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, const double **x, std::int64_t *incx, double **y,
-        std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ccopy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, const std::complex<float> **x, std::int64_t *incx,
-        std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, std::int64_t *n, const std::complex<double> **x, std::int64_t *incx,
-        std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_scopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t stridex,
-        float *y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx,
-        std::int64_t stridex, double *y, std::int64_t incy, std::int64_t stridey,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ccopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, const std::complex<float> *x, std::int64_t incx,
-        std::int64_t stridex, std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, std::int64_t n, const std::complex<double> *x, std::int64_t incx,
-        std::int64_t stridex, std::complex<double> *y, std::int64_t incy, std::int64_t stridey,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sdot_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x,
-                                           std::int64_t incx, const float *y, std::int64_t incy,
-                                           float *result,
-                                           const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ddot_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x,
-                                           std::int64_t incx, const double *y, std::int64_t incy,
-                                           double *result,
-                                           const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dsdot_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x,
-                                            std::int64_t incx, const float *y, std::int64_t incy,
-                                            double *result,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cdotc_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                            const std::complex<float> *x, std::int64_t incx,
-                                            const std::complex<float> *y, std::int64_t incy,
-                                            std::complex<float> *result,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zdotc_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                            const std::complex<double> *x, std::int64_t incx,
-                                            const std::complex<double> *y, std::int64_t incy,
-                                            std::complex<double> *result,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cdotu_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                            const std::complex<float> *x, std::int64_t incx,
-                                            const std::complex<float> *y, std::int64_t incy,
-                                            std::complex<float> *result,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zdotu_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                            const std::complex<double> *x, std::int64_t incx,
-                                            const std::complex<double> *y, std::int64_t incy,
-                                            std::complex<double> *result,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_isamin_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x,
-                                             std::int64_t incx, std::int64_t *result,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_idamin_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x,
-                                             std::int64_t incx, std::int64_t *result,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_icamin_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                             const std::complex<float> *x, std::int64_t incx,
-                                             std::int64_t *result,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_izamin_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                             const std::complex<double> *x, std::int64_t incx,
-                                             std::int64_t *result,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_isamax_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x,
-                                             std::int64_t incx, std::int64_t *result,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_idamax_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x,
-                                             std::int64_t incx, std::int64_t *result,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_icamax_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                             const std::complex<float> *x, std::int64_t incx,
-                                             std::int64_t *result,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_izamax_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                             const std::complex<double> *x, std::int64_t incx,
-                                             std::int64_t *result,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_scnrm2_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                             const std::complex<float> *x, std::int64_t incx,
-                                             float *result,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dznrm2_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                             const std::complex<double> *x, std::int64_t incx,
-                                             double *result,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_snrm2_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x,
-                                            std::int64_t incx, float *result,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dnrm2_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x,
-                                            std::int64_t incx, double *result,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_srot_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                           std::complex<float> *x, std::int64_t incx,
-                                           std::complex<float> *y, std::int64_t incy, float c,
-                                           float s, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_drot_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                           std::complex<double> *x, std::int64_t incx,
-                                           std::complex<double> *y, std::int64_t incy, double c,
-                                           double s, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_csrot_usm_sycl)(sycl::queue &queue, std::int64_t n, float *x,
-                                            std::int64_t incx, float *y, std::int64_t incy, float c,
-                                            float s, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zdrot_usm_sycl)(sycl::queue &queue, std::int64_t n, double *x,
-                                            std::int64_t incx, double *y, std::int64_t incy,
-                                            double c, double s,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_srotg_usm_sycl)(sycl::queue &queue, float *a, float *b, float *c,
-                                            float *s, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_drotg_usm_sycl)(sycl::queue &queue, double *a, double *b, double *c,
-                                            double *s,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_crotg_usm_sycl)(sycl::queue &queue, std::complex<float> *a,
-                                            std::complex<float> *b, float *c,
-                                            std::complex<float> *s,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zrotg_usm_sycl)(sycl::queue &queue, std::complex<double> *a,
-                                            std::complex<double> *b, double *c,
-                                            std::complex<double> *s,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_srotm_usm_sycl)(sycl::queue &queue, std::int64_t n, float *x,
-                                            std::int64_t incx, float *y, std::int64_t incy,
-                                            float *param,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_drotm_usm_sycl)(sycl::queue &queue, std::int64_t n, double *x,
-                                            std::int64_t incx, double *y, std::int64_t incy,
-                                            double *param,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_srotmg_usm_sycl)(sycl::queue &queue, float *d1, float *d2, float *x1,
-                                             float y1, float *param,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_drotmg_usm_sycl)(sycl::queue &queue, double *d1, double *d2, double *x1,
-                                             double y1, double *param,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sscal_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                            float *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dscal_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                            double *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cscal_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                            std::complex<float> alpha, std::complex<float> *x,
-                                            std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_csscal_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                             std::complex<double> alpha, std::complex<double> *x,
-                                             std::int64_t incx,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zscal_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha,
-                                            std::complex<float> *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zdscal_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha,
-                                             std::complex<double> *x, std::int64_t incx,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sdsdot_usm_sycl)(sycl::queue &queue, std::int64_t n, float sb,
-                                             const float *x, std::int64_t incx, const float *y,
-                                             std::int64_t incy, float *result,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sswap_usm_sycl)(sycl::queue &queue, std::int64_t n, float *x,
-                                            std::int64_t incx, float *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dswap_usm_sycl)(sycl::queue &queue, std::int64_t n, double *x,
-                                            std::int64_t incx, double *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cswap_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                            std::complex<float> *x, std::int64_t incx,
-                                            std::complex<float> *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zswap_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                            std::complex<double> *x, std::int64_t incx,
-                                            std::complex<double> *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sgbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                            std::int64_t m, std::int64_t n, std::int64_t kl,
-                                            std::int64_t ku, float alpha, const float *a,
-                                            std::int64_t lda, const float *x, std::int64_t incx,
-                                            float beta, float *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dgbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                            std::int64_t m, std::int64_t n, std::int64_t kl,
-                                            std::int64_t ku, double alpha, const double *a,
-                                            std::int64_t lda, const double *x, std::int64_t incx,
-                                            double beta, double *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cgbmv_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::int64_t kl, std::int64_t ku, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, const std::complex<float> *x, std::int64_t incx, std::complex<float> beta,
-        std::complex<float> *y, std::int64_t incy, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zgbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                            std::int64_t m, std::int64_t n, std::int64_t kl,
-                                            std::int64_t ku, std::complex<double> alpha,
-                                            const std::complex<double> *a, std::int64_t lda,
-                                            const std::complex<double> *x, std::int64_t incx,
-                                            std::complex<double> beta, std::complex<double> *y,
-                                            std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sgemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                            std::int64_t m, std::int64_t n, float alpha,
-                                            const float *a, std::int64_t lda, const float *x,
-                                            std::int64_t incx, float beta, float *y,
-                                            std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dgemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                            std::int64_t m, std::int64_t n, double alpha,
-                                            const double *a, std::int64_t lda, const double *x,
-                                            std::int64_t incx, double beta, double *y,
-                                            std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cgemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                            std::int64_t m, std::int64_t n,
-                                            std::complex<float> alpha, const std::complex<float> *a,
-                                            std::int64_t lda, const std::complex<float> *x,
-                                            std::int64_t incx, std::complex<float> beta,
-                                            std::complex<float> *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zgemv_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-        const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-        std::complex<double> *y, std::int64_t incy, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sgemv_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        float alpha, const float *a, std::int64_t lda, std::int64_t stridea, const float *x,
-        std::int64_t incx, std::int64_t stridex, float beta, float *y, std::int64_t incy,
-        std::int64_t stridey, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dgemv_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        double alpha, const double *a, std::int64_t lda, std::int64_t stridea, const double *x,
-        std::int64_t incx, std::int64_t stridex, double beta, double *y, std::int64_t incy,
-        std::int64_t stridey, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cgemv_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-        std::int64_t stridea, const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-        std::complex<float> beta, std::complex<float> *y, std::int64_t incy, std::int64_t stridey,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zgemv_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-        std::int64_t stridea, const std::complex<double> *x, std::int64_t incx,
-        std::int64_t stridex, std::complex<double> beta, std::complex<double> *y, std::int64_t incy,
-        std::int64_t stridey, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sgemv_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        float *alpha, const float **a, std::int64_t *lda, const float **x, std::int64_t *incx,
-        float *beta, float **y, std::int64_t *incy, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dgemv_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        double *alpha, const double **a, std::int64_t *lda, const double **x, std::int64_t *incx,
-        double *beta, double **y, std::int64_t *incy, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cgemv_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
-        const std::complex<float> **x, std::int64_t *incx, std::complex<float> *beta,
-        std::complex<float> **y, std::int64_t *incy, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zgemv_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
-        const std::complex<double> **x, std::int64_t *incx, std::complex<double> *beta,
-        std::complex<double> **y, std::int64_t *incy, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sdgmm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-        const float *a, std::int64_t lda, std::int64_t stridea, const float *x, std::int64_t incx,
-        std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ddgmm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-        const double *a, std::int64_t lda, std::int64_t stridea, const double *x, std::int64_t incx,
-        std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cdgmm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-        const std::complex<float> *a, std::int64_t lda, std::int64_t stridea,
-        const std::complex<float> *x, std::int64_t incx, std::int64_t stridex,
-        std::complex<float> *c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zdgmm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n,
-        const std::complex<double> *a, std::int64_t lda, std::int64_t stridea,
-        const std::complex<double> *x, std::int64_t incx, std::int64_t stridex,
-        std::complex<double> *c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sdgmm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n,
-        const float **a, std::int64_t *lda, const float **x, std::int64_t *incx, float **c,
-        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ddgmm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n,
-        const double **a, std::int64_t *lda, const double **x, std::int64_t *incx, double **c,
-        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cdgmm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n,
-        const std::complex<float> **a, std::int64_t *lda, const std::complex<float> **x,
-        std::int64_t *incx, std::complex<float> **c, std::int64_t *ldc, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zdgmm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n,
-        const std::complex<double> **a, std::int64_t *lda, const std::complex<double> **x,
-        std::int64_t *incx, std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sger_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                           float alpha, const float *x, std::int64_t incx,
-                                           const float *y, std::int64_t incy, float *a,
-                                           std::int64_t lda,
-                                           const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dger_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                           double alpha, const double *x, std::int64_t incx,
-                                           const double *y, std::int64_t incy, double *a,
-                                           std::int64_t lda,
-                                           const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cgerc_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                            std::complex<float> alpha, const std::complex<float> *x,
-                                            std::int64_t incx, const std::complex<float> *y,
-                                            std::int64_t incy, std::complex<float> *a,
-                                            std::int64_t lda,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zgerc_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                            std::complex<double> alpha,
-                                            const std::complex<double> *x, std::int64_t incx,
-                                            const std::complex<double> *y, std::int64_t incy,
-                                            std::complex<double> *a, std::int64_t lda,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cgeru_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                            std::complex<float> alpha, const std::complex<float> *x,
-                                            std::int64_t incx, const std::complex<float> *y,
-                                            std::int64_t incy, std::complex<float> *a,
-                                            std::int64_t lda,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zgeru_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                            std::complex<double> alpha,
-                                            const std::complex<double> *x, std::int64_t incx,
-                                            const std::complex<double> *y, std::int64_t incy,
-                                            std::complex<double> *a, std::int64_t lda,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_chbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, std::int64_t k,
-                                            std::complex<float> alpha, const std::complex<float> *a,
-                                            std::int64_t lda, const std::complex<float> *x,
-                                            std::int64_t incx, std::complex<float> beta,
-                                            std::complex<float> *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zhbmv_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k,
-        std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-        const std::complex<double> *x, std::int64_t incx, std::complex<double> beta,
-        std::complex<double> *y, std::int64_t incy, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_chemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, std::complex<float> alpha,
-                                            const std::complex<float> *a, std::int64_t lda,
-                                            const std::complex<float> *x, std::int64_t incx,
-                                            std::complex<float> beta, std::complex<float> *y,
-                                            std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zhemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, std::complex<double> alpha,
-                                            const std::complex<double> *a, std::int64_t lda,
-                                            const std::complex<double> *x, std::int64_t incx,
-                                            std::complex<double> beta, std::complex<double> *y,
-                                            std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cher_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                           std::int64_t n, float alpha,
-                                           const std::complex<float> *x, std::int64_t incx,
-                                           std::complex<float> *a, std::int64_t lda,
-                                           const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zher_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                           std::int64_t n, double alpha,
-                                           const std::complex<double> *x, std::int64_t incx,
-                                           std::complex<double> *a, std::int64_t lda,
-                                           const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cher2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, std::complex<float> alpha,
-                                            const std::complex<float> *x, std::int64_t incx,
-                                            const std::complex<float> *y, std::int64_t incy,
-                                            std::complex<float> *a, std::int64_t lda,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zher2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, std::complex<double> alpha,
-                                            const std::complex<double> *x, std::int64_t incx,
-                                            const std::complex<double> *y, std::int64_t incy,
-                                            std::complex<double> *a, std::int64_t lda,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_chpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, std::complex<float> alpha,
-                                            const std::complex<float> *a,
-                                            const std::complex<float> *x, std::int64_t incx,
-                                            std::complex<float> beta, std::complex<float> *y,
-                                            std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zhpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, std::complex<double> alpha,
-                                            const std::complex<double> *a,
-                                            const std::complex<double> *x, std::int64_t incx,
-                                            std::complex<double> beta, std::complex<double> *y,
-                                            std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_chpr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                           std::int64_t n, float alpha,
-                                           const std::complex<float> *x, std::int64_t incx,
-                                           std::complex<float> *a,
-                                           const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zhpr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                           std::int64_t n, double alpha,
-                                           const std::complex<double> *x, std::int64_t incx,
-                                           std::complex<double> *a,
-                                           const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_chpr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, std::complex<float> alpha,
-                                            const std::complex<float> *x, std::int64_t incx,
-                                            const std::complex<float> *y, std::int64_t incy,
-                                            std::complex<float> *a,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zhpr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, std::complex<double> alpha,
-                                            const std::complex<double> *x, std::int64_t incx,
-                                            const std::complex<double> *y, std::int64_t incy,
-                                            std::complex<double> *a,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ssbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, std::int64_t k, float alpha,
-                                            const float *a, std::int64_t lda, const float *x,
-                                            std::int64_t incx, float beta, float *y,
-                                            std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dsbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, std::int64_t k, double alpha,
-                                            const double *a, std::int64_t lda, const double *x,
-                                            std::int64_t incx, double beta, double *y,
-                                            std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sspmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, float alpha, const float *a,
-                                            const float *x, std::int64_t incx, float beta, float *y,
-                                            std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dspmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, double alpha, const double *a,
-                                            const double *x, std::int64_t incx, double beta,
-                                            double *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sspr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                           std::int64_t n, float alpha, const float *x,
-                                           std::int64_t incx, float *a,
-                                           const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dspr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                           std::int64_t n, double alpha, const double *x,
-                                           std::int64_t incx, double *a,
-                                           const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sspr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, float alpha, const float *x,
-                                            std::int64_t incx, const float *y, std::int64_t incy,
-                                            float *a, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dspr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, double alpha, const double *x,
-                                            std::int64_t incx, const double *y, std::int64_t incy,
-                                            double *a,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ssymv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, float alpha, const float *a,
-                                            std::int64_t lda, const float *x, std::int64_t incx,
-                                            float beta, float *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dsymv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, double alpha, const double *a,
-                                            std::int64_t lda, const double *x, std::int64_t incx,
-                                            double beta, double *y, std::int64_t incy,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ssyr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                           std::int64_t n, float alpha, const float *x,
-                                           std::int64_t incx, float *a, std::int64_t lda,
-                                           const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dsyr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                           std::int64_t n, double alpha, const double *x,
-                                           std::int64_t incx, double *a, std::int64_t lda,
-                                           const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ssyr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, float alpha, const float *x,
-                                            std::int64_t incx, const float *y, std::int64_t incy,
-                                            float *a, std::int64_t lda,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dsyr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            std::int64_t n, double alpha, const double *x,
-                                            std::int64_t incx, const double *y, std::int64_t incy,
-                                            double *a, std::int64_t lda,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_stbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            std::int64_t k, const float *a, std::int64_t lda,
-                                            float *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dtbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            std::int64_t k, const double *a, std::int64_t lda,
-                                            double *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ctbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            std::int64_t k, const std::complex<float> *a,
-                                            std::int64_t lda, std::complex<float> *x,
-                                            std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ztbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            std::int64_t k, const std::complex<double> *a,
-                                            std::int64_t lda, std::complex<double> *x,
-                                            std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_stbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            std::int64_t k, const float *a, std::int64_t lda,
-                                            float *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dtbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            std::int64_t k, const double *a, std::int64_t lda,
-                                            double *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ctbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            std::int64_t k, const std::complex<float> *a,
-                                            std::int64_t lda, std::complex<float> *x,
-                                            std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ztbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            std::int64_t k, const std::complex<double> *a,
-                                            std::int64_t lda, std::complex<double> *x,
-                                            std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_stpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const float *a, float *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dtpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const double *a, double *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ctpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const std::complex<float> *a, std::complex<float> *x,
-                                            std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ztpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const std::complex<double> *a, std::complex<double> *x,
-                                            std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_stpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const float *a, float *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dtpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const double *a, double *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ctpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const std::complex<float> *a, std::complex<float> *x,
-                                            std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ztpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const std::complex<double> *a, std::complex<double> *x,
-                                            std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_strmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const float *a, std::int64_t lda, float *x,
-                                            std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dtrmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const double *a, std::int64_t lda, double *x,
-                                            std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ctrmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const std::complex<float> *a, std::int64_t lda,
-                                            std::complex<float> *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ztrmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const std::complex<double> *a, std::int64_t lda,
-                                            std::complex<double> *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_strsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const float *a, std::int64_t lda, float *x,
-                                            std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dtrsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const double *a, std::int64_t lda, double *x,
-                                            std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ctrsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const std::complex<float> *a, std::int64_t lda,
-                                            std::complex<float> *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ztrsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t n,
-                                            const std::complex<double> *a, std::int64_t lda,
-                                            std::complex<double> *x, std::int64_t incx,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sgemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                            oneapi::mkl::transpose transb, std::int64_t m,
-                                            std::int64_t n, std::int64_t k, float alpha,
-                                            const float *a, std::int64_t lda, const float *b,
-                                            std::int64_t ldb, float beta, float *c,
-                                            std::int64_t ldc,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dgemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                            oneapi::mkl::transpose transb, std::int64_t m,
-                                            std::int64_t n, std::int64_t k, double alpha,
-                                            const double *a, std::int64_t lda, const double *b,
-                                            std::int64_t ldb, double beta, double *c,
-                                            std::int64_t ldc,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cgemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                            oneapi::mkl::transpose transb, std::int64_t m,
-                                            std::int64_t n, std::int64_t k,
-                                            std::complex<float> alpha, const std::complex<float> *a,
-                                            std::int64_t lda, const std::complex<float> *b,
-                                            std::int64_t ldb, std::complex<float> beta,
-                                            std::complex<float> *c, std::int64_t ldc,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zgemm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-        const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-        std::int64_t ldb, std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_hgemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                            oneapi::mkl::transpose transb, std::int64_t m,
-                                            std::int64_t n, std::int64_t k, sycl::half alpha,
-                                            const sycl::half *a, std::int64_t lda,
-                                            const sycl::half *b, std::int64_t ldb, sycl::half beta,
-                                            sycl::half *c, std::int64_t ldc,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_gemm_f16f16f32_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a,
-        std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c,
-        std::int64_t ldc, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_gemm_bf16bf16f32_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const oneapi::mkl::bfloat16 *a,
-        std::int64_t lda, const oneapi::mkl::bfloat16 *b, std::int64_t ldb, float beta, float *c,
-        std::int64_t ldc, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_chemm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
-        std::complex<float> *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zhemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                            oneapi::mkl::uplo upper_lower, std::int64_t m,
-                                            std::int64_t n, std::complex<double> alpha,
-                                            const std::complex<double> *a, std::int64_t lda,
-                                            const std::complex<double> *b, std::int64_t ldb,
-                                            std::complex<double> beta, std::complex<double> *c,
-                                            std::int64_t ldc,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cherk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans, std::int64_t n,
-                                            std::int64_t k, float alpha,
-                                            const std::complex<float> *a, std::int64_t lda,
-                                            float beta, std::complex<float> *c, std::int64_t ldc,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zherk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans, std::int64_t n,
-                                            std::int64_t k, double alpha,
-                                            const std::complex<double> *a, std::int64_t lda,
-                                            double beta, std::complex<double> *c, std::int64_t ldc,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cher2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                             oneapi::mkl::transpose trans, std::int64_t n,
-                                             std::int64_t k, std::complex<float> alpha,
-                                             const std::complex<float> *a, std::int64_t lda,
-                                             const std::complex<float> *b, std::int64_t ldb,
-                                             float beta, std::complex<float> *c, std::int64_t ldc,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zher2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                             oneapi::mkl::transpose trans, std::int64_t n,
-                                             std::int64_t k, std::complex<double> alpha,
-                                             const std::complex<double> *a, std::int64_t lda,
-                                             const std::complex<double> *b, std::int64_t ldb,
-                                             double beta, std::complex<double> *c, std::int64_t ldc,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ssymm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                            oneapi::mkl::uplo upper_lower, std::int64_t m,
-                                            std::int64_t n, float alpha, const float *a,
-                                            std::int64_t lda, const float *b, std::int64_t ldb,
-                                            float beta, float *c, std::int64_t ldc,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dsymm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                            oneapi::mkl::uplo upper_lower, std::int64_t m,
-                                            std::int64_t n, double alpha, const double *a,
-                                            std::int64_t lda, const double *b, std::int64_t ldb,
-                                            double beta, double *c, std::int64_t ldc,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_csymm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
-        std::complex<float> *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zsymm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                            oneapi::mkl::uplo upper_lower, std::int64_t m,
-                                            std::int64_t n, std::complex<double> alpha,
-                                            const std::complex<double> *a, std::int64_t lda,
-                                            const std::complex<double> *b, std::int64_t ldb,
-                                            std::complex<double> beta, std::complex<double> *c,
-                                            std::int64_t ldc,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ssyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans, std::int64_t n,
-                                            std::int64_t k, float alpha, const float *a,
-                                            std::int64_t lda, float beta, float *c,
-                                            std::int64_t ldc,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dsyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans, std::int64_t n,
-                                            std::int64_t k, double alpha, const double *a,
-                                            std::int64_t lda, double beta, double *c,
-                                            std::int64_t ldc,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_csyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans, std::int64_t n,
-                                            std::int64_t k, std::complex<float> alpha,
-                                            const std::complex<float> *a, std::int64_t lda,
-                                            std::complex<float> beta, std::complex<float> *c,
-                                            std::int64_t ldc,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zsyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans, std::int64_t n,
-                                            std::int64_t k, std::complex<double> alpha,
-                                            const std::complex<double> *a, std::int64_t lda,
-                                            std::complex<double> beta, std::complex<double> *c,
-                                            std::int64_t ldc,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ssyrk_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans,
-        std::int64_t *n, std::int64_t *k, float *alpha, const float **a, std::int64_t *lda,
-        float *beta, float **c, std::int64_t *ldc, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dsyrk_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans,
-        std::int64_t *n, std::int64_t *k, double *alpha, const double **a, std::int64_t *lda,
-        double *beta, double **c, std::int64_t *ldc, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_csyrk_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans,
-        std::int64_t *n, std::int64_t *k, std::complex<float> *alpha, const std::complex<float> **a,
-        std::int64_t *lda, std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-        std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zsyrk_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans,
-        std::int64_t *n, std::int64_t *k, std::complex<double> *alpha,
-        const std::complex<double> **a, std::int64_t *lda, std::complex<double> *beta,
-        std::complex<double> **c, std::int64_t *ldc, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ssyrk_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda,
-        std::int64_t stride_a, float beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dsyrk_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda,
-        std::int64_t stride_a, double beta, double *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_csyrk_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, std::int64_t stride_a, std::complex<float> beta, std::complex<float> *c,
-        std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zsyrk_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, std::complex<double> alpha, const std::complex<double> *a,
-        std::int64_t lda, std::int64_t stride_a, std::complex<double> beta, std::complex<double> *c,
-        std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ssyr2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                             oneapi::mkl::transpose trans, std::int64_t n,
-                                             std::int64_t k, float alpha, const float *a,
-                                             std::int64_t lda, const float *b, std::int64_t ldb,
-                                             float beta, float *c, std::int64_t ldc,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dsyr2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                             oneapi::mkl::transpose trans, std::int64_t n,
-                                             std::int64_t k, double alpha, const double *a,
-                                             std::int64_t lda, const double *b, std::int64_t ldb,
-                                             double beta, double *c, std::int64_t ldc,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_csyr2k_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-        std::int64_t n, std::int64_t k, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, const std::complex<float> *b, std::int64_t ldb, std::complex<float> beta,
-        std::complex<float> *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zsyr2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                             oneapi::mkl::transpose trans, std::int64_t n,
-                                             std::int64_t k, std::complex<double> alpha,
-                                             const std::complex<double> *a, std::int64_t lda,
-                                             const std::complex<double> *b, std::int64_t ldb,
-                                             std::complex<double> beta, std::complex<double> *c,
-                                             std::int64_t ldc,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_strmm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                            oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t m,
-                                            std::int64_t n, float alpha, const float *a,
-                                            std::int64_t lda, float *b, std::int64_t ldb,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dtrmm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                            oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t m,
-                                            std::int64_t n, double alpha, const double *a,
-                                            std::int64_t lda, double *b, std::int64_t ldb,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ctrmm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-        std::complex<float> *b, std::int64_t ldb, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ztrmm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-        std::complex<double> *b, std::int64_t ldb, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_strsm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                            oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t m,
-                                            std::int64_t n, float alpha, const float *a,
-                                            std::int64_t lda, float *b, std::int64_t ldb,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dtrsm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right,
-                                            oneapi::mkl::uplo upper_lower,
-                                            oneapi::mkl::transpose trans,
-                                            oneapi::mkl::diag unit_diag, std::int64_t m,
-                                            std::int64_t n, double alpha, const double *a,
-                                            std::int64_t lda, double *b, std::int64_t ldb,
-                                            const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ctrsm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-        std::complex<float> *b, std::int64_t ldb, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ztrsm_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-        std::complex<double> *b, std::int64_t ldb, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_strsm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, float *b,
-        std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dtrsm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, double *b,
-        std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ctrsm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-        std::int64_t stride_a, std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ztrsm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower,
-        oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-        std::int64_t stride_a, std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_strsm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower,
-        oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m,
-        std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, float **b,
-        std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dtrsm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower,
-        oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m,
-        std::int64_t *n, double *alpha, const double **a, std::int64_t *lda, double **b,
-        std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ctrsm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower,
-        oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m,
-        std::int64_t *n, std::complex<float> *alpha, const std::complex<float> **a,
-        std::int64_t *lda, std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_ztrsm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower,
-        oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m,
-        std::int64_t *n, std::complex<double> *alpha, const std::complex<double> **a,
-        std::int64_t *lda, std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-        std::int64_t *group_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sgemm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const float **a,
-        std::int64_t *lda, const float **b, std::int64_t *ldb, float *beta, float **c,
-        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dgemm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, double *alpha, const double **a,
-        std::int64_t *lda, const double **b, std::int64_t *ldb, double *beta, double **c,
-        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cgemm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex<float> *alpha,
-        const std::complex<float> **a, std::int64_t *lda, const std::complex<float> **b,
-        std::int64_t *ldb, std::complex<float> *beta, std::complex<float> **c, std::int64_t *ldc,
-        std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zgemm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex<double> *alpha,
-        const std::complex<double> **a, std::int64_t *lda, const std::complex<double> **b,
-        std::int64_t *ldb, std::complex<double> *beta, std::complex<double> **c, std::int64_t *ldc,
-        std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_hgemm_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, sycl::half *alpha, const sycl::half **a,
-        std::int64_t *lda, const sycl::half **b, std::int64_t *ldb, sycl::half *beta,
-        sycl::half **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_gemm_f16f16f32_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const sycl::half **a,
-        std::int64_t *lda, const sycl::half **b, std::int64_t *ldb, float *beta, float **c,
-        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_gemm_s8s8f32_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const std::int8_t **a,
-        std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb, float *beta, float **c,
-        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_gemm_s8s8s32_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb,
-        std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const std::int8_t **a,
-        std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c,
-        std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sgemm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a,
-        std::int64_t lda, std::int64_t stride_a, const float *b, std::int64_t ldb,
-        std::int64_t stride_b, float beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dgemm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a,
-        std::int64_t lda, std::int64_t stride_a, const double *b, std::int64_t ldb,
-        std::int64_t stride_b, double beta, double *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cgemm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-        const std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-        const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::complex<float> beta, std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zgemm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-        const std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-        const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::complex<double> beta, std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_hgemm_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half *a,
-        std::int64_t lda, std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-        std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc,
-        std::int64_t stride_c, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_gemm_f16f16f32_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a,
-        std::int64_t lda, std::int64_t stride_a, const sycl::half *b, std::int64_t ldb,
-        std::int64_t stride_b, float beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_gemm_s8s8f32_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a,
-        std::int64_t lda, std::int64_t stride_a, const std::int8_t *b, std::int64_t ldb,
-        std::int64_t stride_b, float beta, float *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_gemm_s8s8s32_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a,
-        std::int64_t lda, std::int64_t stride_a, const std::int8_t *b, std::int64_t ldb,
-        std::int64_t stride_b, float beta, std::int32_t *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_sgemmt_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                             oneapi::mkl::transpose transa,
-                                             oneapi::mkl::transpose transb, std::int64_t n,
-                                             std::int64_t k, float alpha, const float *a,
-                                             std::int64_t lda, const float *b, std::int64_t ldb,
-                                             float beta, float *c, std::int64_t ldc,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dgemmt_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower,
-                                             oneapi::mkl::transpose transa,
-                                             oneapi::mkl::transpose transb, std::int64_t n,
-                                             std::int64_t k, double alpha, const double *a,
-                                             std::int64_t lda, const double *b, std::int64_t ldb,
-                                             double beta, double *c, std::int64_t ldc,
-                                             const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cgemmt_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa,
-        oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, std::complex<float> alpha,
-        const std::complex<float> *a, std::int64_t lda, const std::complex<float> *b,
-        std::int64_t ldb, std::complex<float> beta, std::complex<float> *c, std::int64_t ldc,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zgemmt_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa,
-        oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, std::complex<double> alpha,
-        const std::complex<double> *a, std::int64_t lda, const std::complex<double> *b,
-        std::int64_t ldb, std::complex<double> beta, std::complex<double> *c, std::int64_t ldc,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_gemm_s8u8s32_bias_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        const std::int8_t *a, std::int64_t lda, std::int8_t ao, const std::uint8_t *b,
-        std::int64_t ldb, std::uint8_t bo, float beta, std::int32_t *c, std::int64_t ldc,
-        const std::int32_t *co, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_gemm_s8s8s32_bias_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        const std::int8_t *a, std::int64_t lda, std::int8_t ao, const std::int8_t *b,
-        std::int64_t ldb, std::int8_t bo, float beta, std::int32_t *c, std::int64_t ldc,
-        const std::int32_t *co, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_gemm_u8s8s32_bias_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        const std::uint8_t *a, std::int64_t lda, std::uint8_t ao, const std::int8_t *b,
-        std::int64_t ldb, std::int8_t bo, float beta, std::int32_t *c, std::int64_t ldc,
-        const std::int32_t *co, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_gemm_u8u8s32_bias_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha,
-        const std::uint8_t *a, std::int64_t lda, std::uint8_t ao, const std::uint8_t *b,
-        std::int64_t ldb, std::uint8_t bo, float beta, std::int32_t *c, std::int64_t ldc,
-        const std::int32_t *co, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_somatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, float *b,
-        std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_domatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, double *b,
-        std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_comatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, const std::complex<float> *a, std::int64_t lda,
-        std::int64_t stride_a, std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zomatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, const std::complex<double> *a, std::int64_t lda,
-        std::int64_t stride_a, std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_simatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        float alpha, float *ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dimatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        double alpha, double *ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cimatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<float> alpha, std::complex<float> *ab, std::int64_t lda, std::int64_t ldb,
-        std::int64_t stride, std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zimatcopy_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-        std::complex<double> alpha, std::complex<double> *ab, std::int64_t lda, std::int64_t ldb,
-        std::int64_t stride, std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_somatadd_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda,
-        std::int64_t stride_a, float beta, const float *b, std::int64_t ldb, std::int64_t stride_b,
-        float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_domatadd_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda,
-        std::int64_t stride_a, double beta, const double *b, std::int64_t ldb,
-        std::int64_t stride_b, double *c, std::int64_t ldc, std::int64_t stride_c,
-        std::int64_t batch_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_comatadd_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, std::int64_t stride_a, std::complex<float> beta,
-        const std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::complex<float> *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zomatadd_batch_strided_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::complex<double> alpha, const std::complex<double> *a,
-        std::int64_t lda, std::int64_t stride_a, std::complex<double> beta,
-        const std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::complex<double> *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_somatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                std::int64_t m, std::int64_t n, float alpha,
-                                                const float *a, std::int64_t lda, float *b,
-                                                std::int64_t ldb,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_domatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                std::int64_t m, std::int64_t n, double alpha,
-                                                const double *a, std::int64_t lda, double *b,
-                                                std::int64_t ldb,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_comatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                std::int64_t m, std::int64_t n,
-                                                std::complex<float> alpha,
-                                                const std::complex<float> *a, std::int64_t lda,
-                                                std::complex<float> *b, std::int64_t ldb,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zomatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                std::int64_t m, std::int64_t n,
-                                                std::complex<double> alpha,
-                                                const std::complex<double> *a, std::int64_t lda,
-                                                std::complex<double> *b, std::int64_t ldb,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_somatcopy2_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                 std::int64_t m, std::int64_t n, float alpha,
-                                                 const float *a, std::int64_t lda,
-                                                 std::int64_t stridea, float *b, std::int64_t ldb,
-                                                 std::int64_t strideb,
-                                                 const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_domatcopy2_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                 std::int64_t m, std::int64_t n, double alpha,
-                                                 const double *a, std::int64_t lda,
-                                                 std::int64_t stridea, double *b, std::int64_t ldb,
-                                                 std::int64_t strideb,
-                                                 const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_comatcopy2_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                 std::int64_t m, std::int64_t n,
-                                                 std::complex<float> alpha,
-                                                 const std::complex<float> *a, std::int64_t lda,
-                                                 std::int64_t stridea, std::complex<float> *b,
-                                                 std::int64_t ldb, std::int64_t strideb,
-                                                 const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zomatcopy2_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                 std::int64_t m, std::int64_t n,
-                                                 std::complex<double> alpha,
-                                                 const std::complex<double> *a, std::int64_t lda,
-                                                 std::int64_t stridea, std::complex<double> *b,
-                                                 std::int64_t ldb, std::int64_t strideb,
-                                                 const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_simatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                std::int64_t m, std::int64_t n, float alpha,
-                                                float *ab, std::int64_t lda, std::int64_t ldb,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dimatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                std::int64_t m, std::int64_t n, double alpha,
-                                                double *ab, std::int64_t lda, std::int64_t ldb,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cimatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                std::int64_t m, std::int64_t n,
-                                                std::complex<float> alpha, std::complex<float> *ab,
-                                                std::int64_t lda, std::int64_t ldb,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zimatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                std::int64_t m, std::int64_t n,
-                                                std::complex<double> alpha,
-                                                std::complex<double> *ab, std::int64_t lda,
-                                                std::int64_t ldb,
-                                                const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_somatadd_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                               oneapi::mkl::transpose transb, std::int64_t m,
-                                               std::int64_t n, float alpha, const float *a,
-                                               std::int64_t lda, float beta, const float *b,
-                                               std::int64_t ldb, float *c, std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_domatadd_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                               oneapi::mkl::transpose transb, std::int64_t m,
-                                               std::int64_t n, double alpha, const double *a,
-                                               std::int64_t lda, double beta, const double *b,
-                                               std::int64_t ldb, double *c, std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_comatadd_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb,
-        std::int64_t m, std::int64_t n, std::complex<float> alpha, const std::complex<float> *a,
-        std::int64_t lda, std::complex<float> beta, const std::complex<float> *b, std::int64_t ldb,
-        std::complex<float> *c, std::int64_t ldc, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zomatadd_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa,
-                                               oneapi::mkl::transpose transb, std::int64_t m,
-                                               std::int64_t n, std::complex<double> alpha,
-                                               const std::complex<double> *a, std::int64_t lda,
-                                               std::complex<double> beta,
-                                               const std::complex<double> *b, std::int64_t ldb,
-                                               std::complex<double> *c, std::int64_t ldc,
-                                               const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_somatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        float *alpha, const float **a, std::int64_t *lda, float **b, std::int64_t *ldb,
-        std::int64_t group_count, std::int64_t *groupsize,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_domatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        double *alpha, const double **a, std::int64_t *lda, double **b, std::int64_t *ldb,
-        std::int64_t group_count, std::int64_t *groupsize,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_comatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        std::complex<float> *alpha, const std::complex<float> **a, std::int64_t *lda,
-        std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-        std::int64_t *groupsize, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zomatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        std::complex<double> *alpha, const std::complex<double> **a, std::int64_t *lda,
-        std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-        std::int64_t *groupsize, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_simatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        float *alpha, float **ab, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-        std::int64_t *groupsize, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_dimatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        double *alpha, double **ab, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-        std::int64_t *groupsize, const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_cimatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        std::complex<float> *alpha, std::complex<float> **ab, std::int64_t *lda, std::int64_t *ldb,
-        std::int64_t group_count, std::int64_t *groupsize,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*row_major_zimatcopy_batch_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n,
-        std::complex<double> *alpha, std::complex<double> **ab, std::int64_t *lda,
-        std::int64_t *ldb, std::int64_t group_count, std::int64_t *groupsize,
-        const std::vector<sycl::event> &dependencies);
-
-} blas_function_table_t;
-
-#endif //_BLAS_FUNCTION_TABLE_HPP_
diff --git a/src/config.hpp.in b/src/config.hpp.in
deleted file mode 100644
index 5698abf9b..000000000
--- a/src/config.hpp.in
+++ /dev/null
@@ -1,44 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef ONEMKL_CONFIG_H
-#define ONEMKL_CONFIG_H
-
-#cmakedefine ENABLE_CUBLAS_BACKEND
-#cmakedefine ENABLE_CUFFT_BACKEND
-#cmakedefine ENABLE_CURAND_BACKEND
-#cmakedefine ENABLE_CUSOLVER_BACKEND
-#cmakedefine ENABLE_MKLCPU_BACKEND
-#cmakedefine ENABLE_MKLGPU_BACKEND
-#cmakedefine ENABLE_NETLIB_BACKEND
-#cmakedefine ENABLE_PORTBLAS_BACKEND
-#cmakedefine ENABLE_PORTBLAS_BACKEND_AMD_GPU
-#cmakedefine ENABLE_PORTBLAS_BACKEND_INTEL_CPU
-#cmakedefine ENABLE_PORTBLAS_BACKEND_INTEL_GPU
-#cmakedefine ENABLE_PORTBLAS_BACKEND_NVIDIA_GPU
-#cmakedefine ENABLE_PORTFFT_BACKEND
-#cmakedefine ENABLE_ROCBLAS_BACKEND
-#cmakedefine ENABLE_ROCFFT_BACKEND
-#cmakedefine ENABLE_ROCRAND_BACKEND
-#cmakedefine ENABLE_ROCSOLVER_BACKEND
-#cmakedefine BUILD_SHARED_LIBS
-#cmakedefine REF_BLAS_LIBNAME "@REF_BLAS_LIBNAME@"
-#cmakedefine REF_CBLAS_LIBNAME "@REF_CBLAS_LIBNAME@"
-
-#endif
diff --git a/src/dft/CMakeLists.txt b/src/dft/CMakeLists.txt
deleted file mode 100644
index e3b373645..000000000
--- a/src/dft/CMakeLists.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build backends
-add_subdirectory(backends)
-
-# Recipe for DFT loader object
-if(BUILD_SHARED_LIBS)
-add_library(onemkl_dft OBJECT)
-target_sources(onemkl_dft PRIVATE backends/descriptor.cpp dft_loader.cpp)
-target_include_directories(onemkl_dft
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${PROJECT_SOURCE_DIR}/src/include
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-          $<TARGET_FILE_DIR:onemkl>
-)
-
-target_compile_options(onemkl_dft PRIVATE ${ONEMKL_BUILD_COPT})
-
-set_target_properties(onemkl_dft PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET onemkl_dft SOURCES backends/descriptor.cxx dft_loader.cpp)
-else()
-  target_link_libraries(onemkl_dft PUBLIC ONEMKL::SYCL::SYCL)
-endif()
-
-include(WarningsUtils)
-target_link_libraries(onemkl_dft PRIVATE onemkl_warnings)
-
-endif()
diff --git a/src/dft/backends/CMakeLists.txt b/src/dft/backends/CMakeLists.txt
deleted file mode 100644
index b03a63e8a..000000000
--- a/src/dft/backends/CMakeLists.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_custom_target(onemkl_backend_libs_dft)
-add_dependencies(onemkl_backend_libs onemkl_backend_libs_dft)
-
-if(ENABLE_MKLGPU_BACKEND)
-  add_subdirectory(mklgpu)
-endif()
-
-if(ENABLE_MKLCPU_BACKEND)
-  add_subdirectory(mklcpu)
-endif()
-
-if(ENABLE_CUFFT_BACKEND)
-  add_subdirectory(cufft)
-endif()
-
-if(ENABLE_ROCFFT_BACKEND)
-  add_subdirectory(rocfft)
-endif()
-
-if(ENABLE_PORTFFT_BACKEND)
-  add_subdirectory(portfft)
-endif()
diff --git a/src/dft/backends/backend_backward_instantiations.cxx b/src/dft/backends/backend_backward_instantiations.cxx
deleted file mode 100644
index a6aeaf71b..000000000
--- a/src/dft/backends/backend_backward_instantiations.cxx
+++ /dev/null
@@ -1,58 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-using desc_rf_t =
-    dft::detail::descriptor<dft::detail::precision::SINGLE, dft::detail::domain::REAL>;
-using desc_cf_t =
-    dft::detail::descriptor<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>;
-using desc_rd_t =
-    dft::detail::descriptor<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>;
-using desc_cd_t =
-    dft::detail::descriptor<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>;
-using depends_vec_t = const std::vector<sycl::event> &;
-
-#define ONEMKL_DFT_BACKWARD_INSTANTIATIONS(DESCRIPTOR_T, SCALAR_T, FORWARD_T, BACKWARD_T)          \
-    /* Buffer API */                                                                               \
-    template ONEMKL_EXPORT void compute_backward<DESCRIPTOR_T>(DESCRIPTOR_T &,                     \
-                                                               sycl::buffer<FORWARD_T> &);         \
-    template ONEMKL_EXPORT void compute_backward<DESCRIPTOR_T>(                                    \
-        DESCRIPTOR_T &, sycl::buffer<SCALAR_T> &, sycl::buffer<SCALAR_T> &);                       \
-    template ONEMKL_EXPORT void compute_backward<DESCRIPTOR_T>(                                    \
-        DESCRIPTOR_T &, sycl::buffer<BACKWARD_T> &, sycl::buffer<FORWARD_T> &);                    \
-    template ONEMKL_EXPORT void compute_backward<DESCRIPTOR_T>(                                    \
-        DESCRIPTOR_T &, sycl::buffer<SCALAR_T> &, sycl::buffer<SCALAR_T> &,                        \
-        sycl::buffer<SCALAR_T> &, sycl::buffer<SCALAR_T> &);                                       \
-                                                                                                   \
-    /* USM API */                                                                                  \
-    template ONEMKL_EXPORT sycl::event compute_backward<DESCRIPTOR_T>(DESCRIPTOR_T &, FORWARD_T *, \
-                                                                      depends_vec_t);              \
-    template ONEMKL_EXPORT sycl::event compute_backward<DESCRIPTOR_T>(DESCRIPTOR_T &, SCALAR_T *,  \
-                                                                      SCALAR_T *, depends_vec_t);  \
-    template ONEMKL_EXPORT sycl::event compute_backward<DESCRIPTOR_T>(                             \
-        DESCRIPTOR_T &, BACKWARD_T *, FORWARD_T *, depends_vec_t);                                 \
-    template ONEMKL_EXPORT sycl::event compute_backward<DESCRIPTOR_T>(                             \
-        DESCRIPTOR_T &, SCALAR_T *, SCALAR_T *, SCALAR_T *, SCALAR_T *, depends_vec_t);
-
-ONEMKL_DFT_BACKWARD_INSTANTIATIONS(desc_rf_t, float, float, std::complex<float>)
-ONEMKL_DFT_BACKWARD_INSTANTIATIONS(desc_cf_t, float, std::complex<float>, std::complex<float>)
-ONEMKL_DFT_BACKWARD_INSTANTIATIONS(desc_rd_t, double, double, std::complex<double>)
-ONEMKL_DFT_BACKWARD_INSTANTIATIONS(desc_cd_t, double, std::complex<double>, std::complex<double>)
-
-#undef ONEMKL_DFT_BACKWARD_INSTANTIATIONS
-#undef ONEMKL_DFT_BACKWARD_INSTANTIATIONS_REAL_ONLY
diff --git a/src/dft/backends/backend_compute_signature.cxx b/src/dft/backends/backend_compute_signature.cxx
deleted file mode 100644
index d011cb995..000000000
--- a/src/dft/backends/backend_compute_signature.cxx
+++ /dev/null
@@ -1,137 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/*
-repetitive definitions from commit.cpp.
-
-This file should be included for each backend, with <BACKEND> defined to match
-the namespace of the backend's implementation.
-*/
-
-using fwd_type = typename dft::detail::commit_impl<prec, dom>::fwd_type;
-using bwd_type = typename dft::detail::commit_impl<prec, dom>::bwd_type;
-using descriptor_type = typename dft::detail::descriptor<prec, dom>;
-
-// forward inplace COMPLEX_COMPLEX
-void forward_ip_cc(descriptor_type& desc, sycl::buffer<fwd_type, 1>& inout) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<fwd_type, 1>>(
-        "compute_forward");
-    oneapi::mkl::dft::BACKEND::compute_forward(desc, inout);
-}
-sycl::event forward_ip_cc(descriptor_type& desc, fwd_type* inout,
-                          const std::vector<sycl::event>& dependencies) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<fwd_type*>("compute_forward");
-    return oneapi::mkl::dft::BACKEND::compute_forward(desc, inout, dependencies);
-}
-
-// forward inplace REAL_REAL
-void forward_ip_rr(descriptor_type& desc, sycl::buffer<scalar_type, 1>& inout_re,
-                   sycl::buffer<scalar_type, 1>& inout_im) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<scalar_type, 1>>(
-        "compute_forward");
-    oneapi::mkl::dft::BACKEND::compute_forward(desc, inout_re, inout_im);
-}
-sycl::event forward_ip_rr(descriptor_type& desc, scalar_type* inout_re, scalar_type* inout_im,
-                          const std::vector<sycl::event>& dependencies) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<scalar_type*>("compute_forward");
-    return oneapi::mkl::dft::BACKEND::compute_forward(desc, inout_re, inout_im, dependencies);
-}
-
-// forward out-of-place COMPLEX_COMPLEX
-void forward_op_cc(descriptor_type& desc, sycl::buffer<fwd_type, 1>& in,
-                   sycl::buffer<bwd_type, 1>& out) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<fwd_type, 1>>(
-        "compute_forward");
-    oneapi::mkl::dft::BACKEND::compute_forward<descriptor_type>(desc, in, out);
-}
-sycl::event forward_op_cc(descriptor_type& desc, fwd_type* in, bwd_type* out,
-                          const std::vector<sycl::event>& dependencies) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<fwd_type*>("compute_forward");
-    return oneapi::mkl::dft::BACKEND::compute_forward<descriptor_type>(desc, in, out, dependencies);
-}
-
-// forward out-of-place REAL_REAL
-void forward_op_rr(descriptor_type& desc, sycl::buffer<scalar_type, 1>& in_re,
-                   sycl::buffer<scalar_type, 1>& in_im, sycl::buffer<scalar_type, 1>& out_re,
-                   sycl::buffer<scalar_type, 1>& out_im) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<scalar_type, 1>>(
-        "compute_forward");
-    oneapi::mkl::dft::BACKEND::compute_forward(desc, in_re, in_im, out_re, out_im);
-}
-sycl::event forward_op_rr(descriptor_type& desc, scalar_type* in_re, scalar_type* in_im,
-                          scalar_type* out_re, scalar_type* out_im,
-                          const std::vector<sycl::event>& dependencies) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<scalar_type*>("compute_forward");
-    return oneapi::mkl::dft::BACKEND::compute_forward(desc, in_re, in_im, out_re, out_im,
-                                                      dependencies);
-}
-
-// backward inplace COMPLEX_COMPLEX
-void backward_ip_cc(descriptor_type& desc, sycl::buffer<fwd_type, 1>& inout) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<fwd_type, 1>>(
-        "compute_backward");
-    oneapi::mkl::dft::BACKEND::compute_backward(desc, inout);
-}
-sycl::event backward_ip_cc(descriptor_type& desc, fwd_type* inout,
-                           const std::vector<sycl::event>& dependencies) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<fwd_type*>("compute_backward");
-    return oneapi::mkl::dft::BACKEND::compute_backward(desc, inout, dependencies);
-}
-
-// backward inplace REAL_REAL
-void backward_ip_rr(descriptor_type& desc, sycl::buffer<scalar_type, 1>& inout_re,
-                    sycl::buffer<scalar_type, 1>& inout_im) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<scalar_type, 1>>(
-        "compute_backward");
-    oneapi::mkl::dft::BACKEND::compute_backward(desc, inout_re, inout_im);
-}
-sycl::event backward_ip_rr(descriptor_type& desc, scalar_type* inout_re, scalar_type* inout_im,
-                           const std::vector<sycl::event>& dependencies) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<scalar_type*>("compute_backward");
-    return oneapi::mkl::dft::BACKEND::compute_backward(desc, inout_re, inout_im, dependencies);
-}
-
-// backward out-of-place COMPLEX_COMPLEX
-void backward_op_cc(descriptor_type& desc, sycl::buffer<bwd_type, 1>& in,
-                    sycl::buffer<fwd_type, 1>& out) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<bwd_type, 1>>(
-        "compute_backward");
-    oneapi::mkl::dft::BACKEND::compute_backward(desc, in, out);
-}
-sycl::event backward_op_cc(descriptor_type& desc, bwd_type* in, fwd_type* out,
-                           const std::vector<sycl::event>& dependencies) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<bwd_type*>("compute_backward");
-    return oneapi::mkl::dft::BACKEND::compute_backward(desc, in, out, dependencies);
-}
-
-// backward out-of-place REAL_REAL
-void backward_op_rr(descriptor_type& desc, sycl::buffer<scalar_type, 1>& in_re,
-                    sycl::buffer<scalar_type, 1>& in_im, sycl::buffer<scalar_type, 1>& out_re,
-                    sycl::buffer<scalar_type, 1>& out_im) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<scalar_type, 1>>(
-        "compute_backward");
-    oneapi::mkl::dft::BACKEND::compute_backward(desc, in_re, in_im, out_re, out_im);
-}
-sycl::event backward_op_rr(descriptor_type& desc, scalar_type* in_re, scalar_type* in_im,
-                           scalar_type* out_re, scalar_type* out_im,
-                           const std::vector<sycl::event>& dependencies) override {
-    dft::detail::get_commit(desc)->template compute_call_throw<scalar_type*>("compute_backward");
-    return oneapi::mkl::dft::BACKEND::compute_backward(desc, in_re, in_im, out_re, out_im,
-                                                       dependencies);
-}
diff --git a/src/dft/backends/backend_forward_instantiations.cxx b/src/dft/backends/backend_forward_instantiations.cxx
deleted file mode 100644
index a6ed371d5..000000000
--- a/src/dft/backends/backend_forward_instantiations.cxx
+++ /dev/null
@@ -1,58 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-using desc_rf_t =
-    dft::detail::descriptor<dft::detail::precision::SINGLE, dft::detail::domain::REAL>;
-using desc_cf_t =
-    dft::detail::descriptor<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>;
-using desc_rd_t =
-    dft::detail::descriptor<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>;
-using desc_cd_t =
-    dft::detail::descriptor<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>;
-using depends_vec_t = const std::vector<sycl::event> &;
-
-#define ONEMKL_DFT_FORWARD_INSTANTIATIONS(DESCRIPTOR_T, SCALAR_T, FORWARD_T, BACKWARD_T)           \
-    /* Buffer API */                                                                               \
-    template ONEMKL_EXPORT void compute_forward<DESCRIPTOR_T>(DESCRIPTOR_T &,                      \
-                                                              sycl::buffer<FORWARD_T> &);          \
-    template ONEMKL_EXPORT void compute_forward<DESCRIPTOR_T>(                                     \
-        DESCRIPTOR_T &, sycl::buffer<SCALAR_T> &, sycl::buffer<SCALAR_T> &);                       \
-    template ONEMKL_EXPORT void compute_forward<DESCRIPTOR_T>(                                     \
-        DESCRIPTOR_T &, sycl::buffer<FORWARD_T> &, sycl::buffer<BACKWARD_T> &);                    \
-    template ONEMKL_EXPORT void compute_forward<DESCRIPTOR_T>(                                     \
-        DESCRIPTOR_T &, sycl::buffer<SCALAR_T> &, sycl::buffer<SCALAR_T> &,                        \
-        sycl::buffer<SCALAR_T> &, sycl::buffer<SCALAR_T> &);                                       \
-                                                                                                   \
-    /* USM API */                                                                                  \
-    template ONEMKL_EXPORT sycl::event compute_forward<DESCRIPTOR_T>(DESCRIPTOR_T &, FORWARD_T *,  \
-                                                                     depends_vec_t);               \
-    template ONEMKL_EXPORT sycl::event compute_forward<DESCRIPTOR_T>(DESCRIPTOR_T &, SCALAR_T *,   \
-                                                                     SCALAR_T *, depends_vec_t);   \
-    template ONEMKL_EXPORT sycl::event compute_forward<DESCRIPTOR_T>(DESCRIPTOR_T &, FORWARD_T *,  \
-                                                                     BACKWARD_T *, depends_vec_t); \
-    template ONEMKL_EXPORT sycl::event compute_forward<DESCRIPTOR_T>(                              \
-        DESCRIPTOR_T &, SCALAR_T *, SCALAR_T *, SCALAR_T *, SCALAR_T *, depends_vec_t);
-
-ONEMKL_DFT_FORWARD_INSTANTIATIONS(desc_rf_t, float, float, std::complex<float>)
-ONEMKL_DFT_FORWARD_INSTANTIATIONS(desc_cf_t, float, std::complex<float>, std::complex<float>)
-ONEMKL_DFT_FORWARD_INSTANTIATIONS(desc_rd_t, double, double, std::complex<double>)
-ONEMKL_DFT_FORWARD_INSTANTIATIONS(desc_cd_t, double, std::complex<double>, std::complex<double>)
-
-#undef ONEMKL_DFT_FORWARD_INSTANTIATIONS
-#undef ONEMKL_DFT_FORWARD_INSTANTIATIONS_REAL_ONLY
diff --git a/src/dft/backends/backend_wrappers.cxx b/src/dft/backends/backend_wrappers.cxx
deleted file mode 100644
index 5d0d2bddc..000000000
--- a/src/dft/backends/backend_wrappers.cxx
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/*
-This file lists functions matching those required by dft_function_table_t in 
-src/dft/function_table.hpp.
-
-To use this:
-
-#define WRAPPER_VERSION <Wrapper version number>
-#define BACKEND         <Backend name eg. mklgpu>
-
-extern "C" dft_function_table_t mkl_dft_table = {
-    WRAPPER_VERSION,
-#include "dft/backends/backend_wrappers.cxx"
-};
-
-Changes to this file should be matched to changes in function_table.hpp. The required 
-function template instantiations must be added to backend_backward_instantiations.cxx 
-and backend_forward_instantiations.cxx.
-*/
-
-// clang-format off
-oneapi::mkl::dft::BACKEND::create_commit,
-oneapi::mkl::dft::BACKEND::create_commit,
-oneapi::mkl::dft::BACKEND::create_commit,
-oneapi::mkl::dft::BACKEND::create_commit,
-// clang-format on
-
-#undef ONEAPI_MKL_DFT_BACKEND_SIGNATURES
diff --git a/src/dft/backends/cufft/CMakeLists.txt b/src/dft/backends/cufft/CMakeLists.txt
deleted file mode 100644
index 010905546..000000000
--- a/src/dft/backends/cufft/CMakeLists.txt
+++ /dev/null
@@ -1,85 +0,0 @@
-#===============================================================================
-# Copyright Codeplay Software Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(LIB_NAME onemkl_dft_cufft)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT
-  descriptor.cpp
-  commit.cpp
-  forward.cpp
-  backward.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: mkl_dft_cufft_wrappers.cpp>
-)
-add_dependencies(onemkl_backend_libs_dft ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PUBLIC ${ONEMKL_INTERFACE_INCLUDE_DIRS}
-)
-target_include_directories(${LIB_NAME}
-  PUBLIC ${ONEMKL_INTERFACE_INCLUDE_DIRS}
-)
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/src
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-
-if (${CMAKE_VERSION} VERSION_LESS "3.17.0")
-  find_package(CUDA REQUIRED)
-  target_include_directories(${LIB_OBJ} PRIVATE ${CUDA_INCLUDE_DIRS})
-  target_link_libraries(${LIB_OBJ} PRIVATE cuda ${CUDA_CUFFT_LIBRARIES})
-else()
-  find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(${LIB_OBJ} PRIVATE CUDA::cufft CUDA::cuda_driver)
-endif()
-
-target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL)
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-#Set oneMKL libraries as not transitive for dynamic
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/dft/backends/cufft/backward.cpp b/src/dft/backends/cufft/backward.cpp
deleted file mode 100644
index aea9f232f..000000000
--- a/src/dft/backends/cufft/backward.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-
-#include "oneapi/mkl/dft/detail/commit_impl.hpp"
-#include "oneapi/mkl/dft/detail/cufft/onemkl_dft_cufft.hpp"
-#include "oneapi/mkl/dft/types.hpp"
-
-#include "execute_helper.hpp"
-
-#include <cufft.h>
-
-namespace oneapi::mkl::dft::cufft {
-namespace detail {
-//forward declaration
-template <dft::precision prec, dft::domain dom>
-std::array<std::int64_t, 2> get_offsets_bwd(dft::detail::commit_impl<prec, dom> *commit);
-
-template <dft::precision prec, dft::domain dom>
-cufftHandle get_bwd_plan(dft::detail::commit_impl<prec, dom> *commit) {
-    return static_cast<std::optional<cufftHandle> *>(commit->get_handle())[1].value();
-}
-} // namespace detail
-// BUFFER version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<fwd<descriptor_type>, 1> &inout) {
-    const std::string func_name = "compute_backward(desc, inout)";
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_bwd_plan(commit);
-    auto offsets = detail::get_offsets_bwd(commit);
-
-    if constexpr (std::is_floating_point_v<fwd<descriptor_type>>) {
-        offsets[0] *= 2; // offset is supplied in complex but we offset scalar pointer
-        if (offsets[1] % 2 != 0) {
-            throw oneapi::mkl::unimplemented(
-                "DFT", func_name,
-                "cuFFT requires offset (first value in strides) to be multiple of 2!");
-        }
-    }
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto inout_acc = inout.template get_access<sycl::access::mode::read_write>(cgh);
-        commit->add_buffer_workspace_dependency_if_rqd("compute_backward", cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, plan);
-
-            auto inout_native = reinterpret_cast<fwd<descriptor_type> *>(
-                ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(inout_acc));
-            detail::cufft_execute<detail::Direction::Backward, fwd<descriptor_type>>(
-                func_name, stream, plan, reinterpret_cast<void *>(inout_native + offsets[0]),
-                reinterpret_cast<void *>(inout_native + offsets[1]));
-        });
-    });
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &, sycl::buffer<scalar<descriptor_type>, 1> &,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &) {
-    throw oneapi::mkl::unimplemented("DFT", "compute_backward(desc, inout_re, inout_im)",
-                                     "cuFFT does not support real-real complex storage.");
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<bwd<descriptor_type>, 1> &in,
-                                    sycl::buffer<fwd<descriptor_type>, 1> &out) {
-    const std::string func_name = "compute_backward(desc, in, out)";
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::NOT_INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_bwd_plan(commit);
-    auto offsets = detail::get_offsets_bwd(commit);
-
-    if constexpr (std::is_floating_point_v<fwd<descriptor_type>>) {
-        if (offsets[1] % 2 != 0) {
-            throw oneapi::mkl::unimplemented(
-                "DFT", func_name,
-                "cuFFT requires offset (first value in strides) to be multiple of 2!");
-        }
-    }
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto in_acc = in.template get_access<sycl::access::mode::read_write>(cgh);
-        auto out_acc = out.template get_access<sycl::access::mode::read_write>(cgh);
-        commit->add_buffer_workspace_dependency_if_rqd("compute_backward", cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, plan);
-
-            auto in_native = reinterpret_cast<void *>(
-                reinterpret_cast<bwd<descriptor_type> *>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(in_acc)) +
-                offsets[0]);
-            auto out_native = reinterpret_cast<void *>(
-                reinterpret_cast<fwd<descriptor_type> *>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(out_acc)) +
-                offsets[1]);
-            detail::cufft_execute<detail::Direction::Backward, fwd<descriptor_type>>(
-                func_name, stream, plan, in_native, out_native);
-        });
-    });
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &, sycl::buffer<scalar<descriptor_type>, 1> &,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &) {
-    throw oneapi::mkl::unimplemented("DFT", "compute_backward(desc, in_re, in_im, out_re, out_im)",
-                                     "cuFFT does not support real-real complex storage.");
-}
-
-//USM version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwd<descriptor_type> *inout,
-                                           const std::vector<sycl::event> &dependencies) {
-    const std::string func_name = "compute_backward(desc, inout, dependencies)";
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_bwd_plan(commit);
-    auto offsets = detail::get_offsets_bwd(commit);
-
-    if constexpr (std::is_floating_point_v<fwd<descriptor_type>>) {
-        offsets[0] *= 2; // offset is supplied in complex but we offset scalar pointer
-        if (offsets[1] % 2 != 0) {
-            throw oneapi::mkl::unimplemented(
-                "DFT", func_name,
-                "cuFFT requires offset (first value in strides) to be multiple of 2!");
-        }
-    }
-
-    sycl::event sycl_event = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        commit->depend_on_last_usm_workspace_event_if_rqd(cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, plan);
-
-            detail::cufft_execute<detail::Direction::Backward, fwd<descriptor_type>>(
-                func_name, stream, plan, inout + offsets[0], inout + offsets[1]);
-        });
-    });
-    commit->set_last_usm_workspace_event_if_rqd(sycl_event);
-    return sycl_event;
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &, scalar<descriptor_type> *,
-                                           scalar<descriptor_type> *,
-                                           const std::vector<sycl::event> &) {
-    throw oneapi::mkl::unimplemented("DFT",
-                                     "compute_backward(desc, inout_re, inout_im, dependencies)",
-                                     "cuFFT does not support real-real complex storage.");
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwd<descriptor_type> *in,
-                                           fwd<descriptor_type> *out,
-                                           const std::vector<sycl::event> &dependencies) {
-    const std::string func_name = "compute_backward(desc, in, out, dependencies)";
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::NOT_INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_bwd_plan(commit);
-    auto offsets = detail::get_offsets_bwd(commit);
-
-    if constexpr (std::is_floating_point_v<fwd<descriptor_type>>) {
-        if (offsets[1] % 2 != 0) {
-            throw oneapi::mkl::unimplemented(
-                "DFT", func_name,
-                "cuFFT requires offset (first value in strides) to be multiple of 2!");
-        }
-    }
-
-    sycl::event sycl_event = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        commit->depend_on_last_usm_workspace_event_if_rqd(cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, plan);
-
-            detail::cufft_execute<detail::Direction::Backward, fwd<descriptor_type>>(
-                func_name, stream, plan, in + offsets[0], out + offsets[1]);
-        });
-    });
-    commit->set_last_usm_workspace_event_if_rqd(sycl_event);
-    return sycl_event;
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &, scalar<descriptor_type> *,
-                                           scalar<descriptor_type> *, scalar<descriptor_type> *,
-                                           scalar<descriptor_type> *,
-                                           const std::vector<sycl::event> &) {
-    throw oneapi::mkl::unimplemented("DFT",
-                                     "compute_backward(desc, in_re, in_im, out_re, out_im, deps)",
-                                     "cuFFT does not support real-real complex storage.");
-}
-
-// Template function instantiations
-#include "dft/backends/backend_backward_instantiations.cxx"
-
-} // namespace oneapi::mkl::dft::cufft
diff --git a/src/dft/backends/cufft/commit.cpp b/src/dft/backends/cufft/commit.cpp
deleted file mode 100644
index faf4332c0..000000000
--- a/src/dft/backends/cufft/commit.cpp
+++ /dev/null
@@ -1,462 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include <array>
-#include <algorithm>
-#include <optional>
-
-#include "oneapi/mkl/exceptions.hpp"
-
-#include "oneapi/mkl/dft/detail/commit_impl.hpp"
-#include "oneapi/mkl/dft/detail/descriptor_impl.hpp"
-#include "oneapi/mkl/dft/detail/cufft/onemkl_dft_cufft.hpp"
-#include "oneapi/mkl/dft/types.hpp"
-
-#include "../stride_helper.hpp"
-
-#include <cufft.h>
-#include <cuda.h>
-
-namespace oneapi::mkl::dft::cufft {
-namespace detail {
-
-/// Commit impl class specialization for cuFFT.
-template <dft::precision prec, dft::domain dom>
-class cufft_commit final : public dft::detail::commit_impl<prec, dom> {
-private:
-    using scalar_type = typename dft::detail::commit_impl<prec, dom>::scalar_type;
-
-    // For real to complex transforms, the "type" arg also encodes the direction (e.g. CUFFT_R2C vs CUFFT_C2R) in the plan so we must have one for each direction.
-    // We also need this because oneMKL uses a directionless "FWD_DISTANCE" and "BWD_DISTANCE" while cuFFT uses a directional "idist" and "odist".
-    // plans[0] is forward, plans[1] is backward
-    std::array<std::optional<cufftHandle>, 2> plans = { std::nullopt, std::nullopt };
-    std::int64_t offset_fwd_in, offset_fwd_out, offset_bwd_in, offset_bwd_out;
-
-public:
-    cufft_commit(sycl::queue& queue, const dft::detail::dft_values<prec, dom>& config_values)
-            : oneapi::mkl::dft::detail::commit_impl<prec, dom>(queue, backend::cufft,
-                                                               config_values) {
-        if constexpr (prec == dft::detail::precision::DOUBLE) {
-            if (!queue.get_device().has(sycl::aspect::fp64)) {
-                throw mkl::exception("DFT", "commit", "Device does not support double precision.");
-            }
-        }
-    }
-
-    void clean_plans() {
-        auto fix_context = plans[0].has_value() || plans[1].has_value();
-        if (plans[0]) {
-            if (cufftDestroy(plans[0].value()) != CUFFT_SUCCESS) {
-                throw mkl::exception("dft/backends/cufft", __FUNCTION__,
-                                     "Failed to destroy forward cuFFT plan.");
-            }
-            plans[0] = std::nullopt;
-        }
-        if (plans[1]) {
-            if (cufftDestroy(plans[1].value()) != CUFFT_SUCCESS) {
-                throw mkl::exception("dft/backends/cufft", __FUNCTION__,
-                                     "Failed to destroy backward cuFFT plan.");
-            }
-            plans[1] = std::nullopt;
-        }
-        if (fix_context) {
-            // cufftDestroy changes the context so change it back.
-            CUdevice interopDevice =
-                sycl::get_native<sycl::backend::ext_oneapi_cuda>(this->get_queue().get_device());
-            CUcontext interopContext;
-            if (cuDevicePrimaryCtxRetain(&interopContext, interopDevice) != CUDA_SUCCESS) {
-                throw mkl::exception("dft/backends/cufft", __FUNCTION__,
-                                     "Failed to change cuda context.");
-            }
-        }
-    }
-
-    void commit(const dft::detail::dft_values<prec, dom>& config_values) override {
-        // this could be a recommit
-        this->external_workspace_helper_ =
-            oneapi::mkl::dft::detail::external_workspace_helper<prec, dom>(
-                config_values.workspace_placement ==
-                oneapi::mkl::dft::detail::config_value::WORKSPACE_EXTERNAL);
-        clean_plans();
-
-        if (config_values.fwd_scale != 1.0 || config_values.bwd_scale != 1.0) {
-            throw mkl::unimplemented(
-                "dft/backends/cufft", __FUNCTION__,
-                "cuFFT does not support values other than 1 for FORWARD/BACKWARD_SCALE");
-        }
-
-        // The cudaStream for the plan is set at execution time so the interop handler can pick the stream.
-        constexpr cufftType fwd_type = [] {
-            if constexpr (dom == dft::domain::COMPLEX) {
-                if constexpr (prec == dft::precision::SINGLE) {
-                    return CUFFT_C2C;
-                }
-                else {
-                    return CUFFT_Z2Z;
-                }
-            }
-            else {
-                if constexpr (prec == dft::precision::SINGLE) {
-                    return CUFFT_R2C;
-                }
-                else {
-                    return CUFFT_D2Z;
-                }
-            }
-        }();
-        constexpr cufftType bwd_type = [] {
-            if constexpr (dom == dft::domain::COMPLEX) {
-                if constexpr (prec == dft::precision::SINGLE) {
-                    return CUFFT_C2C;
-                }
-                else {
-                    return CUFFT_Z2Z;
-                }
-            }
-            else {
-                if constexpr (prec == dft::precision::SINGLE) {
-                    return CUFFT_C2R;
-                }
-                else {
-                    return CUFFT_Z2D;
-                }
-            }
-        }();
-
-        constexpr std::size_t max_supported_dims = 3;
-        std::array<int, max_supported_dims> n_copy;
-        std::copy(config_values.dimensions.begin(), config_values.dimensions.end(), n_copy.data());
-        const int rank = static_cast<int>(config_values.dimensions.size());
-
-        auto stride_api_choice = dft::detail::get_stride_api(config_values);
-        dft::detail::throw_on_invalid_stride_api("CUFFT commit", stride_api_choice);
-        dft::detail::stride_vectors<int> stride_vecs(config_values, stride_api_choice);
-        offset_fwd_in = stride_vecs.offset_fwd_in;
-        offset_fwd_out = stride_vecs.offset_fwd_out;
-        offset_bwd_in = stride_vecs.offset_bwd_in;
-        offset_bwd_out = stride_vecs.offset_bwd_out;
-
-        // cufft ignores the first value in inembed and onembed, so there is no harm in putting offset there
-        auto a_min = std::min_element(stride_vecs.vec_a.begin() + 1, stride_vecs.vec_a.end());
-        auto b_min = std::min_element(stride_vecs.vec_b.begin() + 1, stride_vecs.vec_b.end());
-        if constexpr (dom == dft::domain::REAL) {
-            if ((a_min != stride_vecs.vec_a.begin() + rank) ||
-                (b_min != stride_vecs.vec_b.begin() + rank)) {
-                throw mkl::unimplemented(
-                    "dft/backends/cufft", __FUNCTION__,
-                    "cufft requires the last stride to be the the smallest one for real transforms!");
-            }
-        }
-        else {
-            if (a_min - stride_vecs.vec_a.begin() != b_min - stride_vecs.vec_b.begin()) {
-                throw mkl::unimplemented(
-                    "dft/backends/cufft", __FUNCTION__,
-                    "cufft requires that if ordered by stride length, the order of strides is the same for input and output strides!");
-            }
-        }
-        const int a_stride = static_cast<int>(*a_min);
-        const int b_stride = static_cast<int>(*b_min);
-        stride_vecs.vec_a.erase(a_min);
-        stride_vecs.vec_b.erase(b_min);
-        int fwd_istride = a_stride;
-        int fwd_ostride = b_stride;
-        int bwd_istride =
-            stride_api_choice == dft::detail::stride_api::FB_STRIDES ? b_stride : a_stride;
-        int bwd_ostride =
-            stride_api_choice == dft::detail::stride_api::FB_STRIDES ? a_stride : b_stride;
-        if (a_min - stride_vecs.vec_a.begin() != rank) {
-            // swap dimensions to have the last one have the smallest stride
-            std::swap(n_copy[a_min - stride_vecs.vec_a.begin() - 1], n_copy[rank - 1]);
-        }
-        for (int i = 1; i < rank; i++) {
-            if ((stride_vecs.vec_a[i] % a_stride != 0) || (stride_vecs.vec_b[i] % b_stride != 0)) {
-                throw mkl::unimplemented(
-                    "dft/backends/cufft", __FUNCTION__,
-                    "cufft requires a stride to be divisible by all smaller strides!");
-            }
-            stride_vecs.vec_a[i] /= a_stride;
-            stride_vecs.vec_b[i] /= b_stride;
-        }
-        if (rank > 2) {
-            if (stride_vecs.vec_a[1] > stride_vecs.vec_a[2] &&
-                stride_vecs.vec_b[1] < stride_vecs.vec_b[2]) {
-                throw mkl::unimplemented(
-                    "dft/backends/cufft", __FUNCTION__,
-                    "cufft requires that if ordered by stride length, the order of strides is the same for input and output strides!");
-            }
-            else if (stride_vecs.vec_a[1] < stride_vecs.vec_a[2] &&
-                     stride_vecs.vec_b[1] < stride_vecs.vec_b[2]) {
-                // swap dimensions to have the first one have the biggest stride
-                std::swap(stride_vecs.vec_a[1], stride_vecs.vec_a[2]);
-                std::swap(stride_vecs.vec_b[1], stride_vecs.vec_b[2]);
-                std::swap(n_copy[0], n_copy[1]);
-            }
-            if ((stride_vecs.vec_a[1] % stride_vecs.vec_a[2] != 0) ||
-                (stride_vecs.vec_b[1] % stride_vecs.vec_b[2] != 0)) {
-                throw mkl::unimplemented(
-                    "dft/backends/cufft", __FUNCTION__,
-                    "cufft requires a stride to be divisible by all smaller strides!");
-            }
-            stride_vecs.vec_a[1] /= stride_vecs.vec_a[2];
-            stride_vecs.vec_b[1] /= stride_vecs.vec_b[2];
-        }
-        const int batch = static_cast<int>(config_values.number_of_transforms);
-        const int fwd_dist = static_cast<int>(config_values.fwd_dist);
-        const int bwd_dist = static_cast<int>(config_values.bwd_dist);
-
-        // When creating real-complex descriptions, the strides will always be wrong for one of the directions.
-        // This is because the least significant dimension is symmetric.
-        // If the strides are invalid (too small to fit) then just don't bother creating the plan
-        auto check_stride_validity = [&](auto strides_fwd, auto strides_bwd) {
-            int inner_nfwd = n_copy[rank - 1]; // inner dimensions of DFT
-            // Complex data is stored conjugate even for real domains
-            int inner_nbwd = dom == dft::domain::REAL ? inner_nfwd / 2 + 1 : inner_nfwd;
-            int inner_sfwd = strides_fwd.back(); // inner strides of DFT
-            int inner_sbwd = strides_bwd.back();
-            bool valid = true;
-            for (int r = 1; r < rank; ++r) {
-                valid = valid && (inner_nfwd <= inner_sfwd) && (inner_nbwd <= inner_sbwd);
-                inner_nfwd *= n_copy[rank - r - 1];
-                inner_nbwd *= n_copy[rank - r - 1];
-                inner_sfwd *= strides_fwd[rank - r - 1];
-                inner_sbwd *= strides_bwd[rank - r - 1];
-            }
-            return valid;
-        };
-
-        bool valid_forward = check_stride_validity(stride_vecs.fwd_in, stride_vecs.fwd_out);
-        bool valid_backward = stride_api_choice == dft::detail::stride_api::FB_STRIDES
-                                  ? valid_forward
-                                  : check_stride_validity(stride_vecs.bwd_out, stride_vecs.bwd_in);
-
-        if (!valid_forward && !valid_backward) {
-            throw mkl::exception("dft/backends/cufft", __FUNCTION__, "Invalid strides.");
-        }
-
-        if (valid_forward) {
-            cufftHandle fwd_plan;
-            auto res = cufftCreate(&fwd_plan);
-            if (res != CUFFT_SUCCESS) {
-                throw mkl::exception("dft/backends/cufft", __FUNCTION__, "cufftCreate failed.");
-            }
-            apply_external_workspace_setting(fwd_plan, config_values.workspace_placement);
-            res = cufftPlanMany(&fwd_plan, // plan
-                                rank, // rank
-                                n_copy.data(), // n
-                                stride_vecs.fwd_in.data(), // inembed
-                                fwd_istride, // istride
-                                fwd_dist, // idist
-                                stride_vecs.fwd_out.data(), // onembed
-                                fwd_ostride, // ostride
-                                bwd_dist, // odist
-                                fwd_type, // type
-                                batch // batch
-            );
-
-            if (res != CUFFT_SUCCESS) {
-                throw mkl::exception("dft/backends/cufft", __FUNCTION__,
-                                     "Failed to create forward cuFFT plan.");
-            }
-
-            plans[0] = fwd_plan;
-        }
-
-        if (valid_backward) {
-            cufftHandle bwd_plan;
-            auto res = cufftCreate(&bwd_plan);
-            if (res != CUFFT_SUCCESS) {
-                throw mkl::exception("dft/backends/cufft", __FUNCTION__, "cufftCreate failed.");
-            }
-            apply_external_workspace_setting(bwd_plan, config_values.workspace_placement);
-            // flip fwd_distance and bwd_distance because cuFFt uses input distance and output distance.
-            res = cufftPlanMany(&bwd_plan, // plan
-                                rank, // rank
-                                n_copy.data(), // n
-                                stride_vecs.bwd_in.data(), // inembed
-                                bwd_istride, // istride
-                                bwd_dist, // idist
-                                stride_vecs.bwd_out.data(), // onembed
-                                bwd_ostride, // ostride
-                                fwd_dist, // odist
-                                bwd_type, // type
-                                batch // batch
-            );
-            if (res != CUFFT_SUCCESS) {
-                throw mkl::exception("dft/backends/cufft", __FUNCTION__,
-                                     "Failed to create backward cuFFT plan.");
-            }
-            plans[1] = bwd_plan;
-        }
-    }
-
-    ~cufft_commit() override {
-        clean_plans();
-    }
-
-    static void apply_external_workspace_setting(cufftHandle handle,
-                                                 config_value workspace_setting) {
-        if (workspace_setting == config_value::WORKSPACE_EXTERNAL) {
-            auto res = cufftSetAutoAllocation(handle, 0);
-            if (res != CUFFT_SUCCESS) {
-                throw mkl::exception("dft/backends/cufft", "commit",
-                                     "cufftSetAutoAllocation(plan, 0) failed.");
-            }
-        }
-    }
-
-    void* get_handle() noexcept override {
-        return plans.data();
-    }
-
-    std::array<std::int64_t, 2> get_offsets_fwd() noexcept {
-        return { offset_fwd_in, offset_fwd_out };
-    }
-
-    std::array<std::int64_t, 2> get_offsets_bwd() noexcept {
-        return { offset_bwd_in, offset_bwd_out };
-    }
-
-    virtual void set_workspace(scalar_type* usm_workspace) override {
-        this->external_workspace_helper_.set_workspace_throw(*this, usm_workspace);
-        if (plans[0]) {
-            cufftSetWorkArea(*plans[0], usm_workspace);
-        }
-        if (plans[1]) {
-            cufftSetWorkArea(*plans[1], usm_workspace);
-        }
-    }
-
-    void set_buffer_workspace(cufftHandle plan, sycl::buffer<scalar_type>& buffer_workspace) {
-        this->get_queue()
-            .submit([&](sycl::handler& cgh) {
-                auto workspace_acc =
-                    buffer_workspace.template get_access<sycl::access::mode::read_write>(cgh);
-                cgh.host_task([=](sycl::interop_handle ih) {
-                    auto stream = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
-                    auto result = cufftSetStream(plan, stream);
-                    if (result != CUFFT_SUCCESS) {
-                        throw oneapi::mkl::exception(
-                            "dft/backends/cufft", "set_workspace",
-                            "cufftSetStream returned " + std::to_string(result));
-                    }
-                    auto workspace_native = reinterpret_cast<scalar_type*>(
-                        ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(workspace_acc));
-                    cufftSetWorkArea(plan, workspace_native);
-                });
-            })
-            .wait_and_throw();
-    }
-
-    virtual void set_workspace(sycl::buffer<scalar_type>& buffer_workspace) override {
-        this->external_workspace_helper_.set_workspace_throw(*this, buffer_workspace);
-        if (plans[0]) {
-            set_buffer_workspace(*plans[0], buffer_workspace);
-        }
-        if (plans[1]) {
-            set_buffer_workspace(*plans[1], buffer_workspace);
-        }
-    }
-
-    std::int64_t get_plan_workspace_size_bytes(cufftHandle handle) {
-        std::size_t size = 0;
-        cufftGetSize(handle, &size);
-        std::int64_t padded_size = static_cast<int64_t>(size);
-        return padded_size;
-    }
-
-    virtual std::int64_t get_workspace_external_bytes_impl() override {
-        std::int64_t size0 = plans[0] ? get_plan_workspace_size_bytes(*plans[0]) : 0;
-        std::int64_t size1 = plans[1] ? get_plan_workspace_size_bytes(*plans[1]) : 0;
-        return std::max(size0, size1);
-    };
-
-#define BACKEND cufft
-#include "../backend_compute_signature.cxx"
-#undef BACKEND
-};
-} // namespace detail
-
-template <dft::precision prec, dft::domain dom>
-dft::detail::commit_impl<prec, dom>* create_commit(const dft::detail::descriptor<prec, dom>& desc,
-                                                   sycl::queue& sycl_queue) {
-    return new detail::cufft_commit<prec, dom>(sycl_queue, desc.get_values());
-}
-
-template dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::REAL>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::SINGLE, dft::detail::domain::REAL>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>&,
-    sycl::queue&);
-
-namespace detail {
-template <dft::precision prec, dft::domain dom>
-std::array<std::int64_t, 2> get_offsets_fwd(dft::detail::commit_impl<prec, dom>* commit) {
-    return static_cast<cufft_commit<prec, dom>*>(commit)->get_offsets_fwd();
-}
-
-template <dft::precision prec, dft::domain dom>
-std::array<std::int64_t, 2> get_offsets_bwd(dft::detail::commit_impl<prec, dom>* commit) {
-    return static_cast<cufft_commit<prec, dom>*>(commit)->get_offsets_bwd();
-}
-
-template std::array<std::int64_t, 2>
-get_offsets_fwd<dft::detail::precision::SINGLE, dft::detail::domain::REAL>(
-    dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::REAL>*);
-template std::array<std::int64_t, 2>
-get_offsets_fwd<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>(
-    dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>*);
-template std::array<std::int64_t, 2>
-get_offsets_fwd<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>(
-    dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>*);
-template std::array<std::int64_t, 2>
-get_offsets_fwd<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>(
-    dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>*);
-
-template std::array<std::int64_t, 2>
-get_offsets_bwd<dft::detail::precision::SINGLE, dft::detail::domain::REAL>(
-    dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::REAL>*);
-template std::array<std::int64_t, 2>
-get_offsets_bwd<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>(
-    dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>*);
-template std::array<std::int64_t, 2>
-get_offsets_bwd<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>(
-    dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>*);
-template std::array<std::int64_t, 2>
-get_offsets_bwd<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>(
-    dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>*);
-} //namespace detail
-
-} // namespace oneapi::mkl::dft::cufft
diff --git a/src/dft/backends/cufft/descriptor.cpp b/src/dft/backends/cufft/descriptor.cpp
deleted file mode 100644
index d102164c2..000000000
--- a/src/dft/backends/cufft/descriptor.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/dft/descriptor.hpp"
-#include "../../descriptor.cxx"
-
-#include "oneapi/mkl/dft/detail/cufft/onemkl_dft_cufft.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-
-template <precision prec, domain dom>
-void descriptor<prec, dom>::commit(backend_selector<backend::cufft> selector) {
-    if (!pimpl_ || pimpl_->get_queue() != selector.get_queue()) {
-        if (pimpl_) {
-            pimpl_->get_queue().wait();
-        }
-        pimpl_.reset(cufft::create_commit(*this, selector.get_queue()));
-    }
-    pimpl_->commit(values_);
-}
-
-template void descriptor<precision::SINGLE, domain::COMPLEX>::commit(
-    backend_selector<backend::cufft>);
-template void descriptor<precision::SINGLE, domain::REAL>::commit(backend_selector<backend::cufft>);
-template void descriptor<precision::DOUBLE, domain::COMPLEX>::commit(
-    backend_selector<backend::cufft>);
-template void descriptor<precision::DOUBLE, domain::REAL>::commit(backend_selector<backend::cufft>);
-
-} //namespace dft
-} //namespace mkl
-} //namespace oneapi
diff --git a/src/dft/backends/cufft/execute_helper.hpp b/src/dft/backends/cufft/execute_helper.hpp
deleted file mode 100644
index 776f0f254..000000000
--- a/src/dft/backends/cufft/execute_helper.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_SRC_CUFFT_EXECUTE_HPP_
-#define _ONEMKL_DFT_SRC_CUFFT_EXECUTE_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/dft/detail/commit_impl.hpp"
-#include "oneapi/mkl/dft/detail/descriptor_impl.hpp"
-#include "oneapi/mkl/dft/types.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-
-#include <cuda.h>
-#include <cufft.h>
-
-namespace oneapi::mkl::dft::cufft::detail {
-
-template <dft::precision prec, dft::domain dom>
-inline dft::detail::commit_impl<prec, dom> *checked_get_commit(
-    dft::detail::descriptor<prec, dom> &desc) {
-    auto commit_handle = dft::detail::get_commit(desc);
-    if (commit_handle == nullptr || commit_handle->get_backend() != backend::cufft) {
-        throw mkl::invalid_argument("dft/backends/cufft", "get_commit",
-                                    "DFT descriptor has not been commited for cuFFT");
-    }
-    return commit_handle;
-}
-
-/// Throw an mkl::invalid_argument if the runtime param in the descriptor does not match
-/// the expected value.
-template <dft::config_param Param, dft::config_value Expected, typename DescT>
-inline auto expect_config(DescT &desc, const char *message) {
-    dft::config_value actual{ 0 };
-    desc.get_value(Param, &actual);
-    if (actual != Expected) {
-        throw mkl::invalid_argument("dft/backends/cufft", "expect_config", message);
-    }
-}
-
-enum class Direction { Forward = CUFFT_FORWARD, Backward = CUFFT_INVERSE };
-
-template <Direction dir, typename forward_data_type>
-void cufft_execute(const std::string &func, CUstream stream, cufftHandle plan, void *input,
-                   void *output) {
-    constexpr bool is_real = std::is_floating_point_v<forward_data_type>;
-    using single_type = std::conditional_t<is_real, float, std::complex<float>>;
-    constexpr bool is_single = std::is_same_v<forward_data_type, single_type>;
-
-    if constexpr (is_real) {
-        if constexpr (dir == Direction::Forward) {
-            if constexpr (is_single) {
-                auto result = cufftExecR2C(plan, reinterpret_cast<cufftReal *>(input),
-                                           reinterpret_cast<cufftComplex *>(output));
-                if (result != CUFFT_SUCCESS) {
-                    throw oneapi::mkl::exception("dft/backends/cufft", func,
-                                                 "cufftExecR2C returned " + std::to_string(result));
-                }
-            }
-            else {
-                auto result = cufftExecD2Z(plan, reinterpret_cast<cufftDoubleReal *>(input),
-                                           reinterpret_cast<cufftDoubleComplex *>(output));
-                if (result != CUFFT_SUCCESS) {
-                    throw oneapi::mkl::exception("dft/backends/cufft", func,
-                                                 "cufftExecD2Z returned " + std::to_string(result));
-                }
-            }
-        }
-        else {
-            if constexpr (is_single) {
-                auto result = cufftExecC2R(plan, reinterpret_cast<cufftComplex *>(input),
-                                           reinterpret_cast<cufftReal *>(output));
-                if (result != CUFFT_SUCCESS) {
-                    throw oneapi::mkl::exception("dft/backends/cufft", func,
-                                                 "cufftExecC2R returned " + std::to_string(result));
-                }
-            }
-            else {
-                auto result = cufftExecZ2D(plan, reinterpret_cast<cufftDoubleComplex *>(input),
-                                           reinterpret_cast<cufftDoubleReal *>(output));
-                if (result != CUFFT_SUCCESS) {
-                    throw oneapi::mkl::exception("dft/backends/cufft", func,
-                                                 "cufftExecZ2D returned " + std::to_string(result));
-                }
-            }
-        }
-    }
-    else {
-        if constexpr (is_single) {
-            auto result =
-                cufftExecC2C(plan, reinterpret_cast<cufftComplex *>(input),
-                             reinterpret_cast<cufftComplex *>(output), static_cast<int>(dir));
-            if (result != CUFFT_SUCCESS) {
-                throw oneapi::mkl::exception("dft/backends/cufft", func,
-                                             "cufftExecC2C returned " + std::to_string(result));
-            }
-        }
-        else {
-            auto result =
-                cufftExecZ2Z(plan, reinterpret_cast<cufftDoubleComplex *>(input),
-                             reinterpret_cast<cufftDoubleComplex *>(output), static_cast<int>(dir));
-            if (result != CUFFT_SUCCESS) {
-                throw oneapi::mkl::exception("dft/backends/cufft", func,
-                                             "cufftExecZ2Z returned " + std::to_string(result));
-            }
-        }
-    }
-
-    auto result = cuStreamSynchronize(stream);
-    if (result != CUDA_SUCCESS) {
-        throw oneapi::mkl::exception("dft/backends/cufft", func,
-                                     "cuStreamSynchronize returned " + std::to_string(result));
-    }
-}
-
-inline CUstream setup_stream(const std::string &func, sycl::interop_handle ih, cufftHandle plan) {
-    auto stream = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
-    auto result = cufftSetStream(plan, stream);
-    if (result != CUFFT_SUCCESS) {
-        throw oneapi::mkl::exception("dft/backends/cufft", func,
-                                     "cufftSetStream returned " + std::to_string(result));
-    }
-    return stream;
-}
-
-} // namespace oneapi::mkl::dft::cufft::detail
-
-#endif
diff --git a/src/dft/backends/cufft/forward.cpp b/src/dft/backends/cufft/forward.cpp
deleted file mode 100644
index fb323c085..000000000
--- a/src/dft/backends/cufft/forward.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <type_traits>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-
-#include "oneapi/mkl/dft/detail/commit_impl.hpp"
-#include "oneapi/mkl/dft/detail/cufft/onemkl_dft_cufft.hpp"
-#include "oneapi/mkl/dft/types.hpp"
-
-#include "execute_helper.hpp"
-
-#include <cufft.h>
-
-namespace oneapi::mkl::dft::cufft {
-
-namespace detail {
-//forward declaration
-template <dft::precision prec, dft::domain dom>
-std::array<std::int64_t, 2> get_offsets_fwd(dft::detail::commit_impl<prec, dom> *commit);
-
-template <dft::precision prec, dft::domain dom>
-cufftHandle get_fwd_plan(dft::detail::commit_impl<prec, dom> *commit) {
-    return static_cast<std::optional<cufftHandle> *>(commit->get_handle())[0].value();
-}
-} // namespace detail
-
-// BUFFER version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc,
-                                   sycl::buffer<fwd<descriptor_type>, 1> &inout) {
-    const std::string func_name = "compute_forward(desc, inout)";
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_fwd_plan(commit);
-    auto offsets = detail::get_offsets_fwd(commit);
-
-    if constexpr (std::is_floating_point_v<fwd<descriptor_type>>) {
-        if (offsets[0] % 2 != 0) {
-            throw oneapi::mkl::unimplemented(
-                "DFT", func_name,
-                "cuFFT requires offset (first value in strides) to be multiple of 2!");
-        }
-        offsets[1] *= 2; // offset is supplied in complex but we offset scalar pointer
-    }
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto inout_acc = inout.template get_access<sycl::access::mode::read_write>(cgh);
-        commit->add_buffer_workspace_dependency_if_rqd("compute_forward", cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, plan);
-
-            auto inout_native = reinterpret_cast<fwd<descriptor_type> *>(
-                ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(inout_acc));
-            detail::cufft_execute<detail::Direction::Forward, fwd<descriptor_type>>(
-                func_name, stream, plan, reinterpret_cast<void *>(inout_native + offsets[0]),
-                reinterpret_cast<void *>(inout_native + offsets[1]));
-        });
-    });
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &, sycl::buffer<scalar<descriptor_type>, 1> &,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &) {
-    throw oneapi::mkl::unimplemented("DFT", "compute_forward(desc, inout_re, inout_im)",
-                                     "cuFFT does not support real-real complex storage.");
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer<fwd<descriptor_type>, 1> &in,
-                                   sycl::buffer<bwd<descriptor_type>, 1> &out) {
-    const std::string func_name = "compute_forward(desc, in, out)";
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::NOT_INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_fwd_plan(commit);
-    auto offsets = detail::get_offsets_fwd(commit);
-
-    if constexpr (std::is_floating_point_v<fwd<descriptor_type>>) {
-        if (offsets[0] % 2 != 0) {
-            throw oneapi::mkl::unimplemented(
-                "DFT", func_name,
-                "cuFFT requires offset (first value in strides) to be multiple of 2!");
-        }
-    }
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto in_acc = in.template get_access<sycl::access::mode::read_write>(cgh);
-        auto out_acc = out.template get_access<sycl::access::mode::read_write>(cgh);
-        commit->add_buffer_workspace_dependency_if_rqd("compute_forward", cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, plan);
-
-            auto in_native = reinterpret_cast<void *>(
-                reinterpret_cast<fwd<descriptor_type> *>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(in_acc)) +
-                offsets[0]);
-            auto out_native = reinterpret_cast<void *>(
-                reinterpret_cast<bwd<descriptor_type> *>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(out_acc)) +
-                offsets[1]);
-            detail::cufft_execute<detail::Direction::Forward, fwd<descriptor_type>>(
-                func_name, stream, plan, in_native, out_native);
-        });
-    });
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &, sycl::buffer<scalar<descriptor_type>, 1> &,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &) {
-    throw oneapi::mkl::unimplemented("DFT", "compute_forward(desc, in_re, in_im, out_re, out_im)",
-                                     "cuFFT does not support real-real complex storage.");
-}
-
-//USM version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd<descriptor_type> *inout,
-                                          const std::vector<sycl::event> &dependencies) {
-    const std::string func_name = "compute_forward(desc, inout, dependencies)";
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_fwd_plan(commit);
-    auto offsets = detail::get_offsets_fwd(commit);
-
-    if constexpr (std::is_floating_point_v<fwd<descriptor_type>>) {
-        if (offsets[0] % 2 != 0) {
-            throw oneapi::mkl::unimplemented(
-                "DFT", func_name,
-                "cuFFT requires offset (first value in strides) to be multiple of 2!");
-        }
-        offsets[1] *= 2; // offset is supplied in complex but we offset scalar pointer
-    }
-
-    sycl::event sycl_event = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        commit->depend_on_last_usm_workspace_event_if_rqd(cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, plan);
-
-            detail::cufft_execute<detail::Direction::Forward, fwd<descriptor_type>>(
-                func_name, stream, plan, inout + offsets[0], inout + offsets[1]);
-        });
-    });
-    commit->set_last_usm_workspace_event_if_rqd(sycl_event);
-    return sycl_event;
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &, scalar<descriptor_type> *,
-                                          scalar<descriptor_type> *,
-                                          const std::vector<sycl::event> &) {
-    throw oneapi::mkl::unimplemented("DFT",
-                                     "compute_forward(desc, inout_re, inout_im, dependencies)",
-                                     "cuFFT does not support real-real complex storage.");
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd<descriptor_type> *in,
-                                          bwd<descriptor_type> *out,
-                                          const std::vector<sycl::event> &dependencies) {
-    const std::string func_name = "compute_forward(desc, in, out, dependencies)";
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::NOT_INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_fwd_plan(commit);
-    auto offsets = detail::get_offsets_fwd(commit);
-
-    if constexpr (std::is_floating_point_v<fwd<descriptor_type>>) {
-        if (offsets[0] % 2 != 0) {
-            throw oneapi::mkl::unimplemented(
-                "DFT", func_name,
-                "cuFFT requires offset (first value in strides) to be multiple of 2!");
-        }
-    }
-
-    sycl::event sycl_event = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        commit->depend_on_last_usm_workspace_event_if_rqd(cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, plan);
-
-            detail::cufft_execute<detail::Direction::Forward, fwd<descriptor_type>>(
-                func_name, stream, plan, in + offsets[0], out + offsets[1]);
-        });
-    });
-    commit->set_last_usm_workspace_event_if_rqd(sycl_event);
-    return sycl_event;
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &, scalar<descriptor_type> *,
-                                          scalar<descriptor_type> *, scalar<descriptor_type> *,
-                                          scalar<descriptor_type> *,
-                                          const std::vector<sycl::event> &) {
-    throw oneapi::mkl::unimplemented(
-        "DFT", "compute_forward(desc, in_re, in_im, out_re, out_im, dependencies)",
-        "cuFFT does not support real-real complex storage.");
-}
-
-// Template function instantiations
-#include "dft/backends/backend_forward_instantiations.cxx"
-
-} // namespace oneapi::mkl::dft::cufft
diff --git a/src/dft/backends/cufft/mkl_dft_cufft_wrappers.cpp b/src/dft/backends/cufft/mkl_dft_cufft_wrappers.cpp
deleted file mode 100644
index 93d3aae11..000000000
--- a/src/dft/backends/cufft/mkl_dft_cufft_wrappers.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/dft/detail/cufft/onemkl_dft_cufft.hpp"
-#include "dft/function_table.hpp"
-
-#define WRAPPER_VERSION 1
-#define BACKEND         cufft
-
-extern "C" dft_function_table_t mkl_dft_table = {
-    WRAPPER_VERSION,
-#include "dft/backends/backend_wrappers.cxx"
-};
-
-#undef WRAPPER_VERSION
-#undef BACKEND
diff --git a/src/dft/backends/descriptor.cpp b/src/dft/backends/descriptor.cpp
deleted file mode 100644
index aa4cded9c..000000000
--- a/src/dft/backends/descriptor.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/dft/descriptor.hpp"
-#include "oneapi/mkl/dft/detail/dft_loader.hpp"
-
-#include "../descriptor.cxx"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-
-template <precision prec, domain dom>
-void descriptor<prec, dom>::commit(sycl::queue &queue) {
-    if (!pimpl_ || pimpl_->get_queue() != queue) {
-        if (pimpl_) {
-            pimpl_->get_queue().wait();
-        }
-        pimpl_.reset(detail::create_commit(*this, queue));
-    }
-    pimpl_->commit(values_);
-}
-template void descriptor<precision::SINGLE, domain::COMPLEX>::commit(sycl::queue &);
-template void descriptor<precision::SINGLE, domain::REAL>::commit(sycl::queue &);
-template void descriptor<precision::DOUBLE, domain::COMPLEX>::commit(sycl::queue &);
-template void descriptor<precision::DOUBLE, domain::REAL>::commit(sycl::queue &);
-
-} //namespace dft
-} //namespace mkl
-} //namespace oneapi
diff --git a/src/dft/backends/mklcpu/CMakeLists.txt b/src/dft/backends/mklcpu/CMakeLists.txt
deleted file mode 100644
index 6d0f1276d..000000000
--- a/src/dft/backends/mklcpu/CMakeLists.txt
+++ /dev/null
@@ -1,93 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(LIB_NAME onemkl_dft_mklcpu)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-include(WarningsUtils)
-
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT
-  commit.cpp
-  descriptor.cpp
-  forward.cpp
-  backward.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: mkl_dft_cpu_wrappers.cpp>
-)
-add_dependencies(onemkl_backend_libs_dft ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PUBLIC ${ONEMKL_INTERFACE_INCLUDE_DIRS}
-)
-target_include_directories(${LIB_NAME}
-  PUBLIC ${ONEMKL_INTERFACE_INCLUDE_DIRS}
-)
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/src
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET ${LIB_OBJ} SOURCES ${SOURCES})
-endif()
-
-if(TARGET MKL::MKL_SYCL::DFT)
-  target_link_libraries(${LIB_OBJ}
-    PUBLIC ONEMKL::SYCL::SYCL
-    PUBLIC MKL::MKL_SYCL::DFT
-    PRIVATE onemkl_warnings
-  )
-else()
-  target_link_libraries(${LIB_OBJ}
-    PUBLIC ONEMKL::SYCL::SYCL
-    PUBLIC MKL::MKL_DPCPP
-    PRIVATE onemkl_warnings
-  )
-endif()
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-#Set oneMKL libraries as not transitive for dynamic
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/dft/backends/mklcpu/backward.cpp b/src/dft/backends/mklcpu/backward.cpp
deleted file mode 100644
index fe7186630..000000000
--- a/src/dft/backends/mklcpu/backward.cpp
+++ /dev/null
@@ -1,330 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-
-#include "oneapi/mkl/dft/descriptor.hpp"
-#include "oneapi/mkl/dft/detail/mklcpu/onemkl_dft_mklcpu.hpp"
-#include "oneapi/mkl/dft/types.hpp"
-
-#include "dft/backends/mklcpu/commit_derived_impl.hpp"
-
-// MKLCPU header
-#include "mkl_dfti.h"
-
-namespace oneapi::mkl::dft::mklcpu {
-namespace detail {
-
-// BUFFER version
-// backward a MKLCPU DFT call to the backend, checking that the commit impl is valid.
-template <dft::precision prec, dft::domain dom>
-inline void check_bwd_commit(dft::descriptor<prec, dom> &desc) {
-    auto commit_handle = dft::detail::get_commit(desc);
-    if (commit_handle == nullptr || commit_handle->get_backend() != backend::mklcpu) {
-        throw mkl::invalid_argument("DFT", "computer_backward",
-                                    "DFT descriptor has not been commited for MKLCPU");
-    }
-
-    auto mklcpu_desc = reinterpret_cast<detail::mklcpu_desc_t *>(commit_handle->get_handle());
-    MKL_LONG commit_status{ DFTI_UNCOMMITTED };
-    DftiGetValue(mklcpu_desc[1], DFTI_COMMIT_STATUS, &commit_status);
-    if (commit_status != DFTI_COMMITTED) {
-        throw mkl::invalid_argument("DFT", "compute_backward",
-                                    "MKLCPU DFT descriptor was not successfully committed.");
-    }
-}
-
-// Throw an mkl::invalid_argument if the runtime param in the descriptor does not match
-// the expected value.
-template <dft::detail::config_param Param, dft::detail::config_value Expected, typename DescT>
-inline auto expect_config(DescT &desc, const char *message) {
-    dft::detail::config_value actual{ 0 };
-    desc.get_value(Param, &actual);
-    if (actual != Expected) {
-        throw mkl::invalid_argument("DFT", "compute_backward", message);
-    }
-}
-// convert the base commit class to derived cpu commit class
-template <dft::precision prec, dft::domain dom>
-auto get_buffer(commit_t<prec, dom> *commit_handle) {
-    commit_derived_t<prec, dom> *derived_commit =
-        static_cast<commit_derived_t<prec, dom> *>(commit_handle);
-    return derived_commit->get_handle_buffer();
-}
-} // namespace detail
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<fwd<descriptor_type>, 1> &inout) {
-    detail::expect_config<dft::detail::config_param::PLACEMENT, dft::detail::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_bwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-
-    cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-        auto inout_acc = inout.template get_access<sycl::access::mode::read_write>(cgh);
-        detail::host_task<class host_kernel_back_inplace>(cgh, [=]() {
-            DFT_ERROR status =
-                DftiComputeBackward(desc_acc[detail::DIR::bwd], detail::acc_to_ptr(inout_acc));
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/backends/mklcpu", "compute_backward",
-                    std::string("DftiComputeBackward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &inout_re,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &inout_im) {
-    detail::expect_config<dft::detail::config_param::COMPLEX_STORAGE,
-                          dft::detail::config_value::REAL_REAL>(
-        desc, "Unexpected value for complex storage");
-
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_bwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-
-    cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-        auto re_acc = inout_re.template get_access<sycl::access::mode::read_write>(cgh);
-        auto im_acc = inout_im.template get_access<sycl::access::mode::read_write>(cgh);
-
-        detail::host_task<class host_kernel_split_back_inplace>(cgh, [=]() {
-            DFT_ERROR status = DftiComputeBackward(
-                desc_acc[detail::DIR::bwd], detail::acc_to_ptr(re_acc), detail::acc_to_ptr(im_acc));
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/backends/mklcpu", "compute_backward",
-                    std::string("DftiComputeBackward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<bwd<descriptor_type>, 1> &in,
-                                    sycl::buffer<fwd<descriptor_type>, 1> &out) {
-    detail::expect_config<dft::detail::config_param::PLACEMENT,
-                          dft::detail::config_value::NOT_INPLACE>(desc,
-                                                                  "Unexpected value for placement");
-
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_bwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-
-    cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-        auto in_acc = in.template get_access<sycl::access::mode::read>(cgh);
-        auto out_acc = out.template get_access<sycl::access::mode::write>(cgh);
-
-        detail::host_task<class host_kernel_back_outofplace>(cgh, [=]() {
-            auto in_ptr = const_cast<bwd<descriptor_type> *>(detail::acc_to_ptr(in_acc));
-            DFT_ERROR status = DftiComputeBackward(desc_acc[detail::DIR::bwd], in_ptr,
-                                                   detail::acc_to_ptr(out_acc));
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/backends/mklcpu", "compute_backward",
-                    std::string("DftiComputeBackward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &in_re,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &in_im,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &out_re,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &out_im) {
-    detail::expect_config<dft::detail::config_param::COMPLEX_STORAGE,
-                          dft::detail::config_value::REAL_REAL>(
-        desc, "Unexpected value for complex storage");
-
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_bwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-
-    cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-        auto inre_acc = in_re.template get_access<sycl::access::mode::read>(cgh);
-        auto inim_acc = in_im.template get_access<sycl::access::mode::read>(cgh);
-        auto outre_acc = out_re.template get_access<sycl::access::mode::write>(cgh);
-        auto outim_acc = out_im.template get_access<sycl::access::mode::write>(cgh);
-
-        detail::host_task<class host_kernel_split_back_outofplace>(cgh, [=]() {
-            auto inre_ptr = const_cast<scalar<descriptor_type> *>(detail::acc_to_ptr(inre_acc));
-            auto inim_ptr = const_cast<scalar<descriptor_type> *>(detail::acc_to_ptr(inim_acc));
-            DFT_ERROR status =
-                DftiComputeBackward(desc_acc[detail::DIR::bwd], inre_ptr, inim_ptr,
-                                    detail::acc_to_ptr(outre_acc), detail::acc_to_ptr(outim_acc));
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/backends/mklcpu", "compute_backward",
-                    std::string("DftiComputeBackward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-//USM version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwd<descriptor_type> *inout,
-                                           const std::vector<sycl::event> &dependencies) {
-    detail::expect_config<dft::detail::config_param::PLACEMENT, dft::detail::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_bwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-
-    return cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-        cgh.depends_on(dependencies);
-        detail::host_task<class host_usm_kernel_back_inplace>(cgh, [=]() {
-            DFT_ERROR status = DftiComputeBackward(desc_acc[detail::DIR::bwd], inout);
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/backends/mklcpu", "compute_backward",
-                    std::string("DftiComputeBackward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar<descriptor_type> *inout_re,
-                                           scalar<descriptor_type> *inout_im,
-                                           const std::vector<sycl::event> &dependencies) {
-    detail::expect_config<dft::detail::config_param::COMPLEX_STORAGE,
-                          dft::detail::config_value::REAL_REAL>(
-        desc, "Unexpected value for complex storage");
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_bwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-
-    return cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-        cgh.depends_on(dependencies);
-        detail::host_task<class host_usm_kernel_split_back_inplace>(cgh, [=]() {
-            DFT_ERROR status = DftiComputeBackward(desc_acc[detail::DIR::bwd], inout_re, inout_im);
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/backends/mklcpu", "compute_backward",
-                    std::string("DftiComputeBackward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwd<descriptor_type> *in,
-                                           fwd<descriptor_type> *out,
-                                           const std::vector<sycl::event> &dependencies) {
-    // Check: inplace, complex storage
-    detail::expect_config<dft::detail::config_param::PLACEMENT,
-                          dft::detail::config_value::NOT_INPLACE>(desc,
-                                                                  "Unexpected value for placement");
-
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_bwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-    return cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-
-        cgh.depends_on(dependencies);
-        detail::host_task<class host_usm_kernel_back_outofplace>(cgh, [=]() {
-            DFT_ERROR status = DftiComputeBackward(desc_acc[detail::DIR::bwd], in, out);
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/backends/mklcpu", "compute_backward",
-                    std::string("DftiComputeBackward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar<descriptor_type> *in_re,
-                                           scalar<descriptor_type> *in_im,
-                                           scalar<descriptor_type> *out_re,
-                                           scalar<descriptor_type> *out_im,
-                                           const std::vector<sycl::event> &dependencies) {
-    detail::expect_config<dft::detail::config_param::COMPLEX_STORAGE,
-                          dft::detail::config_value::REAL_REAL>(
-        desc, "Unexpected value for complex storage");
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_bwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-    return cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-
-        cgh.depends_on(dependencies);
-        detail::host_task<class host_usm_kernel_split_back_outofplace>(cgh, [=]() {
-            DFT_ERROR status =
-                DftiComputeBackward(desc_acc[detail::DIR::bwd], in_re, in_im, out_re, out_im);
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/backends/mklcpu", "compute_backward",
-                    std::string("DftiComputeBackward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-// Template function instantiations
-#include "dft/backends/backend_backward_instantiations.cxx"
-
-} // namespace oneapi::mkl::dft::mklcpu
diff --git a/src/dft/backends/mklcpu/commit.cpp b/src/dft/backends/mklcpu/commit.cpp
deleted file mode 100644
index 1ec8aef9c..000000000
--- a/src/dft/backends/mklcpu/commit.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/detail/backends.hpp"
-#include "oneapi/mkl/dft/types.hpp"
-#include "oneapi/mkl/dft/descriptor.hpp"
-
-#include "oneapi/mkl/dft/detail/mklcpu/onemkl_dft_mklcpu.hpp"
-
-#include "oneapi/mkl/dft/detail/commit_impl.hpp"
-
-#include "dft/backends/mklcpu/commit_derived_impl.hpp"
-#include "../stride_helper.hpp"
-#include "mkl_service.h"
-#include "mkl_dfti.h"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-namespace mklcpu {
-namespace detail {
-
-template <dft::detail::precision prec, dft::detail::domain dom>
-commit_derived_impl<prec, dom>::commit_derived_impl(
-    sycl::queue queue, const dft::detail::dft_values<prec, dom>& config_values)
-        : oneapi::mkl::dft::detail::commit_impl<prec, dom>(queue, backend::mklcpu, config_values) {
-    // create the descriptor once for the lifetime of the descriptor class
-    DFT_ERROR status[2] = { DFTI_BAD_DESCRIPTOR, DFTI_BAD_DESCRIPTOR };
-
-    for (auto dir : { DIR::fwd, DIR::bwd }) {
-        const auto rank = static_cast<std::int64_t>(config_values.dimensions.size());
-        if (config_values.dimensions.size() == 1) {
-            status[dir] = DftiCreateDescriptor(&bidirection_handle[dir], mklcpu_prec, mklcpu_dom, 1,
-                                               config_values.dimensions[0]);
-        }
-        else {
-            status[dir] = DftiCreateDescriptor(&bidirection_handle[dir], mklcpu_prec, mklcpu_dom,
-                                               rank, config_values.dimensions.data());
-        }
-    }
-
-    if (status[0] != DFTI_NO_ERROR || status[1] != DFTI_NO_ERROR) {
-        std::string err = std::string("DftiCreateDescriptor failed with status : ") +
-                          DftiErrorMessage(status[0]) + std::string(", ") +
-                          DftiErrorMessage(status[1]);
-        throw oneapi::mkl::exception("dft/backends/mklcpu", "create_descriptor", err);
-    }
-}
-
-template <dft::detail::precision prec, dft::detail::domain dom>
-commit_derived_impl<prec, dom>::~commit_derived_impl() {
-    for (auto dir : { DIR::fwd, DIR::bwd }) {
-        DftiFreeDescriptor(&bidirection_handle[dir]);
-    }
-}
-
-template <dft::detail::precision prec, dft::detail::domain dom>
-void commit_derived_impl<prec, dom>::commit(
-    const dft::detail::dft_values<prec, dom>& config_values) {
-    this->external_workspace_helper_ =
-        oneapi::mkl::dft::detail::external_workspace_helper<prec, dom>(
-            config_values.workspace_placement ==
-            oneapi::mkl::dft::detail::config_value::WORKSPACE_EXTERNAL);
-    set_value(bidirection_handle.data(), config_values);
-
-    this->get_queue()
-        .submit([&](sycl::handler& cgh) {
-            auto bidir_handle_obj =
-                bidirection_buffer.get_access<sycl::access::mode::read_write>(cgh);
-
-            host_task<detail::kernel_name<mklcpu_desc_t>>(cgh, [=]() {
-                DFT_ERROR status[2] = { DFTI_BAD_DESCRIPTOR, DFTI_BAD_DESCRIPTOR };
-
-                for (auto dir : { DIR::fwd, DIR::bwd })
-                    status[dir] = DftiCommitDescriptor(bidir_handle_obj[dir]);
-
-                // this is important for real-batched transforms, as the backward transform would
-                // be inconsistent based on the stride setup, but once recommited before backward
-                // it should work just fine. so we error out only if there is a issue with both.
-                if (status[0] != DFTI_NO_ERROR && status[1] != DFTI_NO_ERROR) {
-                    std::string err = std::string("DftiCommitDescriptor failed with status : ") +
-                                      DftiErrorMessage(status[0]) + std::string(", ") +
-                                      DftiErrorMessage(status[1]);
-                    throw oneapi::mkl::exception("dft/backends/mklcpu", "commit", err);
-                }
-            });
-        })
-        .wait();
-}
-
-template <dft::detail::precision prec, dft::detail::domain dom>
-void* commit_derived_impl<prec, dom>::get_handle() noexcept {
-    return reinterpret_cast<void*>(bidirection_handle.data());
-}
-
-template <dft::detail::precision prec, dft::detail::domain dom>
-template <typename... Args>
-void commit_derived_impl<prec, dom>::set_value_item(mklcpu_desc_t hand, enum DFTI_CONFIG_PARAM name,
-                                                    Args... args) {
-    DFT_ERROR value_err = DftiSetValue(hand, name, args...);
-    if (value_err != DFTI_NO_ERROR) {
-        throw oneapi::mkl::exception("dft/backends/mklcpu", "set_value_item",
-                                     DftiErrorMessage(value_err));
-    }
-}
-
-template <dft::detail::precision prec, dft::detail::domain dom>
-void commit_derived_impl<prec, dom>::set_value(mklcpu_desc_t* descHandle,
-                                               const dft::detail::dft_values<prec, dom>& config) {
-    auto stride_choice = dft::detail::get_stride_api(config);
-    dft::detail::throw_on_invalid_stride_api("MKLCPU commit", stride_choice);
-    for (auto dir : { DIR::fwd, DIR::bwd }) {
-        if (stride_choice == dft::detail::stride_api::IO_STRIDES) {
-            set_value_item(descHandle[dir], DFTI_INPUT_STRIDES, config.input_strides.data());
-            set_value_item(descHandle[dir], DFTI_OUTPUT_STRIDES, config.output_strides.data());
-        }
-        else { // Forward / backward strides
-            if (dir == DIR::fwd) {
-                set_value_item(descHandle[dir], DFTI_INPUT_STRIDES, config.fwd_strides.data());
-                set_value_item(descHandle[dir], DFTI_OUTPUT_STRIDES, config.bwd_strides.data());
-            }
-            else {
-                set_value_item(descHandle[dir], DFTI_INPUT_STRIDES, config.bwd_strides.data());
-                set_value_item(descHandle[dir], DFTI_OUTPUT_STRIDES, config.fwd_strides.data());
-            }
-        }
-        set_value_item(descHandle[dir], DFTI_BACKWARD_SCALE, config.bwd_scale);
-        set_value_item(descHandle[dir], DFTI_FORWARD_SCALE, config.fwd_scale);
-        set_value_item(descHandle[dir], DFTI_NUMBER_OF_TRANSFORMS, config.number_of_transforms);
-        set_value_item(descHandle[dir], DFTI_INPUT_DISTANCE,
-                       (dir == detail::DIR::fwd) ? config.fwd_dist : config.bwd_dist);
-        set_value_item(descHandle[dir], DFTI_OUTPUT_DISTANCE,
-                       (dir == detail::DIR::fwd) ? config.bwd_dist : config.fwd_dist);
-        set_value_item(descHandle[dir], DFTI_COMPLEX_STORAGE,
-                       to_mklcpu<config_param::COMPLEX_STORAGE>(config.complex_storage));
-        set_value_item(descHandle[dir], DFTI_REAL_STORAGE,
-                       to_mklcpu<config_param::REAL_STORAGE>(config.real_storage));
-        set_value_item(descHandle[dir], DFTI_CONJUGATE_EVEN_STORAGE,
-                       to_mklcpu<config_param::CONJUGATE_EVEN_STORAGE>(config.conj_even_storage));
-        set_value_item(descHandle[dir], DFTI_PLACEMENT,
-                       to_mklcpu<config_param::PLACEMENT>(config.placement));
-        set_value_item(descHandle[dir], DFTI_PACKED_FORMAT,
-                       to_mklcpu<config_param::PACKED_FORMAT>(config.packed_format));
-        // Setting the workspace causes an FFT_INVALID_DESCRIPTOR.
-        if (config.workspace != config_value::ALLOW) {
-            throw mkl::invalid_argument("dft/backends/mklcpu", "commit",
-                                        "MKLCPU only supports workspace set to allow");
-        }
-        // Setting the ordering causes an FFT_INVALID_DESCRIPTOR. Check that default is used:
-        if (config.ordering != dft::detail::config_value::ORDERED) {
-            throw mkl::invalid_argument("dft/backends/mklcpu", "commit",
-                                        "MKLCPU only supports ordered ordering.");
-        }
-        // Setting the transpose causes an FFT_INVALID_DESCRIPTOR. Check that default is used:
-        if (config.transpose != false) {
-            throw mkl::invalid_argument("dft/backends/mklcpu", "commit",
-                                        "MKLCPU only supports non-transposed.");
-        }
-    }
-}
-} // namespace detail
-
-template <dft::detail::precision prec, dft::detail::domain dom>
-dft::detail::commit_impl<prec, dom>* create_commit(const dft::detail::descriptor<prec, dom>& desc,
-                                                   sycl::queue& sycl_queue) {
-    return new detail::commit_derived_impl<prec, dom>(sycl_queue, desc.get_values());
-}
-
-template dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::REAL>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::SINGLE, dft::detail::domain::REAL>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>&,
-    sycl::queue&);
-
-} // namespace mklcpu
-} // namespace dft
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/dft/backends/mklcpu/commit_derived_impl.hpp b/src/dft/backends/mklcpu/commit_derived_impl.hpp
deleted file mode 100644
index 3551758a0..000000000
--- a/src/dft/backends/mklcpu/commit_derived_impl.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_COMMIT_DERIVED_IMPL_HPP_
-#define _ONEMKL_DFT_COMMIT_DERIVED_IMPL_HPP_
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/dft/detail/types_impl.hpp"
-#include "dft/backends/mklcpu/mklcpu_helpers.hpp"
-
-// MKLCPU header
-#include "mkl_dfti.h"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-namespace mklcpu {
-namespace detail {
-
-// this is used for indexing bidirectional_handle
-enum DIR { fwd = 0, bwd = 1 };
-
-template <dft::detail::precision prec, dft::detail::domain dom>
-class commit_derived_impl final : public dft::detail::commit_impl<prec, dom> {
-private:
-    using scalar_type = typename dft::detail::commit_impl<prec, dom>::scalar_type;
-    static constexpr DFTI_CONFIG_VALUE mklcpu_prec = to_mklcpu(prec);
-    static constexpr DFTI_CONFIG_VALUE mklcpu_dom = to_mklcpu(dom);
-    using mklcpu_desc_t = DFTI_DESCRIPTOR_HANDLE;
-
-public:
-    commit_derived_impl(sycl::queue queue, const dft::detail::dft_values<prec, dom>& config_values);
-
-    virtual void commit(const dft::detail::dft_values<prec, dom>& config_values) override;
-
-    virtual void* get_handle() noexcept override;
-
-    virtual ~commit_derived_impl() override;
-
-    sycl::buffer<mklcpu_desc_t, 1> get_handle_buffer() noexcept {
-        return bidirection_buffer;
-    };
-
-#define BACKEND mklcpu
-#include "../backend_compute_signature.cxx"
-#undef BACKEND
-
-private:
-    // bidirectional_handle[0] is the forward handle, bidirectional_handle[1] is the backward handle
-    std::array<mklcpu_desc_t, 2> bidirection_handle{ nullptr, nullptr };
-    sycl::buffer<mklcpu_desc_t, 1> bidirection_buffer{ bidirection_handle.data(),
-                                                       sycl::range<1>{ 2 } };
-
-    template <typename... Args>
-    void set_value_item(mklcpu_desc_t hand, enum DFTI_CONFIG_PARAM name, Args... args);
-
-    void set_value(mklcpu_desc_t* descHandle, const dft::detail::dft_values<prec, dom>& config);
-};
-
-template <dft::detail::precision prec, dft::detail::domain dom>
-using commit_t = dft::detail::commit_impl<prec, dom>;
-
-template <dft::detail::precision prec, dft::detail::domain dom>
-using commit_derived_t = detail::commit_derived_impl<prec, dom>;
-
-} // namespace detail
-} // namespace mklcpu
-} // namespace dft
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _ONEMKL_DFT_COMMIT_DERIVED_IMPL_HPP_
diff --git a/src/dft/backends/mklcpu/descriptor.cpp b/src/dft/backends/mklcpu/descriptor.cpp
deleted file mode 100644
index 2bb0e2835..000000000
--- a/src/dft/backends/mklcpu/descriptor.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/dft/descriptor.hpp"
-#include "../../descriptor.cxx"
-
-#include "oneapi/mkl/dft/detail/mklcpu/onemkl_dft_mklcpu.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-
-template <precision prec, domain dom>
-void descriptor<prec, dom>::commit(backend_selector<backend::mklcpu> selector) {
-    if (!pimpl_ || pimpl_->get_queue() != selector.get_queue()) {
-        if (pimpl_) {
-            pimpl_->get_queue().wait();
-        }
-        pimpl_.reset(mklcpu::create_commit(*this, selector.get_queue()));
-    }
-    pimpl_->commit(values_);
-}
-
-template void descriptor<precision::SINGLE, domain::COMPLEX>::commit(
-    backend_selector<backend::mklcpu>);
-template void descriptor<precision::SINGLE, domain::REAL>::commit(
-    backend_selector<backend::mklcpu>);
-template void descriptor<precision::DOUBLE, domain::COMPLEX>::commit(
-    backend_selector<backend::mklcpu>);
-template void descriptor<precision::DOUBLE, domain::REAL>::commit(
-    backend_selector<backend::mklcpu>);
-
-} //namespace dft
-} //namespace mkl
-} //namespace oneapi
diff --git a/src/dft/backends/mklcpu/forward.cpp b/src/dft/backends/mklcpu/forward.cpp
deleted file mode 100644
index 2e5e2fa88..000000000
--- a/src/dft/backends/mklcpu/forward.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-
-#include "oneapi/mkl/dft/descriptor.hpp"
-#include "oneapi/mkl/dft/detail/mklcpu/onemkl_dft_mklcpu.hpp"
-#include "oneapi/mkl/dft/types.hpp"
-
-#include "dft/backends/mklcpu/commit_derived_impl.hpp"
-
-// MKLCPU header
-#include "mkl_dfti.h"
-
-namespace oneapi::mkl::dft::mklcpu {
-namespace detail {
-
-// BUFFER version
-// Forward a MKLCPU DFT call to the backend, checking that the commit impl is valid.
-template <dft::precision prec, dft::domain dom>
-inline void check_fwd_commit(dft::descriptor<prec, dom> &desc) {
-    auto commit_handle = dft::detail::get_commit(desc);
-    if (commit_handle == nullptr || commit_handle->get_backend() != backend::mklcpu) {
-        throw mkl::invalid_argument("DFT", "computer_forward",
-                                    "DFT descriptor has not been commited for MKLCPU");
-    }
-
-    auto mklcpu_desc = reinterpret_cast<detail::mklcpu_desc_t *>(commit_handle->get_handle());
-    MKL_LONG commit_status{ DFTI_UNCOMMITTED };
-    DftiGetValue(mklcpu_desc[0], DFTI_COMMIT_STATUS, &commit_status);
-    if (commit_status != DFTI_COMMITTED) {
-        throw mkl::invalid_argument("DFT", "compute_forward",
-                                    "MKLCPU DFT descriptor was not successfully committed.");
-    }
-}
-
-// Throw an mkl::invalid_argument if the runtime param in the descriptor does not match
-// the expected value.
-template <dft::detail::config_param Param, dft::detail::config_value Expected, typename DescT>
-inline auto expect_config(DescT &desc, const char *message) {
-    dft::detail::config_value actual{ 0 };
-    desc.get_value(Param, &actual);
-    if (actual != Expected) {
-        throw mkl::invalid_argument("DFT", "compute_forward", message);
-    }
-}
-
-// convert the base commit class to derived cpu commit class
-template <dft::precision prec, dft::domain dom>
-auto get_buffer(commit_t<prec, dom> *commit_handle) {
-    commit_derived_t<prec, dom> *derived_commit =
-        static_cast<commit_derived_t<prec, dom> *>(commit_handle);
-    return derived_commit->get_handle_buffer();
-}
-} // namespace detail
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc,
-                                   sycl::buffer<fwd<descriptor_type>, 1> &inout) {
-    detail::expect_config<dft::detail::config_param::PLACEMENT, dft::detail::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_fwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-
-    cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-        auto inout_acc = inout.template get_access<sycl::access::mode::read_write>(cgh);
-        detail::host_task<class host_kernel_inplace>(cgh, [=]() {
-            DFT_ERROR status =
-                DftiComputeForward(desc_acc[detail::DIR::fwd], detail::acc_to_ptr(inout_acc));
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/forward/mklcpu", "compute_forward",
-                    std::string("DftiComputeForward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &inout_re,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &inout_im) {
-    detail::expect_config<dft::detail::config_param::COMPLEX_STORAGE,
-                          dft::detail::config_value::REAL_REAL>(
-        desc, "Unexpected value for complex storage");
-
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_fwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-
-    cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-        auto re_acc = inout_re.template get_access<sycl::access::mode::read_write>(cgh);
-        auto im_acc = inout_im.template get_access<sycl::access::mode::read_write>(cgh);
-
-        detail::host_task<class host_kernel_split_inplace>(cgh, [=]() {
-            DFT_ERROR status = DftiComputeForward(
-                desc_acc[detail::DIR::fwd], detail::acc_to_ptr(re_acc), detail::acc_to_ptr(im_acc));
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/forward/mklcpu", "compute_forward",
-                    std::string("DftiComputeForward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer<fwd<descriptor_type>, 1> &in,
-                                   sycl::buffer<bwd<descriptor_type>, 1> &out) {
-    detail::expect_config<dft::detail::config_param::PLACEMENT,
-                          dft::detail::config_value::NOT_INPLACE>(desc,
-                                                                  "Unexpected value for placement");
-
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_fwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-
-    cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-        auto in_acc = in.template get_access<sycl::access::mode::read>(cgh);
-        auto out_acc = out.template get_access<sycl::access::mode::write>(cgh);
-
-        detail::host_task<class host_kernel_outofplace>(cgh, [=]() {
-            auto in_ptr = const_cast<fwd<descriptor_type> *>(detail::acc_to_ptr(in_acc));
-            DFT_ERROR status =
-                DftiComputeForward(desc_acc[detail::DIR::fwd], in_ptr, detail::acc_to_ptr(out_acc));
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/forward/mklcpu", "compute_forward",
-                    std::string("DftiComputeForward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &in_re,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &in_im,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &out_re,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &out_im) {
-    detail::expect_config<dft::detail::config_param::COMPLEX_STORAGE,
-                          dft::detail::config_value::REAL_REAL>(
-        desc, "Unexpected value for complex storage");
-
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_fwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-
-    cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-        auto inre_acc = in_re.template get_access<sycl::access::mode::read>(cgh);
-        auto inim_acc = in_im.template get_access<sycl::access::mode::read>(cgh);
-        auto outre_acc = out_re.template get_access<sycl::access::mode::write>(cgh);
-        auto outim_acc = out_im.template get_access<sycl::access::mode::write>(cgh);
-
-        detail::host_task<class host_kernel_split_outofplace>(cgh, [=]() {
-            auto inre_ptr = const_cast<scalar<descriptor_type> *>(detail::acc_to_ptr(inre_acc));
-            auto inim_ptr = const_cast<scalar<descriptor_type> *>(detail::acc_to_ptr(inim_acc));
-            DFT_ERROR status =
-                DftiComputeForward(desc_acc[detail::DIR::fwd], inre_ptr, inim_ptr,
-                                   detail::acc_to_ptr(outre_acc), detail::acc_to_ptr(outim_acc));
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/forward/mklcpu", "compute_forward",
-                    std::string("DftiComputeForward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-//USM version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd<descriptor_type> *inout,
-                                          const std::vector<sycl::event> &dependencies) {
-    detail::expect_config<dft::detail::config_param::PLACEMENT, dft::detail::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_fwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-
-    return cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-
-        cgh.depends_on(dependencies);
-        detail::host_task<class host_usm_kernel_inplace>(cgh, [=]() {
-            DFT_ERROR status = DftiComputeForward(desc_acc[detail::DIR::fwd], inout);
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/forward/mklcpu", "compute_forward",
-                    std::string("DftiComputeForward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar<descriptor_type> *inout_re,
-                                          scalar<descriptor_type> *inout_im,
-                                          const std::vector<sycl::event> &dependencies) {
-    detail::expect_config<dft::detail::config_param::COMPLEX_STORAGE,
-                          dft::detail::config_value::REAL_REAL>(
-        desc, "Unexpected value for complex storage");
-
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_fwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-
-    return cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-
-        cgh.depends_on(dependencies);
-        detail::host_task<class host_usm_kernel_split_inplace>(cgh, [=]() {
-            DFT_ERROR status = DftiComputeForward(desc_acc[detail::DIR::fwd], inout_re, inout_im);
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/forward/mklcpu", "compute_forward",
-                    std::string("DftiComputeForward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd<descriptor_type> *in,
-                                          bwd<descriptor_type> *out,
-                                          const std::vector<sycl::event> &dependencies) {
-    // Check: inplace
-    detail::expect_config<dft::detail::config_param::PLACEMENT,
-                          dft::detail::config_value::NOT_INPLACE>(desc,
-                                                                  "Unexpected value for placement");
-
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_fwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-
-    return cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-
-        cgh.depends_on(dependencies);
-        detail::host_task<class host_usm_kernel_outofplace>(cgh, [=]() {
-            DFT_ERROR status = DftiComputeForward(desc_acc[detail::DIR::fwd], in, out);
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/forward/mklcpu", "compute_forward",
-                    std::string("DftiComputeForward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar<descriptor_type> *in_re,
-                                          scalar<descriptor_type> *in_im,
-                                          scalar<descriptor_type> *out_re,
-                                          scalar<descriptor_type> *out_im,
-                                          const std::vector<sycl::event> &dependencies) {
-    detail::expect_config<dft::detail::config_param::COMPLEX_STORAGE,
-                          dft::detail::config_value::REAL_REAL>(
-        desc, "Unexpected value for complex storage");
-
-    auto commit_handle = dft::detail::get_commit(desc);
-    detail::check_fwd_commit(desc);
-    sycl::queue &cpu_queue{ commit_handle->get_queue() };
-
-    auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) };
-
-    return cpu_queue.submit([&](sycl::handler &cgh) {
-        auto desc_acc = mklcpu_desc_buffer.template get_access<sycl::access::mode::read>(cgh);
-
-        cgh.depends_on(dependencies);
-        detail::host_task<class host_usm_kernel_split_outofplace>(cgh, [=]() {
-            DFT_ERROR status =
-                DftiComputeForward(desc_acc[detail::DIR::fwd], in_re, in_im, out_re, out_im);
-            if (status != DFTI_NO_ERROR) {
-                throw oneapi::mkl::exception(
-                    "dft/forward/mklcpu", "compute_forward",
-                    std::string("DftiComputeForward failed : ") + DftiErrorMessage(status));
-            }
-        });
-    });
-}
-
-// Template function instantiations
-#include "dft/backends/backend_forward_instantiations.cxx"
-
-} // namespace oneapi::mkl::dft::mklcpu
diff --git a/src/dft/backends/mklcpu/mkl_dft_cpu_wrappers.cpp b/src/dft/backends/mklcpu/mkl_dft_cpu_wrappers.cpp
deleted file mode 100644
index 463ab80f4..000000000
--- a/src/dft/backends/mklcpu/mkl_dft_cpu_wrappers.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/dft/detail/mklcpu/onemkl_dft_mklcpu.hpp"
-#include "dft/function_table.hpp"
-
-#define WRAPPER_VERSION 1
-#define BACKEND         mklcpu
-
-extern "C" dft_function_table_t mkl_dft_table = {
-    WRAPPER_VERSION,
-#include "dft/backends/backend_wrappers.cxx"
-};
diff --git a/src/dft/backends/mklcpu/mklcpu_helpers.hpp b/src/dft/backends/mklcpu/mklcpu_helpers.hpp
deleted file mode 100644
index 55a8345c2..000000000
--- a/src/dft/backends/mklcpu/mklcpu_helpers.hpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_SRC_MKLCPU_HELPERS_HPP_
-#define _ONEMKL_DFT_SRC_MKLCPU_HELPERS_HPP_
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/dft/detail/types_impl.hpp"
-
-// MKLCPU header
-#include "mkl_dfti.h"
-
-namespace oneapi::mkl::dft::mklcpu::detail {
-
-template <typename K, typename H, typename F>
-static inline auto host_task_internal(H& cgh, F f, int) -> decltype(cgh.host_task(f)) {
-    return cgh.host_task(f);
-}
-
-template <typename K, typename H, typename F>
-static inline void host_task(H& cgh, F f) {
-    (void)host_task_internal<K>(cgh, f, 0);
-}
-
-template <typename Desc>
-class kernel_name {};
-
-/// Convert domain to equivalent backend native value.
-inline constexpr DFTI_CONFIG_VALUE to_mklcpu(dft::detail::domain dom) {
-    if (dom == dft::detail::domain::REAL) {
-        return DFTI_REAL;
-    }
-    else {
-        return DFTI_COMPLEX;
-    }
-}
-
-/// Convert precision to equivalent backend native value.
-inline constexpr DFTI_CONFIG_VALUE to_mklcpu(dft::detail::precision dom) {
-    if (dom == dft::detail::precision::SINGLE) {
-        return DFTI_SINGLE;
-    }
-    else {
-        return DFTI_DOUBLE;
-    }
-}
-
-/// Convert a config_param to equivalent backend native value.
-inline constexpr DFTI_CONFIG_PARAM to_mklcpu(dft::detail::config_param param) {
-    using iparam = dft::detail::config_param;
-    switch (param) {
-        case iparam::FORWARD_DOMAIN: return DFTI_FORWARD_DOMAIN;
-        case iparam::DIMENSION: return DFTI_DIMENSION;
-        case iparam::LENGTHS: return DFTI_LENGTHS;
-        case iparam::PRECISION: return DFTI_PRECISION;
-        case iparam::FORWARD_SCALE: return DFTI_FORWARD_SCALE;
-        case iparam::NUMBER_OF_TRANSFORMS: return DFTI_NUMBER_OF_TRANSFORMS;
-        case iparam::COMPLEX_STORAGE: return DFTI_COMPLEX_STORAGE;
-        case iparam::REAL_STORAGE: return DFTI_REAL_STORAGE;
-        case iparam::CONJUGATE_EVEN_STORAGE: return DFTI_CONJUGATE_EVEN_STORAGE;
-        case iparam::FWD_DISTANCE: return DFTI_FWD_DISTANCE;
-        case iparam::BWD_DISTANCE: return DFTI_BWD_DISTANCE;
-        case iparam::WORKSPACE: return DFTI_WORKSPACE;
-        case iparam::ORDERING: return DFTI_ORDERING;
-        case iparam::TRANSPOSE: return DFTI_TRANSPOSE;
-        case iparam::PACKED_FORMAT: return DFTI_PACKED_FORMAT;
-        case iparam::COMMIT_STATUS: return DFTI_COMMIT_STATUS;
-        default:
-            throw mkl::invalid_argument("dft", "MKLCPU descriptor set_value()",
-                                        "Invalid config param.");
-            return static_cast<DFTI_CONFIG_PARAM>(0);
-    }
-}
-
-/** Convert a config_value to the backend's native value. Throw on invalid input.
- * @tparam Param The config param the value is for.
- * @param value The config value to convert.
-**/
-template <dft::detail::config_param Param>
-inline constexpr int to_mklcpu(dft::detail::config_value value);
-
-template <>
-inline constexpr int to_mklcpu<dft::detail::config_param::COMPLEX_STORAGE>(
-    dft::detail::config_value value) {
-    if (value == dft::detail::config_value::COMPLEX_COMPLEX) {
-        return DFTI_COMPLEX_COMPLEX;
-    }
-    else if (value == dft::detail::config_value::REAL_REAL) {
-        return DFTI_REAL_REAL;
-    }
-    else {
-        throw mkl::invalid_argument("dft", "MKLCPU descriptor set_value()",
-                                    "Invalid config value for complex storage.");
-        return 0;
-    }
-}
-
-template <>
-inline constexpr int to_mklcpu<dft::detail::config_param::REAL_STORAGE>(
-    dft::detail::config_value value) {
-    if (value == dft::detail::config_value::REAL_REAL) {
-        return DFTI_REAL_REAL;
-    }
-    else {
-        throw mkl::invalid_argument("dft", "MKLCPU descriptor set_value()",
-                                    "Invalid config value for real storage.");
-        return 0;
-    }
-}
-template <>
-inline constexpr int to_mklcpu<dft::detail::config_param::CONJUGATE_EVEN_STORAGE>(
-    dft::detail::config_value value) {
-    if (value == dft::detail::config_value::COMPLEX_COMPLEX) {
-        return DFTI_COMPLEX_COMPLEX;
-    }
-    else {
-        throw mkl::invalid_argument("dft", "MKLCPU descriptor set_value()",
-                                    "Invalid config value for conjugate even storage.");
-        return 0;
-    }
-}
-
-template <>
-inline constexpr int to_mklcpu<dft::detail::config_param::PLACEMENT>(
-    dft::detail::config_value value) {
-    if (value == dft::detail::config_value::INPLACE) {
-        return DFTI_INPLACE;
-    }
-    else if (value == dft::detail::config_value::NOT_INPLACE) {
-        return DFTI_NOT_INPLACE;
-    }
-    else {
-        throw mkl::invalid_argument("dft", "MKLCPU descriptor set_value()",
-                                    "Invalid config value for inplace.");
-        return 0;
-    }
-}
-
-template <>
-inline constexpr int to_mklcpu<dft::detail::config_param::PACKED_FORMAT>(
-    dft::detail::config_value value) {
-    if (value == dft::detail::config_value::CCE_FORMAT) {
-        return DFTI_CCE_FORMAT;
-    }
-    else {
-        throw mkl::invalid_argument("dft", "MKLCPU descriptor set_value()",
-                                    "Invalid config value for packed format.");
-        return 0;
-    }
-}
-
-using mklcpu_desc_t = DFTI_DESCRIPTOR_HANDLE;
-
-template <typename AccType>
-typename AccType::value_type* acc_to_ptr(AccType acc) {
-    // no need to decorate the pointer with the address space for mklcpu since its just getting passed to the a host function.
-    return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
-}
-
-} // namespace oneapi::mkl::dft::mklcpu::detail
-
-#endif // _ONEMKL_DFT_SRC_MKLCPU_HELPERS_HPP_
diff --git a/src/dft/backends/mklgpu/CMakeLists.txt b/src/dft/backends/mklgpu/CMakeLists.txt
deleted file mode 100644
index 7e88a23d9..000000000
--- a/src/dft/backends/mklgpu/CMakeLists.txt
+++ /dev/null
@@ -1,89 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(LIB_NAME onemkl_dft_mklgpu)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-include(WarningsUtils)
-
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT
-  descriptor.cpp
-  commit.cpp
-  forward.cpp
-  backward.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: mkl_dft_gpu_wrappers.cpp>
-)
-add_dependencies(onemkl_backend_libs_dft ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PUBLIC ${ONEMKL_INTERFACE_INCLUDE_DIRS}
-)
-target_include_directories(${LIB_NAME}
-  PUBLIC ${ONEMKL_INTERFACE_INCLUDE_DIRS}
-)
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/src
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-
-if(TARGET MKL::MKL_SYCL::DFT)
-  target_link_libraries(${LIB_OBJ}
-    PUBLIC ONEMKL::SYCL::SYCL
-    PUBLIC MKL::MKL_SYCL::DFT
-    PRIVATE onemkl_warnings
-  )
-else()
-  target_link_libraries(${LIB_OBJ}
-    PUBLIC ONEMKL::SYCL::SYCL
-    PUBLIC MKL::MKL_DPCPP
-    PRIVATE onemkl_warnings
-  )
-endif()
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-#Set oneMKL libraries as not transitive for dynamic
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/dft/backends/mklgpu/backward.cpp b/src/dft/backends/mklgpu/backward.cpp
deleted file mode 100644
index 6c4896c66..000000000
--- a/src/dft/backends/mklgpu/backward.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-
-#include "oneapi/mkl/dft/detail/mklgpu/onemkl_dft_mklgpu.hpp"
-#include "oneapi/mkl/dft/detail/descriptor_impl.hpp"
-
-#include "mklgpu_helpers.hpp"
-
-// MKLGPU header
-#include "oneapi/mkl/dfti.hpp"
-
-namespace oneapi::mkl::dft::mklgpu {
-namespace detail {
-
-/// Forward a MKLGPU DFT call to the backend, checking that the commit impl is valid.
-/// Assumes backend descriptor values match those of the frontend.
-template <dft::detail::precision prec, dft::detail::domain dom, typename... ArgTs>
-inline auto compute_backward(dft::detail::descriptor<prec, dom> &desc, ArgTs &&... args) {
-    using mklgpu_desc_t = dft::descriptor<to_mklgpu(prec), to_mklgpu(dom)>;
-    using desc_shptr_t = std::shared_ptr<mklgpu_desc_t>;
-    using handle_t = std::pair<desc_shptr_t, desc_shptr_t>;
-    auto commit_handle = dft::detail::get_commit(desc);
-    if (commit_handle == nullptr || commit_handle->get_backend() != backend::mklgpu) {
-        throw mkl::invalid_argument("DFT", "compute_backward",
-                                    "DFT descriptor has not been commited for MKLGPU");
-    }
-    auto handle = reinterpret_cast<handle_t *>(commit_handle->get_handle());
-    auto mklgpu_desc = handle->second; // Second because backward DFT.
-    int commit_status{ DFTI_UNCOMMITTED };
-    mklgpu_desc->get_value(dft::config_param::COMMIT_STATUS, &commit_status);
-    if (commit_status != DFTI_COMMITTED) {
-        throw mkl::invalid_argument("DFT", "compute_backward",
-                                    "MKLGPU DFT descriptor was not successfully committed.");
-    }
-    // The MKLGPU backend's iterface contains fewer function signatures than in this
-    // open-source library. Consequently, it is not required to forward template arguments
-    // to resolve to the correct function.
-    return dft::compute_backward(*mklgpu_desc, std::forward<ArgTs>(args)...);
-}
-
-/// Throw an mkl::invalid_argument if the runtime param in the descriptor does not match
-/// the expected value.
-template <dft::detail::config_param Param, dft::detail::config_value Expected, typename DescT>
-inline auto expect_config(DescT &desc, const char *message) {
-    dft::detail::config_value actual{ 0 };
-    desc.get_value(Param, &actual);
-    if (actual != Expected) {
-        throw mkl::invalid_argument("DFT", "compute_backward", message);
-    }
-}
-} // namespace detail
-
-// BUFFER version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<fwd<descriptor_type>, 1> &inout) {
-    detail::expect_config<dft::detail::config_param::PLACEMENT, dft::detail::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-    return detail::compute_backward(desc, inout);
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type & /*desc*/,
-                                    sycl::buffer<scalar<descriptor_type>, 1> & /*inout_re*/,
-                                    sycl::buffer<scalar<descriptor_type>, 1> & /*inout_im*/) {
-    throw mkl::unimplemented("DFT", "compute_backward",
-                             "MKLGPU does not support compute_backward(desc, inout_re, inout_im).");
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<bwd<descriptor_type>, 1> &in,
-                                    sycl::buffer<fwd<descriptor_type>, 1> &out) {
-    detail::expect_config<dft::detail::config_param::PLACEMENT,
-                          dft::detail::config_value::NOT_INPLACE>(desc,
-                                                                  "Unexpected value for placement");
-    return detail::compute_backward(desc, in, out);
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<scalar<descriptor_type>, 1> & /*in_re*/,
-                                    sycl::buffer<scalar<descriptor_type>, 1> & /*in_im*/,
-                                    sycl::buffer<scalar<descriptor_type>, 1> & /*out_re*/,
-                                    sycl::buffer<scalar<descriptor_type>, 1> & /*out_im*/) {
-    detail::expect_config<dft::detail::config_param::COMPLEX_STORAGE,
-                          dft::detail::config_value::REAL_REAL>(
-        desc, "Unexpected value for complex storage");
-    throw oneapi::mkl::unimplemented(
-        "DFT", "compute_backward(desc, in_re, in_im, out_re, out_im)",
-        "MKLGPU does not support out-of-place FFT with real-real complex storage.");
-}
-
-//USM version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwd<descriptor_type> *inout,
-                                           const std::vector<sycl::event> &dependencies) {
-    detail::expect_config<dft::detail::config_param::PLACEMENT, dft::detail::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-    return detail::compute_backward(desc, inout, dependencies);
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type & /*desc*/,
-                                           scalar<descriptor_type> * /*inout_re*/,
-                                           scalar<descriptor_type> * /*inout_im*/,
-                                           const std::vector<sycl::event> & /*dependencies*/) {
-    throw mkl::unimplemented(
-        "DFT", "compute_backward",
-        "MKLGPU does not support compute_backward(desc, inout_re, inout_im, dependencies).");
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwd<descriptor_type> *in,
-                                           fwd<descriptor_type> *out,
-                                           const std::vector<sycl::event> &dependencies) {
-    detail::expect_config<dft::detail::config_param::PLACEMENT,
-                          dft::detail::config_value::NOT_INPLACE>(desc,
-                                                                  "Unexpected value for placement");
-    return detail::compute_backward(desc, in, out, dependencies);
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc,
-                                           scalar<descriptor_type> * /*in_re*/,
-                                           scalar<descriptor_type> * /*in_im*/,
-                                           scalar<descriptor_type> * /*out_re*/,
-                                           scalar<descriptor_type> * /*out_im*/,
-                                           const std::vector<sycl::event> & /*dependencies*/) {
-    detail::expect_config<dft::detail::config_param::COMPLEX_STORAGE,
-                          dft::detail::config_value::REAL_REAL>(
-        desc, "Unexpected value for complex storage");
-    throw oneapi::mkl::unimplemented(
-        "DFT", "compute_backward(desc, in_re, in_im, out_re, out_im, deps)",
-        "MKLGPU does not support out-of-place FFT with real-real complex storage.");
-}
-
-// Template function instantiations
-#include "dft/backends/backend_backward_instantiations.cxx"
-
-} // namespace oneapi::mkl::dft::mklgpu
diff --git a/src/dft/backends/mklgpu/commit.cpp b/src/dft/backends/mklgpu/commit.cpp
deleted file mode 100644
index d3a3f1cd6..000000000
--- a/src/dft/backends/mklgpu/commit.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/detail/backends.hpp"
-
-#include "oneapi/mkl/dft/detail/commit_impl.hpp"
-#include "oneapi/mkl/dft/detail/types_impl.hpp"
-#include "oneapi/mkl/dft/detail/descriptor_impl.hpp"
-#include "oneapi/mkl/dft/detail/mklgpu/onemkl_dft_mklgpu.hpp"
-
-#include "dft/backends/mklgpu/mklgpu_helpers.hpp"
-#include "../stride_helper.hpp"
-
-// MKLGPU header
-#include "oneapi/mkl/dfti.hpp"
-
-// MKL 2024.1 deprecates input/output strides.
-#include "mkl_version.h"
-#if INTEL_MKL_VERSION < 20240001
-#error MKLGPU requires oneMKL 2024.1 or later
-#endif
-
-/**
-Note that in this file, the Intel oneMKL closed-source library's interface mirrors the interface
-of this OneMKL open-source library. Consequently, the types under dft::TYPE are closed-source oneMKL types,
-and types under dft::detail::TYPE are from this library.
-**/
-
-namespace oneapi::mkl::dft::mklgpu {
-namespace detail {
-
-/// Commit impl class specialization for MKLGPU.
-template <dft::detail::precision prec, dft::detail::domain dom>
-class mklgpu_commit final : public dft::detail::commit_impl<prec, dom> {
-private:
-    // Equivalent MKLGPU precision and domain from OneMKL's precision / domain.
-    static constexpr dft::precision mklgpu_prec = to_mklgpu(prec);
-    static constexpr dft::domain mklgpu_dom = to_mklgpu(dom);
-
-    // A pair of descriptors are needed because of the [[deprecated]]IN/OUTPUT_STRIDES vs F/BWD_STRIDES API.
-    // Of the pair [0] is fwd DFT, [1] is backward DFT. If possible, the pointers refer to the same desciptor.
-    // Both pointers must be valid.
-    using mklgpu_descriptor_t = dft::descriptor<mklgpu_prec, mklgpu_dom>;
-    using descriptor_shptr_t = std::shared_ptr<mklgpu_descriptor_t>;
-    using handle_t = std::pair<descriptor_shptr_t, descriptor_shptr_t>;
-
-    using scalar_type = typename dft::detail::commit_impl<prec, dom>::scalar_type;
-
-public:
-    mklgpu_commit(sycl::queue queue, const dft::detail::dft_values<prec, dom>& config_values)
-            : oneapi::mkl::dft::detail::commit_impl<prec, dom>(queue, backend::mklgpu,
-                                                               config_values),
-              handle(std::make_shared<mklgpu_descriptor_t>(config_values.dimensions), nullptr) {
-        handle.second = handle.first; // Make sure the bwd pointer is valid.
-        // MKLGPU does not throw an informative exception for the following:
-        if constexpr (prec == dft::detail::precision::DOUBLE) {
-            if (!queue.get_device().has(sycl::aspect::fp64)) {
-                throw mkl::exception("dft/backends/mklgpu", "commit",
-                                     "Device does not support double precision.");
-            }
-        }
-    }
-
-    virtual void commit(const dft::detail::dft_values<prec, dom>& config_values) override {
-        this->external_workspace_helper_ =
-            oneapi::mkl::dft::detail::external_workspace_helper<prec, dom>(
-                config_values.workspace_placement ==
-                oneapi::mkl::dft::detail::config_value::WORKSPACE_EXTERNAL);
-
-        auto stride_choice = dft::detail::get_stride_api(config_values);
-        throw_on_invalid_stride_api("MKLGPU commit", stride_choice);
-        // A separate descriptor for each direction may not be required.
-        bool one_descriptor = (stride_choice == dft::detail::stride_api::FB_STRIDES) ||
-                              (config_values.input_strides == config_values.output_strides);
-        bool forward_good = true;
-        // Make sure that second is always pointing to something new if this is a recommit.
-        handle.second = handle.first;
-
-        // Generate forward DFT descriptor. If using FWD/BWD_STRIDES API, only
-        // one descriptor is needed.
-        set_value(*handle.first, config_values, true, stride_choice);
-        try {
-            handle.first->commit(this->get_queue());
-        }
-        catch (const std::exception& mkl_exception) {
-            // Catching the real Intel oneMKL exception causes headaches with naming
-            forward_good = false;
-            if (one_descriptor) {
-                throw mkl::exception("dft/backends/mklgpu"
-                                     "commit",
-                                     mkl_exception.what());
-            }
-        }
-
-        // Generate backward DFT descriptor only if required.
-        if (!one_descriptor) {
-            handle.second = std::make_shared<mklgpu_descriptor_t>(config_values.dimensions);
-            set_value(*handle.second, config_values, false, stride_choice);
-            try {
-                handle.second->commit(this->get_queue());
-            }
-            catch (const std::exception& mkl_exception) {
-                // Catching the real Intel oneMKL exception causes headaches with naming.
-                if (!forward_good) {
-                    throw mkl::exception("dft/backends/mklgpu"
-                                         "commit",
-                                         mkl_exception.what());
-                }
-            }
-        }
-    }
-
-    void* get_handle() noexcept override {
-        return &handle;
-    }
-
-    ~mklgpu_commit() override = default;
-
-    virtual void set_workspace(scalar_type* usm_workspace) override {
-        this->external_workspace_helper_.set_workspace_throw(*this, usm_workspace);
-        handle.first->set_workspace(usm_workspace);
-        if (handle.first != handle.second) {
-            handle.second->set_workspace(usm_workspace);
-        }
-    }
-
-    virtual void set_workspace(sycl::buffer<scalar_type>& buffer_workspace) override {
-        this->external_workspace_helper_.set_workspace_throw(*this, buffer_workspace);
-        handle.first->set_workspace(buffer_workspace);
-        if (handle.first != handle.second) {
-            handle.second->set_workspace(buffer_workspace);
-        }
-    }
-
-#define BACKEND mklgpu
-#include "../backend_compute_signature.cxx"
-#undef BACKEND
-
-private:
-    // The native MKLGPU class.
-    handle_t handle;
-
-    void set_value(mklgpu_descriptor_t& desc, const dft::detail::dft_values<prec, dom>& config,
-                   bool assume_fwd_dft, dft::detail::stride_api stride_choice) {
-        using onemkl_param = dft::detail::config_param;
-        using backend_param = dft::config_param;
-
-        // The following are read-only:
-        // Dimension, forward domain, precision, commit status.
-        // Lengths are supplied at descriptor construction time.
-        desc.set_value(backend_param::FORWARD_SCALE, config.fwd_scale);
-        desc.set_value(backend_param::BACKWARD_SCALE, config.bwd_scale);
-        desc.set_value(backend_param::NUMBER_OF_TRANSFORMS, config.number_of_transforms);
-        desc.set_value(backend_param::COMPLEX_STORAGE,
-                       to_mklgpu<onemkl_param::COMPLEX_STORAGE>(config.complex_storage));
-        if (config.real_storage != dft::detail::config_value::REAL_REAL) {
-            throw mkl::invalid_argument("dft/backends/mklgpu", "commit",
-                                        "MKLGPU only supports real-real real storage.");
-        }
-        desc.set_value(backend_param::CONJUGATE_EVEN_STORAGE,
-                       to_mklgpu<onemkl_param::CONJUGATE_EVEN_STORAGE>(config.conj_even_storage));
-        desc.set_value(backend_param::PLACEMENT,
-                       to_mklgpu<onemkl_param::PLACEMENT>(config.placement));
-
-        if (stride_choice == dft::detail::stride_api::FB_STRIDES) {
-            if (config.fwd_strides[0] != 0 || config.fwd_strides[0] != 0) {
-                throw mkl::unimplemented("dft/backends/mklgpu", "commit",
-                                         "MKLGPU does not support nonzero offsets.");
-            }
-            desc.set_value(backend_param::FWD_STRIDES, config.fwd_strides.data());
-            desc.set_value(backend_param::BWD_STRIDES, config.bwd_strides.data());
-        }
-        else {
-            if (config.input_strides[0] != 0 || config.output_strides[0] != 0) {
-                throw mkl::unimplemented("dft/backends/mklgpu", "commit",
-                                         "MKLGPU does not support nonzero offsets.");
-            }
-            if (assume_fwd_dft) {
-                desc.set_value(backend_param::FWD_STRIDES, config.input_strides.data());
-                desc.set_value(backend_param::BWD_STRIDES, config.output_strides.data());
-            }
-            else {
-                desc.set_value(backend_param::FWD_STRIDES, config.output_strides.data());
-                desc.set_value(backend_param::BWD_STRIDES, config.input_strides.data());
-            }
-        }
-        desc.set_value(backend_param::FWD_DISTANCE, config.fwd_dist);
-        desc.set_value(backend_param::BWD_DISTANCE, config.bwd_dist);
-        if (config.workspace_placement == dft::detail::config_value::WORKSPACE_EXTERNAL) {
-            // Setting WORKSPACE_INTERNAL (default) causes FFT_INVALID_DESCRIPTOR.
-            desc.set_value(backend_param::WORKSPACE,
-                           to_mklgpu_config_value<onemkl_param::WORKSPACE_PLACEMENT>(
-                               config.workspace_placement));
-        }
-        // Setting the ordering causes an FFT_INVALID_DESCRIPTOR. Check that default is used:
-        if (config.ordering != dft::detail::config_value::ORDERED) {
-            throw mkl::invalid_argument("dft/backends/mklgpu", "commit",
-                                        "MKLGPU only supports ordered ordering.");
-        }
-        // Setting the transpose causes an FFT_INVALID_DESCRIPTOR. Check that default is used:
-        if (config.transpose != false) {
-            throw mkl::invalid_argument("dft/backends/mklgpu", "commit",
-                                        "MKLGPU only supports non-transposed.");
-        }
-    }
-
-    // This is called by the workspace_helper, and is not part of the user API.
-    virtual std::int64_t get_workspace_external_bytes_impl() override {
-        std::size_t workspaceSizeFwd = 0, workspaceSizeBwd = 0;
-        handle.first->get_value(dft::config_param::WORKSPACE_BYTES, &workspaceSizeFwd);
-        handle.second->get_value(dft::config_param::WORKSPACE_BYTES, &workspaceSizeBwd);
-        return static_cast<std::int64_t>(std::max(workspaceSizeFwd, workspaceSizeFwd));
-    }
-};
-} // namespace detail
-
-template <dft::detail::precision prec, dft::detail::domain dom>
-dft::detail::commit_impl<prec, dom>* create_commit(const dft::detail::descriptor<prec, dom>& desc,
-                                                   sycl::queue& sycl_queue) {
-    return new detail::mklgpu_commit<prec, dom>(sycl_queue, desc.get_values());
-}
-
-template dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::REAL>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::SINGLE, dft::detail::domain::REAL>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>&,
-    sycl::queue&);
-
-} // namespace oneapi::mkl::dft::mklgpu
diff --git a/src/dft/backends/mklgpu/descriptor.cpp b/src/dft/backends/mklgpu/descriptor.cpp
deleted file mode 100644
index d2d2fee7a..000000000
--- a/src/dft/backends/mklgpu/descriptor.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/dft/descriptor.hpp"
-#include "../../descriptor.cxx"
-
-#include "oneapi/mkl/dft/detail/mklgpu/onemkl_dft_mklgpu.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-
-template <precision prec, domain dom>
-void descriptor<prec, dom>::commit(backend_selector<backend::mklgpu> selector) {
-    if (!pimpl_ || pimpl_->get_queue() != selector.get_queue()) {
-        if (pimpl_) {
-            pimpl_->get_queue().wait();
-        }
-        pimpl_.reset(mklgpu::create_commit(*this, selector.get_queue()));
-    }
-    pimpl_->commit(values_);
-}
-
-template void descriptor<precision::SINGLE, domain::COMPLEX>::commit(
-    backend_selector<backend::mklgpu>);
-template void descriptor<precision::SINGLE, domain::REAL>::commit(
-    backend_selector<backend::mklgpu>);
-template void descriptor<precision::DOUBLE, domain::COMPLEX>::commit(
-    backend_selector<backend::mklgpu>);
-template void descriptor<precision::DOUBLE, domain::REAL>::commit(
-    backend_selector<backend::mklgpu>);
-
-} //namespace dft
-} //namespace mkl
-} //namespace oneapi
diff --git a/src/dft/backends/mklgpu/forward.cpp b/src/dft/backends/mklgpu/forward.cpp
deleted file mode 100644
index 39da42e45..000000000
--- a/src/dft/backends/mklgpu/forward.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <type_traits>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-
-#include "oneapi/mkl/dft/detail/mklgpu/onemkl_dft_mklgpu.hpp"
-#include "oneapi/mkl/dft/detail/descriptor_impl.hpp"
-
-#include "mklgpu_helpers.hpp"
-
-// MKLGPU header
-#include "oneapi/mkl/dfti.hpp"
-
-/**
-Note that in this file, the Intel oneMKL-GPU library's interface mirrors the
-interface of this OneMKL library. Consequently, the types under dft::TYPE are
-closed-source Intel oneMKL types, and types under dft::detail::TYPE are from
-this library.
-**/
-
-namespace oneapi::mkl::dft::mklgpu {
-namespace detail {
-/// Forward a MKLGPU DFT call to the backend, checking that the commit impl is valid.
-/// Assumes backend descriptor values match those of the frontend.
-template <dft::detail::precision prec, dft::detail::domain dom, typename... ArgTs>
-inline auto compute_forward(dft::detail::descriptor<prec, dom> &desc, ArgTs &&... args) {
-    using mklgpu_desc_t = dft::descriptor<to_mklgpu(prec), to_mklgpu(dom)>;
-    using desc_shptr_t = std::shared_ptr<mklgpu_desc_t>;
-    using handle_t = std::pair<desc_shptr_t, desc_shptr_t>;
-    auto commit_handle = dft::detail::get_commit(desc);
-    if (commit_handle == nullptr || commit_handle->get_backend() != backend::mklgpu) {
-        throw mkl::invalid_argument("DFT", "compute_forward",
-                                    "DFT descriptor has not been commited for MKLGPU");
-    }
-    auto handle = reinterpret_cast<handle_t *>(commit_handle->get_handle());
-    auto mklgpu_desc = handle->first; // First because forward DFT.
-    int commit_status{ DFTI_UNCOMMITTED };
-    mklgpu_desc->get_value(dft::config_param::COMMIT_STATUS, &commit_status);
-    if (commit_status != DFTI_COMMITTED) {
-        throw mkl::invalid_argument("DFT", "compute_forward",
-                                    "MKLGPU DFT descriptor was not successfully committed.");
-    }
-    // The MKLGPU backend's iterface contains fewer function signatures than in this
-    // open-source library. Consequently, it is not required to forward template arguments
-    // to resolve to the correct function.
-    return dft::compute_forward(*mklgpu_desc, std::forward<ArgTs>(args)...);
-}
-
-/// Throw an mkl::invalid_argument if the runtime param in the descriptor does not match
-/// the expected value.
-template <dft::detail::config_param Param, dft::detail::config_value Expected, typename DescT>
-inline auto expect_config(DescT &desc, const char *message) {
-    dft::detail::config_value actual{ 0 };
-    desc.get_value(Param, &actual);
-    if (actual != Expected) {
-        throw mkl::invalid_argument("DFT", "compute_forward", message);
-    }
-}
-} // namespace detail
-
-// BUFFER version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc,
-                                   sycl::buffer<fwd<descriptor_type>, 1> &inout) {
-    detail::expect_config<dft::detail::config_param::PLACEMENT, dft::detail::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-    return detail::compute_forward(desc, inout);
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type & /*desc*/,
-                                   sycl::buffer<scalar<descriptor_type>, 1> & /*inout_re*/,
-                                   sycl::buffer<scalar<descriptor_type>, 1> & /*inout_im*/) {
-    throw mkl::unimplemented("DFT", "compute_forward",
-                             "MKLGPU does not support compute_forward(desc, inout_re, inout_im).");
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer<fwd<descriptor_type>, 1> &in,
-                                   sycl::buffer<bwd<descriptor_type>, 1> &out) {
-    detail::expect_config<dft::detail::config_param::PLACEMENT,
-                          dft::detail::config_value::NOT_INPLACE>(desc,
-                                                                  "Unexpected value for placement");
-    return detail::compute_forward(desc, in, out);
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc,
-                                   sycl::buffer<scalar<descriptor_type>, 1> & /*in_re*/,
-                                   sycl::buffer<scalar<descriptor_type>, 1> & /*in_im*/,
-                                   sycl::buffer<scalar<descriptor_type>, 1> & /*out_re*/,
-                                   sycl::buffer<scalar<descriptor_type>, 1> & /*out_im*/) {
-    detail::expect_config<dft::detail::config_param::COMPLEX_STORAGE,
-                          dft::detail::config_value::REAL_REAL>(
-        desc, "Unexpected value for complex storage");
-    throw oneapi::mkl::unimplemented(
-        "DFT", "compute_forward(desc, in_re, in_im, out_re, out_im)",
-        "MKLGPU does not support out-of-place FFT with real-real complex storage.");
-}
-
-//USM version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd<descriptor_type> *inout,
-                                          const std::vector<sycl::event> &dependencies) {
-    detail::expect_config<dft::detail::config_param::PLACEMENT, dft::detail::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-    return detail::compute_forward(desc, inout, dependencies);
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type & /*desc*/,
-                                          scalar<descriptor_type> * /*inout_re*/,
-                                          scalar<descriptor_type> * /*inout_im*/,
-                                          const std::vector<sycl::event> & /*dependencies*/) {
-    throw mkl::unimplemented(
-        "DFT", "compute_forward",
-        "MKLGPU does not support compute_forward(desc, inout_re, inout_im, dependencies).");
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd<descriptor_type> *in,
-                                          bwd<descriptor_type> *out,
-                                          const std::vector<sycl::event> &dependencies) {
-    detail::expect_config<dft::detail::config_param::PLACEMENT,
-                          dft::detail::config_value::NOT_INPLACE>(desc,
-                                                                  "Unexpected value for placement");
-    return detail::compute_forward(desc, in, out, dependencies);
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc,
-                                          scalar<descriptor_type> * /*in_re*/,
-                                          scalar<descriptor_type> * /*in_im*/,
-                                          scalar<descriptor_type> * /*out_re*/,
-                                          scalar<descriptor_type> * /*out_im*/,
-                                          const std::vector<sycl::event> & /*dependencies*/) {
-    detail::expect_config<dft::detail::config_param::COMPLEX_STORAGE,
-                          dft::detail::config_value::REAL_REAL>(
-        desc, "Unexpected value for complex storage");
-    throw oneapi::mkl::unimplemented(
-        "DFT", "compute_forward(desc, in_re, in_im, out_re, out_im, dependencies)",
-        "MKLGPU does not support out-of-place FFT with real-real complex storage.");
-}
-
-// Template function instantiations
-#include "dft/backends/backend_forward_instantiations.cxx"
-
-} // namespace oneapi::mkl::dft::mklgpu
diff --git a/src/dft/backends/mklgpu/mkl_dft_gpu_wrappers.cpp b/src/dft/backends/mklgpu/mkl_dft_gpu_wrappers.cpp
deleted file mode 100644
index 8d2fa111d..000000000
--- a/src/dft/backends/mklgpu/mkl_dft_gpu_wrappers.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/dft/detail/mklgpu/onemkl_dft_mklgpu.hpp"
-#include "dft/function_table.hpp"
-
-#define WRAPPER_VERSION 1
-#define BACKEND         mklgpu
-
-extern "C" dft_function_table_t mkl_dft_table = {
-    WRAPPER_VERSION,
-#include "dft/backends/backend_wrappers.cxx"
-};
diff --git a/src/dft/backends/mklgpu/mklgpu_helpers.hpp b/src/dft/backends/mklgpu/mklgpu_helpers.hpp
deleted file mode 100644
index 6813297ea..000000000
--- a/src/dft/backends/mklgpu/mklgpu_helpers.hpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_SRC_MKLGPU_HELPERS_HPP_
-#define _ONEMKL_DFT_SRC_MKLGPU_HELPERS_HPP_
-
-#include "oneapi/mkl/detail/exceptions.hpp"
-#include "oneapi/mkl/dft/detail/types_impl.hpp"
-
-// MKLGPU header
-#include "oneapi/mkl/dfti.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-namespace mklgpu {
-namespace detail {
-
-/// Convert domain to equivalent backend native value.
-inline constexpr dft::domain to_mklgpu(dft::detail::domain dom) {
-    if (dom == dft::detail::domain::REAL) {
-        return dft::domain::REAL;
-    }
-    else {
-        return dft::domain::COMPLEX;
-    }
-}
-
-/// Convert precision to equivalent backend native value.
-inline constexpr dft::precision to_mklgpu(dft::detail::precision dom) {
-    if (dom == dft::detail::precision::SINGLE) {
-        return dft::precision::SINGLE;
-    }
-    else {
-        return dft::precision::DOUBLE;
-    }
-}
-
-/// Convert a config_param to equivalent backend native value.
-inline constexpr dft::config_param to_mklgpu(dft::detail::config_param param) {
-    using iparam = dft::detail::config_param;
-    using oparam = dft::config_param;
-    switch (param) {
-        case iparam::FORWARD_DOMAIN: return oparam::FORWARD_DOMAIN;
-        case iparam::DIMENSION: return oparam::DIMENSION;
-        case iparam::LENGTHS: return oparam::LENGTHS;
-        case iparam::PRECISION: return oparam::PRECISION;
-        case iparam::FORWARD_SCALE: return oparam::FORWARD_SCALE;
-        case iparam::NUMBER_OF_TRANSFORMS: return oparam::NUMBER_OF_TRANSFORMS;
-        case iparam::COMPLEX_STORAGE: return oparam::COMPLEX_STORAGE;
-        case iparam::REAL_STORAGE: return oparam::REAL_STORAGE;
-        case iparam::CONJUGATE_EVEN_STORAGE: return oparam::CONJUGATE_EVEN_STORAGE;
-        case iparam::FWD_DISTANCE: return oparam::FWD_DISTANCE;
-        case iparam::BWD_DISTANCE: return oparam::BWD_DISTANCE;
-        case iparam::WORKSPACE: return oparam::WORKSPACE;
-        case iparam::ORDERING: return oparam::ORDERING;
-        case iparam::TRANSPOSE: return oparam::TRANSPOSE;
-        case iparam::PACKED_FORMAT: return oparam::PACKED_FORMAT;
-        case iparam::WORKSPACE_PLACEMENT: return oparam::WORKSPACE; // Same as WORKSPACE
-        case iparam::WORKSPACE_EXTERNAL_BYTES: return oparam::WORKSPACE_BYTES;
-        case iparam::COMMIT_STATUS: return oparam::COMMIT_STATUS;
-        default:
-            throw mkl::invalid_argument("dft", "MKLGPU descriptor set_value()",
-                                        "Invalid config param.");
-            return static_cast<oparam>(0);
-    }
-}
-
-/** Convert a config_value to the backend's native value. Throw on invalid input.
- * @tparam Param The config param the value is for.
- * @param value The config value to convert.
-**/
-template <dft::detail::config_param Param>
-inline constexpr int to_mklgpu(dft::detail::config_value value);
-
-template <>
-inline constexpr int to_mklgpu<dft::detail::config_param::COMPLEX_STORAGE>(
-    dft::detail::config_value value) {
-    if (value == dft::detail::config_value::COMPLEX_COMPLEX) {
-        return DFTI_COMPLEX_COMPLEX;
-    }
-    else {
-        throw mkl::unimplemented("dft", "MKLGPU descriptor set_value()",
-                                 "MKLGPU only supports complex-complex for complex storage.");
-        return 0;
-    }
-}
-
-template <>
-inline constexpr int to_mklgpu<dft::detail::config_param::CONJUGATE_EVEN_STORAGE>(
-    dft::detail::config_value value) {
-    if (value == dft::detail::config_value::COMPLEX_COMPLEX) {
-        return DFTI_COMPLEX_COMPLEX;
-    }
-    else {
-        throw mkl::invalid_argument("dft", "MKLGPU descriptor set_value()",
-                                    "Invalid config value for conjugate even storage.");
-        return 0;
-    }
-}
-
-template <>
-inline constexpr int to_mklgpu<dft::detail::config_param::PLACEMENT>(
-    dft::detail::config_value value) {
-    if (value == dft::detail::config_value::INPLACE) {
-        return DFTI_INPLACE;
-    }
-    else if (value == dft::detail::config_value::NOT_INPLACE) {
-        return DFTI_NOT_INPLACE;
-    }
-    else {
-        throw mkl::invalid_argument("dft", "MKLGPU descriptor set_value()",
-                                    "Invalid config value for inplace.");
-        return 0;
-    }
-}
-
-template <>
-inline constexpr int to_mklgpu<dft::detail::config_param::PACKED_FORMAT>(
-    dft::detail::config_value value) {
-    if (value == dft::detail::config_value::CCE_FORMAT) {
-        return DFTI_CCE_FORMAT;
-    }
-    else {
-        throw mkl::invalid_argument("dft", "MKLGPU descriptor set_value()",
-                                    "Invalid config value for packed format.");
-        return 0;
-    }
-}
-
-/** Convert a config_value to the backend's native value. Throw on invalid input.
- * @tparam Param The config param the value is for.
- * @param value The config value to convert.
-**/
-template <dft::detail::config_param Param>
-inline constexpr dft::config_value to_mklgpu_config_value(dft::detail::config_value value);
-
-template <>
-inline constexpr dft::config_value
-to_mklgpu_config_value<dft::detail::config_param::WORKSPACE_PLACEMENT>(
-    dft::detail::config_value value) {
-    if (value == dft::detail::config_value::WORKSPACE_AUTOMATIC) {
-        // NB: dft::config_value != dft::detail::config_value
-        return dft::config_value::WORKSPACE_INTERNAL;
-    }
-    else if (value == dft::detail::config_value::WORKSPACE_EXTERNAL) {
-        return dft::config_value::WORKSPACE_EXTERNAL;
-    }
-    else {
-        throw mkl::invalid_argument("dft", "MKLGPU descriptor set_value()",
-                                    "Invalid config value for workspace placement.");
-        return dft::config_value::WORKSPACE_INTERNAL;
-    }
-}
-} // namespace detail
-} // namespace mklgpu
-} // namespace dft
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _ONEMKL_DFT_SRC_MKLGPU_HELPERS_HPP_
diff --git a/src/dft/backends/portfft/CMakeLists.txt b/src/dft/backends/portfft/CMakeLists.txt
deleted file mode 100644
index 50e4d30d1..000000000
--- a/src/dft/backends/portfft/CMakeLists.txt
+++ /dev/null
@@ -1,134 +0,0 @@
-#===============================================================================
-# Copyright Codeplay Software Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-
-check_cxx_compiler_flag("-fsycl" IS_DPCPP)
-
-set(FOUND_TARGETS 0)
-
-if (NOT (CMAKE_CXX_FLAGS STREQUAL ""))
-  string(FIND ${CMAKE_CXX_FLAGS} "fsycl-targets" TARGETS_IDX)
-  if (TARGETS_IDX GREATER_EQUAL 0)
-    set(FOUND_TARGETS 1)
-    message(STATUS "fsycl-targets flag found, not setting targets")
-  endif()
-endif()
-
-if (IS_DPCPP AND UNIX AND NOT FOUND_TARGETS)
-  message(WARNING "fsycl-targets flag not found, enabling all backends")
-  set(TARGETS_COMPILE_OPTIONS -fsycl-unnamed-lambda)
-  set(TARGETS_LINK_OPTIONS -fsycl-unnamed-lambda)
-
-  # spir64 must be last in the list due to a bug in dpcpp 2024.0.0
-  set(TARGETS_TRIPLES "spir64")
-  if(dpcpp_supports_nvptx64)
-    set(TARGETS_TRIPLES nvptx64-nvidia-cuda,${TARGETS_TRIPLES})
-  endif()
-
-  if (NOT (HIP_TARGETS STREQUAL ""))
-    set(TARGETS_TRIPLES amdgcn-amd-amdhsa,${TARGETS_TRIPLES})
-    list(APPEND TARGETS_COMPILE_OPTIONS -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${HIP_TARGETS})
-    list(APPEND TARGETS_LINK_OPTIONS -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=${HIP_TARGETS})
-  else()
-    message(WARNING "Can't enable hip backend, HIP_TARGETS has not been set.")
-  endif()
-  
-  message(STATUS "portFFT target triple set to ${TARGETS_TRIPLES}")
-
-  list(APPEND TARGETS_COMPILE_OPTIONS -fsycl-targets=${TARGETS_TRIPLES})
-  list(APPEND TARGETS_LINK_OPTIONS -fsycl-targets=${TARGETS_TRIPLES})
-
-  target_compile_options(ONEMKL::SYCL::SYCL INTERFACE ${TARGETS_COMPILE_OPTIONS})
-  target_link_options(ONEMKL::SYCL::SYCL INTERFACE ${TARGETS_LINK_OPTIONS})
-endif()
-
-set(LIB_NAME onemkl_dft_portfft)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT
-  descriptor.cpp
-  commit.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: mkl_dft_portfft_wrappers.cpp>
-)
-add_dependencies(onemkl_backend_libs_dft ${LIB_NAME})
-
-find_package(portfft QUIET)
-if (NOT portfft_FOUND)
-	message(STATUS "portFFT - not found locally, downloading")
-
-	include(FetchContent)
-	set(FETCHCONTENT_BASE_DIR "${CMAKE_BINARY_DIR}/deps")
-	FetchContent_Declare(
-		portfft
-		GIT_REPOSITORY https://github.com/codeplaysoftware/portFFT.git
-		GIT_TAG        e4251e8ef89a8ac4d851a4cc08a0577a28f953e0
-	)
-	FetchContent_MakeAvailable(portfft)
-	message(STATUS "portFFT - downloaded")
-	target_link_libraries(${LIB_OBJ} PRIVATE portfft)
-else()
-	message(STATUS "portFFT - found")
-	target_link_libraries(${LIB_OBJ} PRIVATE portfft::portfft)
-endif()
-
-target_link_libraries(${LIB_OBJ} PRIVATE onemkl_warnings)
-
-target_include_directories(${LIB_OBJ}
-  PUBLIC ${ONEMKL_INTERFACE_INCLUDE_DIRS}
-)
-target_include_directories(${LIB_NAME}
-  PUBLIC ${ONEMKL_INTERFACE_INCLUDE_DIRS}
-)
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/src
-          ${CMAKE_BINARY_DIR}/bin
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-
-target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL)
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-#Set oneMKL libraries as not transitive for dynamic
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/dft/backends/portfft/commit.cpp b/src/dft/backends/portfft/commit.cpp
deleted file mode 100644
index a2c80e91a..000000000
--- a/src/dft/backends/portfft/commit.cpp
+++ /dev/null
@@ -1,345 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include <array>
-#include <optional>
-
-#include <portfft/portfft.hpp>
-
-#include "oneapi/mkl/exceptions.hpp"
-
-#include "oneapi/mkl/dft/detail/commit_impl.hpp"
-#include "oneapi/mkl/dft/detail/descriptor_impl.hpp"
-#include "oneapi/mkl/dft/detail/portfft/onemkl_dft_portfft.hpp"
-#include "oneapi/mkl/dft/types.hpp"
-
-#include "../stride_helper.hpp"
-
-#include "portfft_helper.hpp"
-
-// alias to avoid ambiguity
-namespace pfft = portfft;
-
-namespace oneapi::mkl::dft::portfft {
-namespace detail {
-
-template <dft::precision prec, dft::domain dom>
-class portfft_commit final : public dft::detail::commit_impl<prec, dom> {
-private:
-    using scalar_type = typename dft::detail::commit_impl<prec, dom>::scalar_type;
-    using fwd_type = typename dft::detail::commit_impl<prec, dom>::fwd_type;
-    using bwd_type = typename dft::detail::commit_impl<prec, dom>::bwd_type;
-    using descriptor_type = typename dft::detail::descriptor<prec, dom>;
-
-    static constexpr pfft::domain domain =
-        dom == dft::domain::REAL ? pfft::domain::REAL : pfft::domain::COMPLEX;
-    // since only complex-to-complex transforms are supported, we expect both directions to be valid or neither.
-    std::array<storage_type<descriptor_type>, 2> committed_descriptors = { std::nullopt,
-                                                                           std::nullopt };
-
-public:
-    portfft_commit(sycl::queue& queue, const dft::detail::dft_values<prec, dom>& config_values)
-            : oneapi::mkl::dft::detail::commit_impl<prec, dom>(queue, backend::portfft,
-                                                               config_values) {
-        if constexpr (prec == dft::detail::precision::DOUBLE) {
-            if (!queue.get_device().has(sycl::aspect::fp64)) {
-                throw mkl::exception("DFT", "commit", "Device does not support double precision.");
-            }
-        }
-    }
-
-    void commit(const dft::detail::dft_values<prec, dom>& config_values) override {
-        // not available in portFFT:
-        this->external_workspace_helper_ =
-            oneapi::mkl::dft::detail::external_workspace_helper<prec, dom>(
-                config_values.workspace_placement ==
-                oneapi::mkl::dft::detail::config_value::WORKSPACE_EXTERNAL);
-        if (config_values.workspace != config_value::ALLOW) {
-            throw mkl::unimplemented("dft/backends/portfft", __FUNCTION__,
-                                     "portFFT only supports ALLOW for the WORKSPACE parameter");
-        }
-        if (config_values.ordering != config_value::ORDERED) {
-            throw mkl::unimplemented("dft/backends/portfft", __FUNCTION__,
-                                     "portFFT only supports ORDERED for the ORDERING parameter");
-        }
-        if (config_values.transpose) {
-            throw mkl::unimplemented("dft/backends/portfft", __FUNCTION__,
-                                     "portFFT does not supported transposed output");
-        }
-
-        auto stride_api_choice = dft::detail::get_stride_api(config_values);
-        dft::detail::throw_on_invalid_stride_api("portFFT commit", stride_api_choice);
-        dft::detail::stride_vectors<std::int64_t> stride_vecs(config_values, stride_api_choice);
-
-        // forward descriptor
-        pfft::descriptor<scalar_type, domain> fwd_desc(
-            { config_values.dimensions.cbegin(), config_values.dimensions.cend() });
-        fwd_desc.forward_scale = config_values.fwd_scale;
-        fwd_desc.backward_scale = config_values.bwd_scale;
-        fwd_desc.number_of_transforms =
-            static_cast<std::size_t>(config_values.number_of_transforms);
-        fwd_desc.complex_storage = config_values.complex_storage == config_value::COMPLEX_COMPLEX
-                                       ? pfft::complex_storage::INTERLEAVED_COMPLEX
-                                       : pfft::complex_storage::SPLIT_COMPLEX;
-        fwd_desc.placement = config_values.placement == config_value::INPLACE
-                                 ? pfft::placement::IN_PLACE
-                                 : pfft::placement::OUT_OF_PLACE;
-        fwd_desc.forward_offset = static_cast<std::size_t>(stride_vecs.offset_fwd_in);
-        fwd_desc.backward_offset = static_cast<std::size_t>(stride_vecs.offset_fwd_out);
-        fwd_desc.forward_strides = { stride_vecs.fwd_in.cbegin() + 1, stride_vecs.fwd_in.cend() };
-        fwd_desc.backward_strides = { stride_vecs.fwd_out.cbegin() + 1,
-                                      stride_vecs.fwd_out.cend() };
-        fwd_desc.forward_distance = static_cast<std::size_t>(config_values.fwd_dist);
-        fwd_desc.backward_distance = static_cast<std::size_t>(config_values.bwd_dist);
-
-        // backward descriptor
-        pfft::descriptor<scalar_type, domain> bwd_desc(
-            { config_values.dimensions.cbegin(), config_values.dimensions.cend() });
-        bwd_desc.forward_scale = config_values.fwd_scale;
-        bwd_desc.backward_scale = config_values.bwd_scale;
-        bwd_desc.number_of_transforms =
-            static_cast<std::size_t>(config_values.number_of_transforms);
-        bwd_desc.complex_storage = config_values.complex_storage == config_value::COMPLEX_COMPLEX
-                                       ? pfft::complex_storage::INTERLEAVED_COMPLEX
-                                       : pfft::complex_storage::SPLIT_COMPLEX;
-        bwd_desc.placement = config_values.placement == config_value::INPLACE
-                                 ? pfft::placement::IN_PLACE
-                                 : pfft::placement::OUT_OF_PLACE;
-        bwd_desc.forward_offset = static_cast<std::size_t>(stride_vecs.offset_bwd_out);
-        bwd_desc.backward_offset = static_cast<std::size_t>(stride_vecs.offset_bwd_in);
-        bwd_desc.forward_strides = { stride_vecs.bwd_out.cbegin() + 1, stride_vecs.bwd_out.cend() };
-        bwd_desc.backward_strides = { stride_vecs.bwd_in.cbegin() + 1, stride_vecs.bwd_in.cend() };
-        bwd_desc.forward_distance = static_cast<std::size_t>(config_values.fwd_dist);
-        bwd_desc.backward_distance = static_cast<std::size_t>(config_values.bwd_dist);
-
-        try {
-            auto q = this->get_queue();
-            committed_descriptors[0] = fwd_desc.commit(q);
-            committed_descriptors[1] = bwd_desc.commit(q);
-        }
-        catch (const pfft::unsupported_configuration& e) {
-            throw oneapi::mkl::unimplemented("dft/backends/portfft", __FUNCTION__, e.what());
-        }
-    }
-
-    ~portfft_commit() override = default;
-
-    void* get_handle() noexcept override {
-        return committed_descriptors.data();
-    }
-
-    // All the compute functions are implementated here so they are in the same translation unit as the commit function.
-    // If the use of the kernel bundle is in a seperate translation unit from the one it was translated in, the runtime can fail to find it.
-
-    // forward inplace COMPLEX_COMPLEX
-    void forward_ip_cc(descriptor_type& desc, sycl::buffer<fwd_type, 1>& inout) override {
-        constexpr auto pfft_domain = detail::to_pfft_domain<descriptor_type>::type::value;
-        dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<fwd_type, 1>>(
-            "compute_forward");
-
-        if constexpr (pfft_domain == pfft::domain::COMPLEX) {
-            detail::get_descriptors(desc)[0]->compute_forward(inout);
-        }
-    }
-    sycl::event forward_ip_cc(descriptor_type& desc, fwd_type* inout,
-                              const std::vector<sycl::event>& dependencies) override {
-        constexpr auto pfft_domain = detail::to_pfft_domain<descriptor_type>::type::value;
-        dft::detail::get_commit(desc)->template compute_call_throw<fwd_type*>("compute_forward");
-
-        if constexpr (pfft_domain == pfft::domain::COMPLEX) {
-            return detail::get_descriptors(desc)[0]->compute_forward(inout, dependencies);
-        }
-        else {
-            return {};
-        }
-    }
-
-    // forward inplace REAL_REAL
-    void forward_ip_rr(descriptor_type& desc, sycl::buffer<scalar_type, 1>&,
-                       sycl::buffer<scalar_type, 1>&) override {
-        dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<scalar_type, 1>>(
-            "compute_forward");
-        throw oneapi::mkl::unimplemented("DFT", "compute_forward(desc, inout_re, inout_im)",
-                                         "portFFT does not support real-real complex storage.");
-    }
-    sycl::event forward_ip_rr(descriptor_type& desc, scalar_type*, scalar_type*,
-                              const std::vector<sycl::event>&) override {
-        dft::detail::get_commit(desc)->template compute_call_throw<scalar_type*>("compute_forward");
-        throw oneapi::mkl::unimplemented("DFT",
-                                         "compute_forward(desc, inout_re, inout_im, dependencies)",
-                                         "portFFT does not support real-real complex storage.");
-    }
-
-    // forward out-of-place COMPLEX_COMPLEX
-    void forward_op_cc(descriptor_type& desc, sycl::buffer<fwd_type, 1>& in,
-                       sycl::buffer<bwd_type, 1>& out) override {
-        constexpr auto pfft_domain = detail::to_pfft_domain<descriptor_type>::type::value;
-        dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<fwd_type, 1>>(
-            "compute_forward");
-
-        if constexpr (pfft_domain == pfft::domain::COMPLEX) {
-            detail::get_descriptors(desc)[0]->compute_forward(in, out);
-        }
-    }
-    sycl::event forward_op_cc(descriptor_type& desc, fwd_type* in, bwd_type* out,
-                              const std::vector<sycl::event>& dependencies) override {
-        constexpr auto pfft_domain = detail::to_pfft_domain<descriptor_type>::type::value;
-        dft::detail::get_commit(desc)->template compute_call_throw<fwd_type*>("compute_forward");
-
-        if constexpr (pfft_domain == pfft::domain::COMPLEX) {
-            return detail::get_descriptors(desc)[0]->compute_forward(in, out, dependencies);
-        }
-        else {
-            return {};
-        }
-    }
-
-    // forward out-of-place REAL_REAL
-    void forward_op_rr(descriptor_type& desc, sycl::buffer<scalar_type, 1>&,
-                       sycl::buffer<scalar_type, 1>&, sycl::buffer<scalar_type, 1>&,
-                       sycl::buffer<scalar_type, 1>&) override {
-        dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<scalar_type, 1>>(
-            "compute_forward");
-        throw oneapi::mkl::unimplemented("DFT",
-                                         "compute_forward(desc, in_re, in_im, out_re, out_im)",
-                                         "portFFT does not support real-real complex storage.");
-    }
-    sycl::event forward_op_rr(descriptor_type& desc, scalar_type*, scalar_type*, scalar_type*,
-                              scalar_type*, const std::vector<sycl::event>&) override {
-        dft::detail::get_commit(desc)->template compute_call_throw<scalar_type*>("compute_forward");
-        throw oneapi::mkl::unimplemented(
-            "DFT", "compute_forward(desc, in_re, in_im, out_re, out_im, dependencies)",
-            "portFFT does not support real-real complex storage.");
-    }
-
-    // backward inplace COMPLEX_COMPLEX
-    void backward_ip_cc(descriptor_type& desc, sycl::buffer<fwd_type, 1>& inout) override {
-        constexpr auto pfft_domain = detail::to_pfft_domain<descriptor_type>::type::value;
-        dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<fwd_type, 1>>(
-            "compute_backward");
-
-        if constexpr (pfft_domain == pfft::domain::COMPLEX) {
-            detail::get_descriptors(desc)[1]->compute_backward(inout);
-        }
-    }
-    sycl::event backward_ip_cc(descriptor_type& desc, fwd_type* inout,
-                               const std::vector<sycl::event>& dependencies) override {
-        constexpr auto pfft_domain = detail::to_pfft_domain<descriptor_type>::type::value;
-        dft::detail::get_commit(desc)->template compute_call_throw<fwd_type*>("compute_backward");
-
-        if constexpr (pfft_domain == pfft::domain::COMPLEX) {
-            return detail::get_descriptors(desc)[1]->compute_backward(inout, dependencies);
-        }
-        else {
-            return {};
-        }
-    }
-
-    // backward inplace REAL_REAL
-    void backward_ip_rr(descriptor_type& desc, sycl::buffer<scalar_type, 1>&,
-                        sycl::buffer<scalar_type, 1>&) override {
-        dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<scalar_type, 1>>(
-            "compute_backward");
-        throw oneapi::mkl::unimplemented("DFT", "compute_backward(desc, inout_re, inout_im)",
-                                         "portFFT does not support real-real complex storage.");
-    }
-    sycl::event backward_ip_rr(descriptor_type& desc, scalar_type*, scalar_type*,
-                               const std::vector<sycl::event>&) override {
-        dft::detail::get_commit(desc)->template compute_call_throw<scalar_type*>(
-            "compute_backward");
-        throw oneapi::mkl::unimplemented("DFT",
-                                         "compute_backward(desc, inout_re, inout_im, dependencies)",
-                                         "portFFT does not support real-real complex storage.");
-    }
-
-    // backward out-of-place COMPLEX_COMPLEX
-    void backward_op_cc(descriptor_type& desc, sycl::buffer<bwd_type, 1>& in,
-                        sycl::buffer<fwd_type, 1>& out) override {
-        constexpr auto pfft_domain = detail::to_pfft_domain<descriptor_type>::type::value;
-        dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<bwd_type, 1>>(
-            "compute_backward");
-
-        if constexpr (pfft_domain == pfft::domain::COMPLEX) {
-            detail::get_descriptors(desc)[1]->compute_backward(in, out);
-        }
-    }
-    sycl::event backward_op_cc(descriptor_type& desc, bwd_type* in, fwd_type* out,
-                               const std::vector<sycl::event>& dependencies) override {
-        constexpr auto pfft_domain = detail::to_pfft_domain<descriptor_type>::type::value;
-        dft::detail::get_commit(desc)->template compute_call_throw<bwd_type*>("compute_backward");
-
-        if constexpr (pfft_domain == pfft::domain::COMPLEX) {
-            return detail::get_descriptors(desc)[1]->compute_backward(in, out, dependencies);
-        }
-        else {
-            return {};
-        }
-    }
-
-    // backward out-of-place REAL_REAL
-    void backward_op_rr(descriptor_type& desc, sycl::buffer<scalar_type, 1>&,
-                        sycl::buffer<scalar_type, 1>&, sycl::buffer<scalar_type, 1>&,
-                        sycl::buffer<scalar_type, 1>&) override {
-        dft::detail::get_commit(desc)->template compute_call_throw<sycl::buffer<scalar_type, 1>>(
-            "compute_backward");
-        throw oneapi::mkl::unimplemented("DFT",
-                                         "compute_backward(desc, in_re, in_im, out_re, out_im)",
-                                         "portFFT does not support real-real complex storage.");
-    }
-    sycl::event backward_op_rr(descriptor_type& desc, scalar_type*, scalar_type*, scalar_type*,
-                               scalar_type*, const std::vector<sycl::event>&) override {
-        dft::detail::get_commit(desc)->template compute_call_throw<scalar_type*>(
-            "compute_backward");
-        throw oneapi::mkl::unimplemented(
-            "DFT", "compute_backward(desc, in_re, in_im, out_re, out_im, deps)",
-            "portFFT does not support real-real complex storage.");
-    }
-};
-} // namespace detail
-
-template <dft::precision prec, dft::domain dom>
-dft::detail::commit_impl<prec, dom>* create_commit(const dft::detail::descriptor<prec, dom>& desc,
-                                                   sycl::queue& sycl_queue) {
-    return new detail::portfft_commit<prec, dom>(sycl_queue, desc.get_values());
-}
-
-template dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::REAL>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::SINGLE, dft::detail::domain::REAL>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>&,
-    sycl::queue&);
-
-} // namespace oneapi::mkl::dft::portfft
diff --git a/src/dft/backends/portfft/descriptor.cpp b/src/dft/backends/portfft/descriptor.cpp
deleted file mode 100644
index d72d23bb5..000000000
--- a/src/dft/backends/portfft/descriptor.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/dft/descriptor.hpp"
-#include "../../descriptor.cxx"
-
-#include "oneapi/mkl/dft/detail/portfft/onemkl_dft_portfft.hpp"
-
-namespace oneapi::mkl::dft {
-
-template <precision prec, domain dom>
-void descriptor<prec, dom>::commit(backend_selector<backend::portfft> selector) {
-    if (!pimpl_ || pimpl_->get_queue() != selector.get_queue()) {
-        if (pimpl_) {
-            pimpl_->get_queue().wait();
-        }
-        pimpl_.reset(portfft::create_commit(*this, selector.get_queue()));
-    }
-    pimpl_->commit(values_);
-}
-
-template void descriptor<precision::SINGLE, domain::COMPLEX>::commit(
-    backend_selector<backend::portfft>);
-template void descriptor<precision::SINGLE, domain::REAL>::commit(
-    backend_selector<backend::portfft>);
-template void descriptor<precision::DOUBLE, domain::COMPLEX>::commit(
-    backend_selector<backend::portfft>);
-template void descriptor<precision::DOUBLE, domain::REAL>::commit(
-    backend_selector<backend::portfft>);
-
-} // namespace oneapi::mkl::dft
diff --git a/src/dft/backends/portfft/mkl_dft_portfft_wrappers.cpp b/src/dft/backends/portfft/mkl_dft_portfft_wrappers.cpp
deleted file mode 100644
index 28996b0a1..000000000
--- a/src/dft/backends/portfft/mkl_dft_portfft_wrappers.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/dft/detail/portfft/onemkl_dft_portfft.hpp"
-#include "dft/function_table.hpp"
-
-#define WRAPPER_VERSION 1
-#define BACKEND         portfft
-
-extern "C" dft_function_table_t mkl_dft_table = {
-    WRAPPER_VERSION,
-#include "dft/backends/backend_wrappers.cxx"
-};
-
-#undef WRAPPER_VERSION
-#undef BACKEND
diff --git a/src/dft/backends/portfft/portfft_helper.hpp b/src/dft/backends/portfft/portfft_helper.hpp
deleted file mode 100644
index 373865f49..000000000
--- a/src/dft/backends/portfft/portfft_helper.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_SRC_PORTFFT_HELPERS_HPP_
-#define _ONEMKL_DFT_SRC_PORTFFT_HELPERS_HPP_
-
-#include <type_traits>
-
-#include <portfft/portfft.hpp>
-
-#include "oneapi/mkl/dft/detail/commit_impl.hpp"
-#include "oneapi/mkl/dft/detail/descriptor_impl.hpp"
-
-namespace pfft = portfft;
-
-namespace oneapi::mkl::dft::portfft::detail {
-template <dft::precision prec, dft::domain dom>
-inline dft::detail::commit_impl<prec, dom> *checked_get_commit(
-    dft::detail::descriptor<prec, dom> &desc) {
-    auto commit_handle = dft::detail::get_commit(desc);
-    if (commit_handle == nullptr || commit_handle->get_backend() != backend::portfft) {
-        throw mkl::invalid_argument("dft/backends/portfft", "get_commit",
-                                    "DFT descriptor has not been commited for portFFT");
-    }
-    return commit_handle;
-}
-
-template <typename descriptor_type>
-using to_pfft_domain =
-    std::conditional<std::is_floating_point_v<fwd<descriptor_type>>,
-                     std::integral_constant<pfft::domain, pfft::domain::REAL>,
-                     std::integral_constant<pfft::domain, pfft::domain::COMPLEX>>;
-
-template <typename descriptor_type>
-using storage_type =
-    std::optional<pfft::committed_descriptor<scalar<descriptor_type>,
-                                             detail::to_pfft_domain<descriptor_type>::type::value>>;
-
-template <typename descriptor_type>
-auto get_descriptors(descriptor_type &desc) {
-    auto commit = detail::checked_get_commit(desc);
-    return reinterpret_cast<storage_type<descriptor_type> *>(commit->get_handle());
-}
-} // namespace oneapi::mkl::dft::portfft::detail
-
-#endif
diff --git a/src/dft/backends/rocfft/CMakeLists.txt b/src/dft/backends/rocfft/CMakeLists.txt
deleted file mode 100644
index 1380c8f0a..000000000
--- a/src/dft/backends/rocfft/CMakeLists.txt
+++ /dev/null
@@ -1,95 +0,0 @@
-#===============================================================================
-# Copyright Codeplay Software Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(LIB_NAME onemkl_dft_rocfft)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT
-  descriptor.cpp
-  commit.cpp
-  forward.cpp
-  backward.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: mkl_dft_rocfft_wrappers.cpp>
-)
-add_dependencies(onemkl_backend_libs_dft ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PUBLIC ${ONEMKL_INTERFACE_INCLUDE_DIRS}
-)
-target_include_directories(${LIB_NAME}
-  PUBLIC ${ONEMKL_INTERFACE_INCLUDE_DIRS}
-)
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/src
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-
-find_package(HIP REQUIRED)
-# Require the minimum rocFFT version matching with ROCm 5.4.3.
-find_package(rocfft 1.0.21 REQUIRED)
-
-target_link_libraries(${LIB_OBJ} PRIVATE hip::host roc::rocfft)
-
-# Allow to compile for different ROCm versions. See the README for the supported
-# ROCm versions.
-# Starting ROCm >=6.0 the include files are one directory level deeper.
-find_path(
-  rocfft_EXTRA_INCLUDE_DIR
-  rocfft.h
-  PATHS ${rocfft_INCLUDE_DIR}
-  PATH_SUFFIXES rocfft
-  NO_DEFAULT_PATH
-  REQUIRED
-)
-target_include_directories(${LIB_OBJ} PRIVATE ${rocfft_EXTRA_INCLUDE_DIR})
-
-target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL)
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-# Set oneMKL libraries as not transitive for dynamic
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/dft/backends/rocfft/backward.cpp b/src/dft/backends/rocfft/backward.cpp
deleted file mode 100644
index 5ff0e2a1f..000000000
--- a/src/dft/backends/rocfft/backward.cpp
+++ /dev/null
@@ -1,357 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-
-#include "oneapi/mkl/dft/detail/rocfft/onemkl_dft_rocfft.hpp"
-#include "oneapi/mkl/dft/descriptor.hpp"
-
-#include "execute_helper.hpp"
-#include "rocfft_handle.hpp"
-
-#include <rocfft.h>
-#include <hip/hip_runtime_api.h>
-
-namespace oneapi::mkl::dft::rocfft {
-namespace detail {
-//forward declaration
-template <dft::precision prec, dft::domain dom>
-std::array<std::int64_t, 2> get_offsets_bwd(dft::detail::commit_impl<prec, dom> *commit);
-
-template <dft::precision prec, dft::domain dom>
-rocfft_plan get_bwd_plan(dft::detail::commit_impl<prec, dom> *commit) {
-    return static_cast<rocfft_handle *>(commit->get_handle())[1].plan.value();
-}
-
-template <dft::precision prec, dft::domain dom>
-rocfft_execution_info get_bwd_info(dft::detail::commit_impl<prec, dom> *commit) {
-    return static_cast<rocfft_handle *>(commit->get_handle())[1].info.value();
-}
-} // namespace detail
-// BUFFER version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<fwd<descriptor_type>, 1> &inout) {
-    const std::string func_name = "compute_backward(desc, inout)";
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_bwd_plan(commit);
-    auto info = detail::get_bwd_info(commit);
-    auto offsets = detail::get_offsets_bwd(commit);
-
-    if constexpr (std::is_floating_point_v<fwd<descriptor_type>>) {
-        offsets[0] *= 2; // offset is supplied in complex but we offset scalar pointer
-    }
-    if (offsets[0] != offsets[1]) {
-        throw oneapi::mkl::unimplemented(
-            "DFT", func_name,
-            "rocFFT requires input and output offsets (first value in strides) to be equal for in-place transforms!");
-    }
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto inout_acc = inout.template get_access<sycl::access::mode::read_write>(cgh);
-        commit->add_buffer_workspace_dependency_if_rqd("compute_backward", cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            auto inout_native = reinterpret_cast<void *>(
-                reinterpret_cast<fwd<descriptor_type> *>(detail::native_mem(ih, inout_acc)) +
-                offsets[0]);
-            detail::execute_checked(func_name, plan, &inout_native, nullptr, info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &inout_re,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &inout_im) {
-    const std::string func_name = "compute_backward(desc, inout_re, inout_im)";
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_bwd_plan(commit);
-    auto info = detail::get_bwd_info(commit);
-    auto offsets = detail::get_offsets_bwd(commit);
-
-    if (offsets[0] != offsets[1]) {
-        throw oneapi::mkl::unimplemented(
-            "DFT", func_name,
-            "rocFFT requires input and output offsets (first value in strides) to be equal for in-place transforms!");
-    }
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto inout_re_acc = inout_re.template get_access<sycl::access::mode::read_write>(cgh);
-        auto inout_im_acc = inout_im.template get_access<sycl::access::mode::read_write>(cgh);
-        commit->add_buffer_workspace_dependency_if_rqd("compute_backward", cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            std::array<void *, 2> inout_native{
-                reinterpret_cast<void *>(reinterpret_cast<scalar<descriptor_type> *>(
-                                             detail::native_mem(ih, inout_re_acc)) +
-                                         offsets[0]),
-                reinterpret_cast<void *>(reinterpret_cast<scalar<descriptor_type> *>(
-                                             detail::native_mem(ih, inout_im_acc)) +
-                                         offsets[0])
-            };
-            detail::execute_checked(func_name, plan, inout_native.data(), nullptr, info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<bwd<descriptor_type>, 1> &in,
-                                    sycl::buffer<fwd<descriptor_type>, 1> &out) {
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::NOT_INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_bwd_plan(commit);
-    auto info = detail::get_bwd_info(commit);
-    auto offsets = detail::get_offsets_bwd(commit);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto in_acc = in.template get_access<sycl::access::mode::read_write>(cgh);
-        auto out_acc = out.template get_access<sycl::access::mode::read_write>(cgh);
-        commit->add_buffer_workspace_dependency_if_rqd("compute_backward", cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            const std::string func_name = "compute_backward(desc, in, out)";
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            auto in_native = reinterpret_cast<void *>(
-                reinterpret_cast<bwd<descriptor_type> *>(detail::native_mem(ih, in_acc)) +
-                offsets[0]);
-            auto out_native = reinterpret_cast<void *>(
-                reinterpret_cast<fwd<descriptor_type> *>(detail::native_mem(ih, out_acc)) +
-                offsets[1]);
-            detail::execute_checked(func_name, plan, &in_native, &out_native, info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_backward(descriptor_type &desc,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &in_re,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &in_im,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &out_re,
-                                    sycl::buffer<scalar<descriptor_type>, 1> &out_im) {
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_bwd_plan(commit);
-    auto info = detail::get_bwd_info(commit);
-    auto offsets = detail::get_offsets_bwd(commit);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto in_re_acc = in_re.template get_access<sycl::access::mode::read_write>(cgh);
-        auto in_im_acc = in_im.template get_access<sycl::access::mode::read_write>(cgh);
-        auto out_re_acc = out_re.template get_access<sycl::access::mode::read_write>(cgh);
-        auto out_im_acc = out_im.template get_access<sycl::access::mode::read_write>(cgh);
-        commit->add_buffer_workspace_dependency_if_rqd("compute_backward", cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            const std::string func_name = "compute_backward(desc, in_re, in_im, out_re, out_im)";
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            std::array<void *, 2> in_native{
-                reinterpret_cast<void *>(
-                    reinterpret_cast<scalar<descriptor_type> *>(detail::native_mem(ih, in_re_acc)) +
-                    offsets[0]),
-                reinterpret_cast<void *>(
-                    reinterpret_cast<scalar<descriptor_type> *>(detail::native_mem(ih, in_im_acc)) +
-                    offsets[0])
-            };
-            std::array<void *, 2> out_native{
-                reinterpret_cast<void *>(reinterpret_cast<scalar<descriptor_type> *>(
-                                             detail::native_mem(ih, out_re_acc)) +
-                                         offsets[1]),
-                reinterpret_cast<void *>(reinterpret_cast<scalar<descriptor_type> *>(
-                                             detail::native_mem(ih, out_im_acc)) +
-                                         offsets[1])
-            };
-            detail::execute_checked(func_name, plan, in_native.data(), out_native.data(), info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-}
-
-//USM version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwd<descriptor_type> *inout,
-                                           const std::vector<sycl::event> &deps) {
-    const std::string func_name = "compute_backward(desc, inout, deps)";
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_bwd_plan(commit);
-    auto info = detail::get_bwd_info(commit);
-    auto offsets = detail::get_offsets_bwd(commit);
-
-    if constexpr (std::is_floating_point_v<fwd<descriptor_type>>) {
-        offsets[0] *= 2; // offset is supplied in complex but we offset scalar pointer
-    }
-    if (offsets[0] != offsets[1]) {
-        throw oneapi::mkl::unimplemented(
-            "DFT", func_name,
-            "rocFFT requires input and output offsets (first value in strides) to be equal for in-place transforms!");
-    }
-    inout += offsets[0];
-
-    sycl::event sycl_event = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(deps);
-        commit->depend_on_last_usm_workspace_event_if_rqd(cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            void *inout_ptr = inout;
-            detail::execute_checked(func_name, plan, &inout_ptr, nullptr, info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-    commit->set_last_usm_workspace_event_if_rqd(sycl_event);
-    return sycl_event;
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar<descriptor_type> *inout_re,
-                                           scalar<descriptor_type> *inout_im,
-                                           const std::vector<sycl::event> &deps) {
-    const std::string func_name = "compute_backward(desc, inout_re, inout_im, deps)";
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_bwd_plan(commit);
-    auto info = detail::get_bwd_info(commit);
-    auto offsets = detail::get_offsets_bwd(commit);
-
-    if (offsets[0] != offsets[1]) {
-        throw oneapi::mkl::unimplemented(
-            "DFT", func_name,
-            "rocFFT requires input and output offsets (first value in strides) to be equal for in-place transforms!");
-    }
-
-    sycl::event sycl_event = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(deps);
-        commit->depend_on_last_usm_workspace_event_if_rqd(cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            std::array<void *, 2> inout_native{ inout_re + offsets[0], inout_im + offsets[0] };
-            detail::execute_checked(func_name, plan, inout_native.data(), nullptr, info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-    commit->set_last_usm_workspace_event_if_rqd(sycl_event);
-    return sycl_event;
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwd<descriptor_type> *in,
-                                           fwd<descriptor_type> *out,
-                                           const std::vector<sycl::event> &deps) {
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::NOT_INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_bwd_plan(commit);
-    auto info = detail::get_bwd_info(commit);
-    auto offsets = detail::get_offsets_bwd(commit);
-
-    in += offsets[0];
-    out += offsets[1];
-
-    sycl::event sycl_event = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(deps);
-        commit->depend_on_last_usm_workspace_event_if_rqd(cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            const std::string func_name = "compute_backward(desc, in, out, deps)";
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            void *in_ptr = in;
-            void *out_ptr = out;
-            detail::execute_checked(func_name, plan, &in_ptr, &out_ptr, info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-    commit->set_last_usm_workspace_event_if_rqd(sycl_event);
-    return sycl_event;
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar<descriptor_type> *in_re,
-                                           scalar<descriptor_type> *in_im,
-                                           scalar<descriptor_type> *out_re,
-                                           scalar<descriptor_type> *out_im,
-                                           const std::vector<sycl::event> &deps) {
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_bwd_plan(commit);
-    auto info = detail::get_bwd_info(commit);
-    auto offsets = detail::get_offsets_bwd(commit);
-
-    sycl::event sycl_event = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(deps);
-        commit->depend_on_last_usm_workspace_event_if_rqd(cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            const std::string func_name =
-                "compute_backward(desc, in_re, in_im, out_re, out_im, deps)";
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            std::array<void *, 2> in_native{ in_re + offsets[0], in_im + offsets[0] };
-            std::array<void *, 2> out_native{ out_re + offsets[1], out_im + offsets[1] };
-            detail::execute_checked(func_name, plan, in_native.data(), out_native.data(), info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-    commit->set_last_usm_workspace_event_if_rqd(sycl_event);
-    return sycl_event;
-}
-
-// Template function instantiations
-#include "dft/backends/backend_backward_instantiations.cxx"
-
-} // namespace oneapi::mkl::dft::rocfft
diff --git a/src/dft/backends/rocfft/commit.cpp b/src/dft/backends/rocfft/commit.cpp
deleted file mode 100644
index db5a7f965..000000000
--- a/src/dft/backends/rocfft/commit.cpp
+++ /dev/null
@@ -1,640 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include <array>
-#include <algorithm>
-#include <optional>
-
-#include "oneapi/mkl/exceptions.hpp"
-
-#include "oneapi/mkl/dft/detail/commit_impl.hpp"
-#include "oneapi/mkl/dft/detail/descriptor_impl.hpp"
-#include "oneapi/mkl/dft/detail/rocfft/onemkl_dft_rocfft.hpp"
-#include "oneapi/mkl/dft/types.hpp"
-
-#include "../stride_helper.hpp"
-
-#include "rocfft_handle.hpp"
-
-#include <rocfft.h>
-#include <hip/hip_runtime_api.h>
-
-namespace oneapi::mkl::dft::rocfft {
-namespace detail {
-
-// rocfft has global setup and cleanup functions which use some global state internally.
-// Each can be called multiple times in an application, but due to the global nature, they always need to alternate.
-// I don't believe its possible to avoid the user calling rocfft_cleanup in their own code,
-// breaking our code, but we can try avoid it for them.
-// rocfft_cleanup internally uses some singletons, so it is very difficult to decide if this is safe due to
-// the static initialisation order fiasco.
-class rocfft_singleton {
-    rocfft_singleton() {
-        const auto result = rocfft_setup();
-        if (result != rocfft_status_success) {
-            throw mkl::exception(
-                "DFT", "rocfft",
-                "Failed to setup rocfft. returned status " + std::to_string(result));
-        }
-    }
-
-    ~rocfft_singleton() {
-        (void)rocfft_cleanup();
-    }
-
-    // no copies or moves allowed
-    rocfft_singleton(const rocfft_singleton& other) = delete;
-    rocfft_singleton(rocfft_singleton&& other) noexcept = delete;
-    rocfft_singleton& operator=(const rocfft_singleton& other) = delete;
-    rocfft_singleton& operator=(rocfft_singleton&& other) noexcept = delete;
-
-public:
-    static void init() {
-        static rocfft_singleton instance;
-        (void)instance;
-    }
-};
-
-/// Commit impl class specialization for rocFFT.
-template <dft::precision prec, dft::domain dom>
-class rocfft_commit final : public dft::detail::commit_impl<prec, dom> {
-private:
-    using scalar_type = typename dft::detail::commit_impl<prec, dom>::scalar_type;
-    // For real to complex transforms, the "transform_type" arg also encodes the direction (e.g. rocfft_transform_type_*_forward vs rocfft_transform_type_*_backward)
-    // in the plan so we must have one for each direction.
-    // We also need this because oneMKL uses a directionless "FWD_DISTANCE" and "BWD_DISTANCE" while rocFFT uses a directional "in_distance" and "out_distance".
-    // The same is also true for "FORWARD_SCALE" and "BACKWARD_SCALE".
-    // handles[0] is forward, handles[1] is backward
-    std::array<rocfft_handle, 2> handles{};
-    std::int64_t offset_fwd_in, offset_fwd_out, offset_bwd_in, offset_bwd_out;
-
-public:
-    rocfft_commit(sycl::queue& queue, const dft::detail::dft_values<prec, dom>& config_values)
-            : oneapi::mkl::dft::detail::commit_impl<prec, dom>(queue, backend::rocfft,
-                                                               config_values) {
-        if constexpr (prec == dft::detail::precision::DOUBLE) {
-            if (!queue.get_device().has(sycl::aspect::fp64)) {
-                throw mkl::exception("DFT", "commit", "Device does not support double precision.");
-            }
-        }
-        // initialise the rocFFT global state
-        rocfft_singleton::init();
-    }
-
-    void clean_plans() {
-        if (handles[0].plan) {
-            if (rocfft_plan_destroy(handles[0].plan.value()) != rocfft_status_success) {
-                throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                     "Failed to destroy forward plan.");
-            }
-            handles[0].plan = std::nullopt;
-        }
-        if (handles[1].plan) {
-            if (rocfft_plan_destroy(handles[1].plan.value()) != rocfft_status_success) {
-                throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                     "Failed to destroy backward plan.");
-            }
-            handles[1].plan = std::nullopt;
-        }
-
-        if (handles[0].info) {
-            if (rocfft_execution_info_destroy(handles[0].info.value()) != rocfft_status_success) {
-                throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                     "Failed to destroy forward execution info .");
-            }
-            handles[0].info = std::nullopt;
-        }
-        if (handles[1].info) {
-            if (rocfft_execution_info_destroy(handles[1].info.value()) != rocfft_status_success) {
-                throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                     "Failed to destroy backward execution info .");
-            }
-            handles[1].info = std::nullopt;
-        }
-        free_internal_workspace_if_rqd(handles[0], "clear_plans");
-        free_internal_workspace_if_rqd(handles[1], "clear_plans");
-    }
-
-    void commit(const dft::detail::dft_values<prec, dom>& config_values) override {
-        // this could be a recommit
-        this->external_workspace_helper_ =
-            oneapi::mkl::dft::detail::external_workspace_helper<prec, dom>(
-                config_values.workspace_placement ==
-                oneapi::mkl::dft::detail::config_value::WORKSPACE_EXTERNAL);
-        clean_plans();
-
-        const rocfft_result_placement placement =
-            (config_values.placement == dft::config_value::INPLACE) ? rocfft_placement_inplace
-                                                                    : rocfft_placement_notinplace;
-
-        constexpr rocfft_transform_type fwd_type = [] {
-            if constexpr (dom == dft::domain::COMPLEX) {
-                return rocfft_transform_type_complex_forward;
-            }
-            else {
-                return rocfft_transform_type_real_forward;
-            }
-        }();
-        constexpr rocfft_transform_type bwd_type = [] {
-            if constexpr (dom == dft::domain::COMPLEX) {
-                return rocfft_transform_type_complex_inverse;
-            }
-            else {
-                return rocfft_transform_type_real_inverse;
-            }
-        }();
-
-        constexpr rocfft_precision precision = [] {
-            if constexpr (prec == dft::precision::SINGLE) {
-                return rocfft_precision_single;
-            }
-            else {
-                return rocfft_precision_double;
-            }
-        }();
-
-        const std::size_t dimensions = config_values.dimensions.size();
-
-        constexpr std::size_t max_supported_dims = 3;
-        std::array<std::size_t, max_supported_dims> lengths;
-        // rocfft does dimensions in the reverse order to oneMKL
-        std::copy(config_values.dimensions.crbegin(), config_values.dimensions.crend(),
-                  lengths.data());
-
-        const std::size_t number_of_transforms =
-            static_cast<std::size_t>(config_values.number_of_transforms);
-
-        const std::size_t fwd_distance = static_cast<std::size_t>(config_values.fwd_dist);
-        const std::size_t bwd_distance = static_cast<std::size_t>(config_values.bwd_dist);
-
-        const rocfft_array_type fwd_array_ty = [&config_values]() {
-            if constexpr (dom == dft::domain::COMPLEX) {
-                if (config_values.complex_storage == dft::config_value::COMPLEX_COMPLEX) {
-                    return rocfft_array_type_complex_interleaved;
-                }
-                else {
-                    return rocfft_array_type_complex_planar;
-                }
-            }
-            else {
-                return rocfft_array_type_real;
-            }
-        }();
-        const rocfft_array_type bwd_array_ty = [&config_values]() {
-            if constexpr (dom == dft::domain::COMPLEX) {
-                if (config_values.complex_storage == dft::config_value::COMPLEX_COMPLEX) {
-                    return rocfft_array_type_complex_interleaved;
-                }
-                else {
-                    return rocfft_array_type_complex_planar;
-                }
-            }
-            else {
-                if (config_values.conj_even_storage != dft::config_value::COMPLEX_COMPLEX) {
-                    throw mkl::exception(
-                        "dft/backends/rocfft", __FUNCTION__,
-                        "only COMPLEX_COMPLEX conjugate_even_storage is supported");
-                }
-                return rocfft_array_type_hermitian_interleaved;
-            }
-        }();
-
-        auto stride_api_choice = dft::detail::get_stride_api(config_values);
-        dft::detail::throw_on_invalid_stride_api("ROCFFT commit", stride_api_choice);
-        dft::detail::stride_vectors<size_t> stride_vecs(config_values, stride_api_choice);
-
-        // while rocfft interface accepts offsets, it does not actually handle them
-        offset_fwd_in = stride_vecs.offset_fwd_in;
-        offset_fwd_out = stride_vecs.offset_fwd_out;
-        offset_bwd_in = stride_vecs.offset_bwd_in;
-        offset_bwd_out = stride_vecs.offset_bwd_out;
-
-        auto func = __FUNCTION__;
-        auto check_strides = [&](const auto& strides) {
-            for (int i = 1; i <= dimensions; i++) {
-                for (int j = 1; j <= dimensions; j++) {
-                    std::int64_t cplx_dim = config_values.dimensions[j - 1];
-                    std::int64_t real_dim = (dom == dft::domain::REAL && j == dimensions)
-                                                ? (cplx_dim / 2 + 1)
-                                                : cplx_dim;
-                    if (strides[i] > strides[j] && strides[i] % cplx_dim != 0 &&
-                        strides[i] % real_dim != 0) {
-                        // rocfft does not throw, it just produces wrong results
-                        throw oneapi::mkl::unimplemented(
-                            "DFT", func,
-                            "rocfft requires a stride to be divisible by all dimensions associated with smaller strides!");
-                    }
-                }
-            }
-        };
-        // bwd_in/out alias fwd_in/out, so no need to check everything.
-        check_strides(stride_vecs.vec_a);
-        check_strides(stride_vecs.vec_b);
-
-        // Reformat slides to conform to rocFFT API.
-        std::reverse(stride_vecs.vec_a.begin(), stride_vecs.vec_a.end());
-        stride_vecs.vec_a.pop_back(); // Offset is not included.
-        std::reverse(stride_vecs.vec_b.begin(), stride_vecs.vec_b.end());
-        stride_vecs.vec_b.pop_back(); // Offset is not included.
-
-        rocfft_plan_description plan_desc;
-        if (rocfft_plan_description_create(&plan_desc) != rocfft_status_success) {
-            throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                 "Failed to create plan description.");
-        }
-
-        // plan_description can be destroyed afted plan_create
-        auto description_destroy = [](rocfft_plan_description p) {
-            if (rocfft_plan_description_destroy(p) != rocfft_status_success) {
-                throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                     "Failed to destroy plan description.");
-            }
-        };
-        std::unique_ptr<rocfft_plan_description_t, decltype(description_destroy)>
-            description_destroyer(plan_desc, description_destroy);
-
-        std::array<std::size_t, 3> stride_a_indices{ 0, 1, 2 };
-        std::sort(&stride_a_indices[0], &stride_a_indices[dimensions],
-                  [&](std::size_t a, std::size_t b) {
-                      return stride_vecs.vec_a[a] < stride_vecs.vec_a[b];
-                  });
-        std::array<std::size_t, 3> stride_b_indices{ 0, 1, 2 };
-        std::sort(&stride_b_indices[0], &stride_b_indices[dimensions],
-                  [&](std::size_t a, std::size_t b) {
-                      return stride_vecs.vec_b[a] < stride_vecs.vec_b[b];
-                  });
-        std::array<std::size_t, max_supported_dims> lengths_cplx = lengths;
-        if (dom == dft::domain::REAL) {
-            lengths_cplx[0] = lengths_cplx[0] / 2 + 1;
-        }
-        // When creating real-complex descriptions, the strides will always be wrong for one of the directions.
-        // This is because the least significant dimension is symmetric.
-        // If the strides are invalid (too small to fit) then just don't bother creating the plan.
-        const bool vec_a_valid_as_reals =
-            dimensions == 1 ||
-            (lengths_cplx[stride_a_indices[0]] <= stride_vecs.vec_a[stride_a_indices[1]] &&
-             (dimensions == 2 ||
-              lengths_cplx[stride_a_indices[0]] * lengths_cplx[stride_a_indices[1]] <=
-                  stride_vecs.vec_a[stride_a_indices[2]]));
-        const bool vec_b_valid_as_reals =
-            dimensions == 1 ||
-            (lengths_cplx[stride_b_indices[0]] <= stride_vecs.vec_b[stride_b_indices[1]] &&
-             (dimensions == 2 ||
-              lengths_cplx[stride_b_indices[0]] * lengths_cplx[stride_b_indices[1]] <=
-                  stride_vecs.vec_b[stride_b_indices[2]]));
-        // Test if the stride vector being used as the fwd domain for each direction has valid strides for that use.
-        bool valid_forward =
-            stride_vecs.fwd_in == stride_vecs.vec_a && vec_a_valid_as_reals || vec_b_valid_as_reals;
-        bool valid_backward = stride_vecs.bwd_out == stride_vecs.vec_a && vec_a_valid_as_reals ||
-                              vec_b_valid_as_reals;
-
-        if (!valid_forward && !valid_backward) {
-            throw mkl::exception("dft/backends/cufft", __FUNCTION__, "Invalid strides.");
-        }
-
-        if (valid_forward) {
-            auto res =
-                rocfft_plan_description_set_data_layout(plan_desc, fwd_array_ty, bwd_array_ty,
-                                                        nullptr, // in offsets
-                                                        nullptr, // out offsets
-                                                        dimensions,
-                                                        stride_vecs.fwd_in.data(), //in strides
-                                                        fwd_distance, // in distance
-                                                        dimensions,
-                                                        stride_vecs.fwd_out.data(), // out strides
-                                                        bwd_distance // out distance
-                );
-            if (res != rocfft_status_success) {
-                throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                     "Failed to set forward data layout.");
-            }
-
-            if (rocfft_plan_description_set_scale_factor(plan_desc, config_values.fwd_scale) !=
-                rocfft_status_success) {
-                throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                     "Failed to set forward scale factor.");
-            }
-
-            rocfft_plan fwd_plan;
-            res = rocfft_plan_create(&fwd_plan, placement, fwd_type, precision, dimensions,
-                                     lengths.data(), number_of_transforms, plan_desc);
-
-            if (res != rocfft_status_success) {
-                throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                     "Failed to create forward plan.");
-            }
-
-            handles[0].plan = fwd_plan;
-
-            rocfft_execution_info fwd_info;
-            if (rocfft_execution_info_create(&fwd_info) != rocfft_status_success) {
-                throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                     "Failed to create forward execution info.");
-            }
-            handles[0].info = fwd_info;
-
-            if (config_values.workspace_placement == config_value::WORKSPACE_AUTOMATIC) {
-                std::int64_t work_buf_size = get_rocfft_workspace_bytes(handles[0], "commit");
-                if (work_buf_size != 0) {
-                    void* work_buf;
-                    if (hipMalloc(&work_buf, work_buf_size) != hipSuccess) {
-                        throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                             "Failed to get allocate forward work buffer.");
-                    }
-                    set_workspace_impl(handles[0], reinterpret_cast<scalar_type*>(work_buf),
-                                       work_buf_size, "commit");
-                    handles[0].buffer = work_buf;
-                }
-            }
-        }
-
-        if (valid_backward) {
-            auto res =
-                rocfft_plan_description_set_data_layout(plan_desc, bwd_array_ty, fwd_array_ty,
-                                                        nullptr, // in offsets
-                                                        nullptr, // out offsets
-                                                        dimensions,
-                                                        stride_vecs.bwd_in.data(), //in strides
-                                                        bwd_distance, // in distance
-                                                        dimensions,
-                                                        stride_vecs.bwd_out.data(), // out strides
-                                                        fwd_distance // out distance
-                );
-            if (res != rocfft_status_success) {
-                throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                     "Failed to set backward data layout.");
-            }
-
-            if (rocfft_plan_description_set_scale_factor(plan_desc, config_values.bwd_scale) !=
-                rocfft_status_success) {
-                throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                     "Failed to set backward scale factor.");
-            }
-
-            rocfft_plan bwd_plan;
-            res = rocfft_plan_create(&bwd_plan, placement, bwd_type, precision, dimensions,
-                                     lengths.data(), number_of_transforms, plan_desc);
-            if (res != rocfft_status_success) {
-                throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                     "Failed to create backward rocFFT plan.");
-            }
-            handles[1].plan = bwd_plan;
-
-            rocfft_execution_info bwd_info;
-            if (rocfft_execution_info_create(&bwd_info) != rocfft_status_success) {
-                throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                     "Failed to create backward execution info.");
-            }
-            handles[1].info = bwd_info;
-
-            if (config_values.workspace_placement == config_value::WORKSPACE_AUTOMATIC) {
-                std::int64_t work_buf_size = get_rocfft_workspace_bytes(handles[1], "commit");
-                if (work_buf_size != 0) {
-                    void* work_buf;
-                    if (hipMalloc(&work_buf, work_buf_size) != hipSuccess) {
-                        throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                             "Failed to get allocate backward work buffer.");
-                    }
-                    set_workspace_impl(handles[1], reinterpret_cast<scalar_type*>(work_buf),
-                                       work_buf_size, "commit");
-                    handles[1].buffer = work_buf;
-                }
-            }
-        }
-    }
-
-    ~rocfft_commit() override {
-        clean_plans();
-    }
-
-    // Rule of three. Copying could lead to memory safety issues.
-    rocfft_commit(const rocfft_commit& other) = delete;
-    rocfft_commit& operator=(const rocfft_commit& other) = delete;
-
-    void* get_handle() noexcept override {
-        return handles.data();
-    }
-
-    std::array<std::int64_t, 2> get_offsets_fwd() noexcept {
-        return { offset_fwd_in, offset_fwd_out };
-    }
-
-    std::array<std::int64_t, 2> get_offsets_bwd() noexcept {
-        return { offset_bwd_in, offset_bwd_out };
-    }
-
-    /** Get the requried worspace size for a rocfft plan. Implementation to be shared by internal and external workspace mechanisms.
-
-     * @param handle rocfft_handle. Expected to have valid rocfft_plan.
-     * @param function The name of the function to give when generating exceptions
-     * @return Required space in bytes
-    **/
-    std::int64_t get_rocfft_workspace_bytes(rocfft_handle& handle, const char* function) {
-        if (!handle.plan) {
-            throw mkl::exception("dft/backends/rocfft", function, "Missing internal rocfft plan");
-        }
-        std::size_t size = 0;
-        if (rocfft_plan_get_work_buffer_size(*handle.plan, &size) != rocfft_status_success) {
-            throw mkl::exception("dft/backends/rocfft", function,
-                                 "Failed to get rocfft work buffer size.");
-        }
-        return static_cast<std::int64_t>(size);
-    }
-
-    /** Set the rocFFT workspace. Implementation to be shared by internal workspace allocation and external workspace
-     * mechanisms. Does not set handle.buffer.
-     * 
-     * @param handle rocfft_handle. Expected to have valid rocfft_plan and rocfft_info, but no associated buffer.
-     * @param workspace Pointer to allocation to use as workspace
-     * @param workspace_bytes The size (in bytes) of the given workspace
-     * @param function The name of the function to give when generating exceptions
-    **/
-    void set_workspace_impl(const rocfft_handle& handle, scalar_type* workspace,
-                            std::int64_t workspace_bytes, const char* function) {
-        if (!handle.info) {
-            throw mkl::exception(
-                "dft/backends/rocfft", function,
-                "Could not set rocFFT workspace - handle has no associated rocfft_info.");
-        }
-        if (handle.buffer) {
-            throw mkl::exception(
-                "dft/backends/rocfft", function,
-                "Could not set rocFFT workspace - an internal buffer is already set.");
-        }
-        if (workspace_bytes && workspace == nullptr) {
-            throw mkl::exception("dft/backends/rocfft", function, "Trying to nullptr workspace.");
-        }
-        auto info = *handle.info;
-        if (workspace_bytes &&
-            rocfft_execution_info_set_work_buffer(info, static_cast<void*>(workspace),
-                                                  static_cast<std::size_t>(workspace_bytes)) !=
-                rocfft_status_success) {
-            throw mkl::exception("dft/backends/rocfft", function, "Failed to set work buffer.");
-        }
-    }
-
-    void free_internal_workspace_if_rqd(rocfft_handle& handle, const char* function) {
-        if (handle.buffer) {
-            if (hipFree(*handle.buffer) != hipSuccess) {
-                throw mkl::exception("dft/backends/rocfft", function,
-                                     "Failed to free internal buffer.");
-            }
-            handle.buffer = std::nullopt;
-        }
-    }
-
-    virtual void set_workspace(scalar_type* usm_workspace) override {
-        std::int64_t total_workspace_bytes{ this->get_workspace_external_bytes() };
-        this->external_workspace_helper_.set_workspace_throw(*this, usm_workspace);
-        if (handles[0].plan) {
-            free_internal_workspace_if_rqd(handles[0], "set_workspace");
-            set_workspace_impl(handles[0], usm_workspace, total_workspace_bytes, "set_workspace");
-        }
-        if (handles[1].plan) {
-            free_internal_workspace_if_rqd(handles[1], "set_workspace");
-            set_workspace_impl(handles[1], usm_workspace, total_workspace_bytes, "set_workspace");
-        }
-    }
-
-    void set_buffer_workspace(rocfft_handle& handle, sycl::buffer<scalar_type>& buffer_workspace) {
-        auto workspace_bytes = buffer_workspace.size() * sizeof(scalar_type);
-        if (buffer_workspace.size() == 0) {
-            return; // Nothing to do.
-        }
-        this->get_queue().submit([&](sycl::handler& cgh) {
-            auto workspace_acc =
-                buffer_workspace.template get_access<sycl::access::mode::read_write>(cgh);
-            cgh.host_task([=](sycl::interop_handle ih) {
-                auto workspace_native = reinterpret_cast<scalar_type*>(
-                    ih.get_native_mem<sycl::backend::ext_oneapi_hip>(workspace_acc));
-                set_workspace_impl(handle, workspace_native, workspace_bytes, "set_workspace");
-            });
-        });
-        this->get_queue().wait_and_throw();
-    }
-
-    virtual void set_workspace(sycl::buffer<scalar_type>& buffer_workspace) override {
-        this->external_workspace_helper_.set_workspace_throw(*this, buffer_workspace);
-        std::size_t total_workspace_count =
-            static_cast<std::size_t>(this->get_workspace_external_bytes()) / sizeof(scalar_type);
-        if (handles[0].plan) {
-            free_internal_workspace_if_rqd(handles[0], "set_workspace");
-            set_buffer_workspace(handles[0], buffer_workspace);
-        }
-        if (handles[1].plan) {
-            free_internal_workspace_if_rqd(handles[1], "set_workspace");
-            set_buffer_workspace(handles[1], buffer_workspace);
-        }
-    }
-
-    std::int64_t get_plan_workspace_size_bytes(rocfft_plan_t* plan) {
-        // plan work buffer
-        if (plan == nullptr) {
-            throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                 "Missing internal rocFFT plan.");
-        }
-        std::size_t work_buf_size;
-        if (rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) {
-            throw mkl::exception("dft/backends/rocfft", __FUNCTION__,
-                                 "Failed to get work buffer size.");
-        }
-        return static_cast<std::int64_t>(work_buf_size);
-    }
-
-    virtual std::int64_t get_workspace_external_bytes_impl() override {
-        std::int64_t size0 = handles[0].plan ? get_plan_workspace_size_bytes(*handles[0].plan) : 0;
-        std::int64_t size1 = handles[1].plan ? get_plan_workspace_size_bytes(*handles[1].plan) : 0;
-        return std::max(size0, size1);
-    };
-
-#define BACKEND rocfft
-#include "../backend_compute_signature.cxx"
-#undef BACKEND
-};
-} // namespace detail
-
-template <dft::precision prec, dft::domain dom>
-dft::detail::commit_impl<prec, dom>* create_commit(const dft::detail::descriptor<prec, dom>& desc,
-                                                   sycl::queue& sycl_queue) {
-    return new detail::rocfft_commit<prec, dom>(sycl_queue, desc.get_values());
-}
-
-template dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::REAL>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::SINGLE, dft::detail::domain::REAL>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>&,
-    sycl::queue&);
-template dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>*
-create_commit(
-    const dft::detail::descriptor<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>&,
-    sycl::queue&);
-
-namespace detail {
-template <dft::precision prec, dft::domain dom>
-std::array<std::int64_t, 2> get_offsets_fwd(dft::detail::commit_impl<prec, dom>* commit) {
-    return static_cast<rocfft_commit<prec, dom>*>(commit)->get_offsets_fwd();
-}
-
-template <dft::precision prec, dft::domain dom>
-std::array<std::int64_t, 2> get_offsets_bwd(dft::detail::commit_impl<prec, dom>* commit) {
-    return static_cast<rocfft_commit<prec, dom>*>(commit)->get_offsets_bwd();
-}
-
-template std::array<std::int64_t, 2>
-get_offsets_fwd<dft::detail::precision::SINGLE, dft::detail::domain::REAL>(
-    dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::REAL>*);
-template std::array<std::int64_t, 2>
-get_offsets_fwd<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>(
-    dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>*);
-template std::array<std::int64_t, 2>
-get_offsets_fwd<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>(
-    dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>*);
-template std::array<std::int64_t, 2>
-get_offsets_fwd<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>(
-    dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>*);
-
-template std::array<std::int64_t, 2>
-get_offsets_bwd<dft::detail::precision::SINGLE, dft::detail::domain::REAL>(
-    dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::REAL>*);
-template std::array<std::int64_t, 2>
-get_offsets_bwd<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>(
-    dft::detail::commit_impl<dft::detail::precision::SINGLE, dft::detail::domain::COMPLEX>*);
-template std::array<std::int64_t, 2>
-get_offsets_bwd<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>(
-    dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::REAL>*);
-template std::array<std::int64_t, 2>
-get_offsets_bwd<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>(
-    dft::detail::commit_impl<dft::detail::precision::DOUBLE, dft::detail::domain::COMPLEX>*);
-
-} //namespace detail
-
-} // namespace oneapi::mkl::dft::rocfft
diff --git a/src/dft/backends/rocfft/descriptor.cpp b/src/dft/backends/rocfft/descriptor.cpp
deleted file mode 100644
index 83fdbe1dc..000000000
--- a/src/dft/backends/rocfft/descriptor.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/dft/descriptor.hpp"
-#include "../../descriptor.cxx"
-
-#include "oneapi/mkl/dft/detail/rocfft/onemkl_dft_rocfft.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-
-template <precision prec, domain dom>
-void descriptor<prec, dom>::commit(backend_selector<backend::rocfft> selector) {
-    if (!pimpl_ || pimpl_->get_queue() != selector.get_queue()) {
-        if (pimpl_) {
-            pimpl_->get_queue().wait();
-        }
-        pimpl_.reset(rocfft::create_commit(*this, selector.get_queue()));
-    }
-    pimpl_->commit(values_);
-}
-
-template void descriptor<precision::SINGLE, domain::COMPLEX>::commit(
-    backend_selector<backend::rocfft>);
-template void descriptor<precision::SINGLE, domain::REAL>::commit(
-    backend_selector<backend::rocfft>);
-template void descriptor<precision::DOUBLE, domain::COMPLEX>::commit(
-    backend_selector<backend::rocfft>);
-template void descriptor<precision::DOUBLE, domain::REAL>::commit(
-    backend_selector<backend::rocfft>);
-
-} //namespace dft
-} //namespace mkl
-} //namespace oneapi
diff --git a/src/dft/backends/rocfft/execute_helper.hpp b/src/dft/backends/rocfft/execute_helper.hpp
deleted file mode 100644
index 4dff6831d..000000000
--- a/src/dft/backends/rocfft/execute_helper.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_SRC_ROCFFT_EXECUTE_HELPER_HPP_
-#define _ONEMKL_DFT_SRC_ROCFFT_EXECUTE_HELPER_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/dft/detail/commit_impl.hpp"
-#include "oneapi/mkl/dft/detail/descriptor_impl.hpp"
-#include "oneapi/mkl/dft/types.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-
-#include <hip/hip_runtime.h>
-#include <rocfft.h>
-
-namespace oneapi::mkl::dft::rocfft::detail {
-
-template <dft::precision prec, dft::domain dom>
-inline dft::detail::commit_impl<prec, dom> *checked_get_commit(
-    dft::detail::descriptor<prec, dom> &desc) {
-    auto commit_handle = dft::detail::get_commit(desc);
-    if (commit_handle == nullptr || commit_handle->get_backend() != backend::rocfft) {
-        throw mkl::invalid_argument("dft/backends/rocfft", "get_commit",
-                                    "DFT descriptor has not been commited for rocFFT");
-    }
-    return commit_handle;
-}
-
-/// Throw an mkl::invalid_argument if the runtime param in the descriptor does not match
-/// the expected value.
-template <dft::config_param Param, dft::config_value Expected, typename DescT>
-inline auto expect_config(DescT &desc, const char *message) {
-    dft::config_value actual{ 0 };
-    desc.get_value(Param, &actual);
-    if (actual != Expected) {
-        throw mkl::invalid_argument("dft/backends/rocfft", "expect_config", message);
-    }
-}
-
-template <typename Acc>
-inline void *native_mem(sycl::interop_handle &ih, Acc &buf) {
-    return ih.get_native_mem<sycl::backend::ext_oneapi_hip>(buf);
-}
-
-inline hipStream_t setup_stream(const std::string &func, sycl::interop_handle &ih,
-                                rocfft_execution_info info) {
-    auto stream = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
-    auto result = rocfft_execution_info_set_stream(info, stream);
-    if (result != rocfft_status_success) {
-        throw oneapi::mkl::exception(
-            "dft/backends/rocfft", func,
-            "rocfft_execution_info_set_stream returned " + std::to_string(result));
-    }
-    return stream;
-}
-
-inline void sync_checked(const std::string &func, hipStream_t stream) {
-    auto result = hipStreamSynchronize(stream);
-    if (result != hipSuccess) {
-        throw oneapi::mkl::exception("dft/backends/rocfft", func,
-                                     "hipStreamSynchronize returned " + std::to_string(result));
-    }
-}
-
-inline void execute_checked(const std::string &func, const rocfft_plan plan, void *in_buffer[],
-                            void *out_buffer[], rocfft_execution_info info) {
-    auto result = rocfft_execute(plan, in_buffer, out_buffer, info);
-    if (result != rocfft_status_success) {
-        throw oneapi::mkl::exception("dft/backends/rocfft", func,
-                                     "rocfft_execute returned " + std::to_string(result));
-    }
-}
-
-} // namespace oneapi::mkl::dft::rocfft::detail
-
-#endif
diff --git a/src/dft/backends/rocfft/forward.cpp b/src/dft/backends/rocfft/forward.cpp
deleted file mode 100644
index 70d3d0f97..000000000
--- a/src/dft/backends/rocfft/forward.cpp
+++ /dev/null
@@ -1,358 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <type_traits>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/exceptions.hpp"
-
-#include "oneapi/mkl/dft/detail/rocfft/onemkl_dft_rocfft.hpp"
-#include "oneapi/mkl/dft/descriptor.hpp"
-
-#include "execute_helper.hpp"
-#include "rocfft_handle.hpp"
-
-#include <rocfft.h>
-#include <hip/hip_runtime_api.h>
-
-namespace oneapi::mkl::dft::rocfft {
-
-namespace detail {
-//forward declaration
-template <dft::precision prec, dft::domain dom>
-std::array<std::int64_t, 2> get_offsets_fwd(dft::detail::commit_impl<prec, dom> *commit);
-
-template <dft::precision prec, dft::domain dom>
-rocfft_plan get_fwd_plan(dft::detail::commit_impl<prec, dom> *commit) {
-    return static_cast<rocfft_handle *>(commit->get_handle())[0].plan.value();
-}
-
-template <dft::precision prec, dft::domain dom>
-rocfft_execution_info get_fwd_info(dft::detail::commit_impl<prec, dom> *commit) {
-    return static_cast<rocfft_handle *>(commit->get_handle())[0].info.value();
-}
-} // namespace detail
-
-// BUFFER version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc,
-                                   sycl::buffer<fwd<descriptor_type>, 1> &inout) {
-    const std::string func_name = "compute_forward(desc, inout)";
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_fwd_plan(commit);
-    auto info = detail::get_fwd_info(commit);
-    auto offsets = detail::get_offsets_fwd(commit);
-
-    if constexpr (std::is_floating_point_v<fwd<descriptor_type>>) {
-        offsets[1] *= 2; // offset is supplied in complex but we offset scalar pointer
-    }
-    if (offsets[0] != offsets[1]) {
-        throw oneapi::mkl::unimplemented(
-            "DFT", func_name,
-            "rocFFT requires input and output offsets (first value in strides) to be equal for in-place transforms!");
-    }
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto inout_acc = inout.template get_access<sycl::access::mode::read_write>(cgh);
-        commit->add_buffer_workspace_dependency_if_rqd("compute_forward", cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            auto inout_native = reinterpret_cast<void *>(
-                reinterpret_cast<fwd<descriptor_type> *>(detail::native_mem(ih, inout_acc)) +
-                offsets[0]);
-            detail::execute_checked(func_name, plan, &inout_native, nullptr, info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &inout_re,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &inout_im) {
-    const std::string func_name = "compute_forward(desc, inout_re, inout_im)";
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_fwd_plan(commit);
-    auto info = detail::get_fwd_info(commit);
-    auto offsets = detail::get_offsets_fwd(commit);
-
-    if (offsets[0] != offsets[1]) {
-        throw oneapi::mkl::unimplemented(
-            "DFT", func_name,
-            "rocFFT requires input and output offsets (first value in strides) to be equal for in-place transforms!");
-    }
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto inout_re_acc = inout_re.template get_access<sycl::access::mode::read_write>(cgh);
-        auto inout_im_acc = inout_im.template get_access<sycl::access::mode::read_write>(cgh);
-        commit->add_buffer_workspace_dependency_if_rqd("compute_forward", cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            std::array<void *, 2> inout_native{
-                reinterpret_cast<void *>(reinterpret_cast<scalar<descriptor_type> *>(
-                                             detail::native_mem(ih, inout_re_acc)) +
-                                         offsets[0]),
-                reinterpret_cast<void *>(reinterpret_cast<scalar<descriptor_type> *>(
-                                             detail::native_mem(ih, inout_im_acc)) +
-                                         offsets[0])
-            };
-            detail::execute_checked(func_name, plan, inout_native.data(), nullptr, info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer<fwd<descriptor_type>, 1> &in,
-                                   sycl::buffer<bwd<descriptor_type>, 1> &out) {
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::NOT_INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_fwd_plan(commit);
-    auto info = detail::get_fwd_info(commit);
-    auto offsets = detail::get_offsets_fwd(commit);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto in_acc = in.template get_access<sycl::access::mode::read_write>(cgh);
-        auto out_acc = out.template get_access<sycl::access::mode::read_write>(cgh);
-        commit->add_buffer_workspace_dependency_if_rqd("compute_forward", cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            const std::string func_name = "compute_forward(desc, in, out)";
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            auto in_native = reinterpret_cast<void *>(
-                reinterpret_cast<fwd<descriptor_type> *>(detail::native_mem(ih, in_acc)) +
-                offsets[0]);
-            auto out_native = reinterpret_cast<void *>(
-                reinterpret_cast<bwd<descriptor_type> *>(detail::native_mem(ih, out_acc)) +
-                offsets[1]);
-            detail::execute_checked(func_name, plan, &in_native, &out_native, info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT void compute_forward(descriptor_type &desc,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &in_re,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &in_im,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &out_re,
-                                   sycl::buffer<scalar<descriptor_type>, 1> &out_im) {
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_fwd_plan(commit);
-    auto info = detail::get_fwd_info(commit);
-    auto offsets = detail::get_offsets_fwd(commit);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto in_re_acc = in_re.template get_access<sycl::access::mode::read_write>(cgh);
-        auto in_im_acc = in_im.template get_access<sycl::access::mode::read_write>(cgh);
-        auto out_re_acc = out_re.template get_access<sycl::access::mode::read_write>(cgh);
-        auto out_im_acc = out_im.template get_access<sycl::access::mode::read_write>(cgh);
-        commit->add_buffer_workspace_dependency_if_rqd("compute_forward", cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            const std::string func_name = "compute_forward(desc, in_re, in_im, out_re, out_im)";
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            std::array<void *, 2> in_native{
-                reinterpret_cast<void *>(
-                    reinterpret_cast<scalar<descriptor_type> *>(detail::native_mem(ih, in_re_acc)) +
-                    offsets[0]),
-                reinterpret_cast<void *>(
-                    reinterpret_cast<scalar<descriptor_type> *>(detail::native_mem(ih, in_im_acc)) +
-                    offsets[0])
-            };
-            std::array<void *, 2> out_native{
-                reinterpret_cast<void *>(reinterpret_cast<scalar<descriptor_type> *>(
-                                             detail::native_mem(ih, out_re_acc)) +
-                                         offsets[1]),
-                reinterpret_cast<void *>(reinterpret_cast<scalar<descriptor_type> *>(
-                                             detail::native_mem(ih, out_im_acc)) +
-                                         offsets[1])
-            };
-            detail::execute_checked(func_name, plan, in_native.data(), out_native.data(), info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-}
-
-//USM version
-
-//In-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd<descriptor_type> *inout,
-                                          const std::vector<sycl::event> &deps) {
-    const std::string func_name = "compute_forward(desc, inout, deps)";
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_fwd_plan(commit);
-    auto info = detail::get_fwd_info(commit);
-    auto offsets = detail::get_offsets_fwd(commit);
-
-    if constexpr (std::is_floating_point_v<fwd<descriptor_type>>) {
-        offsets[1] *= 2; // offset is supplied in complex but we offset scalar pointer
-    }
-    if (offsets[0] != offsets[1]) {
-        throw oneapi::mkl::unimplemented(
-            "DFT", func_name,
-            "rocFFT requires input and output offsets (first value in strides) to be equal for in-place transforms!");
-    }
-    inout += offsets[0];
-
-    sycl::event sycl_event = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(deps);
-        commit->depend_on_last_usm_workspace_event_if_rqd(cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            void *inout_ptr = inout;
-            detail::execute_checked(func_name, plan, &inout_ptr, nullptr, info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-    commit->set_last_usm_workspace_event_if_rqd(sycl_event);
-    return sycl_event;
-}
-
-//In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar<descriptor_type> *inout_re,
-                                          scalar<descriptor_type> *inout_im,
-                                          const std::vector<sycl::event> &deps) {
-    const std::string func_name = "compute_forward(desc, inout_re, inout_im, deps)";
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_fwd_plan(commit);
-    auto info = detail::get_fwd_info(commit);
-    auto offsets = detail::get_offsets_fwd(commit);
-
-    if (offsets[0] != offsets[1]) {
-        throw oneapi::mkl::unimplemented(
-            "DFT", func_name,
-            "rocFFT requires input and output offsets (first value in strides) to be equal for in-place transforms!");
-    }
-
-    sycl::event sycl_event = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(deps);
-        commit->depend_on_last_usm_workspace_event_if_rqd(cgh);
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            std::array<void *, 2> inout_native{ inout_re + offsets[0], inout_im + offsets[0] };
-            detail::execute_checked(func_name, plan, inout_native.data(), nullptr, info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-    commit->set_last_usm_workspace_event_if_rqd(sycl_event);
-    return sycl_event;
-}
-
-//Out-of-place transform
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd<descriptor_type> *in,
-                                          bwd<descriptor_type> *out,
-                                          const std::vector<sycl::event> &deps) {
-    detail::expect_config<dft::config_param::PLACEMENT, dft::config_value::NOT_INPLACE>(
-        desc, "Unexpected value for placement");
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_fwd_plan(commit);
-    auto info = detail::get_fwd_info(commit);
-    auto offsets = detail::get_offsets_fwd(commit);
-
-    in += offsets[0];
-    out += offsets[1];
-
-    sycl::event sycl_event = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(deps);
-        commit->depend_on_last_usm_workspace_event_if_rqd(cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            const std::string func_name = "compute_forward(desc, in, out, deps)";
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            void *in_ptr = in;
-            void *out_ptr = out;
-            detail::execute_checked(func_name, plan, &in_ptr, &out_ptr, info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-    commit->set_last_usm_workspace_event_if_rqd(sycl_event);
-    return sycl_event;
-}
-
-//Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format
-template <typename descriptor_type>
-ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar<descriptor_type> *in_re,
-                                          scalar<descriptor_type> *in_im,
-                                          scalar<descriptor_type> *out_re,
-                                          scalar<descriptor_type> *out_im,
-                                          const std::vector<sycl::event> &deps) {
-    auto commit = detail::checked_get_commit(desc);
-    auto queue = commit->get_queue();
-    auto plan = detail::get_fwd_plan(commit);
-    auto info = detail::get_fwd_info(commit);
-    auto offsets = detail::get_offsets_fwd(commit);
-
-    sycl::event sycl_event = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(deps);
-        commit->depend_on_last_usm_workspace_event_if_rqd(cgh);
-
-        cgh.host_task([=](sycl::interop_handle ih) {
-            const std::string func_name =
-                "compute_forward(desc, in_re, in_im, out_re, out_im, deps)";
-            auto stream = detail::setup_stream(func_name, ih, info);
-
-            std::array<void *, 2> in_native{ in_re + offsets[0], in_im + offsets[0] };
-            std::array<void *, 2> out_native{ out_re + offsets[1], out_im + offsets[1] };
-            detail::execute_checked(func_name, plan, in_native.data(), out_native.data(), info);
-            detail::sync_checked(func_name, stream);
-        });
-    });
-    commit->set_last_usm_workspace_event_if_rqd(sycl_event);
-    return sycl_event;
-}
-
-// Template function instantiations
-#include "dft/backends/backend_forward_instantiations.cxx"
-
-} // namespace oneapi::mkl::dft::rocfft
diff --git a/src/dft/backends/rocfft/mkl_dft_rocfft_wrappers.cpp b/src/dft/backends/rocfft/mkl_dft_rocfft_wrappers.cpp
deleted file mode 100644
index c8f0e35c7..000000000
--- a/src/dft/backends/rocfft/mkl_dft_rocfft_wrappers.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/dft/detail/rocfft/onemkl_dft_rocfft.hpp"
-#include "dft/function_table.hpp"
-
-#define WRAPPER_VERSION 1
-#define BACKEND         rocfft
-
-extern "C" dft_function_table_t mkl_dft_table = {
-    WRAPPER_VERSION,
-#include "dft/backends/backend_wrappers.cxx"
-};
-
-#undef WRAPPER_VERSION
-#undef BACKEND
diff --git a/src/dft/backends/rocfft/rocfft_handle.hpp b/src/dft/backends/rocfft/rocfft_handle.hpp
deleted file mode 100644
index ea4f44d68..000000000
--- a/src/dft/backends/rocfft/rocfft_handle.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DFT_SRC_ROCFFT_ROCFFT_HANDLE_HPP_
-#define _ONEMKL_DFT_SRC_ROCFFT_ROCFFT_HANDLE_HPP_
-
-#include <optional>
-
-struct rocfft_plan_t;
-struct rocfft_execution_info_t;
-
-struct rocfft_handle {
-    std::optional<rocfft_plan_t*> plan = std::nullopt;
-    std::optional<rocfft_execution_info_t*> info = std::nullopt;
-    std::optional<void*> buffer = std::nullopt;
-};
-
-#endif
diff --git a/src/dft/backends/stride_helper.hpp b/src/dft/backends/stride_helper.hpp
deleted file mode 100644
index 6c3146c99..000000000
--- a/src/dft/backends/stride_helper.hpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/*******************************************************************************
-* Copyright 2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _DFT_DETAIL_STRIDE_HELPER_HPP_
-#define _DFT_DETAIL_STRIDE_HELPER_HPP_
-
-namespace oneapi::mkl::dft::detail {
-
-enum class stride_api {
-    INVALID, // Cannot choose: no valid choice
-    FB_STRIDES, // Use FWD_STRIDES and BWD_STRIDES
-    IO_STRIDES // Use INPUT_STRIDES and OUTPUT_STRIDES
-};
-
-/** Throw invalid_argument for stride_api::INVALID
- *  @param function Function name to include in exception.
- *  @param stride_choice The stride_api to check if INVALID. Default is INVALID.
- * 
- *  @throws invalid_argument on stride_api::INVALID.
- */
-inline void throw_on_invalid_stride_api(const char* function,
-                                        stride_api stride_choice = stride_api::INVALID) {
-    if (stride_choice == stride_api::INVALID) {
-        throw mkl::invalid_argument(
-            "DFT", function,
-            "Invalid INPUT/OUTPUT or FWD/BACKWARD strides. API usage may have been mixed.");
-    }
-}
-
-// Helper class for mapping input / output strides for backend DFTs to config values.
-// Intended to be abused as required for each backend.
-template <typename StrideElemT>
-struct stride_vectors {
-    using stride_elem_t = StrideElemT;
-    using stride_vec_t = std::vector<StrideElemT>;
-
-    // The stride API being used.
-    const stride_api stride_choice;
-
-    // The storage for strides. vec_a is forward or input.
-    stride_vec_t vec_a, vec_b;
-
-    // Input and output strides for forward and backward DFTs.
-    stride_vec_t &fwd_in, &fwd_out, &bwd_in, &bwd_out;
-
-    // Input and output offsets for forward and backward DFTs.
-    StrideElemT offset_fwd_in, offset_fwd_out, offset_bwd_in, offset_bwd_out;
-
-    /** Initialize the forward / backwards input and output strides for this object.
-    *  @tparam ConfigT The config values type.
-    *  @param config_values The DFT config values.
-    *  @param stride_api The stride API choice. Must not be INVALID.
-    **/
-    template <typename ConfigT>
-    stride_vectors(const ConfigT& config_values, stride_api stride_choice)
-            : stride_choice(stride_choice),
-              fwd_in(vec_a),
-              fwd_out(vec_b),
-              bwd_in(stride_choice == stride_api::FB_STRIDES ? vec_b : vec_a),
-              bwd_out(stride_choice == stride_api::FB_STRIDES ? vec_a : vec_b) {
-        if (stride_choice == stride_api::INVALID) {
-            throw mkl::exception("DFT", "detail::stride_vector constructor",
-                                 "Internal error: invalid stride API");
-        }
-        auto& v1 = stride_choice == stride_api::FB_STRIDES ? config_values.fwd_strides
-                                                           : config_values.input_strides;
-        auto& v2 = stride_choice == stride_api::FB_STRIDES ? config_values.bwd_strides
-                                                           : config_values.output_strides;
-
-        vec_a.resize(v1.size());
-        vec_b.resize(v2.size());
-        for (std::size_t i{ 0 }; i < v1.size(); ++i) { // v1.size() == v2.size()
-            if constexpr (std::is_unsigned_v<StrideElemT>) {
-                if (v1[i] < 0 || v2[i] < 0) {
-                    throw mkl::unimplemented("DFT", "commit",
-                                             "Backend does not support negative strides.");
-                }
-            }
-            vec_a[i] = static_cast<StrideElemT>(v1[i]);
-            vec_b[i] = static_cast<StrideElemT>(v2[i]);
-        }
-        offset_fwd_in = fwd_in[0];
-        offset_fwd_out = fwd_out[0];
-        offset_bwd_in = bwd_in[0];
-        offset_bwd_out = bwd_out[0];
-    }
-};
-
-/** Determines whether INPUT/OUTPUT strides, or FWD/BWD strides API is used.
- *  @tparam ConfigT The config values type.
- *  @param config_values The DFT config values.
- *  @returns Stride choice. INVALID if the choice could not be determined.
- * 
- *  @note This does not attempt to determine that the set strides are valid.
- */
-template <typename ConfigT>
-inline stride_api get_stride_api(const ConfigT& config_values) {
-    auto n = config_values.dimensions.size();
-    // Test if FWD/BWD strides look like they should be used. If yes, use them.
-    if (config_values.fwd_strides.size() == n + 1 && config_values.bwd_strides.size() == n + 1) {
-        auto all_zero_fwd = true;
-        auto all_zero_bwd = true;
-        // If INPUT or OUTPUT have been set, these will be zeroed.
-        for (auto v : config_values.fwd_strides) {
-            all_zero_fwd = v == 0 && all_zero_fwd;
-        }
-        for (auto v : config_values.bwd_strides) {
-            all_zero_bwd = v == 0 && all_zero_bwd;
-        }
-        if (!all_zero_fwd && !all_zero_bwd) { // Both must be non-zero.
-            return stride_api::FB_STRIDES;
-        }
-    }
-    // FWD/BWD invalid. Test INPUT/OUTPUT for validity.
-    if (config_values.input_strides.size() == n + 1 &&
-        config_values.output_strides.size() == n + 1) {
-        auto all_zero_in = true;
-        auto all_zero_out = true;
-        // If FWD or BWD have been set, these will be zeroed.
-        for (auto v : config_values.input_strides) {
-            all_zero_in = v == 0 && all_zero_in;
-        }
-        for (auto v : config_values.output_strides) {
-            all_zero_out = v == 0 && all_zero_out;
-        }
-        if (!all_zero_in && !all_zero_out) { // Both must be non-zero.
-            return stride_api::IO_STRIDES;
-        }
-    }
-    return stride_api::INVALID;
-}
-
-} // namespace oneapi::mkl::dft::detail
-
-#endif //_DFT_DETAIL_STRIDE_HELPER_HPP_
diff --git a/src/dft/descriptor.cxx b/src/dft/descriptor.cxx
deleted file mode 100644
index a9acd3b9e..000000000
--- a/src/dft/descriptor.cxx
+++ /dev/null
@@ -1,297 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-#include <cstdarg>
-
-#include "oneapi/mkl/detail/exceptions.hpp"
-#include "oneapi/mkl/dft/descriptor.hpp"
-
-#include "dft/descriptor_config_helper.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-namespace detail {
-
-// Compute the default strides. Modifies real_strides and complex_strides arguments.
-inline void compute_default_strides(const std::vector<std::int64_t>& dimensions,
-                                    std::vector<std::int64_t>& fwd_strides,
-                                    std::vector<std::int64_t>& bwd_strides) {
-    auto rank = dimensions.size();
-    std::vector<std::int64_t> strides(rank + 1, 1);
-    for (auto i = rank - 1; i > 0; --i) {
-        strides[i] = strides[i + 1] * dimensions[i];
-    }
-    strides[0] = 0;
-    // Fwd/Bwd strides and Input/Output strides being the same by default means
-    // that we don't have to specify if we default to using fwd/bwd strides or
-    // input/output strides.
-    bwd_strides = strides;
-    fwd_strides = std::move(strides);
-}
-
-template <precision prec, domain dom>
-void descriptor<prec, dom>::set_value(config_param param, ...) {
-    va_list vl;
-    va_start(vl, param);
-    switch (param) {
-        case config_param::FORWARD_DOMAIN:
-            throw mkl::invalid_argument("DFT", "set_value", "Read-only parameter.");
-            break;
-        case config_param::DIMENSION:
-            throw mkl::invalid_argument("DFT", "set_value", "Read-only parameter.");
-            break;
-        case config_param::LENGTHS: {
-            if (values_.dimensions.size() == 1) {
-                std::int64_t length = va_arg(vl, std::int64_t);
-                detail::set_value<config_param::LENGTHS>(values_, &length);
-            }
-            else {
-                detail::set_value<config_param::LENGTHS>(values_, va_arg(vl, std::int64_t*));
-            }
-            break;
-        }
-        case config_param::PRECISION:
-            throw mkl::invalid_argument("DFT", "set_value", "Read-only parameter.");
-            break;
-        case config_param::INPUT_STRIDES:
-            detail::set_value<config_param::INPUT_STRIDES>(values_, va_arg(vl, std::int64_t*));
-            break;
-        case config_param::OUTPUT_STRIDES:
-            detail::set_value<config_param::OUTPUT_STRIDES>(values_, va_arg(vl, std::int64_t*));
-            break;
-        case config_param::FWD_STRIDES:
-            detail::set_value<config_param::FWD_STRIDES>(values_, va_arg(vl, std::int64_t*));
-            break;
-        case config_param::BWD_STRIDES:
-            detail::set_value<config_param::BWD_STRIDES>(values_, va_arg(vl, std::int64_t*));
-            break;
-        // VA arg promotes float args to double, so the following is always double:
-        case config_param::FORWARD_SCALE:
-            detail::set_value<config_param::FORWARD_SCALE>(values_,
-                                                           static_cast<real_t>(va_arg(vl, double)));
-            break;
-        case config_param::BACKWARD_SCALE:
-            detail::set_value<config_param::BACKWARD_SCALE>(
-                values_, static_cast<real_t>(va_arg(vl, double)));
-            break;
-        case config_param::NUMBER_OF_TRANSFORMS:
-            detail::set_value<config_param::NUMBER_OF_TRANSFORMS>(values_,
-                                                                  va_arg(vl, std::int64_t));
-            break;
-        case config_param::FWD_DISTANCE:
-            detail::set_value<config_param::FWD_DISTANCE>(values_, va_arg(vl, std::int64_t));
-            break;
-        case config_param::BWD_DISTANCE:
-            detail::set_value<config_param::BWD_DISTANCE>(values_, va_arg(vl, std::int64_t));
-            break;
-        case config_param::PLACEMENT:
-            detail::set_value<config_param::PLACEMENT>(values_, va_arg(vl, config_value));
-            break;
-        case config_param::COMPLEX_STORAGE:
-            detail::set_value<config_param::COMPLEX_STORAGE>(values_, va_arg(vl, config_value));
-            break;
-        case config_param::REAL_STORAGE:
-            detail::set_value<config_param::REAL_STORAGE>(values_, va_arg(vl, config_value));
-            break;
-        case config_param::CONJUGATE_EVEN_STORAGE:
-            detail::set_value<config_param::CONJUGATE_EVEN_STORAGE>(values_,
-                                                                    va_arg(vl, config_value));
-            break;
-        case config_param::ORDERING:
-            detail::set_value<config_param::ORDERING>(values_, va_arg(vl, config_value));
-            break;
-        case config_param::TRANSPOSE:
-            detail::set_value<config_param::TRANSPOSE>(values_, va_arg(vl, int));
-            break;
-        case config_param::WORKSPACE:
-            detail::set_value<config_param::WORKSPACE>(values_, va_arg(vl, config_value));
-            break;
-        case config_param::WORKSPACE_PLACEMENT:
-            detail::set_value<config_param::WORKSPACE_PLACEMENT>(values_, va_arg(vl, config_value));
-            break;
-        case config_param::WORKSPACE_EXTERNAL_BYTES:
-            throw mkl::invalid_argument("DFT", "set_value", "Read-only parameter.");
-            break;
-        case config_param::PACKED_FORMAT:
-            detail::set_value<config_param::PACKED_FORMAT>(values_, va_arg(vl, config_value));
-            break;
-        case config_param::COMMIT_STATUS:
-            throw mkl::invalid_argument("DFT", "set_value", "Read-only parameter.");
-            break;
-        default: throw mkl::invalid_argument("DFT", "set_value", "Invalid config_param argument.");
-    }
-    va_end(vl);
-}
-
-template <precision prec, domain dom>
-descriptor<prec, dom>::descriptor(std::vector<std::int64_t> dimensions) {
-    if (dimensions.size() == 0) {
-        throw mkl::invalid_argument("DFT", "descriptor", "Cannot have 0 dimensional DFT.");
-    }
-    for (const auto& dim : dimensions) {
-        if (dim <= 0) {
-            throw mkl::invalid_argument("DFT", "descriptor",
-                                        "Invalid dimension value (negative or 0).");
-        }
-    }
-    compute_default_strides(dimensions, values_.fwd_strides, values_.bwd_strides);
-    // Assume forward transform.
-    values_.input_strides = values_.fwd_strides;
-    values_.output_strides = values_.bwd_strides;
-    values_.bwd_scale = real_t(1.0);
-    values_.fwd_scale = real_t(1.0);
-    values_.number_of_transforms = 1;
-    values_.fwd_dist = 1;
-    values_.bwd_dist = 1;
-    values_.placement = config_value::INPLACE;
-    values_.complex_storage = config_value::COMPLEX_COMPLEX;
-    values_.real_storage = config_value::REAL_REAL;
-    values_.conj_even_storage = config_value::COMPLEX_COMPLEX;
-    values_.workspace = config_value::ALLOW;
-    values_.workspace_placement = config_value::WORKSPACE_AUTOMATIC;
-    values_.ordering = config_value::ORDERED;
-    values_.transpose = false;
-    values_.packed_format = config_value::CCE_FORMAT;
-    values_.dimensions = std::move(dimensions);
-}
-
-template <precision prec, domain dom>
-descriptor<prec, dom>::descriptor(std::int64_t length)
-        : descriptor<prec, dom>(std::vector<std::int64_t>{ length }) {}
-
-template <precision prec, domain dom>
-descriptor<prec, dom>::descriptor(descriptor<prec, dom>&& other) = default;
-
-template <precision prec, domain dom>
-descriptor<prec, dom>& descriptor<prec, dom>::operator=(descriptor<prec, dom>&&) = default;
-
-template <precision prec, domain dom>
-descriptor<prec, dom>::~descriptor() = default;
-
-template <precision prec, domain dom>
-void descriptor<prec, dom>::get_value(config_param param, ...) const {
-    va_list vl;
-    va_start(vl, param);
-    if (va_arg(vl, void*) == nullptr) {
-        throw mkl::invalid_argument("DFT", "get_value", "config_param is nullptr.");
-    }
-    va_end(vl);
-    va_start(vl, param);
-    switch (param) {
-        case config_param::FORWARD_DOMAIN: *va_arg(vl, dft::domain*) = dom; break;
-        case config_param::DIMENSION:
-            *va_arg(vl, std::int64_t*) = static_cast<std::int64_t>(values_.dimensions.size());
-            break;
-        case config_param::LENGTHS:
-            std::copy(values_.dimensions.begin(), values_.dimensions.end(),
-                      va_arg(vl, std::int64_t*));
-            break;
-        case config_param::PRECISION: *va_arg(vl, dft::precision*) = prec; break;
-        case config_param::FORWARD_SCALE:
-            *va_arg(vl, real_t*) = static_cast<real_t>(values_.fwd_scale);
-            break;
-        case config_param::BACKWARD_SCALE:
-            *va_arg(vl, real_t*) = static_cast<real_t>(values_.bwd_scale);
-            break;
-        case config_param::NUMBER_OF_TRANSFORMS:
-            *va_arg(vl, std::int64_t*) = values_.number_of_transforms;
-            break;
-        case config_param::COMPLEX_STORAGE:
-            *va_arg(vl, config_value*) = values_.complex_storage;
-            break;
-        case config_param::REAL_STORAGE: *va_arg(vl, config_value*) = values_.real_storage; break;
-        case config_param::CONJUGATE_EVEN_STORAGE:
-            *va_arg(vl, config_value*) = values_.conj_even_storage;
-            break;
-        case config_param::PLACEMENT: *va_arg(vl, config_value*) = values_.placement; break;
-        case config_param::INPUT_STRIDES:
-            std::copy(values_.input_strides.begin(), values_.input_strides.end(),
-                      va_arg(vl, std::int64_t*));
-            break;
-        case config_param::OUTPUT_STRIDES:
-            std::copy(values_.output_strides.begin(), values_.output_strides.end(),
-                      va_arg(vl, std::int64_t*));
-            break;
-        case config_param::FWD_STRIDES:
-            std::copy(values_.fwd_strides.begin(), values_.fwd_strides.end(),
-                      va_arg(vl, std::int64_t*));
-            break;
-        case config_param::BWD_STRIDES:
-            std::copy(values_.bwd_strides.begin(), values_.bwd_strides.end(),
-                      va_arg(vl, std::int64_t*));
-            break;
-        case config_param::FWD_DISTANCE: *va_arg(vl, std::int64_t*) = values_.fwd_dist; break;
-        case config_param::BWD_DISTANCE: *va_arg(vl, std::int64_t*) = values_.bwd_dist; break;
-        case config_param::WORKSPACE: *va_arg(vl, config_value*) = values_.workspace; break;
-        case config_param::WORKSPACE_PLACEMENT:
-            *va_arg(vl, config_value*) = values_.workspace_placement;
-            break;
-        case config_param::WORKSPACE_EXTERNAL_BYTES:
-            if (!pimpl_) {
-                throw mkl::invalid_argument(
-                    "DFT", "get_value",
-                    "Cannot query WORKSPACE_EXTERNAL_BYTES on uncommitted descriptor.");
-            }
-            else {
-                *va_arg(vl, std::int64_t*) = pimpl_->get_workspace_external_bytes();
-            }
-            break;
-        case config_param::ORDERING: *va_arg(vl, config_value*) = values_.ordering; break;
-        case config_param::TRANSPOSE: *va_arg(vl, int*) = values_.transpose; break;
-        case config_param::PACKED_FORMAT: *va_arg(vl, config_value*) = values_.packed_format; break;
-        case config_param::COMMIT_STATUS:
-            *va_arg(vl, config_value*) =
-                pimpl_ ? config_value::COMMITTED : config_value::UNCOMMITTED;
-            break;
-        default: throw mkl::invalid_argument("DFT", "get_value", "Invalid config_param argument.");
-    }
-    va_end(vl);
-}
-
-template <precision prec, domain dom>
-void descriptor<prec, dom>::set_workspace(scalar_type* usm_workspace) {
-    if (pimpl_) {
-        return pimpl_->set_workspace(usm_workspace);
-    }
-    else {
-        throw mkl::uninitialized("DFT", "set_workspace",
-                                 "Can only set workspace on committed descriptor.");
-    }
-}
-
-template <precision prec, domain dom>
-void descriptor<prec, dom>::set_workspace(sycl::buffer<scalar_type>& buffer_workspace) {
-    if (pimpl_) {
-        return pimpl_->set_workspace(buffer_workspace);
-    }
-    else {
-        throw mkl::uninitialized("DFT", "set_workspace",
-                                 "Can only set workspace on committed descriptor.");
-    }
-}
-
-template class descriptor<precision::SINGLE, domain::COMPLEX>;
-template class descriptor<precision::SINGLE, domain::REAL>;
-template class descriptor<precision::DOUBLE, domain::COMPLEX>;
-template class descriptor<precision::DOUBLE, domain::REAL>;
-
-} //namespace detail
-} //namespace dft
-} //namespace mkl
-} //namespace oneapi
diff --git a/src/dft/descriptor_config_helper.hpp b/src/dft/descriptor_config_helper.hpp
deleted file mode 100644
index dc8c97ac2..000000000
--- a/src/dft/descriptor_config_helper.hpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/*******************************************************************************
-* Copyright Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_DETAIL_DESCRIPTOR_CONFIG_HELPER_HPP_
-#define _ONEMKL_DETAIL_DESCRIPTOR_CONFIG_HELPER_HPP_
-
-#include <cstdint>
-#include <type_traits>
-
-#include "oneapi/mkl/dft/descriptor.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace dft {
-namespace detail {
-
-/** Helper: sets both input vectors to zeros.
- *  Used for enforcing consistency when using FWD/BWD_STRIDES and
- *  INPUT/OUTPUT_STRIDES.
- */
-static void reset_strides_to_zero(std::vector<std::int64_t>& v1, std::vector<std::int64_t>& v2) {
-    for (auto& v : v1) {
-        v = 0;
-    }
-    for (auto& v : v2) {
-        v = 0;
-    }
-}
-
-/// Helper to get real type from precision.
-template <precision Prec>
-struct real_helper;
-
-template <>
-struct real_helper<precision::SINGLE> {
-    using type = float;
-};
-
-template <>
-struct real_helper<precision::DOUBLE> {
-    using type = double;
-};
-
-template <precision Prec>
-using real_helper_t = typename real_helper<Prec>::type;
-
-/** Helper to get the argument type for a config param.
- * @tparam RealT The real type for the DFT.
- * @tparam Param The config param to get the arg for.
-**/
-template <typename RealT, config_param Param>
-struct param_type_helper;
-
-template <typename RealT, config_param Param>
-using param_type_helper_t = typename param_type_helper<RealT, Param>::type;
-
-#define PARAM_TYPE_HELPER(param, param_type) \
-    template <typename RealT>                \
-    struct param_type_helper<RealT, param> { \
-        using type = param_type;             \
-    };
-PARAM_TYPE_HELPER(config_param::FORWARD_DOMAIN, domain)
-PARAM_TYPE_HELPER(config_param::DIMENSION, std::int64_t)
-PARAM_TYPE_HELPER(config_param::LENGTHS, std::int64_t*)
-PARAM_TYPE_HELPER(config_param::PRECISION, precision)
-PARAM_TYPE_HELPER(config_param::FORWARD_SCALE, RealT)
-PARAM_TYPE_HELPER(config_param::BACKWARD_SCALE, RealT)
-PARAM_TYPE_HELPER(config_param::NUMBER_OF_TRANSFORMS, std::int64_t)
-PARAM_TYPE_HELPER(config_param::COMPLEX_STORAGE, config_value)
-PARAM_TYPE_HELPER(config_param::REAL_STORAGE, config_value)
-PARAM_TYPE_HELPER(config_param::CONJUGATE_EVEN_STORAGE, config_value)
-PARAM_TYPE_HELPER(config_param::PLACEMENT, config_value)
-PARAM_TYPE_HELPER(config_param::INPUT_STRIDES, std::int64_t*)
-PARAM_TYPE_HELPER(config_param::OUTPUT_STRIDES, std::int64_t*)
-PARAM_TYPE_HELPER(config_param::FWD_DISTANCE, std::int64_t)
-PARAM_TYPE_HELPER(config_param::BWD_DISTANCE, std::int64_t)
-PARAM_TYPE_HELPER(config_param::WORKSPACE, config_value)
-PARAM_TYPE_HELPER(config_param::WORKSPACE_PLACEMENT, config_value)
-PARAM_TYPE_HELPER(config_param::WORKSPACE_EXTERNAL_BYTES, std::int64_t)
-PARAM_TYPE_HELPER(config_param::ORDERING, config_value)
-PARAM_TYPE_HELPER(config_param::TRANSPOSE, bool)
-PARAM_TYPE_HELPER(config_param::PACKED_FORMAT, config_value)
-PARAM_TYPE_HELPER(config_param::COMMIT_STATUS, config_value)
-PARAM_TYPE_HELPER(config_param::FWD_STRIDES, std::int64_t*)
-PARAM_TYPE_HELPER(config_param::BWD_STRIDES, std::int64_t*)
-#undef PARAM_TYPE_HELPER
-
-/** Set a value in dft_values, throwing on invalid args.
- * @tparam Param The config param to set.
- * @tparam prec The precision of the DFT.
- * @tparam dom The domain of the DFT.
- * @param vals The struct to update the value in.
- * @param set_val The value to set Param to.
-**/
-template <config_param Param, precision prec, domain dom>
-void set_value(dft_values<prec, dom>& vals,
-               param_type_helper_t<real_helper_t<prec>, Param>&& set_val) {
-    if constexpr (Param == config_param::LENGTHS) {
-        if (set_val == nullptr) {
-            throw mkl::invalid_argument("DFT", "set_value", "Given nullptr.");
-        }
-        for (std::size_t i{ 0 }; i < vals.dimensions.size(); ++i) {
-            if (set_val[i] <= 0) {
-                throw mkl::invalid_argument("DFT", "set_value",
-                                            "Invalid length value (negative or 0).");
-            }
-        }
-        std::copy(set_val, set_val + vals.dimensions.size(), vals.dimensions.begin());
-    }
-    else if constexpr (Param == config_param::PRECISION) {
-        throw mkl::invalid_argument("DFT", "set_value", "Read-only parameter.");
-    }
-    else if constexpr (Param == config_param::FORWARD_SCALE) {
-        vals.fwd_scale = set_val;
-    }
-    else if constexpr (Param == config_param::BACKWARD_SCALE) {
-        vals.bwd_scale = set_val;
-    }
-    else if constexpr (Param == config_param::NUMBER_OF_TRANSFORMS) {
-        if (set_val <= 0) {
-            throw mkl::invalid_argument("DFT", "set_value",
-                                        "Number of transforms must be positive.");
-        }
-        vals.number_of_transforms = set_val;
-    }
-    else if constexpr (Param == config_param::COMPLEX_STORAGE) {
-        if (set_val == config_value::COMPLEX_COMPLEX || set_val == config_value::REAL_REAL) {
-            vals.complex_storage = set_val;
-        }
-        else {
-            throw mkl::invalid_argument("DFT", "set_value",
-                                        "Complex storage must be complex_complex or real_real.");
-        }
-    }
-    else if constexpr (Param == config_param::REAL_STORAGE) {
-        if (set_val == config_value::REAL_REAL) {
-            vals.real_storage = set_val;
-        }
-        else {
-            throw mkl::invalid_argument("DFT", "set_value", "Real storage must be real_real.");
-        }
-    }
-    else if constexpr (Param == config_param::CONJUGATE_EVEN_STORAGE) {
-        if (set_val == config_value::COMPLEX_COMPLEX) {
-            vals.conj_even_storage = set_val;
-        }
-        else {
-            throw mkl::invalid_argument("DFT", "set_value",
-                                        "Conjugate even storage must be complex_complex.");
-        }
-    }
-    else if constexpr (Param == config_param::PLACEMENT) {
-        if (set_val == config_value::INPLACE || set_val == config_value::NOT_INPLACE) {
-            vals.placement = set_val;
-        }
-        else {
-            throw mkl::invalid_argument("DFT", "set_value",
-                                        "Placement must be inplace or not inplace.");
-        }
-    }
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-    else if constexpr (Param == config_param::INPUT_STRIDES) {
-        if (set_val == nullptr) {
-            throw mkl::invalid_argument("DFT", "set_value", "Given nullptr.");
-        }
-        reset_strides_to_zero(vals.fwd_strides, vals.bwd_strides);
-        std::copy(set_val, set_val + vals.dimensions.size() + 1, vals.input_strides.begin());
-    }
-    else if constexpr (Param == config_param::OUTPUT_STRIDES) {
-        if (set_val == nullptr) {
-            throw mkl::invalid_argument("DFT", "set_value", "Given nullptr.");
-        }
-        reset_strides_to_zero(vals.fwd_strides, vals.bwd_strides);
-        std::copy(set_val, set_val + vals.dimensions.size() + 1, vals.output_strides.begin());
-    }
-#pragma clang diagnostic pop
-    else if constexpr (Param == config_param::FWD_DISTANCE) {
-        vals.fwd_dist = set_val;
-    }
-    else if constexpr (Param == config_param::BWD_DISTANCE) {
-        vals.bwd_dist = set_val;
-    }
-    else if constexpr (Param == config_param::WORKSPACE) {
-        if (set_val == config_value::ALLOW || set_val == config_value::AVOID) {
-            vals.workspace = set_val;
-        }
-        else {
-            throw mkl::invalid_argument("DFT", "set_value", "Workspace must be allow or avoid.");
-        }
-    }
-    else if constexpr (Param == config_param::WORKSPACE_PLACEMENT) {
-        if (set_val == config_value::WORKSPACE_AUTOMATIC ||
-            set_val == config_value::WORKSPACE_EXTERNAL) {
-            vals.workspace_placement = set_val;
-        }
-        else {
-            throw mkl::invalid_argument(
-                "DFT", "set_value", "Workspace must be WORKSPACE_AUTOMATIC or WORKSPACE_EXTERNAL.");
-        }
-    }
-    else if constexpr (Param == config_param::WORKSPACE_EXTERNAL_BYTES) {
-        throw mkl::invalid_argument("DFT", "set_value", "Read-only parameter.");
-    }
-    else if constexpr (Param == config_param::ORDERING) {
-        if (set_val == config_value::ORDERED || set_val == config_value::BACKWARD_SCRAMBLED) {
-            vals.ordering = set_val;
-        }
-        else {
-            throw mkl::invalid_argument("DFT", "set_value",
-                                        "Ordering must be ordered or backwards scrambled.");
-        }
-    }
-    else if constexpr (Param == config_param::TRANSPOSE) {
-        vals.transpose = set_val;
-    }
-    else if constexpr (Param == config_param::PACKED_FORMAT) {
-        if (set_val == config_value::CCE_FORMAT) {
-            vals.packed_format = set_val;
-        }
-        else {
-            throw mkl::invalid_argument("DFT", "set_value", "Packed format must be CCE.");
-        }
-    }
-    else if constexpr (Param == config_param::FWD_STRIDES) {
-        if (set_val == nullptr) {
-            throw mkl::invalid_argument("DFT", "set_value", "Given nullptr.");
-        }
-        reset_strides_to_zero(vals.input_strides, vals.output_strides);
-        std::copy(set_val, set_val + vals.dimensions.size() + 1, vals.fwd_strides.begin());
-    }
-    else if constexpr (Param == config_param::BWD_STRIDES) {
-        if (set_val == nullptr) {
-            throw mkl::invalid_argument("DFT", "set_value", "Given nullptr.");
-        }
-        reset_strides_to_zero(vals.input_strides, vals.output_strides);
-        std::copy(set_val, set_val + vals.dimensions.size() + 1, vals.bwd_strides.begin());
-    }
-}
-
-} // namespace detail
-} // namespace dft
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_ONEMKL_DETAIL_DESCRIPTOR_CONFIG_HELPER_HPP_
diff --git a/src/dft/dft_loader.cpp b/src/dft/dft_loader.cpp
deleted file mode 100644
index b0c421fb0..000000000
--- a/src/dft/dft_loader.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/dft/detail/dft_loader.hpp"
-#include "oneapi/mkl/dft/forward.hpp"
-#include "oneapi/mkl/dft/backward.hpp"
-
-#include "function_table_initializer.hpp"
-#include "dft/function_table.hpp"
-#include "oneapi/mkl/detail/get_device_id.hpp"
-
-namespace oneapi::mkl::dft::detail {
-
-static oneapi::mkl::detail::table_initializer<mkl::domain::dft, dft_function_table_t>
-    function_tables;
-
-template <>
-commit_impl<precision::SINGLE, domain::COMPLEX>* create_commit<precision::SINGLE, domain::COMPLEX>(
-    const descriptor<precision::SINGLE, domain::COMPLEX>& desc, sycl::queue& sycl_queue) {
-    auto libkey = get_device_id(sycl_queue);
-    return function_tables[libkey].create_commit_sycl_fz(desc, sycl_queue);
-}
-
-template <>
-commit_impl<precision::DOUBLE, domain::COMPLEX>* create_commit<precision::DOUBLE, domain::COMPLEX>(
-    const descriptor<precision::DOUBLE, domain::COMPLEX>& desc, sycl::queue& sycl_queue) {
-    auto libkey = get_device_id(sycl_queue);
-    return function_tables[libkey].create_commit_sycl_dz(desc, sycl_queue);
-}
-
-template <>
-commit_impl<precision::SINGLE, domain::REAL>* create_commit<precision::SINGLE, domain::REAL>(
-    const descriptor<precision::SINGLE, domain::REAL>& desc, sycl::queue& sycl_queue) {
-    auto libkey = get_device_id(sycl_queue);
-    return function_tables[libkey].create_commit_sycl_fr(desc, sycl_queue);
-}
-
-template <>
-commit_impl<precision::DOUBLE, domain::REAL>* create_commit<precision::DOUBLE, domain::REAL>(
-    const descriptor<precision::DOUBLE, domain::REAL>& desc, sycl::queue& sycl_queue) {
-    auto libkey = get_device_id(sycl_queue);
-    return function_tables[libkey].create_commit_sycl_dr(desc, sycl_queue);
-}
-
-template <precision prec, domain dom>
-inline oneapi::mkl::device get_device(descriptor<prec, dom>& desc, const char* func_name) {
-    config_value is_committed{ config_value::UNCOMMITTED };
-    desc.get_value(config_param::COMMIT_STATUS, &is_committed);
-    if (is_committed != config_value::COMMITTED) {
-        throw mkl::invalid_argument("DFT", func_name, "Descriptor not committed.");
-    }
-    // Committed means that the commit pointer is not null.
-    return get_device_id(get_commit(desc)->get_queue());
-}
-
-} // namespace oneapi::mkl::dft::detail
diff --git a/src/dft/function_table.hpp b/src/dft/function_table.hpp
deleted file mode 100644
index 9146f239e..000000000
--- a/src/dft/function_table.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _DFT_FUNCTION_TABLE_HPP_
-#define _DFT_FUNCTION_TABLE_HPP_
-
-#include <complex>
-#include <cstdint>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/dft/types.hpp"
-#include "oneapi/mkl/dft/descriptor.hpp"
-
-typedef struct {
-    int version;
-    oneapi::mkl::dft::detail::commit_impl<oneapi::mkl::dft::precision::SINGLE,
-                                          oneapi::mkl::dft::domain::COMPLEX>* (
-        *create_commit_sycl_fz)(
-        const oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE,
-                                           oneapi::mkl::dft::domain::COMPLEX>& desc,
-        sycl::queue& sycl_queue);
-    oneapi::mkl::dft::detail::commit_impl<oneapi::mkl::dft::precision::DOUBLE,
-                                          oneapi::mkl::dft::domain::COMPLEX>* (
-        *create_commit_sycl_dz)(
-        const oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::DOUBLE,
-                                           oneapi::mkl::dft::domain::COMPLEX>& desc,
-        sycl::queue& sycl_queue);
-    oneapi::mkl::dft::detail::commit_impl<oneapi::mkl::dft::precision::SINGLE,
-                                          oneapi::mkl::dft::domain::REAL>* (*create_commit_sycl_fr)(
-        const oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE,
-                                           oneapi::mkl::dft::domain::REAL>& desc,
-        sycl::queue& sycl_queue);
-    oneapi::mkl::dft::detail::commit_impl<oneapi::mkl::dft::precision::DOUBLE,
-                                          oneapi::mkl::dft::domain::REAL>* (*create_commit_sycl_dr)(
-        const oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::DOUBLE,
-                                           oneapi::mkl::dft::domain::REAL>& desc,
-        sycl::queue& sycl_queue);
-} dft_function_table_t;
-
-#endif //_DFT_FUNCTION_TABLE_HPP_
diff --git a/src/include/allocator_helper.hpp b/src/include/allocator_helper.hpp
deleted file mode 100644
index 8ea802dd1..000000000
--- a/src/include/allocator_helper.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef __ALLOCATOR_HELPER_HPP
-#define __ALLOCATOR_HELPER_HPP
-
-#ifdef _WIN64
-#include <malloc.h>
-#else
-#include <stdlib.h>
-#endif
-
-namespace oneapi {
-namespace mkl {
-
-static inline void *aligned_alloc(size_t align, size_t size) {
-#ifdef _WIN64
-    return ::_aligned_malloc(size, align);
-#else
-    return ::aligned_alloc(align, size);
-#endif
-}
-
-static inline void aligned_free(void *p) {
-#ifdef _WIN64
-    ::_aligned_free(p);
-#else
-    ::free(p);
-#endif
-}
-} // namespace mkl
-} // namespace oneapi
-
-#endif // __ALLOCATOR_HELPER_HPP
diff --git a/src/include/dtype_string.hpp b/src/include/dtype_string.hpp
deleted file mode 100644
index 6f2a87feb..000000000
--- a/src/include/dtype_string.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_ERROR_HELPER_HPP_
-#define _ONEMKL_ERROR_HELPER_HPP_
-
-#include <string>
-
-template <typename T>
-inline const std::string dtype_string();
-template <>
-inline const std::string dtype_string<float>() {
-    return "float";
-}
-template <>
-inline const std::string dtype_string<double>() {
-    return "double";
-}
-template <>
-inline const std::string dtype_string<sycl::half>() {
-    return "half";
-}
-template <>
-inline const std::string dtype_string<std::complex<float>>() {
-    return "complex<float>";
-}
-template <>
-inline const std::string dtype_string<std::complex<double>>() {
-    return "complex<double>";
-}
-template <>
-inline const std::string dtype_string<std::int32_t>() {
-    return "int32";
-}
-template <>
-inline const std::string dtype_string<std::int8_t>() {
-    return "int8";
-}
-
-#endif //_ONEMKL_ERROR_HELPER_HPP_
diff --git a/src/include/exceptions_helper.hpp b/src/include/exceptions_helper.hpp
deleted file mode 100644
index 9db0f0ddd..000000000
--- a/src/include/exceptions_helper.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef __EXCEPTIONS_HELPER_HPP
-#define __EXCEPTIONS_HELPER_HPP
-
-#include <stdexcept>
-
-namespace oneapi {
-namespace mkl {
-
-class backend_unsupported_exception : public std::runtime_error {
-public:
-    backend_unsupported_exception() : std::runtime_error("Not yet supported for this backend") {}
-};
-
-} // namespace mkl
-} // namespace oneapi
-
-#endif // __EXCEPTIONS_HELPER_HPP
diff --git a/src/include/function_table_initializer.hpp b/src/include/function_table_initializer.hpp
deleted file mode 100644
index 24b2ffb86..000000000
--- a/src/include/function_table_initializer.hpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _LOADER_HPP_
-#define _LOADER_HPP_
-
-#include <cstdint>
-#include <map>
-
-#include "oneapi/mkl/detail/backends_table.hpp"
-#include "oneapi/mkl/detail/exceptions.hpp"
-
-#define SPEC_VERSION 1
-
-#ifdef __linux__
-#include <dlfcn.h>
-#define LIB_TYPE                 void *
-#define GET_LIB_HANDLE(libname)  dlopen((libname), RTLD_LAZY | RTLD_GLOBAL)
-#define GET_FUNC(lib, fn)        dlsym(lib, (fn))
-#define FREE_LIB_HANDLE(libname) dlclose(libname)
-#define ERROR_MSG                dlerror()
-#elif defined(_WIN64)
-#include <windows.h>
-#define LIB_TYPE                 HINSTANCE
-#define GET_LIB_HANDLE(libname)  LoadLibrary(libname)
-#define GET_FUNC(lib, fn)        GetProcAddress((lib), (fn))
-#define FREE_LIB_HANDLE(libname) FreeLibrary(libname)
-#define ERROR_MSG                GetLastErrorStdStr()
-#endif
-
-namespace oneapi {
-namespace mkl {
-namespace detail {
-
-template <oneapi::mkl::domain domain_id, typename function_table_t>
-class table_initializer {
-    struct handle_deleter {
-        using pointer = LIB_TYPE;
-        void operator()(pointer p) const {
-            ::FREE_LIB_HANDLE(p);
-        }
-    };
-    using dlhandle = std::unique_ptr<LIB_TYPE, handle_deleter>;
-
-public:
-    function_table_t &operator[](oneapi::mkl::device key) {
-        auto lib = tables.find(key);
-        if (lib != tables.end())
-            return lib->second;
-        return add_table(key);
-    }
-
-private:
-#ifdef _WIN64
-    // Create a string with last error message
-    std::string GetLastErrorStdStr() {
-        DWORD error = GetLastError();
-        if (error) {
-            LPVOID lpMsgBuf;
-            DWORD bufLen = FormatMessage(
-                FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
-                    FORMAT_MESSAGE_IGNORE_INSERTS,
-                NULL, error, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR)&lpMsgBuf, 0, NULL);
-            if (bufLen) {
-                LPCSTR lpMsgStr = (LPCSTR)lpMsgBuf;
-                std::string result(lpMsgStr, lpMsgStr + bufLen);
-
-                LocalFree(lpMsgBuf);
-
-                return result;
-            }
-        }
-        return std::string();
-    }
-#endif
-
-    function_table_t &add_table(oneapi::mkl::device key) {
-        dlhandle handle;
-        // check all available libraries for the key(device)
-        for (const char *libname : libraries[domain_id][key]) {
-            handle = dlhandle{ ::GET_LIB_HANDLE(libname) };
-            if (handle)
-                break;
-        }
-        if (!handle) {
-            std::cerr << ERROR_MSG << '\n';
-            throw mkl::backend_not_found();
-        }
-        auto t =
-            reinterpret_cast<function_table_t *>(::GET_FUNC(handle.get(), table_names[domain_id]));
-
-        if (!t) {
-            std::cerr << ERROR_MSG << '\n';
-            throw mkl::function_not_found();
-        }
-        if (t->version != SPEC_VERSION)
-            throw mkl::specification_mismatch();
-
-        handles[key] = std::move(handle);
-        tables[key] = *t;
-        return *t;
-    }
-
-    std::map<oneapi::mkl::device, function_table_t> tables;
-    std::map<oneapi::mkl::device, dlhandle> handles;
-};
-
-} //namespace detail
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_LOADER_HPP_
diff --git a/src/include/runtime_support_helper.hpp b/src/include/runtime_support_helper.hpp
deleted file mode 100644
index 7c3514673..000000000
--- a/src/include/runtime_support_helper.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_RUNTIME_SUPPORT_HELPER_HPP_
-#define _ONEMKL_RUNTIME_SUPPORT_HELPER_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <type_traits>
-
-// Utility function to verify that a given set of types is supported by the
-// device compiler combination
-template <typename verify_type, typename T, typename... Ts>
-bool verify_support(sycl::queue q, sycl::aspect aspect) {
-    bool has_aspect = q.get_device().has(aspect);
-    if constexpr (sizeof...(Ts) > 0) {
-        if constexpr (std::is_same_v<verify_type, T>) {
-            return has_aspect && verify_support<verify_type, Ts...>(q, aspect);
-        }
-        else {
-            return true && verify_support<verify_type, Ts...>(q, aspect);
-        }
-    }
-    else {
-        if constexpr (std::is_same_v<verify_type, T>) {
-            return has_aspect;
-        }
-        else {
-            return true;
-        }
-    }
-}
-
-#endif //_ONEMKL_RUNTIME_SUPPORT_HELPER_HPP_
diff --git a/src/lapack/CMakeLists.txt b/src/lapack/CMakeLists.txt
deleted file mode 100644
index 524edde03..000000000
--- a/src/lapack/CMakeLists.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-#===============================================================================
-# Copyright 2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build backends
-add_subdirectory(backends)
-
-# Recipe for LAPACK loader object
-if(BUILD_SHARED_LIBS)
-add_library(onemkl_lapack OBJECT)
-target_sources(onemkl_lapack PRIVATE lapack_loader.cpp)
-target_include_directories(onemkl_lapack
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${PROJECT_SOURCE_DIR}/src/include
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-          $<TARGET_FILE_DIR:onemkl>
-)
-
-target_compile_options(onemkl_lapack PRIVATE ${ONEMKL_BUILD_COPT})
-
-set_target_properties(onemkl_lapack PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(onemkl_lapack PUBLIC ONEMKL::SYCL::SYCL)
-endif()
-
diff --git a/src/lapack/backends/CMakeLists.txt b/src/lapack/backends/CMakeLists.txt
deleted file mode 100644
index 636f6728f..000000000
--- a/src/lapack/backends/CMakeLists.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-#===============================================================================
-# Copyright 2021-2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_custom_target(onemkl_backend_libs_lapack)
-add_dependencies(onemkl_backend_libs onemkl_backend_libs_lapack)
-
-if(ENABLE_MKLCPU_BACKEND)
-  add_subdirectory(mklcpu)
-endif()
-
-if(ENABLE_MKLGPU_BACKEND)
-  add_subdirectory(mklgpu)
-endif()
-
-if(ENABLE_CUSOLVER_BACKEND)
-  add_subdirectory(cusolver)
-endif()
-
-if(ENABLE_ROCSOLVER_BACKEND)
-  add_subdirectory(rocsolver)
-endif()
diff --git a/src/lapack/backends/cusolver/CMakeLists.txt b/src/lapack/backends/cusolver/CMakeLists.txt
deleted file mode 100644
index dfd1267d7..000000000
--- a/src/lapack/backends/cusolver/CMakeLists.txt
+++ /dev/null
@@ -1,68 +0,0 @@
-#==========================================================================
-#  Copyright (C) Codeplay Software Limited
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  For your convenience, a copy of the License has been included in this
-#  repository.
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-#=========================================================================
-
-set(LIB_NAME onemkl_lapack_cusolver)
-set(LIB_OBJ ${LIB_NAME}_obj)
-find_package(cuSOLVER REQUIRED)
-find_package(cuBLAS REQUIRED)
-set(SOURCES     cusolver_lapack.cpp
-                cusolver_batch.cpp
-                $<$<STREQUAL:${ONEMKL_SYCL_IMPLEMENTATION},dpc++>:cusolver_scope_handle.cpp >
-                $<$<BOOL:${BUILD_SHARED_LIBS}>: cusolver_wrappers.cpp>)
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT ${SOURCES})
-add_dependencies(onemkl_backend_libs_lapack ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${PROJECT_SOURCE_DIR}/src/blas/backends/cublas
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-target_link_libraries(${LIB_OBJ} 
-  PUBLIC ONEMKL::SYCL::SYCL 
-         ONEMKL::cuSOLVER::cuSOLVER
-         ONEMKL::cuBLAS::cuBLAS)
-target_compile_features(${LIB_OBJ} PUBLIC cxx_std_11)
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON)
-
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET ${LIB_OBJ} SOURCES ${SOURCES})
-endif()
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
-
diff --git a/src/lapack/backends/cusolver/cusolver_batch.cpp b/src/lapack/backends/cusolver/cusolver_batch.cpp
deleted file mode 100644
index 59fa47f84..000000000
--- a/src/lapack/backends/cusolver/cusolver_batch.cpp
+++ /dev/null
@@ -1,1994 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "cublas_helper.hpp"
-#include "cusolver_helper.hpp"
-#include "cusolver_task.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace cusolver {
-
-// BATCH BUFFER API
-
-template <typename Func, typename T>
-inline void geqrf_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, std::int64_t stride_a,
-                        sycl::buffer<T> &tau, std::int64_t stride_tau, std::int64_t batch_size,
-                        sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(m, n, lda, stride_a, stride_tau, batch_size, scratchpad_size);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-
-            // Uses scratch so sync between each cuSolver call
-            for (int64_t i = 0; i < batch_size; ++i) {
-                CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_ + stride_a * i,
-                                           lda, tau_ + stride_tau * i, scratch_, scratchpad_size,
-                                           nullptr);
-            }
-        });
-    });
-}
-
-#define GEQRF_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                    \
-    void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<TYPE> &a, \
-                     std::int64_t lda, std::int64_t stride_a, sycl::buffer<TYPE> &tau,          \
-                     std::int64_t stride_tau, std::int64_t batch_size,                          \
-                     sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {            \
-        return geqrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, stride_a,  \
-                           tau, stride_tau, batch_size, scratchpad, scratchpad_size);           \
-    }
-
-GEQRF_STRIDED_BATCH_LAUNCHER(float, cusolverDnSgeqrf)
-GEQRF_STRIDED_BATCH_LAUNCHER(double, cusolverDnDgeqrf)
-GEQRF_STRIDED_BATCH_LAUNCHER(std::complex<float>, cusolverDnCgeqrf)
-GEQRF_STRIDED_BATCH_LAUNCHER(std::complex<double>, cusolverDnZgeqrf)
-
-#undef GEQRF_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void getri_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t n,
-                        sycl::buffer<T> &a, std::int64_t lda, std::int64_t stride_a,
-                        sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                        std::int64_t batch_size, sycl::buffer<T> &scratchpad,
-                        std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(n, lda, stride_a, stride_ipiv, batch_size, scratchpad_size);
-
-    std::uint64_t ipiv32_size = n * batch_size;
-    sycl::buffer<int> ipiv32(sycl::range<1>{ ipiv32_size });
-    sycl::buffer<int> devInfo{ batch_size };
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto ipiv_acc = sycl::accessor{ ipiv, cgh, sycl::read_only };
-        auto ipiv32_acc = sycl::accessor{ ipiv32, cgh, sycl::write_only };
-        cgh.parallel_for(sycl::range<1>{ ipiv32_size }, [=](sycl::id<1> index) {
-            ipiv32_acc[index] = static_cast<int>(ipiv_acc[(index / n) * stride_ipiv + index % n]);
-        });
-    });
-
-    // getri_batched is contained within cublas, not cusolver. For this reason
-    // we need to use cublas types instead of cusolver types (as is needed for
-    // other lapack routines)
-    queue.submit([&](sycl::handler &cgh) {
-        using blas::cublas::cublas_error;
-
-        sycl::accessor a_acc{ a, cgh, sycl::read_only };
-        sycl::accessor scratch_acc{ scratchpad, cgh, sycl::write_only };
-        sycl::accessor ipiv32_acc{ ipiv32, cgh };
-        sycl::accessor devInfo_acc{ devInfo, cgh, sycl::write_only };
-
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            cublasStatus_t err;
-            CUresult cuda_result;
-            cublasHandle_t cublas_handle;
-            CUBLAS_ERROR_FUNC(cublasCreate, err, &cublas_handle);
-            CUstream cu_stream = sycl::get_native<sycl::backend::ext_oneapi_cuda>(queue);
-            CUBLAS_ERROR_FUNC(cublasSetStream, err, cublas_handle, cu_stream);
-
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            auto ipiv32_ = sc.get_mem<int *>(ipiv32_acc);
-            auto info_ = sc.get_mem<int *>(devInfo_acc);
-
-            CUdeviceptr a_dev;
-            cuDataType **a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size);
-            CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T *) * batch_size);
-            CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T *) * batch_size);
-            auto **a_dev_ = reinterpret_cast<cuDataType **>(a_dev);
-
-            CUdeviceptr scratch_dev;
-            cuDataType **scratch_batched =
-                create_ptr_list_from_stride(scratch_, stride_a, batch_size);
-            CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &scratch_dev, sizeof(T *) * batch_size);
-            CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, scratch_dev, scratch_batched,
-                            sizeof(T *) * batch_size);
-            auto **scratch_dev_ = reinterpret_cast<cuDataType **>(scratch_dev);
-
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, cublas_handle, n, a_dev_, lda, ipiv32_,
-                                     scratch_dev_, lda, info_, batch_size)
-
-            free(a_batched);
-            free(scratch_batched);
-            cuMemFree(a_dev);
-            cuMemFree(scratch_dev);
-        });
-    });
-
-    // The inverted matrices stored in scratch_ need to be stored in a_
-    queue.submit([&](sycl::handler &cgh) {
-        sycl::accessor a_acc{ a, cgh, sycl::write_only };
-        sycl::accessor scratch_acc{ scratchpad, cgh, sycl::read_only };
-        cgh.parallel_for(sycl::range<1>{ static_cast<size_t>(
-                             sycl::max(stride_a * batch_size, lda * n * batch_size)) },
-                         [=](sycl::id<1> index) { a_acc[index] = scratch_acc[index]; });
-    });
-
-    queue.submit([&](sycl::handler &cgh) {
-        sycl::accessor ipiv32_acc{ ipiv32, cgh, sycl::read_only };
-        sycl::accessor ipiv_acc{ ipiv, cgh, sycl::write_only };
-        cgh.parallel_for(sycl::range<1>{ static_cast<size_t>(ipiv32_size) },
-                         [=](sycl::id<1> index) {
-                             ipiv_acc[(index / n) * stride_ipiv + index % n] =
-                                 static_cast<int64_t>(ipiv32_acc[index]);
-                         });
-    });
-}
-
-#define GETRI_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                      \
-    void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<TYPE> &a, std::int64_t lda, \
-                     std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,                     \
-                     std::int64_t stride_ipiv, std::int64_t batch_size,                           \
-                     sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {              \
-        return getri_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, n, a, lda, stride_a, ipiv, \
-                           stride_ipiv, batch_size, scratchpad, scratchpad_size);                 \
-    }
-
-GETRI_STRIDED_BATCH_LAUNCHER(float, cublasSgetriBatched)
-GETRI_STRIDED_BATCH_LAUNCHER(double, cublasDgetriBatched)
-GETRI_STRIDED_BATCH_LAUNCHER(std::complex<float>, cublasCgetriBatched)
-GETRI_STRIDED_BATCH_LAUNCHER(std::complex<double>, cublasZgetriBatched)
-
-#undef GETRI_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void getrs_batch(const char *func_name, Func func, sycl::queue &queue,
-                        oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                        sycl::buffer<T> &a, std::int64_t lda, std::int64_t stride_a,
-                        sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                        sycl::buffer<T> &b, std::int64_t ldb, std::int64_t stride_b,
-                        std::int64_t batch_size, sycl::buffer<T> &scratchpad,
-                        std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(n, nrhs, lda, ldb, stride_ipiv, stride_b, batch_size, scratchpad_size);
-
-    // cuSolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Create new buffer and convert 64-bit values.
-    std::uint64_t ipiv_size = stride_ipiv * batch_size;
-    sycl::buffer<int> ipiv32(sycl::range<1>{ ipiv_size });
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto ipiv32_acc = ipiv32.template get_access<sycl::access::mode::write>(cgh);
-        auto ipiv_acc = ipiv.template get_access<sycl::access::mode::read>(cgh);
-        cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-            ipiv32_acc[index] = static_cast<std::int32_t>(ipiv_acc[index]);
-        });
-    });
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto ipiv_acc = ipiv32.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::write>(cgh);
-
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto ipiv_ = sc.get_mem<std::int32_t *>(ipiv_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            cusolverStatus_t err;
-
-            // Does not use scratch so call cuSolver asynchronously and sync at end
-            for (int64_t i = 0; i < batch_size; ++i) {
-                CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_operation(trans), n,
-                                      nrhs, a_ + stride_a * i, lda, ipiv_ + stride_ipiv * i,
-                                      b_ + stride_b * i, ldb, nullptr);
-            }
-            CUSOLVER_SYNC(err, handle)
-        });
-    });
-}
-
-#define GETRS_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                      \
-    void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,            \
-                     std::int64_t nrhs, sycl::buffer<TYPE> &a, std::int64_t lda,                  \
-                     std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,                     \
-                     std::int64_t stride_ipiv, sycl::buffer<TYPE> &b, std::int64_t ldb,           \
-                     std::int64_t stride_b, std::int64_t batch_size,                              \
-                     sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {              \
-        return getrs_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, trans, n, nrhs, a, lda,    \
-                           stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, \
-                           scratchpad_size);                                                      \
-    }
-
-GETRS_STRIDED_BATCH_LAUNCHER(float, cusolverDnSgetrs)
-GETRS_STRIDED_BATCH_LAUNCHER(double, cusolverDnDgetrs)
-GETRS_STRIDED_BATCH_LAUNCHER(std::complex<float>, cusolverDnCgetrs)
-GETRS_STRIDED_BATCH_LAUNCHER(std::complex<double>, cusolverDnZgetrs)
-
-#undef GETRS_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void getrf_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, std::int64_t stride_a,
-                        sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                        std::int64_t batch_size, sycl::buffer<T> &scratchpad,
-                        std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(m, n, lda, stride_a, stride_ipiv, batch_size, scratchpad_size);
-
-    // cuSolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Create new buffer with 32-bit ints then copy over results
-    std::uint64_t ipiv_size = stride_ipiv * batch_size;
-    sycl::buffer<int> ipiv32(sycl::range<1>{ ipiv_size });
-    sycl::buffer<int> devInfo{ batch_size };
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto ipiv32_acc = ipiv32.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto ipiv_ = sc.get_mem<int *>(ipiv32_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-
-            // Uses scratch so sync between each cuSolver call
-            for (std::int64_t i = 0; i < batch_size; ++i) {
-                CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_ + stride_a * i,
-                                           lda, scratch_, ipiv_ + stride_ipiv * i, devInfo_ + i);
-            }
-        });
-    });
-
-    // Copy from 32-bit USM to 64-bit
-    queue.submit([&](sycl::handler &cgh) {
-        auto ipiv32_acc = ipiv32.template get_access<sycl::access::mode::read>(cgh);
-        auto ipiv_acc = ipiv.template get_access<sycl::access::mode::write>(cgh);
-        cgh.parallel_for(sycl::range<1>{ ipiv_size },
-                         [=](sycl::id<1> index) { ipiv_acc[index] = ipiv32_acc[index]; });
-    });
-
-    lapack_info_check(queue, devInfo, __func__, func_name, batch_size);
-}
-
-#define GETRF_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                    \
-    void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<TYPE> &a, \
-                     std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, \
-                     std::int64_t stride_ipiv, std::int64_t batch_size,                         \
-                     sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {            \
-        return getrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, stride_a,  \
-                           ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size);         \
-    }
-
-GETRF_STRIDED_BATCH_LAUNCHER(float, cusolverDnSgetrf)
-GETRF_STRIDED_BATCH_LAUNCHER(double, cusolverDnDgetrf)
-GETRF_STRIDED_BATCH_LAUNCHER(std::complex<float>, cusolverDnCgetrf)
-GETRF_STRIDED_BATCH_LAUNCHER(std::complex<double>, cusolverDnZgetrf)
-
-#undef GETRF_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void orgqr_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, std::int64_t k, sycl::buffer<T> &a, std::int64_t lda,
-                        std::int64_t stride_a, sycl::buffer<T> &tau, std::int64_t stride_tau,
-                        std::int64_t batch_size, sycl::buffer<T> &scratchpad,
-                        std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(m, n, k, lda, stride_a, stride_tau, batch_size, scratchpad_size);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-
-            // Uses scratch so sync between each cuSolver call
-            for (int64_t i = 0; i < batch_size; ++i) {
-                CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_ + stride_a * i,
-                                           lda, tau_ + stride_tau * i, scratch_, scratchpad_size,
-                                           nullptr);
-            }
-        });
-    });
-}
-
-#define ORGQR_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                      \
-    void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,          \
-                     sycl::buffer<TYPE> &a, std::int64_t lda, std::int64_t stride_a,              \
-                     sycl::buffer<TYPE> &tau, std::int64_t stride_tau, std::int64_t batch_size,   \
-                     sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {              \
-        return orgqr_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, stride_a, \
-                           tau, stride_tau, batch_size, scratchpad, scratchpad_size);             \
-    }
-
-ORGQR_STRIDED_BATCH_LAUNCHER(float, cusolverDnSorgqr)
-ORGQR_STRIDED_BATCH_LAUNCHER(double, cusolverDnDorgqr)
-
-#undef ORGQR_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void potrf_batch(const char *func_name, Func func, sycl::queue &queue,
-                        oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<T> &a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                        sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(n, lda, stride_a, batch_size, scratchpad_size);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            CUdeviceptr a_dev;
-            CUresult cuda_result;
-            cusolverStatus_t err;
-
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-
-            // Transform ptr and stride to list of ptr's
-            cuDataType **a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size);
-            CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T *) * batch_size);
-            CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T *) * batch_size);
-
-            auto **a_dev_ = reinterpret_cast<cuDataType **>(a_dev);
-
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo),
-                                       (int)n, a_dev_, (int)lda, nullptr, (int)batch_size);
-
-            free(a_batched);
-            cuMemFree(a_dev);
-        });
-    });
-}
-
-// Scratchpad memory not needed as parts of buffer a is used as workspace memory
-#define POTRF_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                      \
-    void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,                  \
-                     sycl::buffer<TYPE> &a, std::int64_t lda, std::int64_t stride_a,              \
-                     std::int64_t batch_size, sycl::buffer<TYPE> &scratchpad,                     \
-                     std::int64_t scratchpad_size) {                                              \
-        return potrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, stride_a, \
-                           batch_size, scratchpad, scratchpad_size);                              \
-    }
-
-POTRF_STRIDED_BATCH_LAUNCHER(float, cusolverDnSpotrfBatched)
-POTRF_STRIDED_BATCH_LAUNCHER(double, cusolverDnDpotrfBatched)
-POTRF_STRIDED_BATCH_LAUNCHER(std::complex<float>, cusolverDnCpotrfBatched)
-POTRF_STRIDED_BATCH_LAUNCHER(std::complex<double>, cusolverDnZpotrfBatched)
-
-#undef POTRF_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void potrs_batch(const char *func_name, Func func, sycl::queue &queue,
-                        oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                        sycl::buffer<T> &a, std::int64_t lda, std::int64_t stride_a,
-                        sycl::buffer<T> &b, std::int64_t ldb, std::int64_t stride_b,
-                        std::int64_t batch_size, sycl::buffer<T> &scratchpad,
-                        std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(n, nrhs, lda, ldb, stride_a, stride_b, batch_size, scratchpad_size);
-
-    // cuSolver function only supports nrhs = 1
-    if (nrhs != 1)
-        throw unimplemented("lapack", "potrs_batch", "cusolver potrs_batch only supports nrhs = 1");
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            CUdeviceptr a_dev, b_dev;
-            cusolverStatus_t err;
-            CUresult cuda_result;
-
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-
-            // Transform ptr and stride to list of ptr's
-            cuDataType **a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size);
-            cuDataType **b_batched = create_ptr_list_from_stride(b_, stride_b, batch_size);
-            CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T *) * batch_size);
-            CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T *) * batch_size);
-            CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &b_dev, sizeof(T *) * batch_size);
-            CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, b_dev, b_batched, sizeof(T *) * batch_size);
-
-            auto **a_dev_ = reinterpret_cast<cuDataType **>(a_dev);
-            auto **b_dev_ = reinterpret_cast<cuDataType **>(b_dev);
-
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo),
-                                       (int)n, (int)nrhs, a_dev_, (int)lda, b_dev_, ldb, nullptr,
-                                       (int)batch_size);
-
-            free(a_batched);
-            free(b_batched);
-            cuMemFree(a_dev);
-            cuMemFree(b_dev);
-        });
-    });
-}
-
-// Scratchpad memory not needed as parts of buffer a is used as workspace memory
-#define POTRS_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                     \
-    void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,                 \
-                     std::int64_t nrhs, sycl::buffer<TYPE> &a, std::int64_t lda,                 \
-                     std::int64_t stride_a, sycl::buffer<TYPE> &b, std::int64_t ldb,             \
-                     std::int64_t stride_b, std::int64_t batch_size,                             \
-                     sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {             \
-        return potrs_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda,    \
-                           stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); \
-    }
-
-POTRS_STRIDED_BATCH_LAUNCHER(float, cusolverDnSpotrsBatched)
-POTRS_STRIDED_BATCH_LAUNCHER(double, cusolverDnDpotrsBatched)
-POTRS_STRIDED_BATCH_LAUNCHER(std::complex<float>, cusolverDnCpotrsBatched)
-POTRS_STRIDED_BATCH_LAUNCHER(std::complex<double>, cusolverDnZpotrsBatched)
-
-#undef POTRS_STRIDED_BATCH_LAUNCHER
-
-template <typename Func, typename T>
-inline void ungqr_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, std::int64_t k, sycl::buffer<T> &a, std::int64_t lda,
-                        std::int64_t stride_a, sycl::buffer<T> &tau, std::int64_t stride_tau,
-                        std::int64_t batch_size, sycl::buffer<T> &scratchpad,
-                        std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(m, n, k, lda, stride_a, stride_tau, batch_size, scratchpad_size);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-
-            // Uses scratch so sync between each cuSolver call
-            for (int64_t i = 0; i < batch_size; ++i) {
-                CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_ + stride_a * i,
-                                           lda, tau_ + stride_tau * i, scratch_, scratchpad_size,
-                                           nullptr);
-            }
-        });
-    });
-}
-
-#define UNGQR_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                      \
-    void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,          \
-                     sycl::buffer<TYPE> &a, std::int64_t lda, std::int64_t stride_a,              \
-                     sycl::buffer<TYPE> &tau, std::int64_t stride_tau, std::int64_t batch_size,   \
-                     sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {              \
-        return ungqr_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, stride_a, \
-                           tau, stride_tau, batch_size, scratchpad, scratchpad_size);             \
-    }
-
-UNGQR_STRIDED_BATCH_LAUNCHER(std::complex<float>, cusolverDnCungqr)
-UNGQR_STRIDED_BATCH_LAUNCHER(std::complex<double>, cusolverDnZungqr)
-
-#undef UNGQR_STRIDED_BATCH_LAUNCHER
-
-// BATCH USM API
-
-template <typename Func, typename T>
-inline sycl::event geqrf_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, T *a, std::int64_t lda, std::int64_t stride_a,
-                               T *tau, std::int64_t stride_tau, std::int64_t batch_size,
-                               T *scratchpad, std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(m, n, lda, stride_a, stride_tau, batch_size, scratchpad_size);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-
-            // Uses scratch so sync between each cuSolver call
-            for (int64_t i = 0; i < batch_size; ++i) {
-                CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_ + stride_a * i,
-                                           lda, tau_ + stride_tau * i, scratch_, scratchpad_size,
-                                           nullptr);
-            }
-        });
-    });
-
-    return done;
-}
-
-#define GEQRF_STRIDED_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                \
-    sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE *a,        \
-                            std::int64_t lda, std::int64_t stride_a, TYPE *tau,                 \
-                            std::int64_t stride_tau, std::int64_t batch_size, TYPE *scratchpad, \
-                            std::int64_t scratchpad_size,                                       \
-                            const std::vector<sycl::event> &dependencies) {                     \
-        return geqrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, stride_a,  \
-                           tau, stride_tau, batch_size, scratchpad, scratchpad_size,            \
-                           dependencies);                                                       \
-    }
-
-GEQRF_STRIDED_BATCH_LAUNCHER_USM(float, cusolverDnSgeqrf)
-GEQRF_STRIDED_BATCH_LAUNCHER_USM(double, cusolverDnDgeqrf)
-GEQRF_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, cusolverDnCgeqrf)
-GEQRF_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, cusolverDnZgeqrf)
-
-#undef GEQRF_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event geqrf_batch(const char *func_name, Func func, sycl::queue &queue,
-                               std::int64_t *m, std::int64_t *n, T **a, std::int64_t *lda, T **tau,
-                               std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad,
-                               std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(group_count, scratchpad_size);
-    for (int64_t i = 0; i < group_count; ++i)
-        overflow_check(m[i], n[i], lda[i], group_sizes[i]);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType **>(a);
-            auto tau_ = reinterpret_cast<cuDataType **>(tau);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            int64_t global_id = 0;
-            cusolverStatus_t err;
-
-            // Uses scratch so sync between each cuSolver call
-            for (int64_t group_id = 0; group_id < group_count; ++group_id) {
-                for (int64_t local_id = 0; local_id < group_sizes[group_id];
-                     ++local_id, ++global_id) {
-                    CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m[group_id],
-                                               n[group_id], a_[global_id], lda[group_id],
-                                               tau_[global_id], scratch_, scratchpad_size, nullptr);
-                }
-            }
-        });
-    });
-
-    return done;
-}
-
-#define GEQRF_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                         \
-    sycl::event geqrf_batch(                                                                     \
-        sycl::queue &queue, std::int64_t *m, std::int64_t *n, TYPE **a, std::int64_t *lda,       \
-        TYPE **tau, std::int64_t group_count, std::int64_t *group_sizes, TYPE *scratchpad,       \
-        std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {            \
-        return geqrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, tau,        \
-                           group_count, group_sizes, scratchpad, scratchpad_size, dependencies); \
-    }
-
-GEQRF_BATCH_LAUNCHER_USM(float, cusolverDnSgeqrf)
-GEQRF_BATCH_LAUNCHER_USM(double, cusolverDnDgeqrf)
-GEQRF_BATCH_LAUNCHER_USM(std::complex<float>, cusolverDnCgeqrf)
-GEQRF_BATCH_LAUNCHER_USM(std::complex<double>, cusolverDnZgeqrf)
-
-#undef GEQRF_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event getrf_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, T *a, std::int64_t lda, std::int64_t stride_a,
-                               std::int64_t *ipiv, std::int64_t stride_ipiv,
-                               std::int64_t batch_size, T *scratchpad, std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(m, n, lda, stride_a, stride_ipiv, batch_size, scratchpad_size);
-
-    // cuSolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Allocate memory with 32-bit ints then copy over results
-    std::uint64_t ipiv_size = stride_ipiv * batch_size;
-    int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue);
-    int *devInfo = (int *)malloc_device(sizeof(int) * batch_size, queue);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            auto scratchpad_ = reinterpret_cast<cuDataType *>(scratchpad);
-            auto ipiv_ = reinterpret_cast<int *>(ipiv32);
-            cusolverStatus_t err;
-
-            // Uses scratch so sync between each cuSolver call
-            for (int64_t i = 0; i < batch_size; ++i) {
-                CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_ + stride_a * i,
-                                           lda, scratchpad_, ipiv_ + stride_ipiv * i, devInfo_ + i);
-            }
-        });
-    });
-
-    // Copy from 32-bit USM to 64-bit
-    sycl::event done_casting = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(done);
-        cgh.parallel_for(sycl::range<1>{ ipiv_size },
-                         [=](sycl::id<1> index) { ipiv[index] = ipiv32[index]; });
-    });
-
-    // Enqueue free memory, don't return event as not-neccessary for user to wait for ipiv32 being released
-    queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(done_casting);
-        cgh.host_task([=](sycl::interop_handle ih) { sycl::free(ipiv32, queue); });
-    });
-
-    // lapack_info_check calls queue.wait()
-    lapack_info_check(queue, devInfo, __func__, func_name, batch_size);
-    sycl::free(devInfo, queue);
-
-    return done_casting;
-}
-
-#define GETRF_STRIDED_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                 \
-    sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE *a,         \
-                            std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,         \
-                            std::int64_t stride_ipiv, std::int64_t batch_size, TYPE *scratchpad, \
-                            std::int64_t scratchpad_size,                                        \
-                            const std::vector<sycl::event> &dependencies) {                      \
-        return getrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, stride_a,   \
-                           ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size,           \
-                           dependencies);                                                        \
-    }
-
-GETRF_STRIDED_BATCH_LAUNCHER_USM(float, cusolverDnSgetrf)
-GETRF_STRIDED_BATCH_LAUNCHER_USM(double, cusolverDnDgetrf)
-GETRF_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, cusolverDnCgetrf)
-GETRF_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, cusolverDnZgetrf)
-
-#undef GETRF_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event getrf_batch(const char *func_name, Func func, sycl::queue &queue,
-                               std::int64_t *m, std::int64_t *n, T **a, std::int64_t *lda,
-                               std::int64_t **ipiv, std::int64_t group_count,
-                               std::int64_t *group_sizes, T *scratchpad,
-                               std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    int64_t batch_size = 0;
-    overflow_check(group_count, scratchpad_size);
-    for (int64_t i = 0; i < group_count; ++i) {
-        overflow_check(m[i], n[i], lda[i], group_sizes[i]);
-        batch_size += group_sizes[i];
-    }
-
-    // cuSolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Allocate memory with 32-bit ints then copy over results
-    int **ipiv32 = (int **)malloc(sizeof(int *) * batch_size);
-    int64_t global_id = 0;
-    for (int64_t group_id = 0; group_id < group_count; ++group_id)
-        for (int64_t local_id = 0; local_id < group_sizes[group_id]; ++local_id, ++global_id)
-            ipiv32[global_id] = (int *)malloc_device(sizeof(int) * n[group_id], queue);
-    int *devInfo = (int *)malloc_device(sizeof(int) * batch_size, queue);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType **>(a);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            int64_t global_id = 0;
-            cusolverStatus_t err;
-
-            // Uses scratch so sync between each cuSolver call
-            for (int64_t group_id = 0; group_id < group_count; ++group_id) {
-                for (int64_t local_id = 0; local_id < group_sizes[group_id];
-                     ++local_id, ++global_id) {
-                    CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m[group_id],
-                                               n[group_id], a_[global_id], lda[group_id], scratch_,
-                                               ipiv32[global_id], devInfo + global_id);
-                }
-            }
-        });
-    });
-
-    // Copy from 32-bit USM to 64-bit
-    std::vector<sycl::event> casting_dependencies(group_count);
-    for (int64_t group_id = 0, global_id = 0; group_id < group_count; ++group_id) {
-        uint64_t ipiv_size = n[group_id];
-        for (int64_t local_id = 0; local_id < group_sizes[group_id]; ++local_id, ++global_id) {
-            int64_t *d_ipiv = ipiv[global_id];
-            int *d_ipiv32 = ipiv32[global_id];
-
-            sycl::event e = queue.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(done);
-                cgh.parallel_for(sycl::range<1>{ ipiv_size },
-                                 [=](sycl::id<1> index) { d_ipiv[index] = d_ipiv32[index]; });
-            });
-            casting_dependencies[group_id] = e;
-        }
-    }
-
-    // Enqueue free memory
-    sycl::event done_freeing = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(casting_dependencies);
-        cgh.host_task([=](sycl::interop_handle ih) {
-            for (int64_t global_id = 0; global_id < batch_size; ++global_id)
-                sycl::free(ipiv32[global_id], queue);
-            free(ipiv32);
-        });
-    });
-
-    // lapack_info_check calls queue.wait()
-    lapack_info_check(queue, devInfo, __func__, func_name, batch_size);
-    sycl::free(devInfo, queue);
-
-    return done_freeing;
-}
-
-#define GETRF_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                         \
-    sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, TYPE **a,      \
-                            std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,    \
-                            std::int64_t *group_sizes, TYPE *scratchpad,                         \
-                            std::int64_t scratchpad_size,                                        \
-                            const std::vector<sycl::event> &dependencies) {                      \
-        return getrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, ipiv,       \
-                           group_count, group_sizes, scratchpad, scratchpad_size, dependencies); \
-    }
-
-GETRF_BATCH_LAUNCHER_USM(float, cusolverDnSgetrf)
-GETRF_BATCH_LAUNCHER_USM(double, cusolverDnDgetrf)
-GETRF_BATCH_LAUNCHER_USM(std::complex<float>, cusolverDnCgetrf)
-GETRF_BATCH_LAUNCHER_USM(std::complex<double>, cusolverDnZgetrf)
-
-#undef GETRS_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-sycl::event getri_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t n, T *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size, T *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(n, lda, stride_a, stride_ipiv, batch_size, scratchpad_size);
-
-    std::uint64_t ipiv32_size = n * batch_size;
-    int *ipiv32 = sycl::malloc_device<int>(ipiv32_size, queue);
-    int *devInfo = sycl::malloc_device<int>(batch_size, queue);
-
-    sycl::event done_casting = queue.submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(
-            sycl::range<1>{ static_cast<size_t>(ipiv32_size) }, [=](sycl::id<1> index) {
-                ipiv32[index] = static_cast<int>(ipiv[(index / n) * stride_ipiv + index % n]);
-            });
-    });
-
-    // getri_batched is contained within cublas, not cusolver. For this reason
-    // we need to use cublas types instead of cusolver types (as is needed for
-    // other lapack routines)
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        using blas::cublas::cublas_error;
-
-        cgh.depends_on(done_casting);
-        cgh.depends_on(dependencies);
-
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            cublasStatus_t err;
-            CUresult cuda_result;
-            cublasHandle_t cublas_handle;
-            CUBLAS_ERROR_FUNC(cublasCreate, err, &cublas_handle);
-            CUstream cu_stream = sycl::get_native<sycl::backend::ext_oneapi_cuda>(queue);
-            CUBLAS_ERROR_FUNC(cublasSetStream, err, cublas_handle, cu_stream);
-
-            CUdeviceptr a_dev;
-            auto *a_ = reinterpret_cast<cuDataType *>(a);
-            cuDataType **a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size);
-            CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T *) * batch_size);
-            CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T *) * batch_size);
-            auto **a_dev_ = reinterpret_cast<cuDataType **>(a_dev);
-
-            CUdeviceptr scratch_dev;
-            auto *scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cuDataType **scratch_batched =
-                create_ptr_list_from_stride(scratch_, stride_a, batch_size);
-            CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &scratch_dev, sizeof(T *) * batch_size);
-            CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, scratch_dev, scratch_batched,
-                            sizeof(T *) * batch_size);
-            auto **scratch_dev_ = reinterpret_cast<cuDataType **>(scratch_dev);
-
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, cublas_handle, n, a_dev_, lda, ipiv32,
-                                     scratch_dev_, lda, devInfo, batch_size)
-
-            free(a_batched);
-            free(scratch_batched);
-            cuMemFree(a_dev);
-            cuMemFree(scratch_dev);
-        });
-    });
-
-    // The inverted matrices stored in scratch_ need to be stored in a_
-    auto copy1 = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(done);
-        cgh.parallel_for(
-            sycl::range<1>{ static_cast<size_t>(stride_a * (batch_size - 1) + lda * n) },
-            [=](sycl::id<1> index) { a[index] = scratchpad[index]; });
-    });
-
-    auto copy2 = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(done);
-        cgh.parallel_for(
-            sycl::range<1>{ static_cast<size_t>(ipiv32_size) }, [=](sycl::id<1> index) {
-                ipiv[(index / n) * stride_ipiv + index % n] = static_cast<int64_t>(ipiv32[index]);
-            });
-    });
-    copy1.wait();
-    copy2.wait();
-    sycl::free(ipiv32, queue);
-    sycl::free(devInfo, queue);
-    return done;
-}
-
-#define GETRI_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                          \
-    sycl::event getri_batch(                                                                      \
-        sycl::queue &queue, std::int64_t n, TYPE *a, std::int64_t lda, std::int64_t stride_a,     \
-        std::int64_t *ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, TYPE *scratchpad,  \
-        std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {             \
-        return getri_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, n, a, lda, stride_a, ipiv, \
-                           stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies);   \
-    }
-
-GETRI_BATCH_LAUNCHER_USM(float, cublasSgetriBatched)
-GETRI_BATCH_LAUNCHER_USM(double, cublasDgetriBatched)
-GETRI_BATCH_LAUNCHER_USM(std::complex<float>, cublasCgetriBatched)
-GETRI_BATCH_LAUNCHER_USM(std::complex<double>, cublasZgetriBatched)
-
-#undef GETRI_BATCH_LAUNCHER_USM
-
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri_batch");
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri_batch");
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<float> **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri_batch");
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<double> **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri_batch");
-}
-
-template <typename Func, typename T>
-inline sycl::event getrs_batch(const char *func_name, Func func, sycl::queue &queue,
-                               oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                               T *a, std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                               std::int64_t stride_ipiv, T *b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size, T *scratchpad,
-                               std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(n, nrhs, lda, ldb, stride_ipiv, stride_b, batch_size, scratchpad_size);
-
-    // cuSolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Create new memory and convert 64-bit values.
-    std::uint64_t ipiv_size = stride_ipiv * batch_size;
-    int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue);
-
-    auto done_casting = queue.submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-            ipiv32[index] = static_cast<std::int32_t>(ipiv[index]);
-        });
-    });
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        cgh.depends_on(done_casting);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto ipiv_ = reinterpret_cast<int *>(ipiv32);
-            auto b_ = reinterpret_cast<cuDataType *>(b);
-            cusolverStatus_t err;
-
-            // Does not use scratch so call cuSolver asynchronously and sync at end
-            for (int64_t i = 0; i < batch_size; ++i) {
-                CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_operation(trans), n,
-                                      nrhs, a_ + stride_a * i, lda, ipiv_ + stride_ipiv * i,
-                                      b_ + stride_b * i, ldb, nullptr);
-            }
-            CUSOLVER_SYNC(err, handle)
-
-            sycl::free(ipiv32, queue);
-        });
-    });
-
-    return done;
-}
-
-#define GETRS_STRIDED_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                  \
-    sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,     \
-                            std::int64_t nrhs, TYPE *a, std::int64_t lda, std::int64_t stride_a,  \
-                            std::int64_t *ipiv, std::int64_t stride_ipiv, TYPE *b,                \
-                            std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,     \
-                            TYPE *scratchpad, std::int64_t scratchpad_size,                       \
-                            const std::vector<sycl::event> &dependencies) {                       \
-        return getrs_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, trans, n, nrhs, a, lda,    \
-                           stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, \
-                           scratchpad_size, dependencies);                                        \
-    }
-
-GETRS_STRIDED_BATCH_LAUNCHER_USM(float, cusolverDnSgetrs)
-GETRS_STRIDED_BATCH_LAUNCHER_USM(double, cusolverDnDgetrs)
-GETRS_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, cusolverDnCgetrs)
-GETRS_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, cusolverDnZgetrs)
-
-#undef GETRS_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event getrs_batch(const char *func_name, Func func, sycl::queue &queue,
-                               oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-                               T **a, std::int64_t *lda, std::int64_t **ipiv, T **b,
-                               std::int64_t *ldb, std::int64_t group_count,
-                               std::int64_t *group_sizes, T *scratchpad,
-                               std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    int64_t batch_size = 0;
-    overflow_check(group_count, scratchpad_size);
-    for (int64_t i = 0; i < group_count; ++i) {
-        overflow_check(n[i], nrhs[i], lda[i], ldb[i], group_sizes[i]);
-        batch_size += group_sizes[i];
-    }
-
-    // cuSolver legacy api does not accept 64-bit ints.
-    // ipiv is an array of pointers in host memory, pointing to
-    // an array of 64-bit ints in device memory. Each vec of ipiv
-    // values need to be converted from 64-bit to 32-bit. The list
-    // must stay on host.
-    int **ipiv32 = (int **)malloc(sizeof(int *) * batch_size);
-    std::vector<sycl::event> casting_dependencies(batch_size);
-    int64_t global_id = 0;
-    for (int64_t group_id = 0; group_id < group_count; ++group_id) {
-        for (int64_t local_id = 0; local_id < group_sizes[group_id]; ++local_id, ++global_id) {
-            uint64_t ipiv_size = n[group_id];
-            int *d_group_ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue);
-            ipiv32[global_id] = d_group_ipiv32;
-            int64_t *d_group_ipiv = ipiv[global_id];
-
-            auto e = queue.submit([&](sycl::handler &cgh) {
-                cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-                    d_group_ipiv32[index] = static_cast<std::int32_t>(d_group_ipiv[index]);
-                });
-            });
-            casting_dependencies[global_id] = e;
-        }
-    }
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        cgh.depends_on(casting_dependencies);
-
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType **>(a);
-            auto b_ = reinterpret_cast<cuDataType **>(b);
-            cusolverStatus_t err;
-            int64_t global_id = 0;
-
-            // Does not use scratch so call cuSolver asynchronously and sync at end
-            for (int64_t group_id = 0; group_id < group_count; ++group_id) {
-                for (int64_t local_id = 0; local_id < group_sizes[group_id];
-                     ++local_id, ++global_id) {
-                    CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle,
-                                          get_cublas_operation(trans[group_id]), n[group_id],
-                                          nrhs[group_id], a_[global_id], lda[group_id],
-                                          ipiv32[global_id], b_[global_id], ldb[group_id], nullptr);
-                }
-            }
-            CUSOLVER_SYNC(err, handle)
-
-            for (int64_t i = 0; i < batch_size; ++i)
-                sycl::free(ipiv32[i], queue);
-            free(ipiv32);
-        });
-    });
-
-    return done;
-}
-
-#define GETRS_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                        \
-    sycl::event getrs_batch(                                                                    \
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, \
-        TYPE **a, std::int64_t *lda, std::int64_t **ipiv, TYPE **b, std::int64_t *ldb,          \
-        std::int64_t group_count, std::int64_t *group_sizes, TYPE *scratchpad,                  \
-        std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {           \
-        return getrs_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, trans, n, nrhs, a, lda,  \
-                           ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, \
-                           dependencies);                                                       \
-    }
-
-GETRS_BATCH_LAUNCHER_USM(float, cusolverDnSgetrs)
-GETRS_BATCH_LAUNCHER_USM(double, cusolverDnDgetrs)
-GETRS_BATCH_LAUNCHER_USM(std::complex<float>, cusolverDnCgetrs)
-GETRS_BATCH_LAUNCHER_USM(std::complex<double>, cusolverDnZgetrs)
-
-#undef GETRS_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event orgqr_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, std::int64_t k, T *a, std::int64_t lda,
-                               std::int64_t stride_a, T *tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, T *scratchpad, std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(m, n, k, lda, stride_a, stride_tau, batch_size, scratchpad_size);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-
-            // Uses scratch so sync between each cuSolver call
-            for (int64_t i = 0; i < batch_size; ++i) {
-                CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_ + stride_a * i,
-                                           lda, tau_ + stride_tau * i, scratch_, scratchpad_size,
-                                           nullptr);
-            }
-        });
-    });
-
-    return done;
-}
-
-#define ORGQR_STRIDED_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                  \
-    sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,   \
-                            TYPE *a, std::int64_t lda, std::int64_t stride_a, TYPE *tau,          \
-                            std::int64_t stride_tau, std::int64_t batch_size, TYPE *scratchpad,   \
-                            std::int64_t scratchpad_size,                                         \
-                            const std::vector<sycl::event> &dependencies) {                       \
-        return orgqr_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, stride_a, \
-                           tau, stride_tau, batch_size, scratchpad, scratchpad_size,              \
-                           dependencies);                                                         \
-    }
-
-ORGQR_STRIDED_BATCH_LAUNCHER_USM(float, cusolverDnSorgqr)
-ORGQR_STRIDED_BATCH_LAUNCHER_USM(double, cusolverDnDorgqr)
-
-#undef ORGQR_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event orgqr_batch(const char *func_name, Func func, sycl::queue &queue,
-                               std::int64_t *m, std::int64_t *n, std::int64_t *k, T **a,
-                               std::int64_t *lda, T **tau, std::int64_t group_count,
-                               std::int64_t *group_sizes, T *scratchpad,
-                               std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(group_count, scratchpad_size);
-    for (int64_t i = 0; i < group_count; ++i)
-        overflow_check(m[i], n[i], k[i], lda[i], group_sizes[i]);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType **>(a);
-            auto tau_ = reinterpret_cast<cuDataType **>(tau);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            int64_t global_id = 0;
-            cusolverStatus_t err;
-
-            // Uses scratch so sync between each cuSolver call
-            for (int64_t group_id = 0; group_id < group_count; ++group_id) {
-                for (int64_t local_id = 0; local_id < group_sizes[group_id];
-                     ++local_id, ++global_id) {
-                    CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m[group_id],
-                                               n[group_id], k[group_id], a_[global_id],
-                                               lda[group_id], tau_[global_id], scratch_,
-                                               scratchpad_size, nullptr);
-                }
-            }
-        });
-    });
-
-    return done;
-}
-
-#define ORGQR_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                           \
-    sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, \
-                            TYPE **a, std::int64_t *lda, TYPE **tau, std::int64_t group_count,     \
-                            std::int64_t *group_sizes, TYPE *scratchpad,                           \
-                            std::int64_t scratchpad_size,                                          \
-                            const std::vector<sycl::event> &dependencies) {                        \
-        return orgqr_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, tau,       \
-                           group_count, group_sizes, scratchpad, scratchpad_size, dependencies);   \
-    }
-
-ORGQR_BATCH_LAUNCHER_USM(float, cusolverDnSorgqr)
-ORGQR_BATCH_LAUNCHER_USM(double, cusolverDnDorgqr)
-
-#undef ORGQR_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event potrf_batch(const char *func_name, Func func, sycl::queue &queue,
-                               oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda,
-                               std::int64_t stride_a, std::int64_t batch_size, T *scratchpad,
-                               std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(n, lda, stride_a, batch_size, scratchpad_size);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            CUdeviceptr a_dev;
-            cusolverStatus_t err;
-            CUresult cuda_result;
-
-            auto *a_ = reinterpret_cast<cuDataType *>(a);
-
-            // Transform ptr and stride to list of ptr's
-            cuDataType **a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size);
-            CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T *) * batch_size);
-            CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T *) * batch_size);
-
-            auto **a_dev_ = reinterpret_cast<cuDataType **>(a_dev);
-
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo),
-                                       (int)n, a_dev_, (int)lda, nullptr, (int)batch_size);
-
-            free(a_batched);
-            cuMemFree(a_dev);
-        });
-    });
-    return done;
-}
-
-// Scratchpad memory not needed as parts of buffer a is used as workspace memory
-#define POTRF_STRIDED_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                  \
-    sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a,  \
-                            std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,     \
-                            TYPE *scratchpad, std::int64_t scratchpad_size,                       \
-                            const std::vector<sycl::event> &dependencies) {                       \
-        return potrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, stride_a, \
-                           batch_size, scratchpad, scratchpad_size, dependencies);                \
-    }
-
-POTRF_STRIDED_BATCH_LAUNCHER_USM(float, cusolverDnSpotrfBatched)
-POTRF_STRIDED_BATCH_LAUNCHER_USM(double, cusolverDnDpotrfBatched)
-POTRF_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, cusolverDnCpotrfBatched)
-POTRF_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, cusolverDnZpotrfBatched)
-
-#undef POTRF_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event potrf_batch(const char *func_name, Func func, sycl::queue &queue,
-                               oneapi::mkl::uplo *uplo, std::int64_t *n, T **a, std::int64_t *lda,
-                               std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad,
-                               std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    int64_t batch_size = 0;
-    for (int64_t i = 0; i < group_count; i++) {
-        overflow_check(n[i], lda[i], group_sizes[i]);
-        batch_size += group_sizes[i];
-    }
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            int64_t offset = 0;
-            CUdeviceptr a_dev;
-            CUresult cuda_result;
-            cusolverStatus_t err;
-
-            CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T *) * batch_size);
-            CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a, sizeof(T *) * batch_size);
-
-            auto **a_dev_ = reinterpret_cast<cuDataType **>(a_dev);
-
-            // Does not use scratch so call cuSolver asynchronously and sync at end
-            for (int64_t i = 0; i < group_count; i++) {
-                CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo[i]),
-                                      (int)n[i], a_dev_ + offset, (int)lda[i], nullptr,
-                                      (int)group_sizes[i]);
-                offset += group_sizes[i];
-            }
-            CUSOLVER_SYNC(err, handle)
-
-            cuMemFree(a_dev);
-        });
-    });
-    return done;
-}
-
-// Scratchpad memory not needed as parts of buffer a is used as workspace memory
-#define POTRF_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                           \
-    sycl::event potrf_batch(                                                                       \
-        sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, TYPE **a, std::int64_t *lda, \
-        std::int64_t group_count, std::int64_t *group_sizes, TYPE *scratchpad,                     \
-        std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {              \
-        return potrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda,            \
-                           group_count, group_sizes, scratchpad, scratchpad_size, dependencies);   \
-    }
-
-POTRF_BATCH_LAUNCHER_USM(float, cusolverDnSpotrfBatched)
-POTRF_BATCH_LAUNCHER_USM(double, cusolverDnDpotrfBatched)
-POTRF_BATCH_LAUNCHER_USM(std::complex<float>, cusolverDnCpotrfBatched)
-POTRF_BATCH_LAUNCHER_USM(std::complex<double>, cusolverDnZpotrfBatched)
-
-#undef POTRF_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event potrs_batch(const char *func_name, Func func, sycl::queue &queue,
-                               oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, T *a,
-                               std::int64_t lda, std::int64_t stride_a, T *b, std::int64_t ldb,
-                               std::int64_t stride_b, std::int64_t batch_size, T *scratchpad,
-                               std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(n, nrhs, lda, ldb, stride_a, stride_b, batch_size, scratchpad_size);
-
-    // cuSolver function only supports nrhs = 1
-    if (nrhs != 1)
-        throw unimplemented("lapack", "potrs_batch", "cusolver potrs_batch only supports nrhs = 1");
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            CUresult cuda_result;
-            CUdeviceptr a_dev, b_dev;
-            auto *a_ = reinterpret_cast<cuDataType *>(a);
-            auto *b_ = reinterpret_cast<cuDataType *>(b);
-            cusolverStatus_t err;
-
-            // Transform ptr and stride to list of ptr's
-            cuDataType **a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size);
-            cuDataType **b_batched = create_ptr_list_from_stride(b_, stride_b, batch_size);
-            CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T *) * batch_size);
-            CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &b_dev, sizeof(T *) * batch_size);
-            CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T *) * batch_size);
-            CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, b_dev, b_batched, sizeof(T *) * batch_size);
-
-            auto **a_dev_ = reinterpret_cast<cuDataType **>(a_dev);
-            auto **b_dev_ = reinterpret_cast<cuDataType **>(b_dev);
-
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo),
-                                       (int)n, (int)nrhs, a_dev_, (int)lda, b_dev_, ldb, nullptr,
-                                       (int)batch_size);
-
-            free(a_batched);
-            free(b_batched);
-            cuMemFree(a_dev);
-        });
-    });
-    return done;
-}
-
-// Scratchpad memory not needed as parts of buffer a is used as workspace memory
-#define POTRS_STRIDED_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                   \
-    sycl::event potrs_batch(                                                                       \
-        sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, TYPE *a,    \
-        std::int64_t lda, std::int64_t stride_a, TYPE *b, std::int64_t ldb, std::int64_t stride_b, \
-        std::int64_t batch_size, TYPE *scratchpad, std::int64_t scratchpad_size,                   \
-        const std::vector<sycl::event> &dependencies) {                                            \
-        return potrs_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda,      \
-                           stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size,    \
-                           dependencies);                                                          \
-    }
-
-POTRS_STRIDED_BATCH_LAUNCHER_USM(float, cusolverDnSpotrsBatched)
-POTRS_STRIDED_BATCH_LAUNCHER_USM(double, cusolverDnDpotrsBatched)
-POTRS_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, cusolverDnCpotrsBatched)
-POTRS_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, cusolverDnZpotrsBatched)
-
-#undef POTRS_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event potrs_batch(const char *func_name, Func func, sycl::queue &queue,
-                               oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, T **a,
-                               std::int64_t *lda, T **b, std::int64_t *ldb,
-                               std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad,
-                               std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    int64_t batch_size = 0;
-    for (int64_t i = 0; i < group_count; i++) {
-        overflow_check(n[i], lda[i], group_sizes[i]);
-        batch_size += group_sizes[i];
-
-        // cuSolver function only supports nrhs = 1
-        if (nrhs[i] != 1)
-            throw unimplemented("lapack", "potrs_batch",
-                                "cusolver potrs_batch only supports nrhs = 1");
-    }
-
-    int *info = (int *)malloc_device(sizeof(int *) * batch_size, queue);
-    T **a_dev = (T **)malloc_device(sizeof(T *) * batch_size, queue);
-    T **b_dev = (T **)malloc_device(sizeof(T *) * batch_size, queue);
-    auto done_cpy_a =
-        queue.submit([&](sycl::handler &h) { h.memcpy(a_dev, a, batch_size * sizeof(T *)); });
-
-    auto done_cpy_b =
-        queue.submit([&](sycl::handler &h) { h.memcpy(b_dev, b, batch_size * sizeof(T *)); });
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        cgh.depends_on(done_cpy_a);
-        cgh.depends_on(done_cpy_b);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            int64_t offset = 0;
-            cusolverStatus_t err;
-
-            // Does not use scratch so call cuSolver asynchronously and sync at end
-            for (int64_t i = 0; i < group_count; i++) {
-                auto **a_ = reinterpret_cast<cuDataType **>(a_dev);
-                auto **b_ = reinterpret_cast<cuDataType **>(b_dev);
-                auto info_ = reinterpret_cast<int *>(info);
-                CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo[i]),
-                                      (int)n[i], (int)nrhs[i], a_ + offset, (int)lda[i],
-                                      b_ + offset, (int)ldb[i], info_, (int)group_sizes[i]);
-                offset += group_sizes[i];
-            }
-            CUSOLVER_SYNC(err, handle)
-        });
-    });
-    return done;
-}
-
-// Scratchpad memory not needed as parts of buffer a is used as workspace memory
-#define POTRS_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                         \
-    sycl::event potrs_batch(                                                                     \
-        sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,        \
-        TYPE **a, std::int64_t *lda, TYPE **b, std::int64_t *ldb, std::int64_t group_count,      \
-        std::int64_t *group_sizes, TYPE *scratchpad, std::int64_t scratchpad_size,               \
-        const std::vector<sycl::event> &dependencies) {                                          \
-        return potrs_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda, b, \
-                           ldb, group_count, group_sizes, scratchpad, scratchpad_size,           \
-                           dependencies);                                                        \
-    }
-
-POTRS_BATCH_LAUNCHER_USM(float, cusolverDnSpotrsBatched)
-POTRS_BATCH_LAUNCHER_USM(double, cusolverDnDpotrsBatched)
-POTRS_BATCH_LAUNCHER_USM(std::complex<float>, cusolverDnCpotrsBatched)
-POTRS_BATCH_LAUNCHER_USM(std::complex<double>, cusolverDnZpotrsBatched)
-
-#undef POTRS_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event ungqr_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                               std::int64_t n, std::int64_t k, T *a, std::int64_t lda,
-                               std::int64_t stride_a, T *tau, std::int64_t stride_tau,
-                               std::int64_t batch_size, T *scratchpad, std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(m, n, k, lda, stride_a, stride_tau, batch_size, scratchpad_size);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-
-            // Uses scratch so sync between each cuSolver call
-            for (int64_t i = 0; i < batch_size; ++i) {
-                CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_ + stride_a * i,
-                                           lda, tau_ + stride_tau * i, scratch_, scratchpad_size,
-                                           nullptr);
-            }
-        });
-    });
-
-    return done;
-}
-
-#define UNGQR_STRIDED_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                  \
-    sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,   \
-                            TYPE *a, std::int64_t lda, std::int64_t stride_a, TYPE *tau,          \
-                            std::int64_t stride_tau, std::int64_t batch_size, TYPE *scratchpad,   \
-                            std::int64_t scratchpad_size,                                         \
-                            const std::vector<sycl::event> &dependencies) {                       \
-        return ungqr_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, stride_a, \
-                           tau, stride_tau, batch_size, scratchpad, scratchpad_size,              \
-                           dependencies);                                                         \
-    }
-
-UNGQR_STRIDED_BATCH_LAUNCHER_USM(std::complex<float>, cusolverDnCungqr)
-UNGQR_STRIDED_BATCH_LAUNCHER_USM(std::complex<double>, cusolverDnZungqr)
-
-#undef UNGQR_STRIDED_BATCH_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event ungqr_batch(const char *func_name, Func func, sycl::queue &queue,
-                               std::int64_t *m, std::int64_t *n, std::int64_t *k, T **a,
-                               std::int64_t *lda, T **tau, std::int64_t group_count,
-                               std::int64_t *group_sizes, T *scratchpad,
-                               std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-
-    overflow_check(group_count, scratchpad_size);
-    for (int64_t i = 0; i < group_count; ++i)
-        overflow_check(m[i], n[i], k[i], lda[i], group_sizes[i]);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependencies);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType **>(a);
-            auto tau_ = reinterpret_cast<cuDataType **>(tau);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            int64_t global_id = 0;
-            cusolverStatus_t err;
-
-            // Uses scratch so sync between each cuSolver call
-            for (int64_t group_id = 0; group_id < group_count; ++group_id) {
-                for (int64_t local_id = 0; local_id < group_sizes[group_id];
-                     ++local_id, ++global_id) {
-                    CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m[group_id],
-                                               n[group_id], k[group_id], a_[global_id],
-                                               lda[group_id], tau_[global_id], scratch_,
-                                               scratchpad_size, nullptr);
-                }
-            }
-        });
-    });
-
-    return done;
-}
-
-#define UNGQR_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                           \
-    sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, \
-                            TYPE **a, std::int64_t *lda, TYPE **tau, std::int64_t group_count,     \
-                            std::int64_t *group_sizes, TYPE *scratchpad,                           \
-                            std::int64_t scratchpad_size,                                          \
-                            const std::vector<sycl::event> &dependencies) {                        \
-        return ungqr_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, tau,       \
-                           group_count, group_sizes, scratchpad, scratchpad_size, dependencies);   \
-    }
-
-UNGQR_BATCH_LAUNCHER_USM(std::complex<float>, cusolverDnCungqr)
-UNGQR_BATCH_LAUNCHER_USM(std::complex<double>, cusolverDnZungqr)
-
-#undef UNGQR_BATCH_LAUNCHER_USM
-
-// BATCH SCRATCHPAD API
-
-template <typename Func>
-inline void getrf_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                        std::int64_t m, std::int64_t n, std::int64_t lda,
-                                        std::int64_t stride_a, std::int64_t stride_ipiv,
-                                        std::int64_t batch_size, int *scratch_size) {
-    auto e = queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, nullptr, lda, scratch_size);
-        });
-    });
-    e.wait();
-}
-
-#define GETRF_STRIDED_BATCH_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                       \
-    template <>                                                                            \
-    std::int64_t getrf_batch_scratchpad_size<TYPE>(                                        \
-        sycl::queue & queue, std::int64_t m, std::int64_t n, std::int64_t lda,             \
-        std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) {        \
-        int scratch_size;                                                                  \
-        getrf_batch_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, lda, \
-                                    stride_a, stride_ipiv, batch_size, &scratch_size);     \
-        return scratch_size;                                                               \
-    }
-
-GETRF_STRIDED_BATCH_LAUNCHER_SCRATCH(float, cusolverDnSgetrf_bufferSize)
-GETRF_STRIDED_BATCH_LAUNCHER_SCRATCH(double, cusolverDnDgetrf_bufferSize)
-GETRF_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCgetrf_bufferSize)
-GETRF_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZgetrf_bufferSize)
-
-#undef GETRF_STRIDED_BATCH_LAUNCHER_SCRATCH
-
-// Scratch memory needs to be the same size as a
-#define GETRI_STRIDED_BATCH_LAUNCHER_SCRATCH(TYPE)                                    \
-    template <>                                                                       \
-    std::int64_t getri_batch_scratchpad_size<TYPE>(                                   \
-        sycl::queue & queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, \
-        std::int64_t stride_ipiv, std::int64_t batch_size) {                          \
-        assert(stride_a >= lda * n && "A matrices must not overlap");                 \
-        return stride_a * (batch_size - 1) + lda * n;                                 \
-    }
-
-GETRI_STRIDED_BATCH_LAUNCHER_SCRATCH(float)
-GETRI_STRIDED_BATCH_LAUNCHER_SCRATCH(double)
-GETRI_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex<float>)
-GETRI_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef GETRI_STRIDED_BATCH_LAUNCHER_SCRATCH
-
-// cusolverDnXgetrs does not use scratchpad memory
-#define GETRS_STRIDED_BATCH_LAUNCHER_SCRATCH(TYPE)                                            \
-    template <>                                                                               \
-    std::int64_t getrs_batch_scratchpad_size<TYPE>(                                           \
-        sycl::queue & queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, \
-        std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb,  \
-        std::int64_t stride_b, std::int64_t batch_size) {                                     \
-        return 0;                                                                             \
-    }
-
-GETRS_STRIDED_BATCH_LAUNCHER_SCRATCH(float)
-GETRS_STRIDED_BATCH_LAUNCHER_SCRATCH(double)
-GETRS_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex<float>)
-GETRS_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef GETRS_STRIDED_BATCH_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void geqrf_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                        std::int64_t m, std::int64_t n, std::int64_t lda,
-                                        std::int64_t stride_a, std::int64_t stride_tau,
-                                        std::int64_t batch_size, int *scratch_size) {
-    auto e = queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, nullptr, lda, scratch_size);
-        });
-    });
-    e.wait();
-}
-
-#define GEQRF_STRIDED_BATCH_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                       \
-    template <>                                                                            \
-    std::int64_t geqrf_batch_scratchpad_size<TYPE>(                                        \
-        sycl::queue & queue, std::int64_t m, std::int64_t n, std::int64_t lda,             \
-        std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) {         \
-        int scratch_size;                                                                  \
-        geqrf_batch_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, lda, \
-                                    stride_a, stride_tau, batch_size, &scratch_size);      \
-        return scratch_size;                                                               \
-    }
-
-GEQRF_STRIDED_BATCH_LAUNCHER_SCRATCH(float, cusolverDnSgeqrf_bufferSize)
-GEQRF_STRIDED_BATCH_LAUNCHER_SCRATCH(double, cusolverDnDgeqrf_bufferSize)
-GEQRF_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCgeqrf_bufferSize)
-GEQRF_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZgeqrf_bufferSize)
-
-#undef GEQRF_STRIDED_BATCH_LAUNCHER_SCRATCH
-
-// cusolverDnXpotrfBatched does not use scratchpad memory
-#define POTRF_STRIDED_BATCH_LAUNCHER_SCRATCH(TYPE)                                     \
-    template <>                                                                        \
-    std::int64_t potrf_batch_scratchpad_size<TYPE>(                                    \
-        sycl::queue & queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, \
-        std::int64_t stride_a, std::int64_t batch_size) {                              \
-        return 0;                                                                      \
-    }
-
-POTRF_STRIDED_BATCH_LAUNCHER_SCRATCH(float)
-POTRF_STRIDED_BATCH_LAUNCHER_SCRATCH(double)
-POTRF_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex<float>)
-POTRF_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef POTRF_STRIDED_BATCH_LAUNCHER_SCRATCH
-
-// cusolverDnXpotrsBatched does not use scratchpad memory
-#define POTRS_STRIDED_BATCH_LAUNCHER_SCRATCH(TYPE)                                        \
-    template <>                                                                           \
-    std::int64_t potrs_batch_scratchpad_size<TYPE>(                                       \
-        sycl::queue & queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,   \
-        std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, \
-        std::int64_t batch_size) {                                                        \
-        return 0;                                                                         \
-    }
-
-POTRS_STRIDED_BATCH_LAUNCHER_SCRATCH(float)
-POTRS_STRIDED_BATCH_LAUNCHER_SCRATCH(double)
-POTRS_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex<float>)
-POTRS_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef POTRS_STRIDED_BATCH_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void orgqr_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                        std::int64_t m, std::int64_t n, std::int64_t k,
-                                        std::int64_t lda, std::int64_t stride_a,
-                                        std::int64_t stride_tau, std::int64_t batch_size,
-                                        int *scratch_size) {
-    auto e = queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, k, nullptr, lda, nullptr,
-                                  scratch_size);
-        });
-    });
-    e.wait();
-}
-
-#define ORGQR_STRIDED_BATCH_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                           \
-    template <>                                                                                \
-    std::int64_t orgqr_batch_scratchpad_size<TYPE>(                                            \
-        sycl::queue & queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, \
-        std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) {             \
-        int scratch_size;                                                                      \
-        orgqr_batch_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, lda,  \
-                                    stride_a, stride_tau, batch_size, &scratch_size);          \
-        return scratch_size;                                                                   \
-    }
-
-ORGQR_STRIDED_BATCH_LAUNCHER_SCRATCH(float, cusolverDnSorgqr_bufferSize)
-ORGQR_STRIDED_BATCH_LAUNCHER_SCRATCH(double, cusolverDnDorgqr_bufferSize)
-
-#undef ORGQR_STRIDED_BATCH_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void ungqr_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                        std::int64_t m, std::int64_t n, std::int64_t k,
-                                        std::int64_t lda, std::int64_t stride_a,
-                                        std::int64_t stride_tau, std::int64_t batch_size,
-                                        int *scratch_size) {
-    auto e = queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, k, nullptr, lda, nullptr,
-                                  scratch_size);
-        });
-    });
-    e.wait();
-}
-
-#define ORGQR_STRIDED_BATCH_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                           \
-    template <>                                                                                \
-    std::int64_t ungqr_batch_scratchpad_size<TYPE>(                                            \
-        sycl::queue & queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, \
-        std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) {             \
-        int scratch_size;                                                                      \
-        ungqr_batch_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, lda,  \
-                                    stride_a, stride_tau, batch_size, &scratch_size);          \
-        return scratch_size;                                                                   \
-    }
-
-ORGQR_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCungqr_bufferSize)
-ORGQR_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZungqr_bufferSize)
-
-#undef ORGQR_STRIDED_BATCH_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void getrf_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                        std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-                                        std::int64_t group_count, std::int64_t *group_sizes,
-                                        int *scratch_size) {
-    auto e = queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            int group_scratch_size = 0;
-            *scratch_size = 0;
-            cusolverStatus_t err;
-
-            // Get the maximum scratch_size across the groups
-            for (int64_t group_id = 0; group_id < group_count; ++group_id) {
-                CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m[group_id], n[group_id],
-                                      nullptr, lda[group_id], &group_scratch_size);
-                *scratch_size =
-                    group_scratch_size > *scratch_size ? group_scratch_size : *scratch_size;
-            }
-        });
-    });
-    e.wait();
-}
-
-#define GETRF_GROUP_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                               \
-    template <>                                                                            \
-    std::int64_t getrf_batch_scratchpad_size<TYPE>(                                        \
-        sycl::queue & queue, std::int64_t * m, std::int64_t * n, std::int64_t * lda,       \
-        std::int64_t group_count, std::int64_t * group_sizes) {                            \
-        int scratch_size;                                                                  \
-        getrf_batch_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, lda, \
-                                    group_count, group_sizes, &scratch_size);              \
-        return scratch_size;                                                               \
-    }
-
-GETRF_GROUP_LAUNCHER_SCRATCH(float, cusolverDnSgetrf_bufferSize)
-GETRF_GROUP_LAUNCHER_SCRATCH(double, cusolverDnDgetrf_bufferSize)
-GETRF_GROUP_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCgetrf_bufferSize)
-GETRF_GROUP_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZgetrf_bufferSize)
-
-#undef GETRF_GROUP_LAUNCHER_SCRATCH
-
-#define GETRI_GROUP_LAUNCHER_SCRATCH(TYPE)                                                       \
-    template <>                                                                                  \
-    std::int64_t getri_batch_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t * n,        \
-                                                   std::int64_t * lda, std::int64_t group_count, \
-                                                   std::int64_t * group_sizes) {                 \
-        std::int64_t max_scratch_sz = 0;                                                         \
-        for (auto group_id = 0; group_id < group_count; ++group_id) {                            \
-            auto scratch_sz = lda[group_id] * n[group_id];                                       \
-            if (scratch_sz > max_scratch_sz)                                                     \
-                max_scratch_sz = scratch_sz;                                                     \
-        }                                                                                        \
-        return max_scratch_sz;                                                                   \
-    }
-
-GETRI_GROUP_LAUNCHER_SCRATCH(float)
-GETRI_GROUP_LAUNCHER_SCRATCH(double)
-GETRI_GROUP_LAUNCHER_SCRATCH(std::complex<float>)
-GETRI_GROUP_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef GETRI_GROUP_LAUNCHER_SCRATCH
-
-#define GETRS_GROUP_LAUNCHER_SCRATCH(TYPE)                                                     \
-    template <>                                                                                \
-    std::int64_t getrs_batch_scratchpad_size<TYPE>(                                            \
-        sycl::queue & queue, oneapi::mkl::transpose * trans, std::int64_t * n,                 \
-        std::int64_t * nrhs, std::int64_t * lda, std::int64_t * ldb, std::int64_t group_count, \
-        std::int64_t * group_sizes) {                                                          \
-        return 0;                                                                              \
-    }
-
-GETRS_GROUP_LAUNCHER_SCRATCH(float)
-GETRS_GROUP_LAUNCHER_SCRATCH(double)
-GETRS_GROUP_LAUNCHER_SCRATCH(std::complex<float>)
-GETRS_GROUP_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef GETRS_GROUP_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void geqrf_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                        std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-                                        std::int64_t group_count, std::int64_t *group_sizes,
-                                        int *scratch_size) {
-    auto e = queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            int group_scratch_size = 0;
-            *scratch_size = 0;
-            cusolverStatus_t err;
-
-            // Get the maximum scratch_size across the groups
-            for (int64_t group_id = 0; group_id < group_count; ++group_id) {
-                CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m[group_id], n[group_id],
-                                      nullptr, lda[group_id], &group_scratch_size);
-                *scratch_size =
-                    group_scratch_size > *scratch_size ? group_scratch_size : *scratch_size;
-            }
-        });
-    });
-    e.wait();
-}
-
-#define GEQRF_GROUP_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                               \
-    template <>                                                                            \
-    std::int64_t geqrf_batch_scratchpad_size<TYPE>(                                        \
-        sycl::queue & queue, std::int64_t * m, std::int64_t * n, std::int64_t * lda,       \
-        std::int64_t group_count, std::int64_t * group_sizes) {                            \
-        int scratch_size;                                                                  \
-        geqrf_batch_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, lda, \
-                                    group_count, group_sizes, &scratch_size);              \
-        return scratch_size;                                                               \
-    }
-
-GEQRF_GROUP_LAUNCHER_SCRATCH(float, cusolverDnSgeqrf_bufferSize)
-GEQRF_GROUP_LAUNCHER_SCRATCH(double, cusolverDnDgeqrf_bufferSize)
-GEQRF_GROUP_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCgeqrf_bufferSize)
-GEQRF_GROUP_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZgeqrf_bufferSize)
-
-#undef GEQRF_GROUP_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void orgqr_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                        std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                        std::int64_t *lda, std::int64_t group_count,
-                                        std::int64_t *group_sizes, int *scratch_size) {
-    auto e = queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            int group_scratch_size = 0;
-            *scratch_size = 0;
-            cusolverStatus_t err;
-
-            // Get the maximum scratch_size across the groups
-            for (int64_t group_id = 0; group_id < group_count; ++group_id) {
-                CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m[group_id], n[group_id],
-                                      k[group_id], nullptr, lda[group_id], nullptr,
-                                      &group_scratch_size);
-                *scratch_size =
-                    group_scratch_size > *scratch_size ? group_scratch_size : *scratch_size;
-            }
-        });
-    });
-    e.wait();
-}
-
-#define ORGQR_GROUP_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                  \
-    template <>                                                                               \
-    std::int64_t orgqr_batch_scratchpad_size<TYPE>(                                           \
-        sycl::queue & queue, std::int64_t * m, std::int64_t * n, std::int64_t * k,            \
-        std::int64_t * lda, std::int64_t group_count, std::int64_t * group_sizes) {           \
-        int scratch_size;                                                                     \
-        orgqr_batch_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, lda, \
-                                    group_count, group_sizes, &scratch_size);                 \
-        return scratch_size;                                                                  \
-    }
-
-ORGQR_GROUP_LAUNCHER_SCRATCH(float, cusolverDnSorgqr_bufferSize)
-ORGQR_GROUP_LAUNCHER_SCRATCH(double, cusolverDnDorgqr_bufferSize)
-
-#undef ORGQR_GROUP_LAUNCHER_SCRATCH
-
-// cusolverDnXpotrfBatched does not use scratchpad memory
-#define POTRF_GROUP_LAUNCHER_SCRATCH(TYPE)                                                   \
-    template <>                                                                              \
-    std::int64_t potrf_batch_scratchpad_size<TYPE>(                                          \
-        sycl::queue & queue, oneapi::mkl::uplo * uplo, std::int64_t * n, std::int64_t * lda, \
-        std::int64_t group_count, std::int64_t * group_sizes) {                              \
-        return 0;                                                                            \
-    }
-
-POTRF_GROUP_LAUNCHER_SCRATCH(float)
-POTRF_GROUP_LAUNCHER_SCRATCH(double)
-POTRF_GROUP_LAUNCHER_SCRATCH(std::complex<float>)
-POTRF_GROUP_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef POTRF_GROUP_LAUNCHER_SCRATCH
-
-// cusolverDnXpotrsBatched does not use scratchpad memory
-#define POTRS_GROUP_LAUNCHER_SCRATCH(TYPE)                                                    \
-    template <>                                                                               \
-    std::int64_t potrs_batch_scratchpad_size<TYPE>(                                           \
-        sycl::queue & queue, oneapi::mkl::uplo * uplo, std::int64_t * n, std::int64_t * nrhs, \
-        std::int64_t * lda, std::int64_t * ldb, std::int64_t group_count,                     \
-        std::int64_t * group_sizes) {                                                         \
-        return 0;                                                                             \
-    }
-
-POTRS_GROUP_LAUNCHER_SCRATCH(float)
-POTRS_GROUP_LAUNCHER_SCRATCH(double)
-POTRS_GROUP_LAUNCHER_SCRATCH(std::complex<float>)
-POTRS_GROUP_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef POTRS_GROUP_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void ungqr_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                        std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                        std::int64_t *lda, std::int64_t group_count,
-                                        std::int64_t *group_sizes, int *scratch_size) {
-    auto e = queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            int group_scratch_size = 0;
-            *scratch_size = 0;
-            cusolverStatus_t err;
-
-            // Get the maximum scratch_size across the groups
-            for (int64_t group_id = 0; group_id < group_count; ++group_id) {
-                CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m[group_id], n[group_id],
-                                      k[group_id], nullptr, lda[group_id], nullptr,
-                                      &group_scratch_size);
-                *scratch_size =
-                    group_scratch_size > *scratch_size ? group_scratch_size : *scratch_size;
-            }
-        });
-    });
-    e.wait();
-}
-
-#define UNGQR_GROUP_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                  \
-    template <>                                                                               \
-    std::int64_t ungqr_batch_scratchpad_size<TYPE>(                                           \
-        sycl::queue & queue, std::int64_t * m, std::int64_t * n, std::int64_t * k,            \
-        std::int64_t * lda, std::int64_t group_count, std::int64_t * group_sizes) {           \
-        int scratch_size;                                                                     \
-        ungqr_batch_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, lda, \
-                                    group_count, group_sizes, &scratch_size);                 \
-        return scratch_size;                                                                  \
-    }
-
-UNGQR_GROUP_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCungqr_bufferSize)
-UNGQR_GROUP_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZungqr_bufferSize)
-
-#undef UNGQR_GROUP_LAUNCHER_SCRATCH
-
-} // namespace cusolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/lapack/backends/cusolver/cusolver_handle.hpp b/src/lapack/backends/cusolver/cusolver_handle.hpp
deleted file mode 100644
index f3b587039..000000000
--- a/src/lapack/backends/cusolver/cusolver_handle.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#ifndef CUSOLVER_HANDLE_HPP
-#define CUSOLVER_HANDLE_HPP
-#include <atomic>
-#include <unordered_map>
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace cusolver {
-
-template <typename T>
-struct cusolver_handle {
-    using handle_container_t = std::unordered_map<T, std::atomic<cusolverDnHandle_t> *>;
-    handle_container_t cusolver_handle_mapper_{};
-    ~cusolver_handle() noexcept(false) {
-        for (auto &handle_pair : cusolver_handle_mapper_) {
-            cusolverStatus_t err;
-            if (handle_pair.second != nullptr) {
-                auto handle = handle_pair.second->exchange(nullptr);
-                if (handle != nullptr) {
-                    CUSOLVER_ERROR_FUNC(cusolverDnDestroy, err, handle);
-                    handle = nullptr;
-                }
-                else {
-                    // if the handle is nullptr it means the handle was already
-                    // destroyed by the ContextCallback and we're free to delete the
-                    // atomic object.
-                    delete handle_pair.second;
-                }
-
-                handle_pair.second = nullptr;
-            }
-        }
-        cusolver_handle_mapper_.clear();
-    }
-};
-
-} // namespace cusolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
-
-#endif // CUSOLVER_HANDLE_HPP
diff --git a/src/lapack/backends/cusolver/cusolver_helper.hpp b/src/lapack/backends/cusolver/cusolver_helper.hpp
deleted file mode 100644
index e10f56b36..000000000
--- a/src/lapack/backends/cusolver/cusolver_helper.hpp
+++ /dev/null
@@ -1,326 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-/**
- * @file cusolver_*.cpp : contain the implementation of all the routines
- * for CUDA backend
- */
-#ifndef _CUSOLVER_HELPER_HPP_
-#define _CUSOLVER_HELPER_HPP_
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <cublas_v2.h>
-#include <cusolverDn.h>
-#include <cuda.h>
-#include <complex>
-
-#include "oneapi/mkl/types.hpp"
-#include "runtime_support_helper.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/lapack/exceptions.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace cusolver {
-
-// The static assert to make sure that all index types used in
-// oneMKL/include/oneapi/mkl/lapack.hpp interface are int64_t
-template <typename... Next>
-struct is_int64 : std::false_type {};
-
-template <typename First>
-struct is_int64<First> : std::is_same<std::int64_t, First> {};
-
-template <typename First, typename... Next>
-struct is_int64<First, Next...>
-        : std::integral_constant<bool, std::is_same<std::int64_t, First>::value &&
-                                           is_int64<Next...>::value> {};
-
-template <typename... T>
-struct Overflow {
-    static void inline check(T...) {}
-};
-
-template <typename Index, typename... T>
-struct Overflow<Index, T...> {
-    static void inline check(Index index, T... next) {
-        if (std::abs(index) >= (1LL << 31)) {
-            throw std::runtime_error(
-                "Cusolver index overflow. Cusolver legacy API does not support 64 bit "
-                "integer as data size. Thus, the data size should not be greater that "
-                "maximum supported size by 32 bit integer.");
-        }
-        Overflow<T...>::check(next...);
-    }
-};
-
-template <typename Index, typename... Next>
-void overflow_check(Index index, Next... indices) {
-    static_assert(is_int64<Index, Next...>::value, "oneMKL index type must be 64 bit integer.");
-    Overflow<Index, Next...>::check(index, indices...);
-}
-
-class cusolver_error : virtual public std::runtime_error {
-protected:
-    inline const char *cusolver_error_map(cusolverStatus_t error) {
-        switch (error) {
-            case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCESS";
-
-            case CUSOLVER_STATUS_ALLOC_FAILED: return "CUSOLVER_STATUS_ALLOC_FAILED";
-
-            case CUSOLVER_STATUS_INVALID_VALUE: return "CUSOLVER_STATUS_INVALID_VALUE";
-
-            case CUSOLVER_STATUS_ARCH_MISMATCH: return "CUSOLVER_STATUS_ARCH_MISMATCH";
-
-            case CUSOLVER_STATUS_EXECUTION_FAILED: return "CUSOLVER_STATUS_EXECUTION_FAILED";
-
-            case CUSOLVER_STATUS_INTERNAL_ERROR: return "CUSOLVER_STATUS_INTERNAL_ERROR";
-
-            case CUSOLVER_STATUS_NOT_INITIALIZED: return "CUSOLVER_STATUS_NOT_INITIALIZED";
-
-            case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-                return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-
-            default: return "<unknown>";
-        }
-    }
-
-    int error_number; ///< Error number
-public:
-    /** Constructor (C++ STL string, cusolverStatus_t).
-   *  @param msg The error message
-   *  @param err_num error number
-   */
-    explicit cusolver_error(std::string message, cusolverStatus_t result)
-            : std::runtime_error((message + std::string(cusolver_error_map(result)))) {
-        error_number = static_cast<int>(result);
-    }
-
-    /** Destructor.
-   *  Virtual to allow for subclassing.
-   */
-    virtual ~cusolver_error() throw() {}
-
-    /** Returns error number.
-   *  @return #error_number
-   */
-    virtual int getErrorNumber() const throw() {
-        return error_number;
-    }
-};
-
-class cuda_error : virtual public std::runtime_error {
-protected:
-    inline const char *cuda_error_map(CUresult result) {
-        switch (result) {
-            case CUDA_SUCCESS: return "CUDA_SUCCESS";
-            case CUDA_ERROR_NOT_PERMITTED: return "CUDA_ERROR_NOT_PERMITTED";
-            case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT";
-            case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE";
-            case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE";
-            case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY";
-            case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
-            default: return "<unknown>";
-        }
-    }
-    int error_number; ///< error number
-public:
-    /** Constructor (C++ STL string, CUresult).
-   *  @param msg The error message
-   *  @param err_num Error number
-   */
-    explicit cuda_error(std::string message, CUresult result)
-            : std::runtime_error((message + std::string(cuda_error_map(result)))) {
-        error_number = static_cast<int>(result);
-    }
-
-    /** Destructor.
-   *  Virtual to allow for subclassing.
-   */
-    virtual ~cuda_error() throw() {}
-
-    /** Returns error number.
-   *  @return #error_number
-   */
-    virtual int getErrorNumber() const throw() {
-        return error_number;
-    }
-};
-
-#define CUDA_ERROR_FUNC(name, err, ...)                                 \
-    err = name(__VA_ARGS__);                                            \
-    if (err != CUDA_SUCCESS) {                                          \
-        throw cuda_error(std::string(#name) + std::string(" : "), err); \
-    }
-
-#define CUSOLVER_ERROR_FUNC(name, err, ...)                                 \
-    err = name(__VA_ARGS__);                                                \
-    if (err != CUSOLVER_STATUS_SUCCESS) {                                   \
-        throw cusolver_error(std::string(#name) + std::string(" : "), err); \
-    }
-
-#define CUSOLVER_ERROR_FUNC_T(name, func, err, ...)                        \
-    err = func(__VA_ARGS__);                                               \
-    if (err != CUSOLVER_STATUS_SUCCESS) {                                  \
-        throw cusolver_error(std::string(name) + std::string(" : "), err); \
-    }
-
-#define CUSOLVER_SYNC(err, handle)                                           \
-    cudaStream_t currentStreamId;                                            \
-    CUSOLVER_ERROR_FUNC(cusolverDnGetStream, err, handle, &currentStreamId); \
-    {                                                                        \
-        CUresult __cuda_err;                                                 \
-        CUDA_ERROR_FUNC(cuStreamSynchronize, __cuda_err, currentStreamId);   \
-    }
-
-#define CUSOLVER_ERROR_FUNC_T_SYNC(name, func, err, handle, ...)           \
-    err = func(handle, __VA_ARGS__);                                       \
-    if (err != CUSOLVER_STATUS_SUCCESS) {                                  \
-        throw cusolver_error(std::string(name) + std::string(" : "), err); \
-    }                                                                      \
-    CUSOLVER_SYNC(err, handle)
-
-inline cusolverEigType_t get_cusolver_itype(std::int64_t itype) {
-    switch (itype) {
-        case 1: return CUSOLVER_EIG_TYPE_1;
-        case 2: return CUSOLVER_EIG_TYPE_2;
-        case 3: return CUSOLVER_EIG_TYPE_3;
-        default: throw "Wrong itype.";
-    }
-}
-
-inline cusolverEigMode_t get_cusolver_job(oneapi::mkl::job jobz) {
-    switch (jobz) {
-        case oneapi::mkl::job::N: return CUSOLVER_EIG_MODE_NOVECTOR;
-        case oneapi::mkl::job::V: return CUSOLVER_EIG_MODE_VECTOR;
-        default: throw "Wrong jobz.";
-    }
-}
-
-inline signed char get_cusolver_jobsvd(oneapi::mkl::jobsvd job) {
-    switch (job) {
-        case oneapi::mkl::jobsvd::N: return 'N';
-        case oneapi::mkl::jobsvd::A: return 'A';
-        case oneapi::mkl::jobsvd::O: return 'O';
-        case oneapi::mkl::jobsvd::S: return 'S';
-    }
-}
-
-inline cublasOperation_t get_cublas_operation(oneapi::mkl::transpose trn) {
-    switch (trn) {
-        case oneapi::mkl::transpose::nontrans: return CUBLAS_OP_N;
-        case oneapi::mkl::transpose::trans: return CUBLAS_OP_T;
-        case oneapi::mkl::transpose::conjtrans: return CUBLAS_OP_C;
-        default: throw "Wrong transpose Operation.";
-    }
-}
-
-inline cublasFillMode_t get_cublas_fill_mode(oneapi::mkl::uplo ul) {
-    switch (ul) {
-        case oneapi::mkl::uplo::upper: return CUBLAS_FILL_MODE_UPPER;
-        case oneapi::mkl::uplo::lower: return CUBLAS_FILL_MODE_LOWER;
-        default: throw "Wrong fill mode.";
-    }
-}
-
-inline cublasSideMode_t get_cublas_side_mode(oneapi::mkl::side lr) {
-    switch (lr) {
-        case oneapi::mkl::side::left: return CUBLAS_SIDE_LEFT;
-        case oneapi::mkl::side::right: return CUBLAS_SIDE_RIGHT;
-        default: throw "Wrong side mode.";
-    }
-}
-
-inline cublasSideMode_t get_cublas_generate(oneapi::mkl::generate qp) {
-    switch (qp) {
-        case oneapi::mkl::generate::Q: return CUBLAS_SIDE_LEFT;
-        case oneapi::mkl::generate::P: return CUBLAS_SIDE_RIGHT;
-        default: throw "Wrong generate.";
-    }
-}
-
-/*converting std::complex<T> to cu<T>Complex*/
-/*converting sycl::half to __half*/
-template <typename T>
-struct CudaEquivalentType {
-    using Type = T;
-};
-template <>
-struct CudaEquivalentType<sycl::half> {
-    using Type = __half;
-};
-template <>
-struct CudaEquivalentType<std::complex<float>> {
-    using Type = cuComplex;
-};
-template <>
-struct CudaEquivalentType<std::complex<double>> {
-    using Type = cuDoubleComplex;
-};
-
-/* devinfo */
-
-inline void get_cusolver_devinfo(sycl::queue &queue, sycl::buffer<int> &devInfo,
-                                 std::vector<int> &dev_info_) {
-    sycl::host_accessor<int, 1, sycl::access::mode::read> dev_info_acc{ devInfo };
-    for (unsigned int i = 0; i < dev_info_.size(); ++i)
-        dev_info_[i] = dev_info_acc[i];
-}
-
-inline void get_cusolver_devinfo(sycl::queue &queue, const int *devInfo,
-                                 std::vector<int> &dev_info_) {
-    queue.wait();
-    queue.memcpy(dev_info_.data(), devInfo, sizeof(int));
-}
-
-template <typename DEVINFO_T>
-inline void lapack_info_check(sycl::queue &queue, DEVINFO_T devinfo, const char *func_name,
-                              const char *cufunc_name, int dev_info_size = 1) {
-    std::vector<int> dev_info_(dev_info_size);
-    get_cusolver_devinfo(queue, devinfo, dev_info_);
-    for (const auto &val : dev_info_) {
-        if (val > 0)
-            throw oneapi::mkl::lapack::computation_error(
-                func_name, std::string(cufunc_name) + " failed with info = " + std::to_string(val),
-                val);
-    }
-}
-
-/* batched helpers */
-
-// Creates list of matrix/vector pointers from initial ptr and stride
-// Note: user is responsible for deallocating memory
-template <typename T>
-T **create_ptr_list_from_stride(T *ptr, int64_t ptr_stride, int64_t batch_size) {
-    T **ptr_list = (T **)malloc(sizeof(T *) * batch_size);
-    for (int64_t i = 0; i < batch_size; i++)
-        ptr_list[i] = ptr + i * ptr_stride;
-
-    return ptr_list;
-}
-
-} // namespace cusolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
-#endif // _CUSOLVER_HELPER_HPP_
diff --git a/src/lapack/backends/cusolver/cusolver_lapack.cpp b/src/lapack/backends/cusolver/cusolver_lapack.cpp
deleted file mode 100644
index 0c7aaefc8..000000000
--- a/src/lapack/backends/cusolver/cusolver_lapack.cpp
+++ /dev/null
@@ -1,3321 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "cusolver_helper.hpp"
-#include "cusolver_task.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace cusolver {
-
-// BUFFER APIs
-
-template <typename Func, typename T_A, typename T_B>
-inline void gebrd(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                  std::int64_t n, sycl::buffer<T_A> &a, std::int64_t lda, sycl::buffer<T_B> &d,
-                  sycl::buffer<T_B> &e, sycl::buffer<T_A> &tauq, sycl::buffer<T_A> &taup,
-                  sycl::buffer<T_A> &scratchpad, std::int64_t scratchpad_size) {
-    using cuDataType_A = typename CudaEquivalentType<T_A>::Type;
-    using cuDataType_B = typename CudaEquivalentType<T_B>::Type;
-    overflow_check(m, n, lda, scratchpad_size);
-
-    if (m < n)
-        throw unimplemented("lapack", "gebrd", "cusolver gebrd does not support m < n");
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto d_acc = d.template get_access<sycl::access::mode::write>(cgh);
-        auto e_acc = e.template get_access<sycl::access::mode::write>(cgh);
-        auto tauq_acc = tauq.template get_access<sycl::access::mode::write>(cgh);
-        auto taup_acc = taup.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType_A *>(a_acc);
-            auto d_ = sc.get_mem<cuDataType_B *>(d_acc);
-            auto e_ = sc.get_mem<cuDataType_B *>(e_acc);
-            auto tauq_ = sc.get_mem<cuDataType_A *>(tauq_acc);
-            auto taup_ = sc.get_mem<cuDataType_A *>(taup_acc);
-            auto scratch_ = sc.get_mem<cuDataType_A *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, d_, e_, tauq_,
-                                       taup_, scratch_, scratchpad_size, nullptr);
-        });
-    });
-}
-
-#define GEBRD_LAUNCHER(TYPE_A, TYPE_B, CUSOLVER_ROUTINE)                                    \
-    void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<TYPE_A> &a, \
-               std::int64_t lda, sycl::buffer<TYPE_B> &d, sycl::buffer<TYPE_B> &e,          \
-               sycl::buffer<TYPE_A> &tauq, sycl::buffer<TYPE_A> &taup,                      \
-               sycl::buffer<TYPE_A> &scratchpad, std::int64_t scratchpad_size) {            \
-        gebrd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, d, e, tauq, taup,   \
-              scratchpad, scratchpad_size);                                                 \
-    }
-
-GEBRD_LAUNCHER(float, float, cusolverDnSgebrd)
-GEBRD_LAUNCHER(double, double, cusolverDnDgebrd)
-GEBRD_LAUNCHER(std::complex<float>, float, cusolverDnCgebrd)
-GEBRD_LAUNCHER(std::complex<double>, double, cusolverDnZgebrd)
-
-#undef GEBRD_LAUNCHER
-
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "gerqf");
-}
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "gerqf");
-}
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "gerqf");
-}
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "gerqf");
-}
-
-template <typename Func, typename T>
-inline void geqrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau,
-                  sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, tau_, scratch_,
-                                       scratchpad_size, nullptr);
-        });
-    });
-}
-
-#define GEQRF_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                            \
-    void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<TYPE> &a, \
-               std::int64_t lda, sycl::buffer<TYPE> &tau, sycl::buffer<TYPE> &scratchpad, \
-               std::int64_t scratchpad_size) {                                            \
-        geqrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, tau, scratchpad,  \
-              scratchpad_size);                                                           \
-    }
-
-GEQRF_LAUNCHER(float, cusolverDnSgeqrf)
-GEQRF_LAUNCHER(double, cusolverDnDgeqrf)
-GEQRF_LAUNCHER(std::complex<float>, cusolverDnCgeqrf)
-GEQRF_LAUNCHER(std::complex<double>, cusolverDnZgeqrf)
-
-#undef GEQRF_LAUNCHER
-
-template <typename Func, typename T>
-void getrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, scratchpad_size);
-
-    // cuSolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Create new buffer with 32-bit ints then copy over results
-    std::uint64_t ipiv_size = std::min(n, m);
-    sycl::buffer<int, 1> ipiv32(sycl::range<1>{ ipiv_size });
-    sycl::buffer<int> devInfo{ 1 };
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto ipiv32_acc = ipiv32.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto ipiv32_ = sc.get_mem<int *>(ipiv32_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, scratch_,
-                                       ipiv32_, devInfo_);
-        });
-    });
-
-    // Copy from 32-bit buffer to 64-bit
-    queue.submit([&](sycl::handler &cgh) {
-        auto ipiv32_acc = ipiv32.template get_access<sycl::access::mode::read>(cgh);
-        auto ipiv_acc = ipiv.template get_access<sycl::access::mode::write>(cgh);
-        cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-            ipiv_acc[index] = static_cast<std::int64_t>(ipiv32_acc[index]);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define GETRF_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                     \
-    void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<TYPE> &a,          \
-               std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<TYPE> &scratchpad, \
-               std::int64_t scratchpad_size) {                                                     \
-        getrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, ipiv, scratchpad,          \
-              scratchpad_size);                                                                    \
-    }
-
-GETRF_LAUNCHER(float, cusolverDnSgetrf)
-GETRF_LAUNCHER(double, cusolverDnDgetrf)
-GETRF_LAUNCHER(std::complex<float>, cusolverDnCgetrf)
-GETRF_LAUNCHER(std::complex<double>, cusolverDnZgetrf)
-
-#undef GETRF_LAUNCHER
-
-#define GETRI_LAUNCHER(TYPE)                                                                    \
-    void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<TYPE> &a, std::int64_t lda,     \
-               sycl::buffer<std::int64_t> &ipiv, sycl::buffer<TYPE> &scratchpad,                \
-               std::int64_t scratchpad_size) {                                                  \
-        return getri_batch(queue, n, a, lda, lda * n, ipiv, n, 1, scratchpad, scratchpad_size); \
-    }
-
-GETRI_LAUNCHER(float)
-GETRI_LAUNCHER(double)
-GETRI_LAUNCHER(std::complex<float>)
-GETRI_LAUNCHER(std::complex<double>)
-
-#undef GETRI_LAUNCHER
-
-// cusolverDnXgetrs does not use scratchpad memory
-template <typename Func, typename T>
-inline void getrs(const char *func_name, Func func, sycl::queue &queue,
-                  oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                  sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                  sycl::buffer<T> &b, std::int64_t ldb, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, nrhs, lda, ldb);
-
-    // cuSolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Create new buffer and convert 64-bit values.
-    std::uint64_t ipiv_size = ipiv.size();
-    sycl::buffer<int, 1> ipiv32(sycl::range<1>{ ipiv_size });
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto ipiv32_acc = ipiv32.template get_access<sycl::access::mode::write>(cgh);
-        auto ipiv_acc = ipiv.template get_access<sycl::access::mode::read>(cgh);
-        cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-            ipiv32_acc[index] = static_cast<std::int32_t>(ipiv_acc[index]);
-        });
-    });
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto ipiv_acc = ipiv32.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto ipiv_ = sc.get_mem<std::int32_t *>(ipiv_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), n,
-                                       nrhs, a_, lda, ipiv_, b_, ldb, nullptr);
-        });
-    });
-}
-
-#define GETRS_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                  \
-    void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,                \
-               std::int64_t nrhs, sycl::buffer<TYPE> &a, std::int64_t lda,                      \
-               sycl::buffer<std::int64_t> &ipiv, sycl::buffer<TYPE> &b, std::int64_t ldb,       \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                  \
-        getrs(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, trans, n, nrhs, a, lda, ipiv, b, ldb, \
-              scratchpad, scratchpad_size);                                                     \
-    }
-
-GETRS_LAUNCHER(float, cusolverDnSgetrs)
-GETRS_LAUNCHER(double, cusolverDnDgetrs)
-GETRS_LAUNCHER(std::complex<float>, cusolverDnCgetrs)
-GETRS_LAUNCHER(std::complex<double>, cusolverDnZgetrs)
-
-#undef GETRS_LAUNCHER
-
-template <typename Func, typename T_A, typename T_B>
-inline void gesvd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                  oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, sycl::buffer<T_A> &a,
-                  std::int64_t lda, sycl::buffer<T_B> &s, sycl::buffer<T_A> &u, std::int64_t ldu,
-                  sycl::buffer<T_A> &vt, std::int64_t ldvt, sycl::buffer<T_A> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using cuDataType_A = typename CudaEquivalentType<T_A>::Type;
-    using cuDataType_B = typename CudaEquivalentType<T_B>::Type;
-    overflow_check(n, m, lda, ldu, ldvt, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto s_acc = s.template get_access<sycl::access::mode::write>(cgh);
-        auto u_acc = u.template get_access<sycl::access::mode::write>(cgh);
-        auto vt_acc = vt.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType_A *>(a_acc);
-            auto s_ = sc.get_mem<cuDataType_B *>(s_acc);
-            auto u_ = sc.get_mem<cuDataType_A *>(u_acc);
-            auto vt_ = sc.get_mem<cuDataType_A *>(vt_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<cuDataType_A *>(scratch_acc);
-            cusolverStatus_t err;
-            // rwork is set to nullptr. If set it is filled with information from the superdiagonal.
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_jobsvd(jobu),
-                                       get_cusolver_jobsvd(jobvt), m, n, a_, lda, s_, u_, ldu, vt_,
-                                       ldvt, scratch_, scratchpad_size, nullptr, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define GESVD_LAUNCHER(TYPE_A, TYPE_B, CUSOLVER_ROUTINE)                                        \
-    void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,         \
-               std::int64_t m, std::int64_t n, sycl::buffer<TYPE_A> &a, std::int64_t lda,       \
-               sycl::buffer<TYPE_B> &s, sycl::buffer<TYPE_A> &u, std::int64_t ldu,              \
-               sycl::buffer<TYPE_A> &vt, std::int64_t ldvt, sycl::buffer<TYPE_A> &scratchpad,   \
-               std::int64_t scratchpad_size) {                                                  \
-        gesvd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobu, jobvt, m, n, a, lda, s, u, ldu, \
-              vt, ldvt, scratchpad, scratchpad_size);                                           \
-    }
-
-GESVD_LAUNCHER(float, float, cusolverDnSgesvd)
-GESVD_LAUNCHER(double, double, cusolverDnDgesvd)
-GESVD_LAUNCHER(std::complex<float>, float, cusolverDnCgesvd)
-GESVD_LAUNCHER(std::complex<double>, double, cusolverDnZgesvd)
-
-#undef GESVD_LAUNCHER
-
-template <typename Func, typename T_A, typename T_B>
-inline void heevd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<T_A> &a, std::int64_t lda,
-                  sycl::buffer<T_B> &w, sycl::buffer<T_A> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using cuDataType_A = typename CudaEquivalentType<T_A>::Type;
-    using cuDataType_B = typename CudaEquivalentType<T_B>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto w_acc = w.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType_A *>(a_acc);
-            auto w_ = sc.get_mem<cuDataType_B *>(w_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<cuDataType_A *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_job(jobz),
-                                       get_cublas_fill_mode(uplo), n, a_, lda, w_, scratch_,
-                                       scratchpad_size, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define HEEVD_LAUNCHER(TYPE_A, TYPE_B, CUSOLVER_ROUTINE)                                          \
-    void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, \
-               sycl::buffer<TYPE_A> &a, std::int64_t lda, sycl::buffer<TYPE_B> &w,                \
-               sycl::buffer<TYPE_A> &scratchpad, std::int64_t scratchpad_size) {                  \
-        heevd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w, scratchpad,   \
-              scratchpad_size);                                                                   \
-    }
-
-HEEVD_LAUNCHER(std::complex<float>, float, cusolverDnCheevd)
-HEEVD_LAUNCHER(std::complex<double>, double, cusolverDnZheevd)
-
-#undef HEEVD_LAUNCHER
-
-template <typename Func, typename T_A, typename T_B>
-inline void hegvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype,
-                  oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                  sycl::buffer<T_A> &a, std::int64_t lda, sycl::buffer<T_A> &b, std::int64_t ldb,
-                  sycl::buffer<T_B> &w, sycl::buffer<T_A> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using cuDataType_A = typename CudaEquivalentType<T_A>::Type;
-    using cuDataType_B = typename CudaEquivalentType<T_B>::Type;
-    overflow_check(n, lda, ldb, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        auto w_acc = w.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType_A *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType_A *>(b_acc);
-            auto w_ = sc.get_mem<cuDataType_B *>(w_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<cuDataType_A *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_itype(itype),
-                                       get_cusolver_job(jobz), get_cublas_fill_mode(uplo), n, a_,
-                                       lda, b_, ldb, w_, scratch_, scratchpad_size, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define HEGVD_LAUNCHER(TYPE_A, TYPE_B, CUSOLVER_ROUTINE)                                           \
-    void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,                      \
-               oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE_A> &a, std::int64_t lda,  \
-               sycl::buffer<TYPE_A> &b, std::int64_t ldb, sycl::buffer<TYPE_B> &w,                 \
-               sycl::buffer<TYPE_A> &scratchpad, std::int64_t scratchpad_size) {                   \
-        hegvd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, b, ldb, w, \
-              scratchpad, scratchpad_size);                                                        \
-    }
-
-HEGVD_LAUNCHER(std::complex<float>, float, cusolverDnChegvd)
-HEGVD_LAUNCHER(std::complex<double>, double, cusolverDnZhegvd)
-
-#undef HEGVD_LAUNCHER
-
-template <typename Func, typename T_A, typename T_B>
-inline void hetrd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, sycl::buffer<T_A> &a, std::int64_t lda, sycl::buffer<T_B> &d,
-                  sycl::buffer<T_B> &e, sycl::buffer<T_A> &tau, sycl::buffer<T_A> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using cuDataType_A = typename CudaEquivalentType<T_A>::Type;
-    using cuDataType_B = typename CudaEquivalentType<T_B>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto d_acc = d.template get_access<sycl::access::mode::write>(cgh);
-        auto e_acc = e.template get_access<sycl::access::mode::write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType_A *>(a_acc);
-            auto d_ = sc.get_mem<cuDataType_B *>(d_acc);
-            auto e_ = sc.get_mem<cuDataType_B *>(e_acc);
-            auto tau_ = sc.get_mem<cuDataType_A *>(tau_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<cuDataType_A *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       a_, lda, d_, e_, tau_, scratch_, scratchpad_size, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define HETRD_LAUNCHER(TYPE_A, TYPE_B, CUSOLVER_ROUTINE)                                          \
-    void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,                        \
-               sycl::buffer<TYPE_A> &a, std::int64_t lda, sycl::buffer<TYPE_B> &d,                \
-               sycl::buffer<TYPE_B> &e, sycl::buffer<TYPE_A> &tau,                                \
-               sycl::buffer<TYPE_A> &scratchpad, std::int64_t scratchpad_size) {                  \
-        hetrd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau, scratchpad, \
-              scratchpad_size);                                                                   \
-    }
-
-HETRD_LAUNCHER(std::complex<float>, float, cusolverDnChetrd)
-HETRD_LAUNCHER(std::complex<double>, double, cusolverDnZhetrd)
-
-#undef HETRD_LAUNCHER
-
-void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "hetrf");
-}
-void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "hetrf");
-}
-
-template <typename Func, typename T>
-inline void orgbr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::generate vec,
-                  std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<T> &a,
-                  std::int64_t lda, sycl::buffer<T> &tau, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::read>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_generate(vec), m, n,
-                                       k, a_, lda, tau_, scratch_, scratchpad_size, nullptr);
-        });
-    });
-}
-
-#define ORGBR_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                   \
-    void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,    \
-               std::int64_t k, sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &tau, \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                   \
-        orgbr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, scratchpad, \
-              scratchpad_size);                                                                  \
-    }
-
-ORGBR_LAUNCHER(float, cusolverDnSorgbr)
-ORGBR_LAUNCHER(double, cusolverDnDorgbr)
-
-#undef ORGBR_LAUNCHER
-
-template <typename Func, typename T>
-inline void orgqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                  std::int64_t n, std::int64_t k, sycl::buffer<T> &a, std::int64_t lda,
-                  sycl::buffer<T> &tau, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::read>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_,
-                                       scratch_, scratchpad_size, nullptr);
-        });
-    });
-}
-
-#define ORGQR_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                              \
-    void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,          \
-               sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &tau,            \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {              \
-        orgqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, scratchpad, \
-              scratchpad_size);                                                             \
-    }
-
-ORGQR_LAUNCHER(float, cusolverDnSorgqr)
-ORGQR_LAUNCHER(double, cusolverDnDorgqr)
-
-#undef ORGQR_LAUNCHER
-
-template <typename Func, typename T>
-inline void orgtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau,
-                  sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::read>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       a_, lda, tau_, scratch_, scratchpad_size, nullptr);
-        });
-    });
-}
-
-#define ORGTR_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                    \
-    void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE> &a, \
-               std::int64_t lda, sycl::buffer<TYPE> &tau, sycl::buffer<TYPE> &scratchpad,         \
-               std::int64_t scratchpad_size) {                                                    \
-        orgtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, tau, scratchpad,       \
-              scratchpad_size);                                                                   \
-    }
-
-ORGTR_LAUNCHER(float, cusolverDnSorgtr)
-ORGTR_LAUNCHER(double, cusolverDnDorgtr)
-
-#undef ORGTR_LAUNCHER
-
-template <typename Func, typename T>
-inline void ormtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau,
-                  sycl::buffer<T> &c, std::int64_t ldc, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldc, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::read_write>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto c_ = sc.get_mem<cuDataType *>(c_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side),
-                                       get_cublas_fill_mode(uplo), get_cublas_operation(trans), m,
-                                       n, a_, lda, tau_, c_, ldc, scratch_, scratchpad_size,
-                                       nullptr);
-        });
-    });
-}
-
-#define ORMTR_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                     \
-    void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,                 \
-               oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,                       \
-               sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &tau,                   \
-               sycl::buffer<TYPE> &c, std::int64_t ldc, sycl::buffer<TYPE> &scratchpad,            \
-               std::int64_t scratchpad_size) {                                                     \
-        ormtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a, lda, tau, c, \
-              ldc, scratchpad, scratchpad_size);                                                   \
-    }
-
-ORMTR_LAUNCHER(float, cusolverDnSormtr)
-ORMTR_LAUNCHER(double, cusolverDnDormtr)
-
-#undef ORMTR_LAUNCHER
-
-void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-           sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "ormrq");
-}
-void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-           sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "ormrq");
-}
-
-template <typename Func, typename T>
-inline void ormqr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-                  sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau, sycl::buffer<T> &c,
-                  std::int64_t ldc, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, ldc, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto c_ = sc.get_mem<cuDataType *>(c_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side),
-                                       get_cublas_operation(trans), m, n, k, a_, lda, tau_, c_, ldc,
-                                       scratch_, scratchpad_size, nullptr);
-        });
-    });
-}
-
-#define ORMQR_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                     \
-    void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,           \
-               std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<TYPE> &a,              \
-               std::int64_t lda, sycl::buffer<TYPE> &tau, sycl::buffer<TYPE> &c, std::int64_t ldc, \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                     \
-        ormqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda, tau, c,    \
-              ldc, scratchpad, scratchpad_size);                                                   \
-    }
-
-ORMQR_LAUNCHER(float, cusolverDnSormqr)
-ORMQR_LAUNCHER(double, cusolverDnDormqr)
-
-#undef ORMQR_LAUNCHER
-
-template <typename Func, typename T>
-inline void potrf(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       a_, lda, scratch_, scratchpad_size, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define POTRF_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                    \
-    void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE> &a, \
-               std::int64_t lda, sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {  \
-        potrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad,            \
-              scratchpad_size);                                                                   \
-    }
-
-POTRF_LAUNCHER(float, cusolverDnSpotrf)
-POTRF_LAUNCHER(double, cusolverDnDpotrf)
-POTRF_LAUNCHER(std::complex<float>, cusolverDnCpotrf)
-POTRF_LAUNCHER(std::complex<double>, cusolverDnZpotrf)
-
-#undef POTRF_LAUNCHER
-
-template <typename Func, typename T>
-inline void potri(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       a_, lda, scratch_, scratchpad_size, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define POTRI_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                    \
-    void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE> &a, \
-               std::int64_t lda, sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {  \
-        potri(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad,            \
-              scratchpad_size);                                                                   \
-    }
-
-POTRI_LAUNCHER(float, cusolverDnSpotri)
-POTRI_LAUNCHER(double, cusolverDnDpotri)
-POTRI_LAUNCHER(std::complex<float>, cusolverDnCpotri)
-POTRI_LAUNCHER(std::complex<double>, cusolverDnZpotri)
-
-#undef POTRI_LAUNCHER
-
-// cusolverDnXpotrs does not use scratchpad memory
-template <typename Func, typename T>
-inline void potrs(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::int64_t nrhs, sycl::buffer<T> &a, std::int64_t lda,
-                  sycl::buffer<T> &b, std::int64_t ldb, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, nrhs, lda, ldb, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       nrhs, a_, lda, b_, ldb, nullptr);
-        });
-    });
-}
-
-#define POTRS_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                   \
-    void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,    \
-               sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &b, std::int64_t ldb, \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                   \
-        potrs(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda, b, ldb,         \
-              scratchpad, scratchpad_size);                                                      \
-    }
-
-POTRS_LAUNCHER(float, cusolverDnSpotrs)
-POTRS_LAUNCHER(double, cusolverDnDpotrs)
-POTRS_LAUNCHER(std::complex<float>, cusolverDnCpotrs)
-POTRS_LAUNCHER(std::complex<double>, cusolverDnZpotrs)
-
-#undef POTRS_LAUNCHER
-
-template <typename Func, typename T>
-inline void syevd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<T> &a, std::int64_t lda,
-                  sycl::buffer<T> &w, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto w_acc = w.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto w_ = sc.get_mem<cuDataType *>(w_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_job(jobz),
-                                       get_cublas_fill_mode(uplo), n, a_, lda, w_, scratch_,
-                                       scratchpad_size, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define SYEVD_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                    \
-    void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, \
-               sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &w,                    \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                    \
-        syevd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w, scratchpad,   \
-              scratchpad_size);                                                                   \
-    }
-
-SYEVD_LAUNCHER(float, cusolverDnSsyevd)
-SYEVD_LAUNCHER(double, cusolverDnDsyevd)
-
-#undef SYEVD_LAUNCHER
-
-template <typename Func, typename T>
-inline void sygvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype,
-                  oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<T> &a,
-                  std::int64_t lda, sycl::buffer<T> &b, std::int64_t ldb, sycl::buffer<T> &w,
-                  sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, ldb, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        auto w_acc = w.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto b_ = sc.get_mem<cuDataType *>(b_acc);
-            auto w_ = sc.get_mem<cuDataType *>(w_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_itype(itype),
-                                       get_cusolver_job(jobz), get_cublas_fill_mode(uplo), n, a_,
-                                       lda, b_, ldb, w_, scratch_, scratchpad_size, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define SYGVD_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                     \
-    void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,                      \
-               oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE> &a, std::int64_t lda,    \
-               sycl::buffer<TYPE> &b, std::int64_t ldb, sycl::buffer<TYPE> &w,                     \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                     \
-        sygvd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, b, ldb, w, \
-              scratchpad, scratchpad_size);                                                        \
-    }
-
-SYGVD_LAUNCHER(float, cusolverDnSsygvd)
-SYGVD_LAUNCHER(double, cusolverDnDsygvd)
-
-#undef SYGVD_LAUNCH
-
-template <typename Func, typename T>
-inline void sytrd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &d,
-                  sycl::buffer<T> &e, sycl::buffer<T> &tau, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto d_acc = d.template get_access<sycl::access::mode::write>(cgh);
-        auto e_acc = e.template get_access<sycl::access::mode::write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto d_ = sc.get_mem<cuDataType *>(d_acc);
-            auto e_ = sc.get_mem<cuDataType *>(e_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       a_, lda, d_, e_, tau_, scratch_, scratchpad_size, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define SYTRD_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                    \
-    void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE> &a, \
-               std::int64_t lda, sycl::buffer<TYPE> &d, sycl::buffer<TYPE> &e,                    \
-               sycl::buffer<TYPE> &tau, sycl::buffer<TYPE> &scratchpad,                           \
-               std::int64_t scratchpad_size) {                                                    \
-        sytrd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau, scratchpad, \
-              scratchpad_size);                                                                   \
-    }
-
-SYTRD_LAUNCHER(float, cusolverDnSsytrd)
-SYTRD_LAUNCHER(double, cusolverDnDsytrd)
-
-#undef SYTRD_LAUNCHER
-
-template <typename Func, typename T>
-inline void sytrf(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda,
-                  sycl::buffer<std::int64_t> &ipiv, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-
-    // cuSolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Create new buffer with 32-bit ints then copy over results
-    std::uint64_t ipiv_size = n;
-    sycl::buffer<int, 1> ipiv32(sycl::range<1>{ ipiv_size });
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto ipiv32_acc = ipiv32.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto ipiv32_ = sc.get_mem<int *>(ipiv32_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       a_, lda, ipiv32_, scratch_, scratchpad_size, devInfo_);
-        });
-    });
-
-    // Copy from 32-bit buffer to 64-bit
-    queue.submit([&](sycl::handler &cgh) {
-        auto ipiv32_acc = ipiv32.template get_access<sycl::access::mode::read>(cgh);
-        auto ipiv_acc = ipiv.template get_access<sycl::access::mode::write>(cgh);
-        cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-            ipiv_acc[index] = static_cast<std::int64_t>(ipiv32_acc[index]);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define SYTRF_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                     \
-    void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE> &a,  \
-               std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<TYPE> &scratchpad, \
-               std::int64_t scratchpad_size) {                                                     \
-        sytrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, ipiv, scratchpad,       \
-              scratchpad_size);                                                                    \
-    }
-
-SYTRF_LAUNCHER(float, cusolverDnSsytrf)
-SYTRF_LAUNCHER(double, cusolverDnDsytrf)
-SYTRF_LAUNCHER(std::complex<float>, cusolverDnCsytrf)
-SYTRF_LAUNCHER(std::complex<double>, cusolverDnZsytrf)
-
-#undef SYTRF_LAUNCHER
-
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "trtrs");
-}
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "trtrs");
-}
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "trtrs");
-}
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "trtrs");
-}
-
-template <typename Func, typename T>
-inline void ungbr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::generate vec,
-                  std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<T> &a,
-                  std::int64_t lda, sycl::buffer<T> &tau, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_generate(vec), m, n,
-                                       k, a_, lda, tau_, scratch_, scratchpad_size, nullptr);
-        });
-    });
-}
-
-#define UNGBR_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                   \
-    void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,    \
-               std::int64_t k, sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &tau, \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                   \
-        ungbr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, scratchpad, \
-              scratchpad_size);                                                                  \
-    }
-
-UNGBR_LAUNCHER(std::complex<float>, cusolverDnCungbr)
-UNGBR_LAUNCHER(std::complex<double>, cusolverDnZungbr)
-
-#undef UNGBR_LAUNCHER
-
-template <typename Func, typename T>
-inline void ungqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                  std::int64_t n, std::int64_t k, sycl::buffer<T> &a, std::int64_t lda,
-                  sycl::buffer<T> &tau, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_,
-                                       scratch_, scratchpad_size, nullptr);
-        });
-    });
-}
-
-#define UNGQR_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                              \
-    void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,          \
-               sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &tau,            \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {              \
-        ungqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, scratchpad, \
-              scratchpad_size);                                                             \
-    }
-
-UNGQR_LAUNCHER(std::complex<float>, cusolverDnCungqr)
-UNGQR_LAUNCHER(std::complex<double>, cusolverDnZungqr)
-
-#undef UNGQR_LAUNCHER
-
-template <typename Func, typename T>
-inline void ungtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau,
-                  sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       a_, lda, tau_, scratch_, scratchpad_size, nullptr);
-        });
-    });
-}
-
-#define UNGTR_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                    \
-    void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE> &a, \
-               std::int64_t lda, sycl::buffer<TYPE> &tau, sycl::buffer<TYPE> &scratchpad,         \
-               std::int64_t scratchpad_size) {                                                    \
-        ungtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, tau, scratchpad,       \
-              scratchpad_size);                                                                   \
-    }
-
-UNGTR_LAUNCHER(std::complex<float>, cusolverDnCungtr)
-UNGTR_LAUNCHER(std::complex<double>, cusolverDnZungtr)
-
-#undef UNGTR_LAUNCHER
-
-void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "unmrq");
-}
-void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "unmrq");
-}
-
-template <typename Func, typename T>
-inline void unmqr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-                  sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau, sycl::buffer<T> &c,
-                  std::int64_t ldc, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto c_ = sc.get_mem<cuDataType *>(c_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side),
-                                       get_cublas_operation(trans), m, n, k, a_, lda, tau_, c_, ldc,
-                                       scratch_, scratchpad_size, nullptr);
-        });
-    });
-}
-
-#define UNMQR_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                     \
-    void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,           \
-               std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<TYPE> &a,              \
-               std::int64_t lda, sycl::buffer<TYPE> &tau, sycl::buffer<TYPE> &c, std::int64_t ldc, \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                     \
-        unmqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda, tau, c,    \
-              ldc, scratchpad, scratchpad_size);                                                   \
-    }
-
-UNMQR_LAUNCHER(std::complex<float>, cusolverDnCunmqr)
-UNMQR_LAUNCHER(std::complex<double>, cusolverDnZunmqr)
-
-#undef UNMQR_LAUNCHER
-
-template <typename Func, typename T>
-inline void unmtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau,
-                  sycl::buffer<T> &c, std::int64_t ldc, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldc, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<cuDataType *>(a_acc);
-            auto tau_ = sc.get_mem<cuDataType *>(tau_acc);
-            auto c_ = sc.get_mem<cuDataType *>(c_acc);
-            auto scratch_ = sc.get_mem<cuDataType *>(scratch_acc);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side),
-                                       get_cublas_fill_mode(uplo), get_cublas_operation(trans), m,
-                                       n, a_, lda, tau_, c_, ldc, scratch_, scratchpad_size,
-                                       nullptr);
-        });
-    });
-}
-
-#define UNMTR_LAUNCHER(TYPE, CUSOLVER_ROUTINE)                                                     \
-    void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,                 \
-               oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,                       \
-               sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &tau,                   \
-               sycl::buffer<TYPE> &c, std::int64_t ldc, sycl::buffer<TYPE> &scratchpad,            \
-               std::int64_t scratchpad_size) {                                                     \
-        unmtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a, lda, tau, c, \
-              ldc, scratchpad, scratchpad_size);                                                   \
-    }
-
-UNMTR_LAUNCHER(std::complex<float>, cusolverDnCunmtr)
-UNMTR_LAUNCHER(std::complex<double>, cusolverDnZunmtr)
-
-#undef UNMTR_LAUNCHER
-
-// USM APIs
-
-template <typename Func, typename T_A, typename T_B>
-inline sycl::event gebrd(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, T_A *a, std::int64_t lda, T_B *d, T_B *e, T_A *tauq,
-                         T_A *taup, T_A *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType_A = typename CudaEquivalentType<T_A>::Type;
-    using cuDataType_B = typename CudaEquivalentType<T_B>::Type;
-    overflow_check(m, n, lda, scratchpad_size);
-
-    if (m < n)
-        throw unimplemented("lapack", "gebrd", "cusolver gebrd does not support m < n");
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType_A *>(a);
-            auto d_ = reinterpret_cast<cuDataType_B *>(d);
-            auto e_ = reinterpret_cast<cuDataType_B *>(e);
-            auto tauq_ = reinterpret_cast<cuDataType_A *>(tauq);
-            auto taup_ = reinterpret_cast<cuDataType_A *>(taup);
-            auto scratch_ = reinterpret_cast<cuDataType_A *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, d_, e_, tauq_,
-                                       taup_, scratch_, scratchpad_size, nullptr);
-        });
-    });
-    return done;
-}
-
-#define GEBRD_LAUNCHER_USM(TYPE_A, TYPE_B, CUSOLVER_ROUTINE)                                     \
-    sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE_A *a,             \
-                      std::int64_t lda, TYPE_B *d, TYPE_B *e, TYPE_A *tauq, TYPE_A *taup,        \
-                      TYPE_A *scratchpad, std::int64_t scratchpad_size,                          \
-                      const std::vector<sycl::event> &dependencies) {                            \
-        return gebrd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, d, e, tauq, taup, \
-                     scratchpad, scratchpad_size, dependencies);                                 \
-    }
-
-GEBRD_LAUNCHER_USM(float, float, cusolverDnSgebrd)
-GEBRD_LAUNCHER_USM(double, double, cusolverDnDgebrd)
-GEBRD_LAUNCHER_USM(std::complex<float>, float, cusolverDnCgebrd)
-GEBRD_LAUNCHER_USM(std::complex<double>, double, cusolverDnZgebrd)
-
-#undef GEBRD_LAUNCHER_USM
-
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                  float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "gerqf");
-}
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                  double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "gerqf");
-}
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *tau, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "gerqf");
-}
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                  std::int64_t lda, std::complex<double> *tau, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "gerqf");
-}
-
-template <typename Func, typename T>
-inline sycl::event geqrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, T *a, std::int64_t lda, T *tau, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, tau_, scratch_,
-                                       scratchpad_size, nullptr);
-        });
-    });
-    return done;
-}
-
-#define GEQRF_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                                 \
-    sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE *a,                 \
-                      std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return geqrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, tau, scratchpad,    \
-                     scratchpad_size, dependencies);                                               \
-    }
-
-GEQRF_LAUNCHER_USM(float, cusolverDnSgeqrf)
-GEQRF_LAUNCHER_USM(double, cusolverDnDgeqrf)
-GEQRF_LAUNCHER_USM(std::complex<float>, cusolverDnCgeqrf)
-GEQRF_LAUNCHER_USM(std::complex<double>, cusolverDnZgeqrf)
-
-#undef GEQRF_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event getrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, T *a, std::int64_t lda, std::int64_t *ipiv, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, scratchpad_size);
-
-    // cuSolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Allocate memory with 32-bit ints then copy over results
-    std::uint64_t ipiv_size = std::min(n, m);
-    int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue);
-
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            auto ipiv_ = reinterpret_cast<int *>(ipiv32);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, scratch_, ipiv_,
-                                       devInfo_);
-        });
-    });
-
-    // Copy from 32-bit USM to 64-bit
-    auto done_casting = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(done);
-        cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-            ipiv[index] = static_cast<std::int64_t>(ipiv32[index]);
-        });
-    });
-
-    queue.wait();
-
-    free(ipiv32, queue);
-
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done_casting;
-}
-
-#define GETRF_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                               \
-    sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE *a,               \
-                      std::int64_t lda, std::int64_t *ipiv, TYPE *scratchpad,                    \
-                      std::int64_t scratchpad_size,                                              \
-                      const std::vector<sycl::event> &dependencies) {                            \
-        return getrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, ipiv, scratchpad, \
-                     scratchpad_size, dependencies);                                             \
-    }
-
-GETRF_LAUNCHER_USM(float, cusolverDnSgetrf)
-GETRF_LAUNCHER_USM(double, cusolverDnDgetrf)
-GETRF_LAUNCHER_USM(std::complex<float>, cusolverDnCgetrf)
-GETRF_LAUNCHER_USM(std::complex<double>, cusolverDnZgetrf)
-
-#undef GETRF_LAUNCHER_USM
-
-#define GETRI_LAUNCHER_USM(TYPE)                                                               \
-    sycl::event getri(sycl::queue &queue, std::int64_t n, TYPE *a, std::int64_t lda,           \
-                      std::int64_t *ipiv, TYPE *scratchpad, std::int64_t scratchpad_size,      \
-                      const std::vector<sycl::event> &dependencies) {                          \
-        return getri_batch(queue, n, a, lda, lda * n, ipiv, n, 1, scratchpad, scratchpad_size, \
-                           dependencies);                                                      \
-    }
-
-GETRI_LAUNCHER_USM(float)
-GETRI_LAUNCHER_USM(double)
-GETRI_LAUNCHER_USM(std::complex<float>)
-GETRI_LAUNCHER_USM(std::complex<double>)
-
-#undef GETRI_LAUNCHER_USM
-
-// cusolverDnXgetrs does not use scratchpad memory
-template <typename Func, typename T>
-inline sycl::event getrs(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, T *a,
-                         std::int64_t lda, std::int64_t *ipiv, T *b, std::int64_t ldb,
-                         T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, nrhs, lda, ldb, scratchpad_size);
-
-    // cuSolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Create new buffer and convert 64-bit values.
-    std::uint64_t ipiv_size = n;
-    int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue);
-
-    auto done_casting = queue.submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-            ipiv32[index] = static_cast<std::int32_t>(ipiv[index]);
-        });
-    });
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        cgh.depends_on(done_casting);
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto ipiv_ = reinterpret_cast<int *>(ipiv32);
-            auto b_ = reinterpret_cast<cuDataType *>(b);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), n,
-                                       nrhs, a_, lda, ipiv_, b_, ldb, nullptr);
-        });
-    });
-
-    queue.wait();
-
-    free(ipiv32, queue);
-
-    return done;
-}
-
-#define GETRS_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                                \
-    sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,           \
-                      std::int64_t nrhs, TYPE *a, std::int64_t lda, std::int64_t *ipiv, TYPE *b,  \
-                      std::int64_t ldb, TYPE *scratchpad, std::int64_t scratchpad_size,           \
-                      const std::vector<sycl::event> &dependencies) {                             \
-        return getrs(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, trans, n, nrhs, a, lda, ipiv, b, \
-                     ldb, scratchpad, scratchpad_size, dependencies);                             \
-    }
-
-GETRS_LAUNCHER_USM(float, cusolverDnSgetrs)
-GETRS_LAUNCHER_USM(double, cusolverDnDgetrs)
-GETRS_LAUNCHER_USM(std::complex<float>, cusolverDnCgetrs)
-GETRS_LAUNCHER_USM(std::complex<double>, cusolverDnZgetrs)
-
-#undef GETRS_LAUNCHER_USM
-
-template <typename Func, typename T_A, typename T_B>
-inline sycl::event gesvd(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                         std::int64_t n, T_A *a, std::int64_t lda, T_B *s, T_A *u, std::int64_t ldu,
-                         T_A *vt, std::int64_t ldvt, T_A *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType_A = typename CudaEquivalentType<T_A>::Type;
-    using cuDataType_B = typename CudaEquivalentType<T_B>::Type;
-    overflow_check(m, n, lda, ldu, ldvt, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType_A *>(a);
-            auto s_ = reinterpret_cast<cuDataType_B *>(s);
-            auto u_ = reinterpret_cast<cuDataType_A *>(u);
-            auto vt_ = reinterpret_cast<cuDataType_A *>(vt);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            auto scratch_ = reinterpret_cast<cuDataType_A *>(scratchpad);
-            cusolverStatus_t err;
-            // rwork is set to nullptr. If set it is filled with information from the superdiagonal.
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_jobsvd(jobu),
-                                       get_cusolver_jobsvd(jobvt), m, n, a_, lda, s_, u_, ldu, vt_,
-                                       ldvt, scratch_, scratchpad_size, nullptr, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define GESVD_LAUNCHER_USM(TYPE_A, TYPE_B, CUSOLVER_ROUTINE)                                      \
-    sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,    \
-                      std::int64_t m, std::int64_t n, TYPE_A *a, std::int64_t lda, TYPE_B *s,     \
-                      TYPE_A *u, std::int64_t ldu, TYPE_A *vt, std::int64_t ldvt,                 \
-                      TYPE_A *scratchpad, std::int64_t scratchpad_size,                           \
-                      const std::vector<sycl::event> &dependencies) {                             \
-        return gesvd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobu, jobvt, m, n, a, lda, s, u, \
-                     ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies);                   \
-    }
-
-GESVD_LAUNCHER_USM(float, float, cusolverDnSgesvd)
-GESVD_LAUNCHER_USM(double, double, cusolverDnDgesvd)
-GESVD_LAUNCHER_USM(std::complex<float>, float, cusolverDnCgesvd)
-GESVD_LAUNCHER_USM(std::complex<double>, double, cusolverDnZgesvd)
-
-#undef GESVD_LAUNCHER_USM
-
-template <typename Func, typename T_A, typename T_B>
-inline sycl::event heevd(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T_A *&a,
-                         std::int64_t lda, T_B *&w, T_A *&scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType_A = typename CudaEquivalentType<T_A>::Type;
-    using cuDataType_B = typename CudaEquivalentType<T_B>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType_A *>(a);
-            auto w_ = reinterpret_cast<cuDataType_B *>(w);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            auto scratch_ = reinterpret_cast<cuDataType_A *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_job(jobz),
-                                       get_cublas_fill_mode(uplo), n, a_, lda, w_, scratch_,
-                                       scratchpad_size, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define HEEVD_LAUNCHER_USM(TYPE_A, TYPE_B, CUSOLVER_ROUTINE)                                      \
-    sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,          \
-                      std::int64_t n, TYPE_A *a, std::int64_t lda, TYPE_B *w, TYPE_A *scratchpad, \
-                      std::int64_t scratchpad_size,                                               \
-                      const std::vector<sycl::event> &dependencies) {                             \
-        return heevd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w,        \
-                     scratchpad, scratchpad_size, dependencies);                                  \
-    }
-
-HEEVD_LAUNCHER_USM(std::complex<float>, float, cusolverDnCheevd)
-HEEVD_LAUNCHER_USM(std::complex<double>, double, cusolverDnZheevd)
-
-#undef HEEVD_LAUNCHER_USM
-
-template <typename Func, typename T_A, typename T_B>
-inline sycl::event hegvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T_A *&a,
-                         std::int64_t lda, T_A *&b, std::int64_t ldb, T_B *&w, T_A *&scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType_A = typename CudaEquivalentType<T_A>::Type;
-    using cuDataType_B = typename CudaEquivalentType<T_B>::Type;
-    overflow_check(n, lda, ldb, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType_A *>(a);
-            auto b_ = reinterpret_cast<cuDataType_A *>(b);
-            auto w_ = reinterpret_cast<cuDataType_B *>(w);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            auto scratch_ = reinterpret_cast<cuDataType_A *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_itype(itype),
-                                       get_cusolver_job(jobz), get_cublas_fill_mode(uplo), n, a_,
-                                       lda, b_, ldb, w_, scratch_, scratchpad_size, devInfo);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define HEGVD_LAUNCHER_USM(TYPE_A, TYPE_B, CUSOLVER_ROUTINE)                                      \
-    sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,              \
-                      oneapi::mkl::uplo uplo, std::int64_t n, TYPE_A *a, std::int64_t lda,        \
-                      TYPE_A *b, std::int64_t ldb, TYPE_B *w, TYPE_A *scratchpad,                 \
-                      std::int64_t scratchpad_size,                                               \
-                      const std::vector<sycl::event> &dependencies) {                             \
-        return hegvd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, b, \
-                     ldb, w, scratchpad, scratchpad_size, dependencies);                          \
-    }
-
-HEGVD_LAUNCHER_USM(std::complex<float>, float, cusolverDnChegvd)
-HEGVD_LAUNCHER_USM(std::complex<double>, double, cusolverDnZhegvd)
-
-#undef HEGVD_LAUNCHER_USM
-
-template <typename Func, typename T_A, typename T_B>
-inline sycl::event hetrd(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, T_A *a, std::int64_t lda, T_B *d,
-                         T_B *e, T_A *tau, T_A *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType_A = typename CudaEquivalentType<T_A>::Type;
-    using cuDataType_B = typename CudaEquivalentType<T_B>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType_A *>(a);
-            auto d_ = reinterpret_cast<cuDataType_B *>(d);
-            auto e_ = reinterpret_cast<cuDataType_B *>(e);
-            auto tau_ = reinterpret_cast<cuDataType_A *>(tau);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            auto scratch_ = reinterpret_cast<cuDataType_A *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       a_, lda, d_, e_, tau_, scratch_, scratchpad_size, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define HETRD_LAUNCHER_USM(TYPE_A, TYPE_B, CUSOLVER_ROUTINE)                                   \
-    sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE_A *a,   \
-                      std::int64_t lda, TYPE_B *d, TYPE_B *e, TYPE_A *tau, TYPE_A *scratchpad, \
-                      std::int64_t scratchpad_size,                                            \
-                      const std::vector<sycl::event> &dependencies) {                          \
-        return hetrd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau,   \
-                     scratchpad, scratchpad_size, dependencies);                               \
-    }
-
-HETRD_LAUNCHER_USM(std::complex<float>, float, cusolverDnChetrd)
-HETRD_LAUNCHER_USM(std::complex<double>, double, cusolverDnZhetrd)
-
-#undef HETRD_LAUNCHER_USM
-
-sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "hetrf");
-}
-sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "hetrf");
-}
-
-template <typename Func, typename T>
-inline sycl::event orgbr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k,
-                         T *a, std::int64_t lda, T *tau, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_generate(vec), m, n,
-                                       k, a_, lda, tau_, scratch_, scratchpad_size, nullptr);
-        });
-    });
-    return done;
-}
-
-#define ORGBR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                          \
-    sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,        \
-                      std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, TYPE *tau, \
-                      TYPE *scratchpad, std::int64_t scratchpad_size,                       \
-                      const std::vector<sycl::event> &dependencies) {                       \
-        return orgbr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, \
-                     scratchpad, scratchpad_size, dependencies);                            \
-    }
-
-ORGBR_LAUNCHER_USM(float, cusolverDnSorgbr)
-ORGBR_LAUNCHER_USM(double, cusolverDnDorgbr)
-
-#undef ORGBR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event orgqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau,
-                         T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_,
-                                       scratch_, scratchpad_size, nullptr);
-        });
-    });
-    return done;
-}
-
-#define ORGQR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                                 \
-    sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, \
-                      std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return orgqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, scratchpad, \
-                     scratchpad_size, dependencies);                                               \
-    }
-
-ORGQR_LAUNCHER_USM(float, cusolverDnSorgqr)
-ORGQR_LAUNCHER_USM(double, cusolverDnDorgqr)
-
-#undef ORGQR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event orgtr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, T *tau,
-                         T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       a_, lda, tau_, scratch_, scratchpad_size, nullptr);
-        });
-    });
-    return done;
-}
-
-#define ORGTR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                                 \
-    sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a,         \
-                      std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return orgtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, tau, scratchpad, \
-                     scratchpad_size, dependencies);                                               \
-    }
-
-ORGTR_LAUNCHER_USM(float, cusolverDnSorgtr)
-ORGTR_LAUNCHER_USM(double, cusolverDnDorgtr)
-
-#undef ORGTR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event ormtr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, T *a,
-                         std::int64_t lda, T *tau, T *c, std::int64_t ldc, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldc, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto c_ = reinterpret_cast<cuDataType *>(c);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side),
-                                       get_cublas_fill_mode(uplo), get_cublas_operation(trans), m,
-                                       n, a_, lda, tau_, c_, ldc, scratch_, scratchpad_size,
-                                       nullptr);
-        });
-    });
-    return done;
-}
-
-#define ORMTR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                                \
-    sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,         \
-                      oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, TYPE *a,      \
-                      std::int64_t lda, TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad,   \
-                      std::int64_t scratchpad_size,                                               \
-                      const std::vector<sycl::event> &dependencies) {                             \
-        return ormtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a, lda, \
-                     tau, c, ldc, scratchpad, scratchpad_size, dependencies);                     \
-    }
-
-ORMTR_LAUNCHER_USM(float, cusolverDnSormtr)
-ORMTR_LAUNCHER_USM(double, cusolverDnDormtr)
-
-#undef ORMTR_LAUNCHER_USM
-
-sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                  float *tau, float *c, std::int64_t ldc, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "ormrq");
-}
-sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                  double *tau, double *c, std::int64_t ldc, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "ormrq");
-}
-
-template <typename Func, typename T>
-inline sycl::event ormqr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *c,
-                         std::int64_t ldc, T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, ldc, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto c_ = reinterpret_cast<cuDataType *>(c);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side),
-                                       get_cublas_operation(trans), m, n, k, a_, lda, tau_, c_, ldc,
-                                       scratch_, scratchpad_size, nullptr);
-        });
-    });
-    return done;
-}
-
-#define ORMQR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                               \
-    sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,  \
-                      std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, \
-                      TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad,                    \
-                      std::int64_t scratchpad_size,                                              \
-                      const std::vector<sycl::event> &dependencies) {                            \
-        return ormqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda,   \
-                     tau, c, ldc, scratchpad, scratchpad_size, dependencies);                    \
-    }
-
-ORMQR_LAUNCHER_USM(float, cusolverDnSormqr)
-ORMQR_LAUNCHER_USM(double, cusolverDnDormqr)
-
-#undef ORMQR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event potrf(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda,
-                         T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       a_, lda, scratch_, scratchpad_size, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define POTRF_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                            \
-    sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a,    \
-                      std::int64_t lda, TYPE *scratchpad, std::int64_t scratchpad_size,       \
-                      const std::vector<sycl::event> &dependencies) {                         \
-        return potrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad, \
-                     scratchpad_size, dependencies);                                          \
-    }
-
-POTRF_LAUNCHER_USM(float, cusolverDnSpotrf)
-POTRF_LAUNCHER_USM(double, cusolverDnDpotrf)
-POTRF_LAUNCHER_USM(std::complex<float>, cusolverDnCpotrf)
-POTRF_LAUNCHER_USM(std::complex<double>, cusolverDnZpotrf)
-
-#undef POTRF_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event potri(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda,
-                         T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       a_, lda, scratch_, scratchpad_size, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define POTRI_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                            \
-    sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a,    \
-                      std::int64_t lda, TYPE *scratchpad, std::int64_t scratchpad_size,       \
-                      const std::vector<sycl::event> &dependencies) {                         \
-        return potri(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad, \
-                     scratchpad_size, dependencies);                                          \
-    }
-
-POTRI_LAUNCHER_USM(float, cusolverDnSpotri)
-POTRI_LAUNCHER_USM(double, cusolverDnDpotri)
-POTRI_LAUNCHER_USM(std::complex<float>, cusolverDnCpotri)
-POTRI_LAUNCHER_USM(std::complex<double>, cusolverDnZpotri)
-
-#undef POTRI_LAUNCHER_USM
-
-// cusolverDnXpotrs does not use scratchpad memory
-template <typename Func, typename T>
-inline sycl::event potrs(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, T *a,
-                         std::int64_t lda, T *b, std::int64_t ldb, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, nrhs, lda, ldb, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto b_ = reinterpret_cast<cuDataType *>(b);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       nrhs, a_, lda, b_, ldb, nullptr);
-        });
-    });
-    return done;
-}
-
-#define POTRS_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                              \
-    sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,               \
-                      std::int64_t nrhs, TYPE *a, std::int64_t lda, TYPE *b, std::int64_t ldb,  \
-                      TYPE *scratchpad, std::int64_t scratchpad_size,                           \
-                      const std::vector<sycl::event> &dependencies) {                           \
-        return potrs(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda, b, ldb, \
-                     scratchpad, scratchpad_size, dependencies);                                \
-    }
-
-POTRS_LAUNCHER_USM(float, cusolverDnSpotrs)
-POTRS_LAUNCHER_USM(double, cusolverDnDpotrs)
-POTRS_LAUNCHER_USM(std::complex<float>, cusolverDnCpotrs)
-POTRS_LAUNCHER_USM(std::complex<double>, cusolverDnZpotrs)
-
-#undef POTRS_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syevd(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T *a,
-                         std::int64_t lda, T *w, T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto w_ = reinterpret_cast<cuDataType *>(w);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_job(jobz),
-                                       get_cublas_fill_mode(uplo), n, a_, lda, w_, scratch_,
-                                       scratchpad_size, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define SYEVD_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                          \
-    sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,    \
-                      std::int64_t n, TYPE *a, std::int64_t lda, TYPE *w, TYPE *scratchpad, \
-                      std::int64_t scratchpad_size,                                         \
-                      const std::vector<sycl::event> &dependencies) {                       \
-        return syevd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w,  \
-                     scratchpad, scratchpad_size, dependencies);                            \
-    }
-
-SYEVD_LAUNCHER_USM(float, cusolverDnSsyevd)
-SYEVD_LAUNCHER_USM(double, cusolverDnDsyevd)
-
-#undef SYEVD_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event sygvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T *a,
-                         std::int64_t lda, T *b, std::int64_t ldb, T *w, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, ldb, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto b_ = reinterpret_cast<cuDataType *>(b);
-            auto w_ = reinterpret_cast<cuDataType *>(w);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_itype(itype),
-                                       get_cusolver_job(jobz), get_cublas_fill_mode(uplo), n, a_,
-                                       lda, b_, ldb, w_, scratch_, scratchpad_size, devInfo);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define SYGVD_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                                \
-    sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,              \
-                      oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, std::int64_t lda, TYPE *b, \
-                      std::int64_t ldb, TYPE *w, TYPE *scratchpad, std::int64_t scratchpad_size,  \
-                      const std::vector<sycl::event> &dependencies) {                             \
-        return sygvd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, b, \
-                     ldb, w, scratchpad, scratchpad_size, dependencies);                          \
-    }
-
-SYGVD_LAUNCHER_USM(float, cusolverDnSsygvd)
-SYGVD_LAUNCHER_USM(double, cusolverDnDsygvd)
-
-#undef SYGVD_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event sytrd(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, T *d, T *e,
-                         T *tau, T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto d_ = reinterpret_cast<cuDataType *>(d);
-            auto e_ = reinterpret_cast<cuDataType *>(e);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       a_, lda, d_, e_, tau_, scratch_, scratchpad_size, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define SYTRD_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                           \
-    sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a,   \
-                      std::int64_t lda, TYPE *d, TYPE *e, TYPE *tau, TYPE *scratchpad,       \
-                      std::int64_t scratchpad_size,                                          \
-                      const std::vector<sycl::event> &dependencies) {                        \
-        return sytrd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau, \
-                     scratchpad, scratchpad_size, dependencies);                             \
-    }
-
-SYTRD_LAUNCHER_USM(float, cusolverDnSsytrd)
-SYTRD_LAUNCHER_USM(double, cusolverDnDsytrd)
-
-#undef SYTRD_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event sytrf(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda,
-                         std::int64_t *ipiv, T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-
-    // cuSolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Allocate memory with 32-bit ints then copy over results
-    std::uint64_t ipiv_size = n;
-    int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            auto ipiv_ = reinterpret_cast<int *>(ipiv32);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       a_, lda, ipiv_, scratch_, scratchpad_size, devInfo_);
-        });
-    });
-
-    // Copy from 32-bit USM to 64-bit
-    auto done_casting = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(done);
-        cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-            ipiv[index] = static_cast<std::int64_t>(ipiv32[index]);
-        });
-    });
-
-    queue.wait();
-
-    free(ipiv32, queue);
-
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done_casting;
-}
-
-#define SYTRF_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                         \
-    sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \
-                      std::int64_t lda, std::int64_t *ipiv, TYPE *scratchpad,              \
-                      std::int64_t scratchpad_size,                                        \
-                      const std::vector<sycl::event> &dependencies) {                      \
-        return sytrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, ipiv,    \
-                     scratchpad, scratchpad_size, dependencies);                           \
-    }
-
-SYTRF_LAUNCHER_USM(float, cusolverDnSsytrf)
-SYTRF_LAUNCHER_USM(double, cusolverDnDsytrf)
-SYTRF_LAUNCHER_USM(std::complex<float>, cusolverDnCsytrf)
-SYTRF_LAUNCHER_USM(std::complex<double>, cusolverDnZsytrf)
-
-#undef SYTRF_LAUNCHER_USM
-
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "trtrs");
-}
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, double *a,
-                  std::int64_t lda, double *b, std::int64_t ldb, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "trtrs");
-}
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a,
-                  std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "trtrs");
-}
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                  std::int64_t ldb, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "trtrs");
-}
-
-template <typename Func, typename T>
-inline sycl::event ungbr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k,
-                         T *a, std::int64_t lda, T *tau, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_generate(vec), m, n,
-                                       k, a_, lda, tau_, scratch_, scratchpad_size, nullptr);
-        });
-    });
-    return done;
-}
-
-#define UNGBR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                          \
-    sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,        \
-                      std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, TYPE *tau, \
-                      TYPE *scratchpad, std::int64_t scratchpad_size,                       \
-                      const std::vector<sycl::event> &dependencies) {                       \
-        return ungbr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, \
-                     scratchpad, scratchpad_size, dependencies);                            \
-    }
-
-UNGBR_LAUNCHER_USM(std::complex<float>, cusolverDnCungbr)
-UNGBR_LAUNCHER_USM(std::complex<double>, cusolverDnZungbr)
-
-#undef UNGBR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event ungqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau,
-                         T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_,
-                                       scratch_, scratchpad_size, nullptr);
-        });
-    });
-    return done;
-}
-
-#define UNGQR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                                 \
-    sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, \
-                      std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return ungqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, scratchpad, \
-                     scratchpad_size, dependencies);                                               \
-    }
-
-UNGQR_LAUNCHER_USM(std::complex<float>, cusolverDnCungqr)
-UNGQR_LAUNCHER_USM(std::complex<double>, cusolverDnZungqr)
-
-#undef UNGQR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event ungtr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, T *tau,
-                         T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                       a_, lda, tau_, scratch_, scratchpad_size, nullptr);
-        });
-    });
-    return done;
-}
-
-#define UNGTR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                                 \
-    sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a,         \
-                      std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return ungtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, tau, scratchpad, \
-                     scratchpad_size, dependencies);                                               \
-    }
-
-UNGTR_LAUNCHER_USM(std::complex<float>, cusolverDnCungtr)
-UNGTR_LAUNCHER_USM(std::complex<double>, cusolverDnZungtr)
-
-#undef UNGTR_LAUNCHER_USM
-
-sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *tau, std::complex<float> *c,
-                  std::int64_t ldc, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "unmrq");
-}
-sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> *a,
-                  std::int64_t lda, std::complex<double> *tau, std::complex<double> *c,
-                  std::int64_t ldc, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "unmrq");
-}
-
-template <typename Func, typename T>
-inline sycl::event unmqr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *c,
-                         std::int64_t ldc, T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto c_ = reinterpret_cast<cuDataType *>(c);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side),
-                                       get_cublas_operation(trans), m, n, k, a_, lda, tau_, c_, ldc,
-                                       scratch_, scratchpad_size, nullptr);
-        });
-    });
-    return done;
-}
-
-#define UNMQR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                               \
-    sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,  \
-                      std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, \
-                      TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad,                    \
-                      std::int64_t scratchpad_size,                                              \
-                      const std::vector<sycl::event> &dependencies) {                            \
-        return unmqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda,   \
-                     tau, c, ldc, scratchpad, scratchpad_size, dependencies);                    \
-    }
-
-UNMQR_LAUNCHER_USM(std::complex<float>, cusolverDnCunmqr)
-UNMQR_LAUNCHER_USM(std::complex<double>, cusolverDnZunmqr)
-
-#undef UNMQR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event unmtr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, T *a,
-                         std::int64_t lda, T *tau, T *c, std::int64_t ldc, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using cuDataType = typename CudaEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldc, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<cuDataType *>(a);
-            auto tau_ = reinterpret_cast<cuDataType *>(tau);
-            auto c_ = reinterpret_cast<cuDataType *>(c);
-            auto scratch_ = reinterpret_cast<cuDataType *>(scratchpad);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side),
-                                       get_cublas_fill_mode(uplo), get_cublas_operation(trans), m,
-                                       n, a_, lda, tau_, c_, ldc, scratch_, scratchpad_size,
-                                       nullptr);
-        });
-    });
-    return done;
-}
-
-#define UNMTR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE)                                                \
-    sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,         \
-                      oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, TYPE *a,      \
-                      std::int64_t lda, TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad,   \
-                      std::int64_t scratchpad_size,                                               \
-                      const std::vector<sycl::event> &dependencies) {                             \
-        return unmtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a, lda, \
-                     tau, c, ldc, scratchpad, scratchpad_size, dependencies);                     \
-    }
-
-UNMTR_LAUNCHER_USM(std::complex<float>, cusolverDnCunmtr)
-UNMTR_LAUNCHER_USM(std::complex<double>, cusolverDnZunmtr)
-
-#undef UNMTR_LAUNCHER_USM
-
-// SCRATCHPAD APIs
-
-template <typename Func>
-inline void gebrd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  std::int64_t m, std::int64_t n, std::int64_t lda,
-                                  int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define GEBRD_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                            \
-    template <>                                                                                   \
-    std::int64_t gebrd_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t m, std::int64_t n, \
-                                             std::int64_t lda) {                                  \
-        int scratch_size;                                                                         \
-        gebrd_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, lda,              \
-                              &scratch_size);                                                     \
-        return scratch_size;                                                                      \
-    }
-
-GEBRD_LAUNCHER_SCRATCH(float, cusolverDnSgebrd_bufferSize)
-GEBRD_LAUNCHER_SCRATCH(double, cusolverDnDgebrd_bufferSize)
-GEBRD_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCgebrd_bufferSize)
-GEBRD_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZgebrd_bufferSize)
-
-#undef GEBRD_LAUNCHER_SCRATCH
-
-template <>
-std::int64_t gerqf_scratchpad_size<float>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                          std::int64_t lda) {
-    throw unimplemented("lapack", "gerqf_scratchpad_size");
-}
-template <>
-std::int64_t gerqf_scratchpad_size<double>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                           std::int64_t lda) {
-    throw unimplemented("lapack", "gerqf_scratchpad_size");
-}
-template <>
-std::int64_t gerqf_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda) {
-    throw unimplemented("lapack", "gerqf_scratchpad_size");
-}
-template <>
-std::int64_t gerqf_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda) {
-    throw unimplemented("lapack", "gerqf_scratchpad_size");
-}
-
-template <typename Func>
-inline void geqrf_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  std::int64_t m, std::int64_t n, std::int64_t lda,
-                                  int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, nullptr, lda, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define GEQRF_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                            \
-    template <>                                                                                   \
-    std::int64_t geqrf_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t m, std::int64_t n, \
-                                             std::int64_t lda) {                                  \
-        int scratch_size;                                                                         \
-        geqrf_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, lda,              \
-                              &scratch_size);                                                     \
-        return scratch_size;                                                                      \
-    }
-
-GEQRF_LAUNCHER_SCRATCH(float, cusolverDnSgeqrf_bufferSize)
-GEQRF_LAUNCHER_SCRATCH(double, cusolverDnDgeqrf_bufferSize)
-GEQRF_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCgeqrf_bufferSize)
-GEQRF_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZgeqrf_bufferSize)
-
-#undef GEQRF_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void gesvd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                                  std::int64_t m, std::int64_t n, std::int64_t lda,
-                                  std::int64_t ldu, std::int64_t ldvt, int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define GESVD_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                            \
-    template <>                                                                                   \
-    std::int64_t gesvd_scratchpad_size<TYPE>(                                                     \
-        sycl::queue & queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, \
-        std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt) {                  \
-        int scratch_size;                                                                         \
-        gesvd_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobu, jobvt, m, n, lda, \
-                              ldu, ldvt, &scratch_size);                                          \
-        return scratch_size;                                                                      \
-    }
-
-GESVD_LAUNCHER_SCRATCH(float, cusolverDnSgesvd_bufferSize)
-GESVD_LAUNCHER_SCRATCH(double, cusolverDnDgesvd_bufferSize)
-GESVD_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCgesvd_bufferSize)
-GESVD_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZgesvd_bufferSize)
-
-#undef GESVD_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void getrf_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  std::int64_t m, std::int64_t n, std::int64_t lda,
-                                  int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, nullptr, lda, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define GETRF_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                            \
-    template <>                                                                                   \
-    std::int64_t getrf_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t m, std::int64_t n, \
-                                             std::int64_t lda) {                                  \
-        int scratch_size;                                                                         \
-        getrf_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, lda,              \
-                              &scratch_size);                                                     \
-        return scratch_size;                                                                      \
-    }
-
-GETRF_LAUNCHER_SCRATCH(float, cusolverDnSgetrf_bufferSize)
-GETRF_LAUNCHER_SCRATCH(double, cusolverDnDgetrf_bufferSize)
-GETRF_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCgetrf_bufferSize)
-GETRF_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZgetrf_bufferSize)
-
-#undef GETRF_LAUNCHER_SCRATCH
-
-#define GETRI_LAUNCHER_SCRATCH(TYPE)                                              \
-    template <>                                                                   \
-    std::int64_t getri_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t n, \
-                                             std::int64_t lda) {                  \
-        return lda * n;                                                           \
-    }
-
-GETRI_LAUNCHER_SCRATCH(float)
-GETRI_LAUNCHER_SCRATCH(double)
-GETRI_LAUNCHER_SCRATCH(std::complex<float>)
-GETRI_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef GETRI_LAUNCHER_SCRATCH
-
-// cusolverDnXgetrs does not use scratchpad memory
-#define GETRS_LAUNCHER_SCRATCH(TYPE)                                                              \
-    template <>                                                                                   \
-    std::int64_t getrs_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::transpose trans,   \
-                                             std::int64_t n, std::int64_t nrhs, std::int64_t lda, \
-                                             std::int64_t ldb) {                                  \
-        return 0;                                                                                 \
-    }
-
-GETRS_LAUNCHER_SCRATCH(float)
-GETRS_LAUNCHER_SCRATCH(double)
-GETRS_LAUNCHER_SCRATCH(std::complex<float>)
-GETRS_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef GETRS_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void heevd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                  std::int64_t lda, int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cusolver_job(jobz),
-                                  get_cublas_fill_mode(uplo), n, nullptr, lda, nullptr,
-                                  scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define HEEVD_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                        \
-    template <>                                                                               \
-    std::int64_t heevd_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::job jobz,      \
-                                             oneapi::mkl::uplo uplo, std::int64_t n,          \
-                                             std::int64_t lda) {                              \
-        int scratch_size;                                                                     \
-        heevd_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobz, uplo, n, lda, \
-                              &scratch_size);                                                 \
-        return scratch_size;                                                                  \
-    }
-
-HEEVD_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCheevd_bufferSize)
-HEEVD_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZheevd_bufferSize)
-
-#undef HEEVD_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void hegvd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                  std::int64_t n, std::int64_t lda, std::int64_t ldb,
-                                  int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cusolver_itype(itype),
-                                  get_cusolver_job(jobz), get_cublas_fill_mode(uplo), n, nullptr,
-                                  lda, nullptr, ldb, nullptr, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define HEGVD_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                             \
-    template <>                                                                                    \
-    std::int64_t hegvd_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t itype,              \
-                                             oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,        \
-                                             std::int64_t n, std::int64_t lda, std::int64_t ldb) { \
-        int scratch_size;                                                                          \
-        hegvd_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, itype, jobz, uplo, n,    \
-                              lda, ldb, &scratch_size);                                            \
-        return scratch_size;                                                                       \
-    }
-
-HEGVD_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnChegvd_bufferSize)
-HEGVD_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZhegvd_bufferSize)
-
-#undef HEGVD_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void hetrd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                  int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                  nullptr, lda, nullptr, nullptr, nullptr, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define HETRD_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                    \
-    template <>                                                                           \
-    std::int64_t hetrd_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo, \
-                                             std::int64_t n, std::int64_t lda) {          \
-        int scratch_size;                                                                 \
-        hetrd_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, lda,   \
-                              &scratch_size);                                             \
-        return scratch_size;                                                              \
-    }
-
-HETRD_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnChetrd_bufferSize)
-HETRD_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZhetrd_bufferSize)
-
-#undef HETRD_LAUNCHER_SCRATCH
-
-template <>
-std::int64_t hetrf_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda) {
-    throw unimplemented("lapack", "hetrf_scratchpad_size");
-}
-template <>
-std::int64_t hetrf_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda) {
-    throw unimplemented("lapack", "hetrf_scratchpad_size");
-}
-
-template <typename Func>
-inline void orgbr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                  std::int64_t k, std::int64_t lda, int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_generate(vec), m, n, k,
-                                  nullptr, lda, nullptr, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define ORGBR_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                       \
-    template <>                                                                              \
-    std::int64_t orgbr_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::generate vec, \
-                                             std::int64_t m, std::int64_t n, std::int64_t k, \
-                                             std::int64_t lda) {                             \
-        int scratch_size;                                                                    \
-        orgbr_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, vec, m, n, k, lda, \
-                              &scratch_size);                                                \
-        return scratch_size;                                                                 \
-    }
-
-ORGBR_LAUNCHER_SCRATCH(float, cusolverDnSorgbr_bufferSize)
-ORGBR_LAUNCHER_SCRATCH(double, cusolverDnDorgbr_bufferSize)
-
-#undef ORGBR_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void orgtr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                  int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                  nullptr, lda, nullptr, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define ORGTR_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                    \
-    template <>                                                                           \
-    std::int64_t orgtr_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo, \
-                                             std::int64_t n, std::int64_t lda) {          \
-        int scratch_size;                                                                 \
-        orgtr_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, lda,   \
-                              &scratch_size);                                             \
-        return scratch_size;                                                              \
-    }
-
-ORGTR_LAUNCHER_SCRATCH(float, cusolverDnSorgtr_bufferSize)
-ORGTR_LAUNCHER_SCRATCH(double, cusolverDnDorgtr_bufferSize)
-
-#undef ORGTR_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void orgqr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                  int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, k, nullptr, lda, nullptr,
-                                  scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define ORGQR_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                            \
-    template <>                                                                                   \
-    std::int64_t orgqr_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t m, std::int64_t n, \
-                                             std::int64_t k, std::int64_t lda) {                  \
-        int scratch_size;                                                                         \
-        orgqr_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, lda,           \
-                              &scratch_size);                                                     \
-        return scratch_size;                                                                      \
-    }
-
-ORGQR_LAUNCHER_SCRATCH(float, cusolverDnSorgqr_bufferSize)
-ORGQR_LAUNCHER_SCRATCH(double, cusolverDnDorgqr_bufferSize)
-
-#undef ORGQR_LAUNCHER_SCRATCH
-
-template <>
-std::int64_t ormrq_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::side side,
-                                          oneapi::mkl::transpose trans, std::int64_t m,
-                                          std::int64_t n, std::int64_t k, std::int64_t lda,
-                                          std::int64_t ldc) {
-    throw unimplemented("lapack", "ormrq_scratchpad_size");
-}
-template <>
-std::int64_t ormrq_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::side side,
-                                           oneapi::mkl::transpose trans, std::int64_t m,
-                                           std::int64_t n, std::int64_t k, std::int64_t lda,
-                                           std::int64_t ldc) {
-    throw unimplemented("lapack", "ormrq_scratchpad_size");
-}
-
-template <typename Func>
-inline void ormqr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                  std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                  std::int64_t ldc, int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_side_mode(side),
-                                  get_cublas_operation(trans), m, n, k, nullptr, lda, nullptr,
-                                  nullptr, ldc, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define ORMQRF_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                            \
-    template <>                                                                                    \
-    std::int64_t ormqr_scratchpad_size<TYPE>(                                                      \
-        sycl::queue & queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, \
-        std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) {                      \
-        int scratch_size;                                                                          \
-        ormqr_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, trans, m, n, k,    \
-                              lda, ldc, &scratch_size);                                            \
-        return scratch_size;                                                                       \
-    }
-
-ORMQRF_LAUNCHER_SCRATCH(float, cusolverDnSormqr_bufferSize)
-ORMQRF_LAUNCHER_SCRATCH(double, cusolverDnDormqr_bufferSize)
-
-#undef ORMQRF_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void ormtr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                  std::int64_t lda, std::int64_t ldc, int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_side_mode(side),
-                                  get_cublas_fill_mode(uplo), get_cublas_operation(trans), m, n,
-                                  nullptr, lda, nullptr, nullptr, ldc, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define ORMTR_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                             \
-    template <>                                                                                    \
-    std::int64_t ormtr_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::side side,          \
-                                             oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, \
-                                             std::int64_t m, std::int64_t n, std::int64_t lda,     \
-                                             std::int64_t ldc) {                                   \
-        int scratch_size;                                                                          \
-        ormtr_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, uplo, trans, m, n, \
-                              lda, ldc, &scratch_size);                                            \
-        return scratch_size;                                                                       \
-    }
-
-ORMTR_LAUNCHER_SCRATCH(float, cusolverDnSormtr_bufferSize)
-ORMTR_LAUNCHER_SCRATCH(double, cusolverDnDormtr_bufferSize)
-
-#undef ORMTR_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void potrf_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                  int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                  nullptr, lda, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define POTRF_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                    \
-    template <>                                                                           \
-    std::int64_t potrf_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo, \
-                                             std::int64_t n, std::int64_t lda) {          \
-        int scratch_size;                                                                 \
-        potrf_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, lda,   \
-                              &scratch_size);                                             \
-        return scratch_size;                                                              \
-    }
-
-POTRF_LAUNCHER_SCRATCH(float, cusolverDnSpotrf_bufferSize)
-POTRF_LAUNCHER_SCRATCH(double, cusolverDnDpotrf_bufferSize)
-POTRF_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCpotrf_bufferSize)
-POTRF_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZpotrf_bufferSize)
-
-#undef POTRF_LAUNCHER_SCRATCH
-
-// cusolverDnXpotrs does not use scratchpad memory
-#define POTRS_LAUNCHER_SCRATCH(TYPE)                                                              \
-    template <>                                                                                   \
-    std::int64_t potrs_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo,         \
-                                             std::int64_t n, std::int64_t nrhs, std::int64_t lda, \
-                                             std::int64_t ldb) {                                  \
-        return 0;                                                                                 \
-    }
-
-POTRS_LAUNCHER_SCRATCH(float)
-POTRS_LAUNCHER_SCRATCH(double)
-POTRS_LAUNCHER_SCRATCH(std::complex<float>)
-POTRS_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef POTRS_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void potri_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                  int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                  nullptr, lda, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define POTRI_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                    \
-    template <>                                                                           \
-    std::int64_t potri_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo, \
-                                             std::int64_t n, std::int64_t lda) {          \
-        int scratch_size;                                                                 \
-        potri_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, lda,   \
-                              &scratch_size);                                             \
-        return scratch_size;                                                              \
-    }
-
-POTRI_LAUNCHER_SCRATCH(float, cusolverDnSpotri_bufferSize)
-POTRI_LAUNCHER_SCRATCH(double, cusolverDnDpotri_bufferSize)
-POTRI_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCpotri_bufferSize)
-POTRI_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZpotri_bufferSize)
-
-#undef POTRI_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void sytrf_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                  int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, n, nullptr, lda, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define SYTRF_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                    \
-    template <>                                                                           \
-    std::int64_t sytrf_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo, \
-                                             std::int64_t n, std::int64_t lda) {          \
-        int scratch_size;                                                                 \
-        sytrf_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, lda,   \
-                              &scratch_size);                                             \
-        return scratch_size;                                                              \
-    }
-
-SYTRF_LAUNCHER_SCRATCH(float, cusolverDnSsytrf_bufferSize)
-SYTRF_LAUNCHER_SCRATCH(double, cusolverDnDsytrf_bufferSize)
-SYTRF_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCsytrf_bufferSize)
-SYTRF_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZsytrf_bufferSize)
-
-#undef SYTRF_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void syevd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                                  std::int64_t lda, int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cusolver_job(jobz),
-                                  get_cublas_fill_mode(uplo), n, nullptr, lda, nullptr,
-                                  scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define SYEVD_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                        \
-    template <>                                                                               \
-    std::int64_t syevd_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::job jobz,      \
-                                             oneapi::mkl::uplo uplo, std::int64_t n,          \
-                                             std::int64_t lda) {                              \
-        int scratch_size;                                                                     \
-        syevd_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobz, uplo, n, lda, \
-                              &scratch_size);                                                 \
-        return scratch_size;                                                                  \
-    }
-
-SYEVD_LAUNCHER_SCRATCH(float, cusolverDnSsyevd_bufferSize)
-SYEVD_LAUNCHER_SCRATCH(double, cusolverDnDsyevd_bufferSize)
-
-#undef SYEVD_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void sygvd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                  std::int64_t n, std::int64_t lda, std::int64_t ldb,
-                                  int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cusolver_itype(itype),
-                                  get_cusolver_job(jobz), get_cublas_fill_mode(uplo), n, nullptr,
-                                  lda, nullptr, ldb, nullptr, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define SYGVD_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                             \
-    template <>                                                                                    \
-    std::int64_t sygvd_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t itype,              \
-                                             oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,        \
-                                             std::int64_t n, std::int64_t lda, std::int64_t ldb) { \
-        int scratch_size;                                                                          \
-        sygvd_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, itype, jobz, uplo, n,    \
-                              lda, ldb, &scratch_size);                                            \
-        return scratch_size;                                                                       \
-    }
-
-SYGVD_LAUNCHER_SCRATCH(float, cusolverDnSsygvd_bufferSize)
-SYGVD_LAUNCHER_SCRATCH(double, cusolverDnDsygvd_bufferSize)
-
-#undef SYGVD_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void sytrd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                  int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                  nullptr, lda, nullptr, nullptr, nullptr, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define SYTRD_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                    \
-    template <>                                                                           \
-    std::int64_t sytrd_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo, \
-                                             std::int64_t n, std::int64_t lda) {          \
-        int scratch_size;                                                                 \
-        sytrd_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, lda,   \
-                              &scratch_size);                                             \
-        return scratch_size;                                                              \
-    }
-
-SYTRD_LAUNCHER_SCRATCH(float, cusolverDnSsytrd_bufferSize)
-SYTRD_LAUNCHER_SCRATCH(double, cusolverDnDsytrd_bufferSize)
-
-#undef SYTRD_LAUNCHER_SCRATCH
-
-template <>
-std::int64_t trtrs_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                          oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                          std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                          std::int64_t ldb) {
-    throw unimplemented("lapack", "trtrs_scratchpad_size");
-}
-template <>
-std::int64_t trtrs_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                           oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                           std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                           std::int64_t ldb) {
-    throw unimplemented("lapack", "trtrs_scratchpad_size");
-}
-template <>
-std::int64_t trtrs_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        oneapi::mkl::transpose trans,
-                                                        oneapi::mkl::diag diag, std::int64_t n,
-                                                        std::int64_t nrhs, std::int64_t lda,
-                                                        std::int64_t ldb) {
-    throw unimplemented("lapack", "trtrs_scratchpad_size");
-}
-template <>
-std::int64_t trtrs_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         oneapi::mkl::transpose trans,
-                                                         oneapi::mkl::diag diag, std::int64_t n,
-                                                         std::int64_t nrhs, std::int64_t lda,
-                                                         std::int64_t ldb) {
-    throw unimplemented("lapack", "trtrs_scratchpad_size");
-}
-
-template <typename Func>
-inline void ungbr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                                  std::int64_t k, std::int64_t lda, int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_generate(vec), m, n, k,
-                                  nullptr, lda, nullptr, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define UNGBR_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                       \
-    template <>                                                                              \
-    std::int64_t ungbr_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::generate vec, \
-                                             std::int64_t m, std::int64_t n, std::int64_t k, \
-                                             std::int64_t lda) {                             \
-        int scratch_size;                                                                    \
-        ungbr_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, vec, m, n, k, lda, \
-                              &scratch_size);                                                \
-        return scratch_size;                                                                 \
-    }
-
-UNGBR_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCungbr_bufferSize)
-UNGBR_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZungbr_bufferSize)
-
-#undef UNGBR_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void ungqr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                  int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, k, nullptr, lda, nullptr,
-                                  scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define UNGQR_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                            \
-    template <>                                                                                   \
-    std::int64_t ungqr_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t m, std::int64_t n, \
-                                             std::int64_t k, std::int64_t lda) {                  \
-        int scratch_size;                                                                         \
-        ungqr_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, lda,           \
-                              &scratch_size);                                                     \
-        return scratch_size;                                                                      \
-    }
-
-UNGQR_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCungqr_bufferSize)
-UNGQR_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZungqr_bufferSize)
-
-#undef UNGQR_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void ungtr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                  int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo), n,
-                                  nullptr, lda, nullptr, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define UNGTR_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                    \
-    template <>                                                                           \
-    std::int64_t ungtr_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo, \
-                                             std::int64_t n, std::int64_t lda) {          \
-        int scratch_size;                                                                 \
-        ungtr_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, lda,   \
-                              &scratch_size);                                             \
-        return scratch_size;                                                              \
-    }
-
-UNGTR_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCungtr_bufferSize)
-UNGTR_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZungtr_bufferSize)
-
-#undef UNGTR_LAUNCHER_SCRATCH
-
-template <>
-std::int64_t unmrq_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::side side,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t m, std::int64_t n,
-                                                        std::int64_t k, std::int64_t lda,
-                                                        std::int64_t ldc) {
-    throw unimplemented("lapack", "unmrq_scratchpad_size");
-}
-template <>
-std::int64_t unmrq_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::side side,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t m, std::int64_t n,
-                                                         std::int64_t k, std::int64_t lda,
-                                                         std::int64_t ldc) {
-    throw unimplemented("lapack", "unmrq_scratchpad_size");
-}
-
-template <typename Func>
-inline void unmqr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                  std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-                                  std::int64_t ldc, int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_side_mode(side),
-                                  get_cublas_operation(trans), m, n, k, nullptr, lda, nullptr,
-                                  nullptr, ldc, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define UNMQR_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                             \
-    template <>                                                                                    \
-    std::int64_t unmqr_scratchpad_size<TYPE>(                                                      \
-        sycl::queue & queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, \
-        std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) {                      \
-        int scratch_size;                                                                          \
-        unmqr_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, trans, m, n, k,    \
-                              lda, ldc, &scratch_size);                                            \
-        return scratch_size;                                                                       \
-    }
-
-UNMQR_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCunmqr_bufferSize)
-UNMQR_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZunmqr_bufferSize)
-
-#undef UNMQR_LAUNCHER_SCRATCH
-
-template <typename Func>
-inline void unmtr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue,
-                                  oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                  std::int64_t lda, std::int64_t ldc, int *scratch_size) {
-    queue.submit([&](sycl::handler &cgh) {
-        onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            cusolverStatus_t err;
-            CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_side_mode(side),
-                                  get_cublas_fill_mode(uplo), get_cublas_operation(trans), m, n,
-                                  nullptr, lda, nullptr, nullptr, ldc, scratch_size);
-        });
-    });
-    queue.wait();
-}
-
-#define UNMTR_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE)                                             \
-    template <>                                                                                    \
-    std::int64_t unmtr_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::side side,          \
-                                             oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, \
-                                             std::int64_t m, std::int64_t n, std::int64_t lda,     \
-                                             std::int64_t ldc) {                                   \
-        int scratch_size;                                                                          \
-        unmtr_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, uplo, trans, m, n, \
-                              lda, ldc, &scratch_size);                                            \
-        return scratch_size;                                                                       \
-    }
-
-UNMTR_LAUNCHER_SCRATCH(std::complex<float>, cusolverDnCunmtr_bufferSize)
-UNMTR_LAUNCHER_SCRATCH(std::complex<double>, cusolverDnZunmtr_bufferSize)
-
-#undef UNMTR_LAUNCHER_SCRATCH
-
-} // namespace cusolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/lapack/backends/cusolver/cusolver_scope_handle.cpp b/src/lapack/backends/cusolver/cusolver_scope_handle.cpp
deleted file mode 100644
index 0bc3ebdb0..000000000
--- a/src/lapack/backends/cusolver/cusolver_scope_handle.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "cusolver_scope_handle.hpp"
-#if __has_include(<sycl/detail/common.hpp>)
-#include <sycl/detail/common.hpp>
-#else
-#include <CL/sycl/detail/common.hpp>
-#endif
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace cusolver {
-
-/**
- * Inserts a new element in the map if its key is unique. This new element
- * is constructed in place using args as the arguments for the construction
- * of a value_type (which is an object of a pair type). The insertion only
- * takes place if no other element in the container has a key equivalent to
- * the one being emplaced (keys in a map container are unique).
- */
-thread_local cusolver_handle<pi_context> CusolverScopedContextHandler::handle_helper =
-    cusolver_handle<pi_context>{};
-
-CusolverScopedContextHandler::CusolverScopedContextHandler(sycl::queue queue,
-                                                           sycl::interop_handle &ih)
-        : ih(ih),
-          needToRecover_(false) {
-    placedContext_ = new sycl::context(queue.get_context());
-    auto cudaDevice = ih.get_native_device<sycl::backend::ext_oneapi_cuda>();
-    CUresult err;
-    CUcontext desired;
-    CUDA_ERROR_FUNC(cuCtxGetCurrent, err, &original_);
-    CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, err, &desired, cudaDevice);
-    if (original_ != desired) {
-        // Sets the desired context as the active one for the thread
-        CUDA_ERROR_FUNC(cuCtxSetCurrent, err, desired);
-        // No context is installed and the suggested context is primary
-        // This is the most common case. We can activate the context in the
-        // thread and leave it there until all the PI context referring to the
-        // same underlying CUDA primary context are destroyed. This emulates
-        // the behaviour of the CUDA runtime api, and avoids costly context
-        // switches. No action is required on this side of the if.
-        needToRecover_ = !(original_ == nullptr);
-    }
-}
-
-CusolverScopedContextHandler::~CusolverScopedContextHandler() noexcept(false) {
-    if (needToRecover_) {
-        CUresult err;
-        CUDA_ERROR_FUNC(cuCtxSetCurrent, err, original_);
-    }
-    delete placedContext_;
-}
-
-void ContextCallback(void *userData) {
-    auto *ptr = static_cast<std::atomic<cusolverDnHandle_t> *>(userData);
-    if (!ptr) {
-        return;
-    }
-    auto handle = ptr->exchange(nullptr);
-    if (handle != nullptr) {
-        cusolverStatus_t err1;
-        CUSOLVER_ERROR_FUNC(cusolverDnDestroy, err1, handle);
-        handle = nullptr;
-    }
-    else {
-        // if the handle is nullptr it means the handle was already destroyed by
-        // the cusolver_handle destructor and we're free to delete the atomic
-        // object.
-        delete ptr;
-    }
-}
-
-cusolverDnHandle_t CusolverScopedContextHandler::get_handle(const sycl::queue &queue) {
-    auto cudaDevice = ih.get_native_device<sycl::backend::ext_oneapi_cuda>();
-    CUresult cuErr;
-    CUcontext desired;
-    CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, cuErr, &desired, cudaDevice);
-    auto piPlacedContext_ = reinterpret_cast<pi_context>(desired);
-    CUstream streamId = get_stream(queue);
-    cusolverStatus_t err;
-    auto it = handle_helper.cusolver_handle_mapper_.find(piPlacedContext_);
-    if (it != handle_helper.cusolver_handle_mapper_.end()) {
-        if (it->second == nullptr) {
-            handle_helper.cusolver_handle_mapper_.erase(it);
-        }
-        else {
-            auto handle = it->second->load();
-            if (handle != nullptr) {
-                cudaStream_t currentStreamId;
-                CUSOLVER_ERROR_FUNC(cusolverDnGetStream, err, handle, &currentStreamId);
-                if (currentStreamId != streamId) {
-                    CUSOLVER_ERROR_FUNC(cusolverDnSetStream, err, handle, streamId);
-                }
-                return handle;
-            }
-            else {
-                handle_helper.cusolver_handle_mapper_.erase(it);
-            }
-        }
-    }
-
-    cusolverDnHandle_t handle;
-
-    CUSOLVER_ERROR_FUNC(cusolverDnCreate, err, &handle);
-    CUSOLVER_ERROR_FUNC(cusolverDnSetStream, err, handle, streamId);
-
-    auto insert_iter = handle_helper.cusolver_handle_mapper_.insert(
-        std::make_pair(piPlacedContext_, new std::atomic<cusolverDnHandle_t>(handle)));
-
-    sycl::detail::pi::contextSetExtendedDeleter(*placedContext_, ContextCallback,
-                                                insert_iter.first->second);
-
-    return handle;
-}
-
-CUstream CusolverScopedContextHandler::get_stream(const sycl::queue &queue) {
-    return sycl::get_native<sycl::backend::ext_oneapi_cuda>(queue);
-}
-sycl::context CusolverScopedContextHandler::get_context(const sycl::queue &queue) {
-    return queue.get_context();
-}
-
-} // namespace cusolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/lapack/backends/cusolver/cusolver_scope_handle.hpp b/src/lapack/backends/cusolver/cusolver_scope_handle.hpp
deleted file mode 100644
index 585b4995a..000000000
--- a/src/lapack/backends/cusolver/cusolver_scope_handle.hpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#ifndef _CUSOLVER_SCOPED_HANDLE_HPP_
-#define _CUSOLVER_SCOPED_HANDLE_HPP_
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#if __has_include(<sycl/context.hpp>)
-#if __SYCL_COMPILER_VERSION <= 20220930
-#include <sycl/backend/cuda.hpp>
-#endif
-#include <sycl/context.hpp>
-#include <sycl/detail/pi.hpp>
-#else
-#include <CL/sycl/backend/cuda.hpp>
-#include <CL/sycl/context.hpp>
-#include <CL/sycl/detail/pi.hpp>
-#endif
-#include <atomic>
-#include <memory>
-#include <thread>
-#include <unordered_map>
-#include "cusolver_helper.hpp"
-#include "cusolver_handle.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace cusolver {
-
-/**
-* @brief NVIDIA advise for handle creation for cublas:
-https://devtalk.nvidia.com/default/topic/838794/gpu-accelerated libraries/using-cublas-in-different-cuda-streams/
-According to NVIDIA: 
-1)	The cusolver handles behaviour with different devices is unclear. However, cusolver is based on the cublas API 
-    which required that different handles to be used for different devices. So it is assumed that cusolver also 
-    requires different handles on different devices: 
-https://docs.nvidia.com/cuda/cusolver/index.html#introduction
-http://docs.nvidia.com/cuda/cublas/index.html#cublas-context	
-2) 	The library is thread safe and can be called form different host threads: 
-https://docs.nvidia.com/cuda/cusolver/index.html#thread-safety
-3)	It is neither required nor recommended that different handles be used for different streams on the same device,
- using the same host thread.
-
-The advice above is for using cublas with the cuda runtime API. Given that cusolver is based on cublas the advice is 
-transferable. The cusolver_scope_handle is based on the oneMKL cublas_scope_handle. The NVIDIA runtime API creates a 
-default context for users. The cusolverDnCreate function in uses the context located on top of the stack for each thread. 
-Then, the cuSolver routine uses this context for resource allocation/access. Calling a cuSolver function with a handle 
-created for context A and memories/queue created for context B results in a segmentation fault. Thus we need to create 
-one handle per context and per thread. A context can have multiple streams, so the important thing here is to have one 
-cusolverDnHandle_t per driver context and that cuSolver handle can switch between multiple streams created for that context. 
-Here, we are dealing with CUDA driver API, therefore, the SYCL-CUDA backend controls the context. If a queue(equivalent of 
-CUDA stream) is associated with a context different from the one on top of the thread stack(can be any context which 
-associated at any time by either the runtime or user for any specific reason), the context associated with the queue must 
-be moved on top of the stack temporarily for the requested routine operations. However, after the cuSolver routine 
-execution, the original context must be restored to prevent intervening with the original user/runtime execution set up. 
-Here, the RAII type context switch is used to guarantee to recover the original CUDA context. The cuSolver handle allocates 
-internal resources, therefore, the handle must be destroyed when the context goes out of scope. This will bind the life of 
-cuSolver handle to the SYCL context.
-**/
-
-class CusolverScopedContextHandler {
-    CUcontext original_;
-    sycl::context *placedContext_;
-    bool needToRecover_;
-    sycl::interop_handle &ih;
-    static thread_local cusolver_handle<pi_context> handle_helper;
-    CUstream get_stream(const sycl::queue &queue);
-    sycl::context get_context(const sycl::queue &queue);
-
-public:
-    CusolverScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih);
-
-    ~CusolverScopedContextHandler() noexcept(false);
-    /**
-   * @brief get_handle: creates the handle by implicitly impose the advice
-   * given by nvidia for creating a cusolver_handle. (e.g. one cuStream per device
-   * per thread).
-   * @param queue sycl queue.
-   * @return cusolverDnHandle_t a handle to construct cusolver routines
-   */
-    cusolverDnHandle_t get_handle(const sycl::queue &queue);
-    // This is a work-around function for reinterpret_casting the memory. This
-    // will be fixed when SYCL-2020 has been implemented for Pi backend.
-    template <typename T, typename U>
-    inline T get_mem(U acc) {
-        CUdeviceptr cudaPtr = ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(acc);
-        return reinterpret_cast<T>(cudaPtr);
-    }
-
-    void wait_stream(const sycl::queue &queue) {
-        cuStreamSynchronize(get_stream(queue));
-    }
-};
-
-} // namespace cusolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
-#endif //_CUSOLVER_SCOPED_HANDLE_HPP_
diff --git a/src/lapack/backends/cusolver/cusolver_task.hpp b/src/lapack/backends/cusolver/cusolver_task.hpp
deleted file mode 100644
index 9d319be64..000000000
--- a/src/lapack/backends/cusolver/cusolver_task.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _MKL_LAPACK_CUSOLVER_TASK_HPP_
-#define _MKL_LAPACK_CUSOLVER_TASK_HPP_
-#include <cuda.h>
-#include <cublas_v2.h>
-#include <cusolverDn.h>
-#include <complex>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl/types.hpp"
-#include "cusolver_scope_handle.hpp"
-#if __has_include(<sycl/detail/pi.hpp>)
-#include <sycl/detail/pi.hpp>
-#else
-#include <CL/sycl/detail/pi.hpp>
-#endif
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace cusolver {
-
-template <typename H, typename F>
-static inline void host_task_internal(H &cgh, sycl::queue queue, F f) {
-    cgh.host_task([f, queue](sycl::interop_handle ih) {
-        auto sc = CusolverScopedContextHandler(queue, ih);
-        f(sc);
-        sc.wait_stream(queue);
-    });
-}
-
-template <typename H, typename F>
-static inline void onemkl_cusolver_host_task(H &cgh, sycl::queue queue, F f) {
-    (void)host_task_internal(cgh, queue, f);
-}
-
-} // namespace cusolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
-#endif // _MKL_LAPACK_CUSOLVER_TASK_HPP_
diff --git a/src/lapack/backends/cusolver/cusolver_wrappers.cpp b/src/lapack/backends/cusolver/cusolver_wrappers.cpp
deleted file mode 100644
index 4b5ab8e2c..000000000
--- a/src/lapack/backends/cusolver/cusolver_wrappers.cpp
+++ /dev/null
@@ -1,426 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "lapack/function_table.hpp"
-#include "oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hpp"
-
-#define WRAPPER_VERSION 1
-
-extern "C" ONEMKL_EXPORT lapack_function_table_t mkl_lapack_table = {
-    WRAPPER_VERSION,
-#define LAPACK_BACKEND cusolver
-    oneapi::mkl::lapack::cusolver::gebrd,
-    oneapi::mkl::lapack::cusolver::gebrd,
-    oneapi::mkl::lapack::cusolver::gebrd,
-    oneapi::mkl::lapack::cusolver::gebrd,
-    oneapi::mkl::lapack::cusolver::gerqf,
-    oneapi::mkl::lapack::cusolver::gerqf,
-    oneapi::mkl::lapack::cusolver::gerqf,
-    oneapi::mkl::lapack::cusolver::gerqf,
-    oneapi::mkl::lapack::cusolver::geqrf,
-    oneapi::mkl::lapack::cusolver::geqrf,
-    oneapi::mkl::lapack::cusolver::geqrf,
-    oneapi::mkl::lapack::cusolver::geqrf,
-    oneapi::mkl::lapack::cusolver::getrf,
-    oneapi::mkl::lapack::cusolver::getrf,
-    oneapi::mkl::lapack::cusolver::getrf,
-    oneapi::mkl::lapack::cusolver::getrf,
-    oneapi::mkl::lapack::cusolver::getri,
-    oneapi::mkl::lapack::cusolver::getri,
-    oneapi::mkl::lapack::cusolver::getri,
-    oneapi::mkl::lapack::cusolver::getri,
-    oneapi::mkl::lapack::cusolver::getrs,
-    oneapi::mkl::lapack::cusolver::getrs,
-    oneapi::mkl::lapack::cusolver::getrs,
-    oneapi::mkl::lapack::cusolver::getrs,
-    oneapi::mkl::lapack::cusolver::gesvd,
-    oneapi::mkl::lapack::cusolver::gesvd,
-    oneapi::mkl::lapack::cusolver::gesvd,
-    oneapi::mkl::lapack::cusolver::gesvd,
-    oneapi::mkl::lapack::cusolver::heevd,
-    oneapi::mkl::lapack::cusolver::heevd,
-    oneapi::mkl::lapack::cusolver::hegvd,
-    oneapi::mkl::lapack::cusolver::hegvd,
-    oneapi::mkl::lapack::cusolver::hetrd,
-    oneapi::mkl::lapack::cusolver::hetrd,
-    oneapi::mkl::lapack::cusolver::hetrf,
-    oneapi::mkl::lapack::cusolver::hetrf,
-    oneapi::mkl::lapack::cusolver::orgbr,
-    oneapi::mkl::lapack::cusolver::orgbr,
-    oneapi::mkl::lapack::cusolver::orgqr,
-    oneapi::mkl::lapack::cusolver::orgqr,
-    oneapi::mkl::lapack::cusolver::orgtr,
-    oneapi::mkl::lapack::cusolver::orgtr,
-    oneapi::mkl::lapack::cusolver::ormtr,
-    oneapi::mkl::lapack::cusolver::ormtr,
-    oneapi::mkl::lapack::cusolver::ormrq,
-    oneapi::mkl::lapack::cusolver::ormrq,
-    oneapi::mkl::lapack::cusolver::ormqr,
-    oneapi::mkl::lapack::cusolver::ormqr,
-    oneapi::mkl::lapack::cusolver::potrf,
-    oneapi::mkl::lapack::cusolver::potrf,
-    oneapi::mkl::lapack::cusolver::potrf,
-    oneapi::mkl::lapack::cusolver::potrf,
-    oneapi::mkl::lapack::cusolver::potri,
-    oneapi::mkl::lapack::cusolver::potri,
-    oneapi::mkl::lapack::cusolver::potri,
-    oneapi::mkl::lapack::cusolver::potri,
-    oneapi::mkl::lapack::cusolver::potrs,
-    oneapi::mkl::lapack::cusolver::potrs,
-    oneapi::mkl::lapack::cusolver::potrs,
-    oneapi::mkl::lapack::cusolver::potrs,
-    oneapi::mkl::lapack::cusolver::syevd,
-    oneapi::mkl::lapack::cusolver::syevd,
-    oneapi::mkl::lapack::cusolver::sygvd,
-    oneapi::mkl::lapack::cusolver::sygvd,
-    oneapi::mkl::lapack::cusolver::sytrd,
-    oneapi::mkl::lapack::cusolver::sytrd,
-    oneapi::mkl::lapack::cusolver::sytrf,
-    oneapi::mkl::lapack::cusolver::sytrf,
-    oneapi::mkl::lapack::cusolver::sytrf,
-    oneapi::mkl::lapack::cusolver::sytrf,
-    oneapi::mkl::lapack::cusolver::trtrs,
-    oneapi::mkl::lapack::cusolver::trtrs,
-    oneapi::mkl::lapack::cusolver::trtrs,
-    oneapi::mkl::lapack::cusolver::trtrs,
-    oneapi::mkl::lapack::cusolver::ungbr,
-    oneapi::mkl::lapack::cusolver::ungbr,
-    oneapi::mkl::lapack::cusolver::ungqr,
-    oneapi::mkl::lapack::cusolver::ungqr,
-    oneapi::mkl::lapack::cusolver::ungtr,
-    oneapi::mkl::lapack::cusolver::ungtr,
-    oneapi::mkl::lapack::cusolver::unmrq,
-    oneapi::mkl::lapack::cusolver::unmrq,
-    oneapi::mkl::lapack::cusolver::unmqr,
-    oneapi::mkl::lapack::cusolver::unmqr,
-    oneapi::mkl::lapack::cusolver::unmtr,
-    oneapi::mkl::lapack::cusolver::unmtr,
-    oneapi::mkl::lapack::cusolver::gebrd,
-    oneapi::mkl::lapack::cusolver::gebrd,
-    oneapi::mkl::lapack::cusolver::gebrd,
-    oneapi::mkl::lapack::cusolver::gebrd,
-    oneapi::mkl::lapack::cusolver::gerqf,
-    oneapi::mkl::lapack::cusolver::gerqf,
-    oneapi::mkl::lapack::cusolver::gerqf,
-    oneapi::mkl::lapack::cusolver::gerqf,
-    oneapi::mkl::lapack::cusolver::geqrf,
-    oneapi::mkl::lapack::cusolver::geqrf,
-    oneapi::mkl::lapack::cusolver::geqrf,
-    oneapi::mkl::lapack::cusolver::geqrf,
-    oneapi::mkl::lapack::cusolver::getrf,
-    oneapi::mkl::lapack::cusolver::getrf,
-    oneapi::mkl::lapack::cusolver::getrf,
-    oneapi::mkl::lapack::cusolver::getrf,
-    oneapi::mkl::lapack::cusolver::getri,
-    oneapi::mkl::lapack::cusolver::getri,
-    oneapi::mkl::lapack::cusolver::getri,
-    oneapi::mkl::lapack::cusolver::getri,
-    oneapi::mkl::lapack::cusolver::getrs,
-    oneapi::mkl::lapack::cusolver::getrs,
-    oneapi::mkl::lapack::cusolver::getrs,
-    oneapi::mkl::lapack::cusolver::getrs,
-    oneapi::mkl::lapack::cusolver::gesvd,
-    oneapi::mkl::lapack::cusolver::gesvd,
-    oneapi::mkl::lapack::cusolver::gesvd,
-    oneapi::mkl::lapack::cusolver::gesvd,
-    oneapi::mkl::lapack::cusolver::heevd,
-    oneapi::mkl::lapack::cusolver::heevd,
-    oneapi::mkl::lapack::cusolver::hegvd,
-    oneapi::mkl::lapack::cusolver::hegvd,
-    oneapi::mkl::lapack::cusolver::hetrd,
-    oneapi::mkl::lapack::cusolver::hetrd,
-    oneapi::mkl::lapack::cusolver::hetrf,
-    oneapi::mkl::lapack::cusolver::hetrf,
-    oneapi::mkl::lapack::cusolver::orgbr,
-    oneapi::mkl::lapack::cusolver::orgbr,
-    oneapi::mkl::lapack::cusolver::orgqr,
-    oneapi::mkl::lapack::cusolver::orgqr,
-    oneapi::mkl::lapack::cusolver::orgtr,
-    oneapi::mkl::lapack::cusolver::orgtr,
-    oneapi::mkl::lapack::cusolver::ormtr,
-    oneapi::mkl::lapack::cusolver::ormtr,
-    oneapi::mkl::lapack::cusolver::ormrq,
-    oneapi::mkl::lapack::cusolver::ormrq,
-    oneapi::mkl::lapack::cusolver::ormqr,
-    oneapi::mkl::lapack::cusolver::ormqr,
-    oneapi::mkl::lapack::cusolver::potrf,
-    oneapi::mkl::lapack::cusolver::potrf,
-    oneapi::mkl::lapack::cusolver::potrf,
-    oneapi::mkl::lapack::cusolver::potrf,
-    oneapi::mkl::lapack::cusolver::potri,
-    oneapi::mkl::lapack::cusolver::potri,
-    oneapi::mkl::lapack::cusolver::potri,
-    oneapi::mkl::lapack::cusolver::potri,
-    oneapi::mkl::lapack::cusolver::potrs,
-    oneapi::mkl::lapack::cusolver::potrs,
-    oneapi::mkl::lapack::cusolver::potrs,
-    oneapi::mkl::lapack::cusolver::potrs,
-    oneapi::mkl::lapack::cusolver::syevd,
-    oneapi::mkl::lapack::cusolver::syevd,
-    oneapi::mkl::lapack::cusolver::sygvd,
-    oneapi::mkl::lapack::cusolver::sygvd,
-    oneapi::mkl::lapack::cusolver::sytrd,
-    oneapi::mkl::lapack::cusolver::sytrd,
-    oneapi::mkl::lapack::cusolver::sytrf,
-    oneapi::mkl::lapack::cusolver::sytrf,
-    oneapi::mkl::lapack::cusolver::sytrf,
-    oneapi::mkl::lapack::cusolver::sytrf,
-    oneapi::mkl::lapack::cusolver::trtrs,
-    oneapi::mkl::lapack::cusolver::trtrs,
-    oneapi::mkl::lapack::cusolver::trtrs,
-    oneapi::mkl::lapack::cusolver::trtrs,
-    oneapi::mkl::lapack::cusolver::ungbr,
-    oneapi::mkl::lapack::cusolver::ungbr,
-    oneapi::mkl::lapack::cusolver::ungqr,
-    oneapi::mkl::lapack::cusolver::ungqr,
-    oneapi::mkl::lapack::cusolver::ungtr,
-    oneapi::mkl::lapack::cusolver::ungtr,
-    oneapi::mkl::lapack::cusolver::unmrq,
-    oneapi::mkl::lapack::cusolver::unmrq,
-    oneapi::mkl::lapack::cusolver::unmqr,
-    oneapi::mkl::lapack::cusolver::unmqr,
-    oneapi::mkl::lapack::cusolver::unmtr,
-    oneapi::mkl::lapack::cusolver::unmtr,
-    oneapi::mkl::lapack::cusolver::geqrf_batch,
-    oneapi::mkl::lapack::cusolver::geqrf_batch,
-    oneapi::mkl::lapack::cusolver::geqrf_batch,
-    oneapi::mkl::lapack::cusolver::geqrf_batch,
-    oneapi::mkl::lapack::cusolver::getri_batch,
-    oneapi::mkl::lapack::cusolver::getri_batch,
-    oneapi::mkl::lapack::cusolver::getri_batch,
-    oneapi::mkl::lapack::cusolver::getri_batch,
-    oneapi::mkl::lapack::cusolver::getrs_batch,
-    oneapi::mkl::lapack::cusolver::getrs_batch,
-    oneapi::mkl::lapack::cusolver::getrs_batch,
-    oneapi::mkl::lapack::cusolver::getrs_batch,
-    oneapi::mkl::lapack::cusolver::getrf_batch,
-    oneapi::mkl::lapack::cusolver::getrf_batch,
-    oneapi::mkl::lapack::cusolver::getrf_batch,
-    oneapi::mkl::lapack::cusolver::getrf_batch,
-    oneapi::mkl::lapack::cusolver::orgqr_batch,
-    oneapi::mkl::lapack::cusolver::orgqr_batch,
-    oneapi::mkl::lapack::cusolver::potrf_batch,
-    oneapi::mkl::lapack::cusolver::potrf_batch,
-    oneapi::mkl::lapack::cusolver::potrf_batch,
-    oneapi::mkl::lapack::cusolver::potrf_batch,
-    oneapi::mkl::lapack::cusolver::potrs_batch,
-    oneapi::mkl::lapack::cusolver::potrs_batch,
-    oneapi::mkl::lapack::cusolver::potrs_batch,
-    oneapi::mkl::lapack::cusolver::potrs_batch,
-    oneapi::mkl::lapack::cusolver::ungqr_batch,
-    oneapi::mkl::lapack::cusolver::ungqr_batch,
-    oneapi::mkl::lapack::cusolver::geqrf_batch,
-    oneapi::mkl::lapack::cusolver::geqrf_batch,
-    oneapi::mkl::lapack::cusolver::geqrf_batch,
-    oneapi::mkl::lapack::cusolver::geqrf_batch,
-    oneapi::mkl::lapack::cusolver::getrf_batch,
-    oneapi::mkl::lapack::cusolver::getrf_batch,
-    oneapi::mkl::lapack::cusolver::getrf_batch,
-    oneapi::mkl::lapack::cusolver::getrf_batch,
-    oneapi::mkl::lapack::cusolver::getri_batch,
-    oneapi::mkl::lapack::cusolver::getri_batch,
-    oneapi::mkl::lapack::cusolver::getri_batch,
-    oneapi::mkl::lapack::cusolver::getri_batch,
-    oneapi::mkl::lapack::cusolver::getrs_batch,
-    oneapi::mkl::lapack::cusolver::getrs_batch,
-    oneapi::mkl::lapack::cusolver::getrs_batch,
-    oneapi::mkl::lapack::cusolver::getrs_batch,
-    oneapi::mkl::lapack::cusolver::orgqr_batch,
-    oneapi::mkl::lapack::cusolver::orgqr_batch,
-    oneapi::mkl::lapack::cusolver::potrf_batch,
-    oneapi::mkl::lapack::cusolver::potrf_batch,
-    oneapi::mkl::lapack::cusolver::potrf_batch,
-    oneapi::mkl::lapack::cusolver::potrf_batch,
-    oneapi::mkl::lapack::cusolver::potrs_batch,
-    oneapi::mkl::lapack::cusolver::potrs_batch,
-    oneapi::mkl::lapack::cusolver::potrs_batch,
-    oneapi::mkl::lapack::cusolver::potrs_batch,
-    oneapi::mkl::lapack::cusolver::ungqr_batch,
-    oneapi::mkl::lapack::cusolver::ungqr_batch,
-    oneapi::mkl::lapack::cusolver::geqrf_batch,
-    oneapi::mkl::lapack::cusolver::geqrf_batch,
-    oneapi::mkl::lapack::cusolver::geqrf_batch,
-    oneapi::mkl::lapack::cusolver::geqrf_batch,
-    oneapi::mkl::lapack::cusolver::getrf_batch,
-    oneapi::mkl::lapack::cusolver::getrf_batch,
-    oneapi::mkl::lapack::cusolver::getrf_batch,
-    oneapi::mkl::lapack::cusolver::getrf_batch,
-    oneapi::mkl::lapack::cusolver::getri_batch,
-    oneapi::mkl::lapack::cusolver::getri_batch,
-    oneapi::mkl::lapack::cusolver::getri_batch,
-    oneapi::mkl::lapack::cusolver::getri_batch,
-    oneapi::mkl::lapack::cusolver::getrs_batch,
-    oneapi::mkl::lapack::cusolver::getrs_batch,
-    oneapi::mkl::lapack::cusolver::getrs_batch,
-    oneapi::mkl::lapack::cusolver::getrs_batch,
-    oneapi::mkl::lapack::cusolver::orgqr_batch,
-    oneapi::mkl::lapack::cusolver::orgqr_batch,
-    oneapi::mkl::lapack::cusolver::potrf_batch,
-    oneapi::mkl::lapack::cusolver::potrf_batch,
-    oneapi::mkl::lapack::cusolver::potrf_batch,
-    oneapi::mkl::lapack::cusolver::potrf_batch,
-    oneapi::mkl::lapack::cusolver::potrs_batch,
-    oneapi::mkl::lapack::cusolver::potrs_batch,
-    oneapi::mkl::lapack::cusolver::potrs_batch,
-    oneapi::mkl::lapack::cusolver::potrs_batch,
-    oneapi::mkl::lapack::cusolver::ungqr_batch,
-    oneapi::mkl::lapack::cusolver::ungqr_batch,
-    oneapi::mkl::lapack::cusolver::gebrd_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::gebrd_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::gebrd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::gebrd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::gerqf_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::gerqf_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::gerqf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::gerqf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::geqrf_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::geqrf_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::geqrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::geqrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::gesvd_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::gesvd_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::gesvd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::gesvd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::getrf_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::getrf_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::getrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::getrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::getri_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::getri_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::getri_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::getri_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::getrs_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::getrs_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::getrs_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::getrs_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::heevd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::heevd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::hegvd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::hegvd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::hetrd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::hetrd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::hetrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::hetrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::orgbr_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::orgbr_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::orgtr_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::orgtr_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::orgqr_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::orgqr_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::ormrq_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::ormrq_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::ormqr_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::ormqr_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::ormtr_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::ormtr_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::potrf_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::potrf_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::potrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::potrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::potrs_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::potrs_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::potrs_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::potrs_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::potri_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::potri_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::potri_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::potri_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::sytrf_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::sytrf_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::sytrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::sytrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::syevd_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::syevd_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::sygvd_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::sygvd_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::sytrd_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::sytrd_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::trtrs_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::trtrs_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::trtrs_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::trtrs_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::ungbr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::ungbr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::ungqr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::ungqr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::ungtr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::ungtr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::unmrq_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::unmrq_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::unmqr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::unmqr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::unmtr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::unmtr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::getrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::getrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::getrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::getrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::getri_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::getri_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::getri_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::getri_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::getrs_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::getrs_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::getrs_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::getrs_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::geqrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::geqrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::geqrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::geqrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::potrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::potrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::potrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::potrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::potrs_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::potrs_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::potrs_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::potrs_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::orgqr_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::orgqr_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::ungqr_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::ungqr_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::getrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::getrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::getrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::getrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::getri_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::getri_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::getri_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::getri_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::getrs_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::getrs_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::getrs_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::getrs_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::geqrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::geqrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::geqrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::geqrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::orgqr_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::orgqr_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::potrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::potrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::potrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::potrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::potrs_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::cusolver::potrs_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::cusolver::potrs_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::potrs_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::cusolver::ungqr_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::cusolver::ungqr_batch_scratchpad_size<std::complex<double>>
-#undef LAPACK_BACKEND
-};
diff --git a/src/lapack/backends/mkl_common/lapack_wrappers.cxx b/src/lapack/backends/mkl_common/lapack_wrappers.cxx
deleted file mode 100644
index a80f807b6..000000000
--- a/src/lapack/backends/mkl_common/lapack_wrappers.cxx
+++ /dev/null
@@ -1,331 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-oneapi::mkl::lapack::LAPACK_BACKEND::gebrd, oneapi::mkl::lapack::LAPACK_BACKEND::gebrd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gebrd, oneapi::mkl::lapack::LAPACK_BACKEND::gebrd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gerqf, oneapi::mkl::lapack::LAPACK_BACKEND::gerqf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gerqf, oneapi::mkl::lapack::LAPACK_BACKEND::gerqf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf, oneapi::mkl::lapack::LAPACK_BACKEND::geqrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf, oneapi::mkl::lapack::LAPACK_BACKEND::geqrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf, oneapi::mkl::lapack::LAPACK_BACKEND::getrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf, oneapi::mkl::lapack::LAPACK_BACKEND::getrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri, oneapi::mkl::lapack::LAPACK_BACKEND::getri,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri, oneapi::mkl::lapack::LAPACK_BACKEND::getri,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs, oneapi::mkl::lapack::LAPACK_BACKEND::getrs,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs, oneapi::mkl::lapack::LAPACK_BACKEND::getrs,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gesvd, oneapi::mkl::lapack::LAPACK_BACKEND::gesvd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gesvd, oneapi::mkl::lapack::LAPACK_BACKEND::gesvd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::heevd, oneapi::mkl::lapack::LAPACK_BACKEND::heevd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::hegvd, oneapi::mkl::lapack::LAPACK_BACKEND::hegvd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::hetrd, oneapi::mkl::lapack::LAPACK_BACKEND::hetrd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::hetrf, oneapi::mkl::lapack::LAPACK_BACKEND::hetrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgbr, oneapi::mkl::lapack::LAPACK_BACKEND::orgbr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr, oneapi::mkl::lapack::LAPACK_BACKEND::orgqr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgtr, oneapi::mkl::lapack::LAPACK_BACKEND::orgtr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormtr, oneapi::mkl::lapack::LAPACK_BACKEND::ormtr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormrq, oneapi::mkl::lapack::LAPACK_BACKEND::ormrq,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormqr, oneapi::mkl::lapack::LAPACK_BACKEND::ormqr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf, oneapi::mkl::lapack::LAPACK_BACKEND::potrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf, oneapi::mkl::lapack::LAPACK_BACKEND::potrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potri, oneapi::mkl::lapack::LAPACK_BACKEND::potri,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potri, oneapi::mkl::lapack::LAPACK_BACKEND::potri,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs, oneapi::mkl::lapack::LAPACK_BACKEND::potrs,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs, oneapi::mkl::lapack::LAPACK_BACKEND::potrs,
-    oneapi::mkl::lapack::LAPACK_BACKEND::syevd, oneapi::mkl::lapack::LAPACK_BACKEND::syevd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sygvd, oneapi::mkl::lapack::LAPACK_BACKEND::sygvd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrd, oneapi::mkl::lapack::LAPACK_BACKEND::sytrd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrf, oneapi::mkl::lapack::LAPACK_BACKEND::sytrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrf, oneapi::mkl::lapack::LAPACK_BACKEND::sytrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::trtrs, oneapi::mkl::lapack::LAPACK_BACKEND::trtrs,
-    oneapi::mkl::lapack::LAPACK_BACKEND::trtrs, oneapi::mkl::lapack::LAPACK_BACKEND::trtrs,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungbr, oneapi::mkl::lapack::LAPACK_BACKEND::ungbr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr, oneapi::mkl::lapack::LAPACK_BACKEND::ungqr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungtr, oneapi::mkl::lapack::LAPACK_BACKEND::ungtr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmrq, oneapi::mkl::lapack::LAPACK_BACKEND::unmrq,
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmqr, oneapi::mkl::lapack::LAPACK_BACKEND::unmqr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmtr, oneapi::mkl::lapack::LAPACK_BACKEND::unmtr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gebrd, oneapi::mkl::lapack::LAPACK_BACKEND::gebrd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gebrd, oneapi::mkl::lapack::LAPACK_BACKEND::gebrd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gerqf, oneapi::mkl::lapack::LAPACK_BACKEND::gerqf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gerqf, oneapi::mkl::lapack::LAPACK_BACKEND::gerqf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf, oneapi::mkl::lapack::LAPACK_BACKEND::geqrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf, oneapi::mkl::lapack::LAPACK_BACKEND::geqrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf, oneapi::mkl::lapack::LAPACK_BACKEND::getrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf, oneapi::mkl::lapack::LAPACK_BACKEND::getrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri, oneapi::mkl::lapack::LAPACK_BACKEND::getri,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri, oneapi::mkl::lapack::LAPACK_BACKEND::getri,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs, oneapi::mkl::lapack::LAPACK_BACKEND::getrs,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs, oneapi::mkl::lapack::LAPACK_BACKEND::getrs,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gesvd, oneapi::mkl::lapack::LAPACK_BACKEND::gesvd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gesvd, oneapi::mkl::lapack::LAPACK_BACKEND::gesvd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::heevd, oneapi::mkl::lapack::LAPACK_BACKEND::heevd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::hegvd, oneapi::mkl::lapack::LAPACK_BACKEND::hegvd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::hetrd, oneapi::mkl::lapack::LAPACK_BACKEND::hetrd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::hetrf, oneapi::mkl::lapack::LAPACK_BACKEND::hetrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgbr, oneapi::mkl::lapack::LAPACK_BACKEND::orgbr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr, oneapi::mkl::lapack::LAPACK_BACKEND::orgqr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgtr, oneapi::mkl::lapack::LAPACK_BACKEND::orgtr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormtr, oneapi::mkl::lapack::LAPACK_BACKEND::ormtr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormrq, oneapi::mkl::lapack::LAPACK_BACKEND::ormrq,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormqr, oneapi::mkl::lapack::LAPACK_BACKEND::ormqr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf, oneapi::mkl::lapack::LAPACK_BACKEND::potrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf, oneapi::mkl::lapack::LAPACK_BACKEND::potrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potri, oneapi::mkl::lapack::LAPACK_BACKEND::potri,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potri, oneapi::mkl::lapack::LAPACK_BACKEND::potri,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs, oneapi::mkl::lapack::LAPACK_BACKEND::potrs,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs, oneapi::mkl::lapack::LAPACK_BACKEND::potrs,
-    oneapi::mkl::lapack::LAPACK_BACKEND::syevd, oneapi::mkl::lapack::LAPACK_BACKEND::syevd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sygvd, oneapi::mkl::lapack::LAPACK_BACKEND::sygvd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrd, oneapi::mkl::lapack::LAPACK_BACKEND::sytrd,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrf, oneapi::mkl::lapack::LAPACK_BACKEND::sytrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrf, oneapi::mkl::lapack::LAPACK_BACKEND::sytrf,
-    oneapi::mkl::lapack::LAPACK_BACKEND::trtrs, oneapi::mkl::lapack::LAPACK_BACKEND::trtrs,
-    oneapi::mkl::lapack::LAPACK_BACKEND::trtrs, oneapi::mkl::lapack::LAPACK_BACKEND::trtrs,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungbr, oneapi::mkl::lapack::LAPACK_BACKEND::ungbr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr, oneapi::mkl::lapack::LAPACK_BACKEND::ungqr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungtr, oneapi::mkl::lapack::LAPACK_BACKEND::ungtr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmrq, oneapi::mkl::lapack::LAPACK_BACKEND::unmrq,
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmqr, oneapi::mkl::lapack::LAPACK_BACKEND::unmqr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmtr, oneapi::mkl::lapack::LAPACK_BACKEND::unmtr,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gebrd_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gebrd_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gebrd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gebrd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gerqf_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gerqf_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gerqf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gerqf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gesvd_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gesvd_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gesvd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::gesvd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::heevd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::heevd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::hegvd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::hegvd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::hetrd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::hetrd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::hetrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::hetrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgbr_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgbr_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgtr_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgtr_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormrq_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormrq_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormqr_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormqr_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormtr_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ormtr_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potri_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potri_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potri_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potri_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrf_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrf_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::syevd_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::syevd_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sygvd_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sygvd_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrd_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::sytrd_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::trtrs_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::trtrs_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::trtrs_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::trtrs_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungbr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungbr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungtr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungtr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmrq_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmrq_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmqr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmqr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmtr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::unmtr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch_scratchpad_size<std::complex<double>>
diff --git a/src/lapack/backends/mkl_common/mkl_lapack.cxx b/src/lapack/backends/mkl_common/mkl_lapack.cxx
deleted file mode 100644
index 8573bffd9..000000000
--- a/src/lapack/backends/mkl_common/mkl_lapack.cxx
+++ /dev/null
@@ -1,2793 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<float> &d, sycl::buffer<float> &e,
-           sycl::buffer<std::complex<float>> &tauq, sycl::buffer<std::complex<float>> &taup,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                                 scratchpad_size);
-}
-void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &d, sycl::buffer<double> &e,
-           sycl::buffer<double> &tauq, sycl::buffer<double> &taup, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                                 scratchpad_size);
-}
-void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &d, sycl::buffer<float> &e,
-           sycl::buffer<float> &tauq, sycl::buffer<float> &taup, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                                 scratchpad_size);
-}
-void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda, sycl::buffer<double> &d,
-           sycl::buffer<double> &e, sycl::buffer<std::complex<double>> &tauq,
-           sycl::buffer<std::complex<double>> &taup, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                                 scratchpad_size);
-}
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                                 scratchpad_size);
-}
-void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<double> &b, std::int64_t ldb, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                                 scratchpad_size);
-}
-void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<float> &b, std::int64_t ldb, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                                 scratchpad_size);
-}
-void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &b,
-           std::int64_t ldb, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                                 scratchpad_size);
-}
-void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-           std::int64_t n, sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &s,
-           sycl::buffer<double> &u, std::int64_t ldu, sycl::buffer<double> &vt, std::int64_t ldvt,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad,
-                                 scratchpad_size);
-}
-void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-           std::int64_t n, sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &s,
-           sycl::buffer<float> &u, std::int64_t ldu, sycl::buffer<float> &vt, std::int64_t ldvt,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad,
-                                 scratchpad_size);
-}
-void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-           std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<float> &s, sycl::buffer<std::complex<float>> &u, std::int64_t ldu,
-           sycl::buffer<std::complex<float>> &vt, std::int64_t ldvt,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad,
-                                 scratchpad_size);
-}
-void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-           std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<double> &s, sycl::buffer<std::complex<double>> &u, std::int64_t ldu,
-           sycl::buffer<std::complex<double>> &vt, std::int64_t ldvt,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad,
-                                 scratchpad_size);
-}
-void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<float> &w,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::heevd(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size);
-}
-void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda, sycl::buffer<double> &w,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::heevd(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size);
-}
-void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-           std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &b, std::int64_t ldb, sycl::buffer<float> &w,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                                 scratchpad_size);
-}
-void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-           std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &b, std::int64_t ldb, sycl::buffer<double> &w,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                                 scratchpad_size);
-}
-void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<float> &d,
-           sycl::buffer<float> &e, sycl::buffer<std::complex<float>> &tau,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::hetrd(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size);
-}
-void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda, sycl::buffer<double> &d,
-           sycl::buffer<double> &e, sycl::buffer<std::complex<double>> &tau,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::hetrd(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size);
-}
-void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::hetrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::hetrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-           std::int64_t k, sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::orgbr(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-           std::int64_t k, sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::orgbr(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::orgqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::orgqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::orgtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::orgtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::ormtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad,
-                                 scratchpad_size);
-}
-void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::ormtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad,
-                                 scratchpad_size);
-}
-void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-           sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::ormrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad,
-                                 scratchpad_size);
-}
-void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-           sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::ormrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad,
-                                 scratchpad_size);
-}
-void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-           sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::ormqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad,
-                                 scratchpad_size);
-}
-void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-           sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::ormqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad,
-                                 scratchpad_size);
-}
-void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size);
-}
-void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size);
-}
-void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size);
-}
-void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size);
-}
-void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &w,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::syevd(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size);
-}
-void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &w,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::syevd(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size);
-}
-void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-           std::int64_t n, sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b,
-           std::int64_t ldb, sycl::buffer<double> &w, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                                 scratchpad_size);
-}
-void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-           std::int64_t n, sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b,
-           std::int64_t ldb, sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                                 scratchpad_size);
-}
-void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &d, sycl::buffer<double> &e,
-           sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::sytrd(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size);
-}
-void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &d, sycl::buffer<float> &e,
-           sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::sytrd(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size);
-}
-void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad,
-                                 scratchpad_size);
-}
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad,
-                                 scratchpad_size);
-}
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad,
-                                 scratchpad_size);
-}
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad,
-                                 scratchpad_size);
-}
-void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-           std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::ungbr(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-           std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::ungbr(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::ungqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::ungqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::ungtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::ungtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::unmrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad,
-                                 scratchpad_size);
-}
-void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::unmrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad,
-                                 scratchpad_size);
-}
-void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::unmqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad,
-                                 scratchpad_size);
-}
-void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::unmqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad,
-                                 scratchpad_size);
-}
-void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::unmtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad,
-                                 scratchpad_size);
-}
-void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::unmtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad,
-                                 scratchpad_size);
-}
-sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                  std::int64_t lda, float *d, float *e, std::complex<float> *tauq,
-                  std::complex<float> *taup, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                  double *d, double *e, double *tauq, double *taup, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                  float *d, float *e, float *tauq, float *taup, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                  std::int64_t lda, double *d, double *e, std::complex<double> *tauq,
-                  std::complex<double> *taup, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                  float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                  double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *tau, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                  std::int64_t lda, std::complex<double> *tau, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *tau, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                  double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                  float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                  std::int64_t lda, std::complex<double> *tau, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                  std::int64_t lda, std::int64_t *ipiv, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                  std::int64_t *ipiv, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                  std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                  std::int64_t lda, std::int64_t *ipiv, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                  std::int64_t *ipiv, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event getri(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda,
-                  std::int64_t *ipiv, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event getri(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda,
-                  std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                  std::int64_t *ipiv, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                  std::int64_t nrhs, std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<float> *b, std::int64_t ldb, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                  std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t *ipiv, double *b,
-                  std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                  std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t *ipiv, float *b,
-                  std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                  std::int64_t nrhs, std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<double> *b, std::int64_t ldb, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                  std::int64_t m, std::int64_t n, double *a, std::int64_t lda, double *s, double *u,
-                  std::int64_t ldu, double *vt, std::int64_t ldvt, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                  std::int64_t m, std::int64_t n, float *a, std::int64_t lda, float *s, float *u,
-                  std::int64_t ldu, float *vt, std::int64_t ldvt, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                  std::int64_t m, std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                  float *s, std::complex<float> *u, std::int64_t ldu, std::complex<float> *vt,
-                  std::int64_t ldvt, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                  std::int64_t m, std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                  double *s, std::complex<double> *u, std::int64_t ldu, std::complex<double> *vt,
-                  std::int64_t ldvt, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, float *w,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::heevd(queue, jobz, uplo, n, a, lda, w, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, double *w,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::heevd(queue, jobz, uplo, n, a, lda, w, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                  std::complex<float> *b, std::int64_t ldb, float *w,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                  std::complex<double> *b, std::int64_t ldb, double *w,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, float *d, float *e,
-                  std::complex<float> *tau, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::hetrd(queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, double *d, double *e,
-                  std::complex<double> *tau, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::hetrd(queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::hetrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::hetrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                  std::int64_t k, float *a, std::int64_t lda, float *tau, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::orgbr(queue, vec, m, n, k, a, lda, tau, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                  std::int64_t k, double *a, std::int64_t lda, double *tau, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::orgbr(queue, vec, m, n, k, a, lda, tau, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, double *a,
-                  std::int64_t lda, double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::orgqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, float *a,
-                  std::int64_t lda, float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::orgqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                  std::int64_t lda, float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::orgtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                  std::int64_t lda, double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::orgtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float *a,
-                  std::int64_t lda, float *tau, float *c, std::int64_t ldc, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ormtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double *a,
-                  std::int64_t lda, double *tau, double *c, std::int64_t ldc, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ormtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                  float *tau, float *c, std::int64_t ldc, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ormrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                  double *tau, double *c, std::int64_t ldc, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ormrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                  double *tau, double *c, std::int64_t ldc, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ormqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                  float *tau, float *c, std::int64_t ldc, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ormqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                  std::int64_t lda, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                  std::int64_t lda, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                  std::int64_t lda, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                  std::int64_t lda, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                  float *a, std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                  double *a, std::int64_t lda, double *b, std::int64_t ldb, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                  std::int64_t ldb, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                  std::int64_t ldb, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                  double *a, std::int64_t lda, double *w, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::syevd(queue, jobz, uplo, n, a, lda, w, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                  float *a, std::int64_t lda, float *w, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::syevd(queue, jobz, uplo, n, a, lda, w, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, double *b,
-                  std::int64_t ldb, double *w, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, float *b,
-                  std::int64_t ldb, float *w, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                  std::int64_t lda, double *d, double *e, double *tau, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::sytrd(queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                  std::int64_t lda, float *d, float *e, float *tau, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::sytrd(queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                  std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                  std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, double *a,
-                  std::int64_t lda, double *b, std::int64_t ldb, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a,
-                  std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                  std::int64_t ldb, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                  std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                  std::complex<float> *tau, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ungbr(queue, vec, m, n, k, a, lda, tau, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                  std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                  std::complex<double> *tau, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ungbr(queue, vec, m, n, k, a, lda, tau, scratchpad,
-                                        scratchpad_size, dependencies);
-}
-sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ungqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *tau,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ungqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ungtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *tau,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ungtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size,
-                                        dependencies);
-}
-sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *tau, std::complex<float> *c,
-                  std::int64_t ldc, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::unmrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> *a,
-                  std::int64_t lda, std::complex<double> *tau, std::complex<double> *c,
-                  std::int64_t ldc, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::unmrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *tau, std::complex<float> *c,
-                  std::int64_t ldc, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::unmqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> *a,
-                  std::int64_t lda, std::complex<double> *tau, std::complex<double> *c,
-                  std::int64_t ldc, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::unmqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                  std::complex<float> *c, std::int64_t ldc, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::unmtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *tau,
-                  std::complex<double> *c, std::int64_t ldc, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::unmtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size, dependencies);
-}
-void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<float> &tau,
-                 std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size,
-                                       scratchpad, scratchpad_size);
-}
-void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<double> &tau,
-                 std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size,
-                                       scratchpad, scratchpad_size);
-}
-void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size,
-                                       scratchpad, scratchpad_size);
-}
-void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<double>> &tau, std::int64_t stride_tau,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size,
-                                       scratchpad, scratchpad_size);
-}
-void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size,
-                                       scratchpad, scratchpad_size);
-}
-void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size,
-                                       scratchpad, scratchpad_size);
-}
-void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                 std::int64_t stride_ipiv, std::int64_t batch_size,
-                 sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size,
-                                       scratchpad, scratchpad_size);
-}
-void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                 std::int64_t stride_ipiv, std::int64_t batch_size,
-                 sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size,
-                                       scratchpad, scratchpad_size);
-}
-void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                 std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv, sycl::buffer<float> &b,
-                 std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                 sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv,
-                                       b, ldb, stride_b, batch_size, scratchpad, scratchpad_size);
-}
-void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                 std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 sycl::buffer<double> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv,
-                                       b, ldb, stride_b, batch_size, scratchpad, scratchpad_size);
-}
-void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                 std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 sycl::buffer<std::complex<float>> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv,
-                                       b, ldb, stride_b, batch_size, scratchpad, scratchpad_size);
-}
-void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                 std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 sycl::buffer<std::complex<double>> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv,
-                                       b, ldb, stride_b, batch_size, scratchpad, scratchpad_size);
-}
-void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                 std::int64_t stride_ipiv, std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size,
-                                       scratchpad, scratchpad_size);
-}
-void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                 std::int64_t stride_ipiv, std::int64_t batch_size,
-                 sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size,
-                                       scratchpad, scratchpad_size);
-}
-void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size,
-                                       scratchpad, scratchpad_size);
-}
-void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size,
-                                       scratchpad, scratchpad_size);
-}
-void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                 sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<float> &tau, std::int64_t stride_tau, std::int64_t batch_size,
-                 sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::orgqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                                       batch_size, scratchpad, scratchpad_size);
-}
-void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                 sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<double> &tau, std::int64_t stride_tau, std::int64_t batch_size,
-                 sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::orgqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                                       batch_size, scratchpad, scratchpad_size);
-}
-void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-                 std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                 sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad,
-                                       scratchpad_size);
-}
-void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                 sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad,
-                                       scratchpad_size);
-}
-void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad,
-                                       scratchpad_size);
-}
-void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad,
-                                       scratchpad_size);
-}
-void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                 sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<float> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b,
-                                       batch_size, scratchpad, scratchpad_size);
-}
-void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                 sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<double> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b,
-                                       batch_size, scratchpad, scratchpad_size);
-}
-void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<float>> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b,
-                                       batch_size, scratchpad, scratchpad_size);
-}
-void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<double>> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b,
-                                       batch_size, scratchpad, scratchpad_size);
-}
-void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::ungqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                                       batch_size, scratchpad, scratchpad_size);
-}
-void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<double>> &tau, std::int64_t stride_tau,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    ::oneapi::mkl::lapack::ungqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                                       batch_size, scratchpad, scratchpad_size);
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                        std::int64_t lda, std::int64_t stride_a, float *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                        std::int64_t lda, std::int64_t stride_a, double *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::complex<float> *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::complex<double> *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, float **a,
-                        std::int64_t *lda, float **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, tau, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, double **a,
-                        std::int64_t *lda, double **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, tau, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                        std::complex<float> **a, std::int64_t *lda, std::complex<float> **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, tau, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                        std::complex<double> **a, std::int64_t *lda, std::complex<double> **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, tau, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, float **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, ipiv, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, double **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, ipiv, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                        std::complex<float> **a, std::int64_t *lda, std::int64_t **ipiv,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, ipiv, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                        std::complex<double> **a, std::int64_t *lda, std::int64_t **ipiv,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, ipiv, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv,
-                        std::int64_t batch_size, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv,
-                        std::int64_t batch_size, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex<float> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex<double> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, ipiv, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, ipiv, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<float> **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, ipiv, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<double> **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, ipiv, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t *ipiv, std::int64_t stride_ipiv, float *b, std::int64_t ldb,
-                        std::int64_t stride_b, std::int64_t batch_size, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                                              stride_ipiv, b, ldb, stride_b, batch_size, scratchpad,
-                                              scratchpad_size, dependencies);
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t *ipiv, std::int64_t stride_ipiv, double *b, std::int64_t ldb,
-                        std::int64_t stride_b, std::int64_t batch_size, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                                              stride_ipiv, b, ldb, stride_b, batch_size, scratchpad,
-                                              scratchpad_size, dependencies);
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv,
-                        std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                        std::int64_t batch_size, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                                              stride_ipiv, b, ldb, stride_b, batch_size, scratchpad,
-                                              scratchpad_size, dependencies);
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv,
-                        std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                        std::int64_t batch_size, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                                              stride_ipiv, b, ldb, stride_b, batch_size, scratchpad,
-                                              scratchpad_size, dependencies);
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-                        std::int64_t *nrhs, float **a, std::int64_t *lda, std::int64_t **ipiv,
-                        float **b, std::int64_t *ldb, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                              group_count, group_sizes, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-                        std::int64_t *nrhs, double **a, std::int64_t *lda, std::int64_t **ipiv,
-                        double **b, std::int64_t *ldb, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                              group_count, group_sizes, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-                        std::int64_t *nrhs, std::complex<float> **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::complex<float> **b, std::int64_t *ldb,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                              group_count, group_sizes, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-                        std::int64_t *nrhs, std::complex<double> **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::complex<double> **b, std::int64_t *ldb,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                              group_count, group_sizes, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        float *a, std::int64_t lda, std::int64_t stride_a, float *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::orgqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        double *a, std::int64_t lda, std::int64_t stride_a, double *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::orgqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                        float **a, std::int64_t *lda, float **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::orgqr_batch(queue, m, n, k, a, lda, tau, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                        double **a, std::int64_t *lda, double **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::orgqr_batch(queue, m, n, k, a, lda, tau, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t batch_size, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t batch_size, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, float **a,
-                        std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, double **a,
-                        std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                        std::complex<float> **a, std::int64_t *lda, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                        std::complex<double> **a, std::int64_t *lda, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t stride_a,
-                        float *b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                                              stride_b, batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t stride_a,
-                        double *b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                                              stride_b, batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::complex<float> *b, std::int64_t ldb,
-                        std::int64_t stride_b, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                                              stride_b, batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::complex<double> *b, std::int64_t ldb,
-                        std::int64_t stride_b, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                                              stride_b, batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                        std::int64_t *nrhs, float **a, std::int64_t *lda, float **b,
-                        std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, b, ldb, group_count,
-                                              group_sizes, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                        std::int64_t *nrhs, double **a, std::int64_t *lda, double **b,
-                        std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, b, ldb, group_count,
-                                              group_sizes, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                        std::int64_t *nrhs, std::complex<float> **a, std::int64_t *lda,
-                        std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, b, ldb, group_count,
-                                              group_sizes, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                        std::int64_t *nrhs, std::complex<double> **a, std::int64_t *lda,
-                        std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, b, ldb, group_count,
-                                              group_sizes, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::complex<float> *tau, std::int64_t stride_tau, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ungqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::complex<double> *tau, std::int64_t stride_tau, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ungqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size,
-                                              dependencies);
-}
-sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                        std::complex<float> **a, std::int64_t *lda, std::complex<float> **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ungqr_batch(queue, m, n, k, a, lda, tau, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                        std::complex<double> **a, std::int64_t *lda, std::complex<double> **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return ::oneapi::mkl::lapack::ungqr_batch(queue, m, n, k, a, lda, tau, group_count, group_sizes,
-                                              scratchpad, scratchpad_size, dependencies);
-}
-
-template <>
-std::int64_t gebrd_scratchpad_size<float>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                          std::int64_t lda) {
-    return ::oneapi::mkl::lapack::gebrd_scratchpad_size<float>(queue, m, n, lda);
-}
-template <>
-std::int64_t gebrd_scratchpad_size<double>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                           std::int64_t lda) {
-    return ::oneapi::mkl::lapack::gebrd_scratchpad_size<double>(queue, m, n, lda);
-}
-template <>
-std::int64_t gebrd_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::gebrd_scratchpad_size<std::complex<float>>(queue, m, n, lda);
-}
-template <>
-std::int64_t gebrd_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::gebrd_scratchpad_size<std::complex<double>>(queue, m, n, lda);
-}
-template <>
-std::int64_t gerqf_scratchpad_size<float>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                          std::int64_t lda) {
-    return ::oneapi::mkl::lapack::gerqf_scratchpad_size<float>(queue, m, n, lda);
-}
-template <>
-std::int64_t gerqf_scratchpad_size<double>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                           std::int64_t lda) {
-    return ::oneapi::mkl::lapack::gerqf_scratchpad_size<double>(queue, m, n, lda);
-}
-template <>
-std::int64_t gerqf_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::gerqf_scratchpad_size<std::complex<float>>(queue, m, n, lda);
-}
-template <>
-std::int64_t gerqf_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::gerqf_scratchpad_size<std::complex<double>>(queue, m, n, lda);
-}
-template <>
-std::int64_t geqrf_scratchpad_size<float>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                          std::int64_t lda) {
-    return ::oneapi::mkl::lapack::geqrf_scratchpad_size<float>(queue, m, n, lda);
-}
-template <>
-std::int64_t geqrf_scratchpad_size<double>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                           std::int64_t lda) {
-    return ::oneapi::mkl::lapack::geqrf_scratchpad_size<double>(queue, m, n, lda);
-}
-template <>
-std::int64_t geqrf_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::geqrf_scratchpad_size<std::complex<float>>(queue, m, n, lda);
-}
-template <>
-std::int64_t geqrf_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::geqrf_scratchpad_size<std::complex<double>>(queue, m, n, lda);
-}
-template <>
-std::int64_t gesvd_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                          oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                          std::int64_t lda, std::int64_t ldu, std::int64_t ldvt) {
-    return ::oneapi::mkl::lapack::gesvd_scratchpad_size<float>(queue, jobu, jobvt, m, n, lda, ldu,
-                                                               ldvt);
-}
-template <>
-std::int64_t gesvd_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                           oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                           std::int64_t n, std::int64_t lda, std::int64_t ldu,
-                                           std::int64_t ldvt) {
-    return ::oneapi::mkl::lapack::gesvd_scratchpad_size<double>(queue, jobu, jobvt, m, n, lda, ldu,
-                                                                ldvt);
-}
-template <>
-std::int64_t gesvd_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                        oneapi::mkl::jobsvd jobu,
-                                                        oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda,
-                                                        std::int64_t ldu, std::int64_t ldvt) {
-    return ::oneapi::mkl::lapack::gesvd_scratchpad_size<std::complex<float>>(queue, jobu, jobvt, m,
-                                                                             n, lda, ldu, ldvt);
-}
-template <>
-std::int64_t gesvd_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                         oneapi::mkl::jobsvd jobu,
-                                                         oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda,
-                                                         std::int64_t ldu, std::int64_t ldvt) {
-    return ::oneapi::mkl::lapack::gesvd_scratchpad_size<std::complex<double>>(queue, jobu, jobvt, m,
-                                                                              n, lda, ldu, ldvt);
-}
-template <>
-std::int64_t getrf_scratchpad_size<float>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                          std::int64_t lda) {
-    return ::oneapi::mkl::lapack::getrf_scratchpad_size<float>(queue, m, n, lda);
-}
-template <>
-std::int64_t getrf_scratchpad_size<double>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                           std::int64_t lda) {
-    return ::oneapi::mkl::lapack::getrf_scratchpad_size<double>(queue, m, n, lda);
-}
-template <>
-std::int64_t getrf_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::getrf_scratchpad_size<std::complex<float>>(queue, m, n, lda);
-}
-template <>
-std::int64_t getrf_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::getrf_scratchpad_size<std::complex<double>>(queue, m, n, lda);
-}
-template <>
-std::int64_t getri_scratchpad_size<float>(sycl::queue &queue, std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::getri_scratchpad_size<float>(queue, n, lda);
-}
-template <>
-std::int64_t getri_scratchpad_size<double>(sycl::queue &queue, std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::getri_scratchpad_size<double>(queue, n, lda);
-}
-template <>
-std::int64_t getri_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t n,
-                                                        std::int64_t lda) {
-    return ::oneapi::mkl::lapack::getri_scratchpad_size<std::complex<float>>(queue, n, lda);
-}
-template <>
-std::int64_t getri_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t n,
-                                                         std::int64_t lda) {
-    return ::oneapi::mkl::lapack::getri_scratchpad_size<std::complex<double>>(queue, n, lda);
-}
-template <>
-std::int64_t getrs_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                          std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                          std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::getrs_scratchpad_size<float>(queue, trans, n, nrhs, lda, ldb);
-}
-template <>
-std::int64_t getrs_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                           std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                           std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::getrs_scratchpad_size<double>(queue, trans, n, nrhs, lda, ldb);
-}
-template <>
-std::int64_t getrs_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t n, std::int64_t nrhs,
-                                                        std::int64_t lda, std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::getrs_scratchpad_size<std::complex<float>>(queue, trans, n, nrhs,
-                                                                             lda, ldb);
-}
-template <>
-std::int64_t getrs_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t n, std::int64_t nrhs,
-                                                         std::int64_t lda, std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::getrs_scratchpad_size<std::complex<double>>(queue, trans, n, nrhs,
-                                                                              lda, ldb);
-}
-template <>
-std::int64_t heevd_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::job jobz,
-                                                        oneapi::mkl::uplo uplo, std::int64_t n,
-                                                        std::int64_t lda) {
-    return ::oneapi::mkl::lapack::heevd_scratchpad_size<std::complex<float>>(queue, jobz, uplo, n,
-                                                                             lda);
-}
-template <>
-std::int64_t heevd_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::job jobz,
-                                                         oneapi::mkl::uplo uplo, std::int64_t n,
-                                                         std::int64_t lda) {
-    return ::oneapi::mkl::lapack::heevd_scratchpad_size<std::complex<double>>(queue, jobz, uplo, n,
-                                                                              lda);
-}
-template <>
-std::int64_t hegvd_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t itype,
-                                                        oneapi::mkl::job jobz,
-                                                        oneapi::mkl::uplo uplo, std::int64_t n,
-                                                        std::int64_t lda, std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::hegvd_scratchpad_size<std::complex<float>>(queue, itype, jobz,
-                                                                             uplo, n, lda, ldb);
-}
-template <>
-std::int64_t hegvd_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t itype,
-                                                         oneapi::mkl::job jobz,
-                                                         oneapi::mkl::uplo uplo, std::int64_t n,
-                                                         std::int64_t lda, std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::hegvd_scratchpad_size<std::complex<double>>(queue, itype, jobz,
-                                                                              uplo, n, lda, ldb);
-}
-template <>
-std::int64_t hetrd_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::hetrd_scratchpad_size<std::complex<float>>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t hetrd_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::hetrd_scratchpad_size<std::complex<double>>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t hetrf_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::hetrf_scratchpad_size<std::complex<float>>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t hetrf_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::hetrf_scratchpad_size<std::complex<double>>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t orgbr_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::generate vect,
-                                          std::int64_t m, std::int64_t n, std::int64_t k,
-                                          std::int64_t lda) {
-    return ::oneapi::mkl::lapack::orgbr_scratchpad_size<float>(queue, vect, m, n, k, lda);
-}
-template <>
-std::int64_t orgbr_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::generate vect,
-                                           std::int64_t m, std::int64_t n, std::int64_t k,
-                                           std::int64_t lda) {
-    return ::oneapi::mkl::lapack::orgbr_scratchpad_size<double>(queue, vect, m, n, k, lda);
-}
-template <>
-std::int64_t orgtr_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                          std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::orgtr_scratchpad_size<float>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t orgtr_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                           std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::orgtr_scratchpad_size<double>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t orgqr_scratchpad_size<float>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                          std::int64_t k, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::orgqr_scratchpad_size<float>(queue, m, n, k, lda);
-}
-template <>
-std::int64_t orgqr_scratchpad_size<double>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                           std::int64_t k, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::orgqr_scratchpad_size<double>(queue, m, n, k, lda);
-}
-template <>
-std::int64_t ormrq_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::side side,
-                                          oneapi::mkl::transpose trans, std::int64_t m,
-                                          std::int64_t n, std::int64_t k, std::int64_t lda,
-                                          std::int64_t ldc) {
-    return ::oneapi::mkl::lapack::ormrq_scratchpad_size<float>(queue, side, trans, m, n, k, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t ormrq_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::side side,
-                                           oneapi::mkl::transpose trans, std::int64_t m,
-                                           std::int64_t n, std::int64_t k, std::int64_t lda,
-                                           std::int64_t ldc) {
-    return ::oneapi::mkl::lapack::ormrq_scratchpad_size<double>(queue, side, trans, m, n, k, lda,
-                                                                ldc);
-}
-template <>
-std::int64_t ormqr_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::side side,
-                                          oneapi::mkl::transpose trans, std::int64_t m,
-                                          std::int64_t n, std::int64_t k, std::int64_t lda,
-                                          std::int64_t ldc) {
-    return ::oneapi::mkl::lapack::ormqr_scratchpad_size<float>(queue, side, trans, m, n, k, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t ormqr_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::side side,
-                                           oneapi::mkl::transpose trans, std::int64_t m,
-                                           std::int64_t n, std::int64_t k, std::int64_t lda,
-                                           std::int64_t ldc) {
-    return ::oneapi::mkl::lapack::ormqr_scratchpad_size<double>(queue, side, trans, m, n, k, lda,
-                                                                ldc);
-}
-template <>
-std::int64_t ormtr_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::side side,
-                                          oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                          std::int64_t m, std::int64_t n, std::int64_t lda,
-                                          std::int64_t ldc) {
-    return ::oneapi::mkl::lapack::ormtr_scratchpad_size<float>(queue, side, uplo, trans, m, n, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t ormtr_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::side side,
-                                           oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                           std::int64_t m, std::int64_t n, std::int64_t lda,
-                                           std::int64_t ldc) {
-    return ::oneapi::mkl::lapack::ormtr_scratchpad_size<double>(queue, side, uplo, trans, m, n, lda,
-                                                                ldc);
-}
-template <>
-std::int64_t potrf_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                          std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::potrf_scratchpad_size<float>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t potrf_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                           std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::potrf_scratchpad_size<double>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t potrf_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::potrf_scratchpad_size<std::complex<float>>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t potrf_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::potrf_scratchpad_size<std::complex<double>>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t potrs_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                          std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                          std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::potrs_scratchpad_size<float>(queue, uplo, n, nrhs, lda, ldb);
-}
-template <>
-std::int64_t potrs_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                           std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                           std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::potrs_scratchpad_size<double>(queue, uplo, n, nrhs, lda, ldb);
-}
-template <>
-std::int64_t potrs_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t nrhs,
-                                                        std::int64_t lda, std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::potrs_scratchpad_size<std::complex<float>>(queue, uplo, n, nrhs,
-                                                                             lda, ldb);
-}
-template <>
-std::int64_t potrs_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t nrhs,
-                                                         std::int64_t lda, std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::potrs_scratchpad_size<std::complex<double>>(queue, uplo, n, nrhs,
-                                                                              lda, ldb);
-}
-template <>
-std::int64_t potri_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                          std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::potri_scratchpad_size<float>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t potri_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                           std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::potri_scratchpad_size<double>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t potri_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::potri_scratchpad_size<std::complex<float>>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t potri_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::potri_scratchpad_size<std::complex<double>>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t sytrf_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                          std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::sytrf_scratchpad_size<float>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t sytrf_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                           std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::sytrf_scratchpad_size<double>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t sytrf_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::sytrf_scratchpad_size<std::complex<float>>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t sytrf_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::sytrf_scratchpad_size<std::complex<double>>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t syevd_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::job jobz,
-                                          oneapi::mkl::uplo uplo, std::int64_t n,
-                                          std::int64_t lda) {
-    return ::oneapi::mkl::lapack::syevd_scratchpad_size<float>(queue, jobz, uplo, n, lda);
-}
-template <>
-std::int64_t syevd_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::job jobz,
-                                           oneapi::mkl::uplo uplo, std::int64_t n,
-                                           std::int64_t lda) {
-    return ::oneapi::mkl::lapack::syevd_scratchpad_size<double>(queue, jobz, uplo, n, lda);
-}
-template <>
-std::int64_t sygvd_scratchpad_size<float>(sycl::queue &queue, std::int64_t itype,
-                                          oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                          std::int64_t n, std::int64_t lda, std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::sygvd_scratchpad_size<float>(queue, itype, jobz, uplo, n, lda,
-                                                               ldb);
-}
-template <>
-std::int64_t sygvd_scratchpad_size<double>(sycl::queue &queue, std::int64_t itype,
-                                           oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                           std::int64_t n, std::int64_t lda, std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::sygvd_scratchpad_size<double>(queue, itype, jobz, uplo, n, lda,
-                                                                ldb);
-}
-template <>
-std::int64_t sytrd_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                          std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::sytrd_scratchpad_size<float>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t sytrd_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                           std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::sytrd_scratchpad_size<double>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t trtrs_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                          oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                          std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                          std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::trtrs_scratchpad_size<float>(queue, uplo, trans, diag, n, nrhs,
-                                                               lda, ldb);
-}
-template <>
-std::int64_t trtrs_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                           oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                           std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                           std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::trtrs_scratchpad_size<double>(queue, uplo, trans, diag, n, nrhs,
-                                                                lda, ldb);
-}
-template <>
-std::int64_t trtrs_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        oneapi::mkl::transpose trans,
-                                                        oneapi::mkl::diag diag, std::int64_t n,
-                                                        std::int64_t nrhs, std::int64_t lda,
-                                                        std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::trtrs_scratchpad_size<std::complex<float>>(
-        queue, uplo, trans, diag, n, nrhs, lda, ldb);
-}
-template <>
-std::int64_t trtrs_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         oneapi::mkl::transpose trans,
-                                                         oneapi::mkl::diag diag, std::int64_t n,
-                                                         std::int64_t nrhs, std::int64_t lda,
-                                                         std::int64_t ldb) {
-    return ::oneapi::mkl::lapack::trtrs_scratchpad_size<std::complex<double>>(
-        queue, uplo, trans, diag, n, nrhs, lda, ldb);
-}
-template <>
-std::int64_t ungbr_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                        oneapi::mkl::generate vect, std::int64_t m,
-                                                        std::int64_t n, std::int64_t k,
-                                                        std::int64_t lda) {
-    return ::oneapi::mkl::lapack::ungbr_scratchpad_size<std::complex<float>>(queue, vect, m, n, k,
-                                                                             lda);
-}
-template <>
-std::int64_t ungbr_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                         oneapi::mkl::generate vect, std::int64_t m,
-                                                         std::int64_t n, std::int64_t k,
-                                                         std::int64_t lda) {
-    return ::oneapi::mkl::lapack::ungbr_scratchpad_size<std::complex<double>>(queue, vect, m, n, k,
-                                                                              lda);
-}
-template <>
-std::int64_t ungqr_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t k,
-                                                        std::int64_t lda) {
-    return ::oneapi::mkl::lapack::ungqr_scratchpad_size<std::complex<float>>(queue, m, n, k, lda);
-}
-template <>
-std::int64_t ungqr_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t k,
-                                                         std::int64_t lda) {
-    return ::oneapi::mkl::lapack::ungqr_scratchpad_size<std::complex<double>>(queue, m, n, k, lda);
-}
-template <>
-std::int64_t ungtr_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::ungtr_scratchpad_size<std::complex<float>>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t ungtr_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda) {
-    return ::oneapi::mkl::lapack::ungtr_scratchpad_size<std::complex<double>>(queue, uplo, n, lda);
-}
-template <>
-std::int64_t unmrq_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::side side,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t m, std::int64_t n,
-                                                        std::int64_t k, std::int64_t lda,
-                                                        std::int64_t ldc) {
-    return ::oneapi::mkl::lapack::unmrq_scratchpad_size<std::complex<float>>(queue, side, trans, m,
-                                                                             n, k, lda, ldc);
-}
-template <>
-std::int64_t unmrq_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::side side,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t m, std::int64_t n,
-                                                         std::int64_t k, std::int64_t lda,
-                                                         std::int64_t ldc) {
-    return ::oneapi::mkl::lapack::unmrq_scratchpad_size<std::complex<double>>(queue, side, trans, m,
-                                                                              n, k, lda, ldc);
-}
-template <>
-std::int64_t unmqr_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::side side,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t m, std::int64_t n,
-                                                        std::int64_t k, std::int64_t lda,
-                                                        std::int64_t ldc) {
-    return ::oneapi::mkl::lapack::unmqr_scratchpad_size<std::complex<float>>(queue, side, trans, m,
-                                                                             n, k, lda, ldc);
-}
-template <>
-std::int64_t unmqr_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::side side,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t m, std::int64_t n,
-                                                         std::int64_t k, std::int64_t lda,
-                                                         std::int64_t ldc) {
-    return ::oneapi::mkl::lapack::unmqr_scratchpad_size<std::complex<double>>(queue, side, trans, m,
-                                                                              n, k, lda, ldc);
-}
-template <>
-std::int64_t unmtr_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::side side,
-                                                        oneapi::mkl::uplo uplo,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t m, std::int64_t n,
-                                                        std::int64_t lda, std::int64_t ldc) {
-    return ::oneapi::mkl::lapack::unmtr_scratchpad_size<std::complex<float>>(queue, side, uplo,
-                                                                             trans, m, n, lda, ldc);
-}
-template <>
-std::int64_t unmtr_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::side side,
-                                                         oneapi::mkl::uplo uplo,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t m, std::int64_t n,
-                                                         std::int64_t lda, std::int64_t ldc) {
-    return ::oneapi::mkl::lapack::unmtr_scratchpad_size<std::complex<double>>(
-        queue, side, uplo, trans, m, n, lda, ldc);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda, std::int64_t stride_a,
-                                                std::int64_t stride_ipiv, std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::getrf_batch_scratchpad_size<float>(queue, m, n, lda, stride_a,
-                                                                     stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t lda, std::int64_t stride_a,
-                                                 std::int64_t stride_ipiv,
-                                                 std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::getrf_batch_scratchpad_size<double>(queue, m, n, lda, stride_a,
-                                                                      stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t m,
-                                                              std::int64_t n, std::int64_t lda,
-                                                              std::int64_t stride_a,
-                                                              std::int64_t stride_ipiv,
-                                                              std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::getrf_batch_scratchpad_size<std::complex<float>>(
-        queue, m, n, lda, stride_a, stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t m,
-                                                               std::int64_t n, std::int64_t lda,
-                                                               std::int64_t stride_a,
-                                                               std::int64_t stride_ipiv,
-                                                               std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::getrf_batch_scratchpad_size<std::complex<double>>(
-        queue, m, n, lda, stride_a, stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t n,
-                                                std::int64_t lda, std::int64_t stride_a,
-                                                std::int64_t stride_ipiv, std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::getri_batch_scratchpad_size<float>(queue, n, lda, stride_a,
-                                                                     stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t n,
-                                                 std::int64_t lda, std::int64_t stride_a,
-                                                 std::int64_t stride_ipiv,
-                                                 std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::getri_batch_scratchpad_size<double>(queue, n, lda, stride_a,
-                                                                      stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t n,
-                                                              std::int64_t lda,
-                                                              std::int64_t stride_a,
-                                                              std::int64_t stride_ipiv,
-                                                              std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::getri_batch_scratchpad_size<std::complex<float>>(
-        queue, n, lda, stride_a, stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t n,
-                                                               std::int64_t lda,
-                                                               std::int64_t stride_a,
-                                                               std::int64_t stride_ipiv,
-                                                               std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::getri_batch_scratchpad_size<std::complex<double>>(
-        queue, n, lda, stride_a, stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t stride_a, std::int64_t stride_ipiv,
-                                                std::int64_t ldb, std::int64_t stride_b,
-                                                std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::getrs_batch_scratchpad_size<float>(
-        queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                 std::int64_t n, std::int64_t nrhs,
-                                                 std::int64_t lda, std::int64_t stride_a,
-                                                 std::int64_t stride_ipiv, std::int64_t ldb,
-                                                 std::int64_t stride_b, std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::getrs_batch_scratchpad_size<double>(
-        queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::getrs_batch_scratchpad_size<std::complex<float>>(
-        queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::getrs_batch_scratchpad_size<std::complex<double>>(
-        queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda, std::int64_t stride_a,
-                                                std::int64_t stride_tau, std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::geqrf_batch_scratchpad_size<float>(queue, m, n, lda, stride_a,
-                                                                     stride_tau, batch_size);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t lda, std::int64_t stride_a,
-                                                 std::int64_t stride_tau, std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::geqrf_batch_scratchpad_size<double>(queue, m, n, lda, stride_a,
-                                                                      stride_tau, batch_size);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t m,
-                                                              std::int64_t n, std::int64_t lda,
-                                                              std::int64_t stride_a,
-                                                              std::int64_t stride_tau,
-                                                              std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::geqrf_batch_scratchpad_size<std::complex<float>>(
-        queue, m, n, lda, stride_a, stride_tau, batch_size);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t m,
-                                                               std::int64_t n, std::int64_t lda,
-                                                               std::int64_t stride_a,
-                                                               std::int64_t stride_tau,
-                                                               std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::geqrf_batch_scratchpad_size<std::complex<double>>(
-        queue, m, n, lda, stride_a, stride_tau, batch_size);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda,
-                                                std::int64_t stride_a, std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::potrf_batch_scratchpad_size<float>(queue, uplo, n, lda, stride_a,
-                                                                     batch_size);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda,
-                                                 std::int64_t stride_a, std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::potrf_batch_scratchpad_size<double>(queue, uplo, n, lda, stride_a,
-                                                                      batch_size);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                              oneapi::mkl::uplo uplo,
-                                                              std::int64_t n, std::int64_t lda,
-                                                              std::int64_t stride_a,
-                                                              std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::potrf_batch_scratchpad_size<std::complex<float>>(
-        queue, uplo, n, lda, stride_a, batch_size);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                               oneapi::mkl::uplo uplo,
-                                                               std::int64_t n, std::int64_t lda,
-                                                               std::int64_t stride_a,
-                                                               std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::potrf_batch_scratchpad_size<std::complex<double>>(
-        queue, uplo, n, lda, stride_a, batch_size);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t stride_a, std::int64_t ldb,
-                                                std::int64_t stride_b, std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::potrs_batch_scratchpad_size<float>(
-        queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t nrhs,
-                                                 std::int64_t lda, std::int64_t stride_a,
-                                                 std::int64_t ldb, std::int64_t stride_b,
-                                                 std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::potrs_batch_scratchpad_size<double>(
-        queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::potrs_batch_scratchpad_size<std::complex<float>>(
-        queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::potrs_batch_scratchpad_size<std::complex<double>>(
-        queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t orgqr_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t k, std::int64_t lda,
-                                                std::int64_t stride_a, std::int64_t stride_tau,
-                                                std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::orgqr_batch_scratchpad_size<float>(queue, m, n, k, lda, stride_a,
-                                                                     stride_tau, batch_size);
-}
-template <>
-std::int64_t orgqr_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t k, std::int64_t lda,
-                                                 std::int64_t stride_a, std::int64_t stride_tau,
-                                                 std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::orgqr_batch_scratchpad_size<double>(queue, m, n, k, lda, stride_a,
-                                                                      stride_tau, batch_size);
-}
-template <>
-std::int64_t ungqr_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::ungqr_batch_scratchpad_size<std::complex<float>>(
-        queue, m, n, k, lda, stride_a, stride_tau, batch_size);
-}
-template <>
-std::int64_t ungqr_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) {
-    return ::oneapi::mkl::lapack::ungqr_batch_scratchpad_size<std::complex<double>>(
-        queue, m, n, k, lda, stride_a, stride_tau, batch_size);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t *m,
-                                                std::int64_t *n, std::int64_t *lda,
-                                                std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::getrf_batch_scratchpad_size<float>(queue, m, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t *m,
-                                                 std::int64_t *n, std::int64_t *lda,
-                                                 std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::getrf_batch_scratchpad_size<double>(queue, m, n, lda, group_count,
-                                                                      group_sizes);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t *m,
-                                                              std::int64_t *n, std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::getrf_batch_scratchpad_size<std::complex<float>>(
-        queue, m, n, lda, group_count, group_sizes);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t *m,
-                                                               std::int64_t *n, std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::getrf_batch_scratchpad_size<std::complex<double>>(
-        queue, m, n, lda, group_count, group_sizes);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t *n,
-                                                std::int64_t *lda, std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::getri_batch_scratchpad_size<float>(queue, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t *n,
-                                                 std::int64_t *lda, std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::getri_batch_scratchpad_size<double>(queue, n, lda, group_count,
-                                                                      group_sizes);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t *n,
-                                                              std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::getri_batch_scratchpad_size<std::complex<float>>(
-        queue, n, lda, group_count, group_sizes);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t *n,
-                                                               std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::getri_batch_scratchpad_size<std::complex<double>>(
-        queue, n, lda, group_count, group_sizes);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                                std::int64_t *n, std::int64_t *nrhs,
-                                                std::int64_t *lda, std::int64_t *ldb,
-                                                std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::getrs_batch_scratchpad_size<float>(queue, trans, n, nrhs, lda,
-                                                                     ldb, group_count, group_sizes);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                                 std::int64_t *n, std::int64_t *nrhs,
-                                                 std::int64_t *lda, std::int64_t *ldb,
-                                                 std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::getrs_batch_scratchpad_size<double>(
-        queue, trans, n, nrhs, lda, ldb, group_count, group_sizes);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::getrs_batch_scratchpad_size<std::complex<float>>(
-        queue, trans, n, nrhs, lda, ldb, group_count, group_sizes);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::getrs_batch_scratchpad_size<std::complex<double>>(
-        queue, trans, n, nrhs, lda, ldb, group_count, group_sizes);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t *m,
-                                                std::int64_t *n, std::int64_t *lda,
-                                                std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::geqrf_batch_scratchpad_size<float>(queue, m, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t *m,
-                                                 std::int64_t *n, std::int64_t *lda,
-                                                 std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::geqrf_batch_scratchpad_size<double>(queue, m, n, lda, group_count,
-                                                                      group_sizes);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t *m,
-                                                              std::int64_t *n, std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::geqrf_batch_scratchpad_size<std::complex<float>>(
-        queue, m, n, lda, group_count, group_sizes);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t *m,
-                                                               std::int64_t *n, std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::geqrf_batch_scratchpad_size<std::complex<double>>(
-        queue, m, n, lda, group_count, group_sizes);
-}
-template <>
-std::int64_t orgqr_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t *m,
-                                                std::int64_t *n, std::int64_t *k, std::int64_t *lda,
-                                                std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::orgqr_batch_scratchpad_size<float>(queue, m, n, k, lda,
-                                                                     group_count, group_sizes);
-}
-template <>
-std::int64_t orgqr_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t *m,
-                                                 std::int64_t *n, std::int64_t *k,
-                                                 std::int64_t *lda, std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::orgqr_batch_scratchpad_size<double>(queue, m, n, k, lda,
-                                                                      group_count, group_sizes);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                std::int64_t *n, std::int64_t *lda,
-                                                std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::potrf_batch_scratchpad_size<float>(queue, uplo, n, lda,
-                                                                     group_count, group_sizes);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                 std::int64_t *n, std::int64_t *lda,
-                                                 std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::potrf_batch_scratchpad_size<double>(queue, uplo, n, lda,
-                                                                      group_count, group_sizes);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                              oneapi::mkl::uplo *uplo,
-                                                              std::int64_t *n, std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::potrf_batch_scratchpad_size<std::complex<float>>(
-        queue, uplo, n, lda, group_count, group_sizes);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                               oneapi::mkl::uplo *uplo,
-                                                               std::int64_t *n, std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::potrf_batch_scratchpad_size<std::complex<double>>(
-        queue, uplo, n, lda, group_count, group_sizes);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                std::int64_t *n, std::int64_t *nrhs,
-                                                std::int64_t *lda, std::int64_t *ldb,
-                                                std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::potrs_batch_scratchpad_size<float>(queue, uplo, n, nrhs, lda, ldb,
-                                                                     group_count, group_sizes);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                 std::int64_t *n, std::int64_t *nrhs,
-                                                 std::int64_t *lda, std::int64_t *ldb,
-                                                 std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::potrs_batch_scratchpad_size<double>(
-        queue, uplo, n, nrhs, lda, ldb, group_count, group_sizes);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::potrs_batch_scratchpad_size<std::complex<float>>(
-        queue, uplo, n, nrhs, lda, ldb, group_count, group_sizes);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::potrs_batch_scratchpad_size<std::complex<double>>(
-        queue, uplo, n, nrhs, lda, ldb, group_count, group_sizes);
-}
-template <>
-std::int64_t ungqr_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t *m,
-                                                              std::int64_t *n, std::int64_t *k,
-                                                              std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::ungqr_batch_scratchpad_size<std::complex<float>>(
-        queue, m, n, k, lda, group_count, group_sizes);
-}
-template <>
-std::int64_t ungqr_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t *m,
-                                                               std::int64_t *n, std::int64_t *k,
-                                                               std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes) {
-    return ::oneapi::mkl::lapack::ungqr_batch_scratchpad_size<std::complex<double>>(
-        queue, m, n, k, lda, group_count, group_sizes);
-}
diff --git a/src/lapack/backends/mkl_common/mkl_lapack_backend.hpp b/src/lapack/backends/mkl_common/mkl_lapack_backend.hpp
deleted file mode 100644
index 1932bb959..000000000
--- a/src/lapack/backends/mkl_common/mkl_lapack_backend.hpp
+++ /dev/null
@@ -1,1263 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#include <complex>
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/lapack/types.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-
-void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<float> &d, sycl::buffer<float> &e,
-           sycl::buffer<std::complex<float>> &tauq, sycl::buffer<std::complex<float>> &taup,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &d, sycl::buffer<double> &e,
-           sycl::buffer<double> &tauq, sycl::buffer<double> &taup, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size);
-void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &d, sycl::buffer<float> &e,
-           sycl::buffer<float> &tauq, sycl::buffer<float> &taup, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size);
-void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda, sycl::buffer<double> &d,
-           sycl::buffer<double> &e, sycl::buffer<std::complex<double>> &tauq,
-           sycl::buffer<std::complex<double>> &taup, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size);
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size);
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size);
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size);
-void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size);
-void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size);
-void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size);
-void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size);
-void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size);
-void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size);
-void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size);
-void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size);
-void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size);
-void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<double> &b, std::int64_t ldb, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size);
-void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<float> &b, std::int64_t ldb, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size);
-void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &b,
-           std::int64_t ldb, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size);
-void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-           std::int64_t n, sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &s,
-           sycl::buffer<double> &u, std::int64_t ldu, sycl::buffer<double> &vt, std::int64_t ldvt,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-           std::int64_t n, sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &s,
-           sycl::buffer<float> &u, std::int64_t ldu, sycl::buffer<float> &vt, std::int64_t ldvt,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-           std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<float> &s, sycl::buffer<std::complex<float>> &u, std::int64_t ldu,
-           sycl::buffer<std::complex<float>> &vt, std::int64_t ldvt,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-           std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<double> &s, sycl::buffer<std::complex<double>> &u, std::int64_t ldu,
-           sycl::buffer<std::complex<double>> &vt, std::int64_t ldvt,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size);
-void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<float> &w,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda, sycl::buffer<double> &w,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size);
-void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-           std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &b, std::int64_t ldb, sycl::buffer<float> &w,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-           std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &b, std::int64_t ldb, sycl::buffer<double> &w,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size);
-void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<float> &d,
-           sycl::buffer<float> &e, sycl::buffer<std::complex<float>> &tau,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda, sycl::buffer<double> &d,
-           sycl::buffer<double> &e, sycl::buffer<std::complex<double>> &tau,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size);
-void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size);
-void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-           std::int64_t k, sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-           std::int64_t k, sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size);
-void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size);
-void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-           sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-           sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-           sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-           sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size);
-void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size);
-void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size);
-void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &w,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &w,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-           std::int64_t n, sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b,
-           std::int64_t ldb, sycl::buffer<double> &w, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size);
-void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-           std::int64_t n, sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b,
-           std::int64_t ldb, sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size);
-void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &d, sycl::buffer<double> &e,
-           sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size);
-void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &d, sycl::buffer<float> &e,
-           sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size);
-void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size);
-void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size);
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size);
-void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-           std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size);
-void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-           std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size);
-void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size);
-void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size);
-void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size);
-void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size);
-void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size);
-void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size);
-void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size);
-void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size);
-void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size);
-void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size);
-void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<float> &tau,
-                 std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size);
-void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<double> &tau,
-                 std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size);
-void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size);
-void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<double>> &tau, std::int64_t stride_tau,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size);
-void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size);
-void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size);
-void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                 std::int64_t stride_ipiv, std::int64_t batch_size,
-                 sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size);
-void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                 std::int64_t stride_ipiv, std::int64_t batch_size,
-                 sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size);
-void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                 std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv, sycl::buffer<float> &b,
-                 std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                 sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                 std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 sycl::buffer<double> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size);
-void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                 std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 sycl::buffer<std::complex<float>> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size);
-void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                 std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 sycl::buffer<std::complex<double>> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size);
-void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                 std::int64_t stride_ipiv, std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size);
-void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                 std::int64_t stride_ipiv, std::int64_t batch_size,
-                 sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size);
-void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size);
-void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                 sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<float> &tau, std::int64_t stride_tau, std::int64_t batch_size,
-                 sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                 sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<double> &tau, std::int64_t stride_tau, std::int64_t batch_size,
-                 sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-                 std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                 sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                 sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size);
-void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size);
-void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size);
-void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                 sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<float> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size);
-void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                 sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<double> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size);
-void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<float>> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size);
-void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<double>> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size);
-void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size);
-void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<double>> &tau, std::int64_t stride_tau,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size);
-sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                  std::int64_t lda, float *d, float *e, std::complex<float> *tauq,
-                  std::complex<float> *taup, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                  double *d, double *e, double *tauq, double *taup, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                  float *d, float *e, float *tauq, float *taup, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                  std::int64_t lda, double *d, double *e, std::complex<double> *tauq,
-                  std::complex<double> *taup, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                  float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                  double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *tau, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                  std::int64_t lda, std::complex<double> *tau, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *tau, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                  double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                  float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                  std::int64_t lda, std::complex<double> *tau, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                  std::int64_t lda, std::int64_t *ipiv, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                  std::int64_t *ipiv, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                  std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                  std::int64_t lda, std::int64_t *ipiv, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                  std::int64_t *ipiv, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event getri(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda,
-                  std::int64_t *ipiv, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event getri(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda,
-                  std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                  std::int64_t *ipiv, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                  std::int64_t nrhs, std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<float> *b, std::int64_t ldb, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                  std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t *ipiv, double *b,
-                  std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                  std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t *ipiv, float *b,
-                  std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                  std::int64_t nrhs, std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<double> *b, std::int64_t ldb, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                  std::int64_t m, std::int64_t n, double *a, std::int64_t lda, double *s, double *u,
-                  std::int64_t ldu, double *vt, std::int64_t ldvt, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                  std::int64_t m, std::int64_t n, float *a, std::int64_t lda, float *s, float *u,
-                  std::int64_t ldu, float *vt, std::int64_t ldvt, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                  std::int64_t m, std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                  float *s, std::complex<float> *u, std::int64_t ldu, std::complex<float> *vt,
-                  std::int64_t ldvt, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                  std::int64_t m, std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                  double *s, std::complex<double> *u, std::int64_t ldu, std::complex<double> *vt,
-                  std::int64_t ldvt, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, float *w,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, double *w,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                  std::complex<float> *b, std::int64_t ldb, float *w,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                  std::complex<double> *b, std::int64_t ldb, double *w,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, float *d, float *e,
-                  std::complex<float> *tau, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, double *d, double *e,
-                  std::complex<double> *tau, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                  std::int64_t k, float *a, std::int64_t lda, float *tau, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                  std::int64_t k, double *a, std::int64_t lda, double *tau, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, double *a,
-                  std::int64_t lda, double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, float *a,
-                  std::int64_t lda, float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                  std::int64_t lda, float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                  std::int64_t lda, double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float *a,
-                  std::int64_t lda, float *tau, float *c, std::int64_t ldc, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double *a,
-                  std::int64_t lda, double *tau, double *c, std::int64_t ldc, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                  float *tau, float *c, std::int64_t ldc, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                  double *tau, double *c, std::int64_t ldc, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                  double *tau, double *c, std::int64_t ldc, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                  float *tau, float *c, std::int64_t ldc, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                  std::int64_t lda, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                  std::int64_t lda, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                  std::int64_t lda, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                  std::int64_t lda, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                  float *a, std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                  double *a, std::int64_t lda, double *b, std::int64_t ldb, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                  std::int64_t ldb, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                  std::int64_t ldb, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                  double *a, std::int64_t lda, double *w, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                  float *a, std::int64_t lda, float *w, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, double *b,
-                  std::int64_t ldb, double *w, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, float *b,
-                  std::int64_t ldb, float *w, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                  std::int64_t lda, double *d, double *e, double *tau, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                  std::int64_t lda, float *d, float *e, float *tau, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                  std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                  std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, double *a,
-                  std::int64_t lda, double *b, std::int64_t ldb, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a,
-                  std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                  std::int64_t ldb, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                  std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                  std::complex<float> *tau, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,
-                  std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                  std::complex<double> *tau, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *tau,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *tau,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *tau, std::complex<float> *c,
-                  std::int64_t ldc, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> *a,
-                  std::int64_t lda, std::complex<double> *tau, std::complex<double> *c,
-                  std::int64_t ldc, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *tau, std::complex<float> *c,
-                  std::int64_t ldc, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> *a,
-                  std::int64_t lda, std::complex<double> *tau, std::complex<double> *c,
-                  std::int64_t ldc, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies = {});
-sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                  std::complex<float> *c, std::int64_t ldc, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *tau,
-                  std::complex<double> *c, std::int64_t ldc, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies = {});
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                        std::int64_t lda, std::int64_t stride_a, float *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                        std::int64_t lda, std::int64_t stride_a, double *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::complex<float> *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::complex<double> *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, float **a,
-                        std::int64_t *lda, float **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, double **a,
-                        std::int64_t *lda, double **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                        std::complex<float> **a, std::int64_t *lda, std::complex<float> **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                        std::complex<double> **a, std::int64_t *lda, std::complex<double> **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, float **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, double **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                        std::complex<float> **a, std::int64_t *lda, std::int64_t **ipiv,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                        std::complex<double> **a, std::int64_t *lda, std::int64_t **ipiv,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getri_batch(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv,
-                        std::int64_t batch_size, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getri_batch(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv,
-                        std::int64_t batch_size, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex<float> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex<double> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<float> **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<double> **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t *ipiv, std::int64_t stride_ipiv, float *b, std::int64_t ldb,
-                        std::int64_t stride_b, std::int64_t batch_size, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t *ipiv, std::int64_t stride_ipiv, double *b, std::int64_t ldb,
-                        std::int64_t stride_b, std::int64_t batch_size, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv,
-                        std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                        std::int64_t batch_size, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv,
-                        std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                        std::int64_t batch_size, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-                        std::int64_t *nrhs, float **a, std::int64_t *lda, std::int64_t **ipiv,
-                        float **b, std::int64_t *ldb, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-                        std::int64_t *nrhs, double **a, std::int64_t *lda, std::int64_t **ipiv,
-                        double **b, std::int64_t *ldb, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-                        std::int64_t *nrhs, std::complex<float> **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::complex<float> **b, std::int64_t *ldb,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-                        std::int64_t *nrhs, std::complex<double> **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::complex<double> **b, std::int64_t *ldb,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        float *a, std::int64_t lda, std::int64_t stride_a, float *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        double *a, std::int64_t lda, std::int64_t stride_a, double *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                        float **a, std::int64_t *lda, float **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                        double **a, std::int64_t *lda, double **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t batch_size, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t batch_size, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, float **a,
-                        std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, double **a,
-                        std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                        std::complex<float> **a, std::int64_t *lda, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                        std::complex<double> **a, std::int64_t *lda, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t stride_a,
-                        float *b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t stride_a,
-                        double *b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::complex<float> *b, std::int64_t ldb,
-                        std::int64_t stride_b, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::complex<double> *b, std::int64_t ldb,
-                        std::int64_t stride_b, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                        std::int64_t *nrhs, float **a, std::int64_t *lda, float **b,
-                        std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                        std::int64_t *nrhs, double **a, std::int64_t *lda, double **b,
-                        std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                        std::int64_t *nrhs, std::complex<float> **a, std::int64_t *lda,
-                        std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-                        std::int64_t *nrhs, std::complex<double> **a, std::int64_t *lda,
-                        std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::complex<float> *tau, std::int64_t stride_tau, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::complex<double> *tau, std::int64_t stride_tau, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                        std::complex<float> **a, std::int64_t *lda, std::complex<float> **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                        std::complex<double> **a, std::int64_t *lda, std::complex<double> **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies = {});
-
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                   oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldu, std::int64_t ldvt);
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                   oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda, std::int64_t ldu, std::int64_t ldvt);
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda);
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                   std::int64_t nrhs, std::int64_t lda, std::int64_t ldb);
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t heevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda);
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hegvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldb);
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hetrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t hetrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda, std::int64_t ldc);
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda, std::int64_t ldc);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldc);
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t nrhs, std::int64_t lda, std::int64_t ldb);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda);
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldb);
-template <typename fp_type, internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type, internal::is_floating_point<fp_type> = nullptr>
-std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                   std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                   std::int64_t ldb);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::int64_t lda);
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda, std::int64_t ldc);
-template <typename fp_type, internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::int64_t lda, std::int64_t ldc);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t unmtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::int64_t lda,
-                                   std::int64_t ldc);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_ipiv, std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t stride_ipiv,
-                                         std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t stride_ipiv,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                         std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t k, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t k, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t stride_tau, std::int64_t batch_size);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                         std::int64_t *n, std::int64_t *nrhs, std::int64_t *lda,
-                                         std::int64_t *ldb, std::int64_t group_count,
-                                         std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_real_floating_point<fp_type> = nullptr>
-std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *k, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                         std::int64_t *n, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes);
-template <typename fp_type, oneapi::mkl::lapack::internal::is_floating_point<fp_type> = nullptr>
-std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                         std::int64_t *n, std::int64_t *nrhs, std::int64_t *lda,
-                                         std::int64_t *ldb, std::int64_t group_count,
-                                         std::int64_t *group_sizes);
-template <typename fp_type,
-          oneapi::mkl::lapack::internal::is_complex_floating_point<fp_type> = nullptr>
-std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *k, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes);
-
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/lapack/backends/mklcpu/CMakeLists.txt b/src/lapack/backends/mklcpu/CMakeLists.txt
deleted file mode 100644
index fcc60a8e7..000000000
--- a/src/lapack/backends/mklcpu/CMakeLists.txt
+++ /dev/null
@@ -1,71 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(LIB_NAME onemkl_lapack_mklcpu)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT
-  mkl_lapack.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: lapack_cpu_wrappers.cpp>
-)
-add_dependencies(onemkl_backend_libs_lapack ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-
-if(TARGET MKL::MKL_SYCL::LAPACK)
-  target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL MKL::MKL_SYCL::LAPACK)
-else()
-  target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL MKL::MKL_DPCPP)
-endif()
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-#Set oneMKL libraries as not transitive for dynamic
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/lapack/backends/mklcpu/lapack_cpu_wrappers.cpp b/src/lapack/backends/mklcpu/lapack_cpu_wrappers.cpp
deleted file mode 100644
index 4bd0713fa..000000000
--- a/src/lapack/backends/mklcpu/lapack_cpu_wrappers.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "lapack/function_table.hpp"
-#include "oneapi/mkl/lapack/detail/mklcpu/onemkl_lapack_mklcpu.hpp"
-
-#define WRAPPER_VERSION 1
-
-extern "C" ONEMKL_EXPORT lapack_function_table_t mkl_lapack_table = {
-    WRAPPER_VERSION,
-#define LAPACK_BACKEND mklcpu
-#include "../mkl_common/lapack_wrappers.cxx"
-#undef LAPACK_BACKEND
-};
diff --git a/src/lapack/backends/mklcpu/mkl_lapack.cpp b/src/lapack/backends/mklcpu/mkl_lapack.cpp
deleted file mode 100644
index cbd3aaa84..000000000
--- a/src/lapack/backends/mklcpu/mkl_lapack.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/lapack/types.hpp"
-#include "oneapi/mkl/lapack/detail/mklcpu/onemkl_lapack_mklcpu.hpp"
-#include "../mkl_common/mkl_lapack_backend.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace mklcpu {
-
-#include "../mkl_common/mkl_lapack.cxx"
-
-} // namespace mklcpu
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/lapack/backends/mklgpu/CMakeLists.txt b/src/lapack/backends/mklgpu/CMakeLists.txt
deleted file mode 100644
index e11592f82..000000000
--- a/src/lapack/backends/mklgpu/CMakeLists.txt
+++ /dev/null
@@ -1,71 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(LIB_NAME onemkl_lapack_mklgpu)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT
-  mkl_lapack.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: lapack_gpu_wrappers.cpp>
-)
-add_dependencies(onemkl_backend_libs_lapack ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-
-if(TARGET MKL::MKL_SYCL::LAPACK)
-  target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL MKL::MKL_SYCL::LAPACK)
-else()
-  target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL MKL::MKL_DPCPP)
-endif()
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-#Set oneMKL libraries as not transitive for dynamic
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/lapack/backends/mklgpu/lapack_gpu_wrappers.cpp b/src/lapack/backends/mklgpu/lapack_gpu_wrappers.cpp
deleted file mode 100644
index cd3933274..000000000
--- a/src/lapack/backends/mklgpu/lapack_gpu_wrappers.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "lapack/function_table.hpp"
-#include "oneapi/mkl/lapack/detail/mklgpu/onemkl_lapack_mklgpu.hpp"
-
-#define WRAPPER_VERSION 1
-
-extern "C" ONEMKL_EXPORT lapack_function_table_t mkl_lapack_table = {
-    WRAPPER_VERSION,
-#define LAPACK_BACKEND mklgpu
-#include "../mkl_common/lapack_wrappers.cxx"
-#undef LAPACK_BACKEND
-};
diff --git a/src/lapack/backends/mklgpu/mkl_lapack.cpp b/src/lapack/backends/mklgpu/mkl_lapack.cpp
deleted file mode 100644
index f7bc2a7e6..000000000
--- a/src/lapack/backends/mklgpu/mkl_lapack.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/types.hpp"
-#include "oneapi/mkl/lapack/types.hpp"
-#include "oneapi/mkl/lapack/detail/mklgpu/onemkl_lapack_mklgpu.hpp"
-#include "../mkl_common/mkl_lapack_backend.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace mklgpu {
-
-#include "../mkl_common/mkl_lapack.cxx"
-
-} // namespace mklgpu
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/lapack/backends/rocsolver/CMakeLists.txt b/src/lapack/backends/rocsolver/CMakeLists.txt
deleted file mode 100644
index c91089118..000000000
--- a/src/lapack/backends/rocsolver/CMakeLists.txt
+++ /dev/null
@@ -1,69 +0,0 @@
-#==========================================================================
-#  Copyright (C) Codeplay Software Limited
-#  Copyright 2022 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  For your convenience, a copy of the License has been included in this
-#  repository.
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-#=========================================================================
-
-set(LIB_NAME onemkl_lapack_rocsolver)
-set(LIB_OBJ ${LIB_NAME}_obj)
-find_package(hip REQUIRED)
-find_package(rocsolver REQUIRED)
-find_package(Threads REQUIRED)
-
-set(SOURCES rocsolver_lapack.cpp
-		rocsolver_batch.cpp
-	        $<$<STREQUAL:${ONEMKL_SYCL_IMPLEMENTATION},dpc++>:rocsolver_scope_handle.cpp>
-          $<$<BOOL:${BUILD_SHARED_LIBS}>: rocsolver_wrappers.cpp>)
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT ${SOURCES})
-add_dependencies(onemkl_backend_libs_lapack ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-target_link_libraries(${LIB_OBJ} PRIVATE roc::rocsolver hip::host Threads::Threads)
-target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL)
-target_compile_features(${LIB_OBJ} PUBLIC cxx_std_17)
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON)
-
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET ${LIB_OBJ} SOURCES ${SOURCES})
-endif()
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
-
diff --git a/src/lapack/backends/rocsolver/rocsolver_batch.cpp b/src/lapack/backends/rocsolver/rocsolver_batch.cpp
deleted file mode 100644
index 0b4b877e8..000000000
--- a/src/lapack/backends/rocsolver/rocsolver_batch.cpp
+++ /dev/null
@@ -1,1066 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright 2022 Intel Corporation
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "rocsolver_helper.hpp"
-#include "rocsolver_task.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace rocsolver {
-
-// BATCH BUFFER API
-
-void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<float> &tau,
-                 std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "geqrf_batch");
-}
-void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<double> &tau,
-                 std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "geqrf_batch");
-}
-void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "geqrf_batch");
-}
-void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<double>> &tau, std::int64_t stride_tau,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "geqrf_batch");
-}
-void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getri_batch");
-}
-void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getri_batch");
-}
-void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                 std::int64_t stride_ipiv, std::int64_t batch_size,
-                 sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getri_batch");
-}
-void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                 std::int64_t stride_ipiv, std::int64_t batch_size,
-                 sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getri_batch");
-}
-void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                 std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv, sycl::buffer<float> &b,
-                 std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                 sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getrs_batch");
-}
-void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                 std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 sycl::buffer<double> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getrs_batch");
-}
-void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                 std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 sycl::buffer<std::complex<float>> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getrs_batch");
-}
-void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                 std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 sycl::buffer<std::complex<double>> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getrs_batch");
-}
-void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                 std::int64_t stride_ipiv, std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getrf_batch");
-}
-void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                 std::int64_t stride_ipiv, std::int64_t batch_size,
-                 sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getrf_batch");
-}
-void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getrf_batch");
-}
-void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getrf_batch");
-}
-void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                 sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<float> &tau, std::int64_t stride_tau, std::int64_t batch_size,
-                 sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "orgqr_batch");
-}
-void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                 sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<double> &tau, std::int64_t stride_tau, std::int64_t batch_size,
-                 sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "orgqr_batch");
-}
-void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-                 std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                 sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "potrf_batch");
-}
-void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                 sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "potrf_batch");
-}
-void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "potrf_batch");
-}
-void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "potrf_batch");
-}
-void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                 sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<float> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "potrs_batch");
-}
-void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                 sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<double> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "potrs_batch");
-}
-void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<float>> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "potrs_batch");
-}
-void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<double>> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "potrs_batch");
-}
-void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "ungqr_batch");
-}
-void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<double>> &tau, std::int64_t stride_tau,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "ungqr_batch");
-}
-
-// BATCH USM API
-
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                        std::int64_t lda, std::int64_t stride_a, float *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "geqrf_batch");
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                        std::int64_t lda, std::int64_t stride_a, double *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "geqrf_batch");
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::complex<float> *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "geqrf_batch");
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::complex<double> *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "geqrf_batch");
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, float **a,
-                        std::int64_t *lda, float **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "geqrf_batch");
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, double **a,
-                        std::int64_t *lda, double **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "geqrf_batch");
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                        std::complex<float> **a, std::int64_t *lda, std::complex<float> **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "geqrf_batch");
-}
-sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                        std::complex<double> **a, std::int64_t *lda, std::complex<double> **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "geqrf_batch");
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrf_batch");
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrf_batch");
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrf_batch");
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrf_batch");
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, float **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrf_batch");
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, double **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrf_batch");
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                        std::complex<float> **a, std::int64_t *lda, std::int64_t **ipiv,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrf_batch");
-}
-sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                        std::complex<double> **a, std::int64_t *lda, std::int64_t **ipiv,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrf_batch");
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv,
-                        std::int64_t batch_size, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri_batch");
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv,
-                        std::int64_t batch_size, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri_batch");
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex<float> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri_batch");
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex<double> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri_batch");
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri_batch");
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri_batch");
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<float> **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri_batch");
-}
-sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex<double> **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri_batch");
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t *ipiv, std::int64_t stride_ipiv, float *b, std::int64_t ldb,
-                        std::int64_t stride_b, std::int64_t batch_size, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrs_batch");
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t *ipiv, std::int64_t stride_ipiv, double *b, std::int64_t ldb,
-                        std::int64_t stride_b, std::int64_t batch_size, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrs_batch");
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv,
-                        std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-                        std::int64_t batch_size, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrs_batch");
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv,
-                        std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-                        std::int64_t batch_size, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrs_batch");
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-                        std::int64_t *nrhs, float **a, std::int64_t *lda, std::int64_t **ipiv,
-                        float **b, std::int64_t *ldb, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrs_batch");
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-                        std::int64_t *nrhs, double **a, std::int64_t *lda, std::int64_t **ipiv,
-                        double **b, std::int64_t *ldb, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrs_batch");
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-                        std::int64_t *nrhs, std::complex<float> **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::complex<float> **b, std::int64_t *ldb,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrs_batch");
-}
-sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-                        std::int64_t *nrhs, std::complex<double> **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::complex<double> **b, std::int64_t *ldb,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getrs_batch");
-}
-sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        float *a, std::int64_t lda, std::int64_t stride_a, float *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "orgqr_batch");
-}
-sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        double *a, std::int64_t lda, std::int64_t stride_a, double *tau,
-                        std::int64_t stride_tau, std::int64_t batch_size, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "orgqr_batch");
-}
-sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                        float **a, std::int64_t *lda, float **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "orgqr_batch");
-}
-sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                        double **a, std::int64_t *lda, double **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "orgqr_batch");
-}
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "potrf_batch");
-}
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "potrf_batch");
-}
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t batch_size, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "potrf_batch");
-}
-sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t batch_size, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "potrf_batch");
-}
-
-template <typename Func, typename T>
-inline sycl::event potrf_batch(const char *func_name, Func func, sycl::queue &queue,
-                               oneapi::mkl::uplo *uplo, std::int64_t *n, T **a, std::int64_t *lda,
-                               std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad,
-                               std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-
-    int64_t batch_size = 0;
-    for (int64_t i = 0; i < group_count; i++) {
-        overflow_check(n[i], lda[i], group_sizes[i]);
-        batch_size += group_sizes[i];
-    }
-
-    int *info = (int *)malloc_device(sizeof(int) * batch_size, queue);
-    T **a_dev = (T **)malloc_device(sizeof(T *) * batch_size, queue);
-    auto done_cpy =
-        queue.submit([&](sycl::handler &h) { h.memcpy(a_dev, a, batch_size * sizeof(T *)); });
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        cgh.depends_on(done_cpy);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            int64_t offset = 0;
-            rocblas_status err;
-            for (int64_t i = 0; i < group_count; i++) {
-                auto **a_ = reinterpret_cast<rocmDataType **>(a_dev);
-                auto *info_ = reinterpret_cast<rocblas_int *>(info);
-                ROCSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_rocblas_fill_mode(uplo[i]),
-                                       (int)n[i], a_ + offset, (int)lda[i], info_ + offset,
-                                       (int)group_sizes[i]);
-                offset += group_sizes[i];
-            }
-        });
-    });
-    return done;
-}
-
-// Scratchpad memory not needed as parts of buffer a is used as workspace memory
-#define POTRF_BATCH_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                          \
-    sycl::event potrf_batch(                                                                       \
-        sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, TYPE **a, std::int64_t *lda, \
-        std::int64_t group_count, std::int64_t *group_sizes, TYPE *scratchpad,                     \
-        std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {              \
-        return potrf_batch(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda,          \
-                           group_count, group_sizes, scratchpad, scratchpad_size, dependencies);   \
-    }
-
-POTRF_BATCH_LAUNCHER_USM(float, rocsolver_spotrf_batched)
-POTRF_BATCH_LAUNCHER_USM(double, rocsolver_dpotrf_batched)
-POTRF_BATCH_LAUNCHER_USM(std::complex<float>, rocsolver_cpotrf_batched)
-POTRF_BATCH_LAUNCHER_USM(std::complex<double>, rocsolver_zpotrf_batched)
-
-#undef POTRF_BATCH_LAUNCHER_USM
-
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t stride_a,
-                        float *b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "potrs_batch");
-}
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t stride_a,
-                        double *b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "potrs_batch");
-}
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::complex<float> *b, std::int64_t ldb,
-                        std::int64_t stride_b, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "potrs_batch");
-}
-sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::complex<double> *b, std::int64_t ldb,
-                        std::int64_t stride_b, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "potrs_batch");
-}
-
-template <typename Func, typename T>
-inline sycl::event potrs_batch(const char *func_name, Func func, sycl::queue &queue,
-                               oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, T **a,
-                               std::int64_t *lda, T **b, std::int64_t *ldb,
-                               std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad,
-                               std::int64_t scratchpad_size,
-                               const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-
-    int64_t batch_size = 0;
-    for (int64_t i = 0; i < group_count; i++) {
-        overflow_check(n[i], lda[i], group_sizes[i]);
-        batch_size += group_sizes[i];
-
-        // rocsolver function only supports nrhs = 1
-        if (nrhs[i] != 1)
-            throw unimplemented("lapack", "potrs_batch",
-                                "rocsolver potrs_batch only supports nrhs = 1");
-    }
-
-    T **a_dev = (T **)malloc_device(sizeof(T *) * batch_size, queue);
-    T **b_dev = (T **)malloc_device(sizeof(T *) * batch_size, queue);
-    auto done_cpy_a =
-        queue.submit([&](sycl::handler &h) { h.memcpy(a_dev, a, batch_size * sizeof(T *)); });
-
-    auto done_cpy_b =
-        queue.submit([&](sycl::handler &h) { h.memcpy(b_dev, b, batch_size * sizeof(T *)); });
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        cgh.depends_on(done_cpy_a);
-        cgh.depends_on(done_cpy_b);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            int64_t offset = 0;
-            rocblas_status err;
-            for (int64_t i = 0; i < group_count; i++) {
-                auto **a_ = reinterpret_cast<rocmDataType **>(a_dev);
-                auto **b_ = reinterpret_cast<rocmDataType **>(b_dev);
-                ROCSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_rocblas_fill_mode(uplo[i]),
-                                       (int)n[i], (int)nrhs[i], a_ + offset, (int)lda[i],
-                                       b_ + offset, (int)ldb[i], (int)group_sizes[i]);
-                offset += group_sizes[i];
-            }
-        });
-    });
-    return done;
-}
-
-// Scratchpad memory not needed as parts of buffer a is used as workspace memory
-#define POTRS_BATCH_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                          \
-    sycl::event potrs_batch(                                                                       \
-        sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,          \
-        TYPE **a, std::int64_t *lda, TYPE **b, std::int64_t *ldb, std::int64_t group_count,        \
-        std::int64_t *group_sizes, TYPE *scratchpad, std::int64_t scratchpad_size,                 \
-        const std::vector<sycl::event> &dependencies) {                                            \
-        return potrs_batch(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda, b, \
-                           ldb, group_count, group_sizes, scratchpad, scratchpad_size,             \
-                           dependencies);                                                          \
-    }
-
-POTRS_BATCH_LAUNCHER_USM(float, rocsolver_spotrs_batched)
-POTRS_BATCH_LAUNCHER_USM(double, rocsolver_dpotrs_batched)
-POTRS_BATCH_LAUNCHER_USM(std::complex<float>, rocsolver_cpotrs_batched)
-POTRS_BATCH_LAUNCHER_USM(std::complex<double>, rocsolver_zpotrs_batched)
-
-#undef POTRS_BATCH_LAUNCHER_USM
-
-sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::complex<float> *tau, std::int64_t stride_tau, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "ungqr_batch");
-}
-sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::complex<double> *tau, std::int64_t stride_tau, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "ungqr_batch");
-}
-sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                        std::complex<float> **a, std::int64_t *lda, std::complex<float> **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "ungqr_batch");
-}
-sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                        std::complex<double> **a, std::int64_t *lda, std::complex<double> **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "ungqr_batch");
-}
-
-// BATCH SCRATCHPAD API
-
-template <>
-std::int64_t getrf_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda, std::int64_t stride_a,
-                                                std::int64_t stride_ipiv, std::int64_t batch_size) {
-    throw unimplemented("lapack", "getrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t lda, std::int64_t stride_a,
-                                                 std::int64_t stride_ipiv,
-                                                 std::int64_t batch_size) {
-    throw unimplemented("lapack", "getrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t m,
-                                                              std::int64_t n, std::int64_t lda,
-                                                              std::int64_t stride_a,
-                                                              std::int64_t stride_ipiv,
-                                                              std::int64_t batch_size) {
-    throw unimplemented("lapack", "getrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t m,
-                                                               std::int64_t n, std::int64_t lda,
-                                                               std::int64_t stride_a,
-                                                               std::int64_t stride_ipiv,
-                                                               std::int64_t batch_size) {
-    throw unimplemented("lapack", "getrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t n,
-                                                std::int64_t lda, std::int64_t stride_a,
-                                                std::int64_t stride_ipiv, std::int64_t batch_size) {
-    throw unimplemented("lapack", "getri_batch_scratchpad_size");
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t n,
-                                                 std::int64_t lda, std::int64_t stride_a,
-                                                 std::int64_t stride_ipiv,
-                                                 std::int64_t batch_size) {
-    throw unimplemented("lapack", "getri_batch_scratchpad_size");
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t n,
-                                                              std::int64_t lda,
-                                                              std::int64_t stride_a,
-                                                              std::int64_t stride_ipiv,
-                                                              std::int64_t batch_size) {
-    throw unimplemented("lapack", "getri_batch_scratchpad_size");
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t n,
-                                                               std::int64_t lda,
-                                                               std::int64_t stride_a,
-                                                               std::int64_t stride_ipiv,
-                                                               std::int64_t batch_size) {
-    throw unimplemented("lapack", "getri_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t stride_a, std::int64_t stride_ipiv,
-                                                std::int64_t ldb, std::int64_t stride_b,
-                                                std::int64_t batch_size) {
-    throw unimplemented("lapack", "getrs_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                 std::int64_t n, std::int64_t nrhs,
-                                                 std::int64_t lda, std::int64_t stride_a,
-                                                 std::int64_t stride_ipiv, std::int64_t ldb,
-                                                 std::int64_t stride_b, std::int64_t batch_size) {
-    throw unimplemented("lapack", "getrs_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size) {
-    throw unimplemented("lapack", "getrs_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size) {
-    throw unimplemented("lapack", "getrs_batch_scratchpad_size");
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda, std::int64_t stride_a,
-                                                std::int64_t stride_tau, std::int64_t batch_size) {
-    throw unimplemented("lapack", "geqrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t lda, std::int64_t stride_a,
-                                                 std::int64_t stride_tau, std::int64_t batch_size) {
-    throw unimplemented("lapack", "geqrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t m,
-                                                              std::int64_t n, std::int64_t lda,
-                                                              std::int64_t stride_a,
-                                                              std::int64_t stride_tau,
-                                                              std::int64_t batch_size) {
-    throw unimplemented("lapack", "geqrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t m,
-                                                               std::int64_t n, std::int64_t lda,
-                                                               std::int64_t stride_a,
-                                                               std::int64_t stride_tau,
-                                                               std::int64_t batch_size) {
-    throw unimplemented("lapack", "geqrf_batch_scratchpad_size");
-}
-
-template <>
-std::int64_t potrf_batch_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda,
-                                                std::int64_t stride_a, std::int64_t batch_size) {
-    throw unimplemented("lapack", "potrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t lda,
-                                                 std::int64_t stride_a, std::int64_t batch_size) {
-    throw unimplemented("lapack", "potrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue,
-                                                              oneapi::mkl::uplo uplo,
-                                                              std::int64_t n, std::int64_t lda,
-                                                              std::int64_t stride_a,
-                                                              std::int64_t batch_size) {
-    throw unimplemented("lapack", "potrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue,
-                                                               oneapi::mkl::uplo uplo,
-                                                               std::int64_t n, std::int64_t lda,
-                                                               std::int64_t stride_a,
-                                                               std::int64_t batch_size) {
-    throw unimplemented("lapack", "potrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t stride_a, std::int64_t ldb,
-                                                std::int64_t stride_b, std::int64_t batch_size) {
-    throw unimplemented("lapack", "potrs_batch_scratchpad_size");
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                 std::int64_t n, std::int64_t nrhs,
-                                                 std::int64_t lda, std::int64_t stride_a,
-                                                 std::int64_t ldb, std::int64_t stride_b,
-                                                 std::int64_t batch_size) {
-    throw unimplemented("lapack", "potrs_batch_scratchpad_size");
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    throw unimplemented("lapack", "potrs_batch_scratchpad_size");
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    throw unimplemented("lapack", "potrs_batch_scratchpad_size");
-}
-template <>
-std::int64_t orgqr_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t k, std::int64_t lda,
-                                                std::int64_t stride_a, std::int64_t stride_tau,
-                                                std::int64_t batch_size) {
-    throw unimplemented("lapack", "orgqr_batch_scratchpad_size");
-}
-template <>
-std::int64_t orgqr_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                 std::int64_t k, std::int64_t lda,
-                                                 std::int64_t stride_a, std::int64_t stride_tau,
-                                                 std::int64_t batch_size) {
-    throw unimplemented("lapack", "orgqr_batch_scratchpad_size");
-}
-template <>
-std::int64_t ungqr_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) {
-    throw unimplemented("lapack", "ungqr_batch_scratchpad_size");
-}
-template <>
-std::int64_t ungqr_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) {
-    throw unimplemented("lapack", "ungqr_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t *m,
-                                                std::int64_t *n, std::int64_t *lda,
-                                                std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "getrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t *m,
-                                                 std::int64_t *n, std::int64_t *lda,
-                                                 std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "getrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t *m,
-                                                              std::int64_t *n, std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "getrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t *m,
-                                                               std::int64_t *n, std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "getrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t *n,
-                                                std::int64_t *lda, std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "getri_batch_scratchpad_size");
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t *n,
-                                                 std::int64_t *lda, std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "getri_batch_scratchpad_size");
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t *n,
-                                                              std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "getri_batch_scratchpad_size");
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t *n,
-                                                               std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "getri_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                                std::int64_t *n, std::int64_t *nrhs,
-                                                std::int64_t *lda, std::int64_t *ldb,
-                                                std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "getrs_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                                 std::int64_t *n, std::int64_t *nrhs,
-                                                 std::int64_t *lda, std::int64_t *ldb,
-                                                 std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "getrs_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<std::complex<float>>(
-    sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "getrs_batch_scratchpad_size");
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<std::complex<double>>(
-    sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-    std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "getrs_batch_scratchpad_size");
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t *m,
-                                                std::int64_t *n, std::int64_t *lda,
-                                                std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "geqrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t *m,
-                                                 std::int64_t *n, std::int64_t *lda,
-                                                 std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "geqrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t *m,
-                                                              std::int64_t *n, std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "geqrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t *m,
-                                                               std::int64_t *n, std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "geqrf_batch_scratchpad_size");
-}
-template <>
-std::int64_t orgqr_batch_scratchpad_size<float>(sycl::queue &queue, std::int64_t *m,
-                                                std::int64_t *n, std::int64_t *k, std::int64_t *lda,
-                                                std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "orgqr_batch_scratchpad_size");
-}
-template <>
-std::int64_t orgqr_batch_scratchpad_size<double>(sycl::queue &queue, std::int64_t *m,
-                                                 std::int64_t *n, std::int64_t *k,
-                                                 std::int64_t *lda, std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "orgqr_batch_scratchpad_size");
-}
-
-// rocsolverDnXpotrfBatched does not use scratchpad memory
-#define POTRF_GROUP_LAUNCHER_SCRATCH(TYPE)                                                   \
-    template <>                                                                              \
-    std::int64_t potrf_batch_scratchpad_size<TYPE>(                                          \
-        sycl::queue & queue, oneapi::mkl::uplo * uplo, std::int64_t * n, std::int64_t * lda, \
-        std::int64_t group_count, std::int64_t * group_sizes) {                              \
-        return 0;                                                                            \
-    }
-
-POTRF_GROUP_LAUNCHER_SCRATCH(float)
-POTRF_GROUP_LAUNCHER_SCRATCH(double)
-POTRF_GROUP_LAUNCHER_SCRATCH(std::complex<float>)
-POTRF_GROUP_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef POTRF_GROUP_LAUNCHER_SCRATCH
-
-// rocsolverDnXpotrsBatched does not use scratchpad memory
-#define POTRS_GROUP_LAUNCHER_SCRATCH(TYPE)                                                    \
-    template <>                                                                               \
-    std::int64_t potrs_batch_scratchpad_size<TYPE>(                                           \
-        sycl::queue & queue, oneapi::mkl::uplo * uplo, std::int64_t * n, std::int64_t * nrhs, \
-        std::int64_t * lda, std::int64_t * ldb, std::int64_t group_count,                     \
-        std::int64_t * group_sizes) {                                                         \
-        return 0;                                                                             \
-    }
-
-POTRS_GROUP_LAUNCHER_SCRATCH(float)
-POTRS_GROUP_LAUNCHER_SCRATCH(double)
-POTRS_GROUP_LAUNCHER_SCRATCH(std::complex<float>)
-POTRS_GROUP_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef POTRS_GROUP_LAUNCHER_SCRATCH
-
-template <>
-std::int64_t ungqr_batch_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t *m,
-                                                              std::int64_t *n, std::int64_t *k,
-                                                              std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "ungqr_batch_scratchpad_size");
-}
-template <>
-std::int64_t ungqr_batch_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t *m,
-                                                               std::int64_t *n, std::int64_t *k,
-                                                               std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes) {
-    throw unimplemented("lapack", "ungqr_batch_scratchpad_size");
-}
-
-} // namespace rocsolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/lapack/backends/rocsolver/rocsolver_handle.hpp b/src/lapack/backends/rocsolver/rocsolver_handle.hpp
deleted file mode 100644
index c44463ef4..000000000
--- a/src/lapack/backends/rocsolver/rocsolver_handle.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright 2022 Intel Corporation
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#ifndef ROCSOLVER_HANDLE_HPP
-#define ROCSOLVER_HANDLE_HPP
-#include <atomic>
-#include <unordered_map>
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace rocsolver {
-
-template <typename T>
-struct rocsolver_handle {
-    using handle_container_t = std::unordered_map<T, std::atomic<rocblas_handle> *>;
-    handle_container_t rocsolver_handle_mapper_{};
-    ~rocsolver_handle() noexcept(false) {
-        for (auto &handle_pair : rocsolver_handle_mapper_) {
-            rocblas_status err;
-            if (handle_pair.second != nullptr) {
-                auto handle = handle_pair.second->exchange(nullptr);
-                if (handle != nullptr) {
-                    ROCSOLVER_ERROR_FUNC(rocblas_destroy_handle, err, handle);
-                    handle = nullptr;
-                }
-                else {
-                    // if the handle is nullptr it means the handle was already
-                    // destroyed by the ContextCallback and we're free to delete the
-                    // atomic object.
-                    delete handle_pair.second;
-                }
-
-                handle_pair.second = nullptr;
-            }
-        }
-        rocsolver_handle_mapper_.clear();
-    }
-};
-
-} // namespace rocsolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
-
-#endif // ROCSOLVER_HANDLE_HPP
diff --git a/src/lapack/backends/rocsolver/rocsolver_helper.hpp b/src/lapack/backends/rocsolver/rocsolver_helper.hpp
deleted file mode 100644
index dade1df64..000000000
--- a/src/lapack/backends/rocsolver/rocsolver_helper.hpp
+++ /dev/null
@@ -1,277 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright 2022 Intel Corporation
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-/**
- * @file solver_*.cpp : contain the implementation of all the routines
- * for HIP backend
- */
-#ifndef _ROCSOLVER_HELPER_HPP_
-#define _ROCSOLVER_HELPER_HPP_
-
-#include <CL/sycl.hpp>
-#include <rocblas/rocblas.h>
-#include <rocsolver/rocsolver.h>
-#include <hip/hip_runtime.h>
-#include <complex>
-
-#include "oneapi/mkl/types.hpp"
-#include "runtime_support_helper.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/lapack/exceptions.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace rocsolver {
-
-// The static assert to make sure that all index types used in
-// oneMKL/include/oneapi/mkl/lapack.hpp interface are int64_t
-template <typename... Next>
-struct is_int64 : std::false_type {};
-
-template <typename First>
-struct is_int64<First> : std::is_same<std::int64_t, First> {};
-
-template <typename First, typename... Next>
-struct is_int64<First, Next...>
-        : std::integral_constant<bool, std::is_same<std::int64_t, First>::value &&
-                                           is_int64<Next...>::value> {};
-
-template <typename... T>
-struct Overflow {
-    static void inline check(T...) {}
-};
-
-template <typename Index, typename... T>
-struct Overflow<Index, T...> {
-    static void inline check(Index index, T... next) {
-        if (std::abs(index) >= (1LL << 31)) {
-            throw std::runtime_error(
-                "rocsolver index overflow. rocsolver legacy API does not support 64 bit "
-                "integer as data size. Thus, the data size should not be greater than "
-                "maximum supported size by 32 bit integer.");
-        }
-        Overflow<T...>::check(next...);
-    }
-};
-
-template <typename Index, typename... Next>
-void overflow_check(Index index, Next... indices) {
-    static_assert(is_int64<Index, Next...>::value, "oneMKL index type must be 64 bit integer.");
-    Overflow<Index, Next...>::check(index, indices...);
-}
-
-class rocsolver_error : virtual public std::runtime_error {
-protected:
-    inline const char *rocsolver_error_map(rocblas_status error) {
-        return rocblas_status_to_string(error);
-    }
-
-    int error_number; ///< Error number
-public:
-    /** Constructor (C++ STL string, rocblas_status ).
-   *  @param msg The error message
-   *  @param err_num error number
-   */
-    explicit rocsolver_error(std::string message, rocblas_status result)
-            : std::runtime_error((message + std::string(rocsolver_error_map(result)))) {
-        error_number = static_cast<int>(result);
-    }
-
-    /** Destructor.
-   *  Virtual to allow for subclassing.
-   */
-    virtual ~rocsolver_error() throw() {}
-
-    /** Returns error number.
-   *  @return #error_number
-   */
-    virtual int getErrorNumber() const throw() {
-        return error_number;
-    }
-};
-
-class hip_error : virtual public std::runtime_error {
-protected:
-    inline const char *hip_error_map(hipError_t result) {
-        return hipGetErrorName(result);
-    }
-    int error_number; ///< error number
-public:
-    /** Constructor (C++ STL string, hipError_t).
-   *  @param msg The error message
-   *  @param err_num Error number
-   */
-    explicit hip_error(std::string message, hipError_t result)
-            : std::runtime_error((message + std::string(hip_error_map(result)))) {
-        error_number = static_cast<int>(result);
-    }
-
-    /** Destructor.
-   *  Virtual to allow for subclassing.
-   */
-    virtual ~hip_error() throw() {}
-
-    /** Returns error number.
-   *  @return #error_number
-   */
-    virtual int getErrorNumber() const throw() {
-        return error_number;
-    }
-};
-
-#define HIP_ERROR_FUNC(name, err, ...)                                 \
-    err = name(__VA_ARGS__);                                           \
-    if (err != HIP_SUCCESS) {                                          \
-        throw hip_error(std::string(#name) + std::string(" : "), err); \
-    }
-
-#define ROCSOLVER_ERROR_FUNC(name, err, ...)                                 \
-    err = name(__VA_ARGS__);                                                 \
-    if (err != rocblas_status_success) {                                     \
-        throw rocsolver_error(std::string(#name) + std::string(" : "), err); \
-    }
-
-#define ROCSOLVER_ERROR_FUNC_T(name, func, err, ...)                        \
-    err = func(__VA_ARGS__);                                                \
-    if (err != rocblas_status_success) {                                    \
-        throw rocsolver_error(std::string(name) + std::string(" : "), err); \
-    }
-
-#define ROCSOLVER_ERROR_FUNC_T_SYNC(name, func, err, handle, ...)            \
-    err = func(handle, __VA_ARGS__);                                         \
-    if (err != rocblas_status_success) {                                     \
-        throw rocsolver_error(std::string(name) + std::string(" : "), err);  \
-    }                                                                        \
-    hipStream_t currentStreamId;                                             \
-    ROCSOLVER_ERROR_FUNC(rocblas_get_stream, err, handle, &currentStreamId); \
-    hipError_t hip_err;                                                      \
-    HIP_ERROR_FUNC(hipStreamSynchronize, hip_err, currentStreamId);
-
-inline rocblas_eform get_rocsolver_itype(std::int64_t itype) {
-    switch (itype) {
-        case 1: return rocblas_eform_ax;
-        case 2: return rocblas_eform_abx;
-        case 3: return rocblas_eform_bax;
-        default: throw "Wrong itype.";
-    }
-}
-
-inline rocblas_evect get_rocsolver_job(oneapi::mkl::job jobz) {
-    switch (jobz) {
-        case oneapi::mkl::job::V: return rocblas_evect_original;
-        case oneapi::mkl::job::N: return rocblas_evect_none;
-        default: throw "Wrong jobz.";
-    }
-}
-
-inline rocblas_svect get_rocsolver_jobsvd(oneapi::mkl::jobsvd job) {
-    switch (job) {
-        case oneapi::mkl::jobsvd::N: return rocblas_svect_none;
-        case oneapi::mkl::jobsvd::A: return rocblas_svect_all;
-        case oneapi::mkl::jobsvd::O: return rocblas_svect_overwrite;
-        case oneapi::mkl::jobsvd::S: return rocblas_svect_singular;
-        default: throw "Wrong jobsvd.";
-    }
-}
-
-inline rocblas_operation get_rocblas_operation(oneapi::mkl::transpose trn) {
-    switch (trn) {
-        case oneapi::mkl::transpose::nontrans: return rocblas_operation_none;
-        case oneapi::mkl::transpose::trans: return rocblas_operation_transpose;
-        case oneapi::mkl::transpose::conjtrans: return rocblas_operation_conjugate_transpose;
-        default: throw "Wrong transpose Operation.";
-    }
-}
-
-inline rocblas_fill get_rocblas_fill_mode(oneapi::mkl::uplo ul) {
-    switch (ul) {
-        case oneapi::mkl::uplo::upper: return rocblas_fill_upper;
-        case oneapi::mkl::uplo::lower: return rocblas_fill_lower;
-        default: throw "Wrong fill mode.";
-    }
-}
-
-inline rocblas_side get_rocblas_side_mode(oneapi::mkl::side lr) {
-    switch (lr) {
-        case oneapi::mkl::side::left: return rocblas_side_left;
-        case oneapi::mkl::side::right: return rocblas_side_right;
-        default: throw "Wrong side mode.";
-    }
-}
-
-inline rocblas_storev get_rocblas_generate(oneapi::mkl::generate qp) {
-    switch (qp) {
-        case oneapi::mkl::generate::Q: return rocblas_column_wise;
-        case oneapi::mkl::generate::P: return rocblas_row_wise;
-        default: throw "Wrong generate.";
-    }
-}
-
-/*converting std::complex<T> to cu<T>Complex*/
-/*converting sycl::half to __half*/
-template <typename T>
-struct RocmEquivalentType {
-    using Type = T;
-};
-template <>
-struct RocmEquivalentType<sycl::half> {
-    using Type = rocblas_half;
-};
-template <>
-struct RocmEquivalentType<std::complex<float>> {
-    using Type = rocblas_float_complex;
-};
-template <>
-struct RocmEquivalentType<std::complex<double>> {
-    using Type = rocblas_double_complex;
-};
-
-/* devinfo */
-
-inline int get_rocsolver_devinfo(sycl::queue &queue, sycl::buffer<int> &devInfo) {
-    sycl::host_accessor<int, 1, sycl::access::mode::read> dev_info_{ devInfo };
-    return dev_info_[0];
-}
-
-inline int get_rocsolver_devinfo(sycl::queue &queue, const int *devInfo) {
-    int dev_info_;
-    queue.memcpy(&dev_info_, devInfo, sizeof(int));
-    queue.wait();
-    return dev_info_;
-}
-
-template <typename DEVINFO_T>
-inline void lapack_info_check(sycl::queue &queue, DEVINFO_T devinfo, const char *func_name,
-                              const char *cufunc_name) {
-    queue.wait();
-    const int devinfo_ = get_rocsolver_devinfo(queue, devinfo);
-    if (devinfo_ > 0)
-        throw oneapi::mkl::lapack::computation_error(
-            func_name, std::string(cufunc_name) + " failed with info = " + std::to_string(devinfo_),
-            devinfo_);
-}
-
-} // namespace rocsolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
-#endif // _ROCSOLVER_HELPER_HPP_
diff --git a/src/lapack/backends/rocsolver/rocsolver_lapack.cpp b/src/lapack/backends/rocsolver/rocsolver_lapack.cpp
deleted file mode 100644
index e5e634ad0..000000000
--- a/src/lapack/backends/rocsolver/rocsolver_lapack.cpp
+++ /dev/null
@@ -1,2807 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright 2022 Intel Corporation
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "rocsolver_helper.hpp"
-#include "rocsolver_task.hpp"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace rocsolver {
-
-// BUFFER APIs
-
-template <typename Func, typename T_A, typename T_B>
-inline void gebrd(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                  std::int64_t n, sycl::buffer<T_A> &a, std::int64_t lda, sycl::buffer<T_B> &d,
-                  sycl::buffer<T_B> &e, sycl::buffer<T_A> &tauq, sycl::buffer<T_A> &taup,
-                  sycl::buffer<T_A> &scratchpad, std::int64_t scratchpad_size) {
-    using rocmDataType_A = typename RocmEquivalentType<T_A>::Type;
-    using rocmDataType_B = typename RocmEquivalentType<T_B>::Type;
-    overflow_check(m, n, lda, scratchpad_size);
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto d_acc = d.template get_access<sycl::access::mode::write>(cgh);
-        auto e_acc = e.template get_access<sycl::access::mode::write>(cgh);
-        auto tauq_acc = tauq.template get_access<sycl::access::mode::write>(cgh);
-        auto taup_acc = taup.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType_A *>(a_acc);
-            auto d_ = sc.get_mem<rocmDataType_B *>(d_acc);
-            auto e_ = sc.get_mem<rocmDataType_B *>(e_acc);
-            auto tauq_ = sc.get_mem<rocmDataType_A *>(tauq_acc);
-            auto taup_ = sc.get_mem<rocmDataType_A *>(taup_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, d_, e_, tauq_,
-                                        taup_);
-        });
-    });
-}
-
-#define GEBRD_LAUNCHER(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE)                                   \
-    void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<TYPE_A> &a, \
-               std::int64_t lda, sycl::buffer<TYPE_B> &d, sycl::buffer<TYPE_B> &e,          \
-               sycl::buffer<TYPE_A> &tauq, sycl::buffer<TYPE_A> &taup,                      \
-               sycl::buffer<TYPE_A> &scratchpad, std::int64_t scratchpad_size) {            \
-        gebrd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, a, lda, d, e, tauq, taup, \
-              scratchpad, scratchpad_size);                                                 \
-    }
-
-GEBRD_LAUNCHER(float, float, rocsolver_sgebrd)
-GEBRD_LAUNCHER(double, double, rocsolver_dgebrd)
-GEBRD_LAUNCHER(std::complex<float>, float, rocsolver_cgebrd)
-GEBRD_LAUNCHER(std::complex<double>, double, rocsolver_zgebrd)
-
-#undef GEBRD_LAUNCHER
-
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "gerqf");
-}
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "gerqf");
-}
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "gerqf");
-}
-void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "gerqf");
-}
-
-template <typename Func, typename T>
-inline void geqrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau,
-                  sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto tau_ = sc.get_mem<rocmDataType *>(tau_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, tau_);
-        });
-    });
-}
-
-#define GEQRF_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                            \
-    void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<TYPE> &a,  \
-               std::int64_t lda, sycl::buffer<TYPE> &tau, sycl::buffer<TYPE> &scratchpad,  \
-               std::int64_t scratchpad_size) {                                             \
-        geqrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, a, lda, tau, scratchpad, \
-              scratchpad_size);                                                            \
-    }
-
-GEQRF_LAUNCHER(float, rocsolver_sgeqrf)
-GEQRF_LAUNCHER(double, rocsolver_dgeqrf)
-GEQRF_LAUNCHER(std::complex<float>, rocsolver_cgeqrf)
-GEQRF_LAUNCHER(std::complex<double>, rocsolver_zgeqrf)
-
-#undef GEQRF_LAUNCHER
-
-template <typename Func, typename T>
-void getrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, lda, scratchpad_size);
-
-    // rocsolver legacy api does not accept 64-bit ints.
-    // To get around the limitation,
-    // create new buffer with 32-bit ints then copy over results
-    std::uint64_t ipiv_size = std::min(n, m);
-    sycl::buffer<int, 1> ipiv32(sycl::range<1>{ ipiv_size });
-    sycl::buffer<int> devInfo{ 1 };
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto ipiv32_acc = ipiv32.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto ipiv32_ = sc.get_mem<int *>(ipiv32_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, ipiv32_,
-                                        devInfo_);
-        });
-    });
-
-    // Copy from 32-bit buffer to 64-bit
-    queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(done);
-        auto ipiv32_acc = ipiv32.template get_access<sycl::access::mode::read>(cgh);
-        auto ipiv_acc = ipiv.template get_access<sycl::access::mode::write>(cgh);
-        cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-            ipiv_acc[index] = static_cast<std::int64_t>(ipiv32_acc[index]);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define GETRF_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                    \
-    void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<TYPE> &a,          \
-               std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<TYPE> &scratchpad, \
-               std::int64_t scratchpad_size) {                                                     \
-        getrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, a, lda, ipiv, scratchpad,        \
-              scratchpad_size);                                                                    \
-    }
-
-GETRF_LAUNCHER(float, rocsolver_sgetrf)
-GETRF_LAUNCHER(double, rocsolver_dgetrf)
-GETRF_LAUNCHER(std::complex<float>, rocsolver_cgetrf)
-GETRF_LAUNCHER(std::complex<double>, rocsolver_zgetrf)
-
-#undef GETRF_LAUNCHER
-
-void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getri");
-}
-void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getri");
-}
-void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getri");
-}
-void getri(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "getri");
-}
-
-template <typename Func, typename T>
-inline void getrs(const char *func_name, Func func, sycl::queue &queue,
-                  oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                  sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                  sycl::buffer<T> &b, std::int64_t ldb, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, nrhs, lda, ldb);
-
-    // rocsolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Create new buffer and convert 64-bit values.
-    std::uint64_t ipiv_size = ipiv.size();
-    sycl::buffer<int, 1> ipiv32(sycl::range<1>{ ipiv_size });
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto ipiv32_acc = ipiv32.template get_access<sycl::access::mode::write>(cgh);
-        auto ipiv_acc = ipiv.template get_access<sycl::access::mode::read>(cgh);
-        cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-            ipiv32_acc[index] = static_cast<std::int32_t>(ipiv_acc[index]);
-        });
-    });
-
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto ipiv_acc = ipiv32.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto ipiv_ = sc.get_mem<std::int32_t *>(ipiv_acc);
-            auto b_ = sc.get_mem<rocmDataType *>(b_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_operation(trans),
-                                        n, nrhs, a_, lda, ipiv_, b_, ldb);
-        });
-    });
-}
-
-#define GETRS_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                   \
-    void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,                  \
-               std::int64_t nrhs, sycl::buffer<TYPE> &a, std::int64_t lda,                        \
-               sycl::buffer<std::int64_t> &ipiv, sycl::buffer<TYPE> &b, std::int64_t ldb,         \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                    \
-        getrs(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, trans, n, nrhs, a, lda, ipiv, b, ldb, \
-              scratchpad, scratchpad_size);                                                       \
-    }
-
-GETRS_LAUNCHER(float, rocsolver_sgetrs)
-GETRS_LAUNCHER(double, rocsolver_dgetrs)
-GETRS_LAUNCHER(std::complex<float>, rocsolver_cgetrs)
-GETRS_LAUNCHER(std::complex<double>, rocsolver_zgetrs)
-
-#undef GETRS_LAUNCHER
-
-template <typename Func, typename T_A, typename T_B>
-inline void gesvd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                  oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, sycl::buffer<T_A> &a,
-                  std::int64_t lda, sycl::buffer<T_B> &s, sycl::buffer<T_A> &u, std::int64_t ldu,
-                  sycl::buffer<T_A> &vt, std::int64_t ldvt, sycl::buffer<T_A> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using rocmDataType_A = typename RocmEquivalentType<T_A>::Type;
-    using rocmDataType_B = typename RocmEquivalentType<T_B>::Type;
-    overflow_check(n, m, lda, ldu, ldvt, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto s_acc = s.template get_access<sycl::access::mode::write>(cgh);
-        auto u_acc = u.template get_access<sycl::access::mode::write>(cgh);
-        auto vt_acc = vt.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType_A *>(a_acc);
-            auto s_ = sc.get_mem<rocmDataType_B *>(s_acc);
-            auto u_ = sc.get_mem<rocmDataType_A *>(u_acc);
-            auto vt_ = sc.get_mem<rocmDataType_A *>(vt_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<rocmDataType_B *>(scratch_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_jobsvd(jobu),
-                                        get_rocsolver_jobsvd(jobvt), m, n, a_, lda, s_, u_, ldu,
-                                        vt_, ldvt, scratch_, rocblas_workmode::rocblas_outofplace,
-                                        devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define GESVD_LAUNCHER(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE)                                         \
-    void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,           \
-               std::int64_t m, std::int64_t n, sycl::buffer<TYPE_A> &a, std::int64_t lda,         \
-               sycl::buffer<TYPE_B> &s, sycl::buffer<TYPE_A> &u, std::int64_t ldu,                \
-               sycl::buffer<TYPE_A> &vt, std::int64_t ldvt, sycl::buffer<TYPE_A> &scratchpad,     \
-               std::int64_t scratchpad_size) {                                                    \
-        gesvd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, jobu, jobvt, m, n, a, lda, s, u, ldu, \
-              vt, ldvt, scratchpad, scratchpad_size);                                             \
-    }
-
-GESVD_LAUNCHER(float, float, rocsolver_sgesvd)
-GESVD_LAUNCHER(double, double, rocsolver_dgesvd)
-GESVD_LAUNCHER(std::complex<float>, float, rocsolver_cgesvd)
-GESVD_LAUNCHER(std::complex<double>, double, rocsolver_zgesvd)
-
-#undef GESVD_LAUNCHER
-
-template <typename Func, typename T_A, typename T_B>
-inline void heevd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<T_A> &a, std::int64_t lda,
-                  sycl::buffer<T_B> &w, sycl::buffer<T_A> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using rocmDataType_A = typename RocmEquivalentType<T_A>::Type;
-    using rocmDataType_B = typename RocmEquivalentType<T_B>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto w_acc = w.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType_A *>(a_acc);
-            auto w_ = sc.get_mem<rocmDataType_B *>(w_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<rocmDataType_B *>(scratch_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_job(jobz),
-                                        get_rocblas_fill_mode(uplo), n, a_, lda, w_, scratch_,
-                                        devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define HEEVD_LAUNCHER(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE)                                         \
-    void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, \
-               sycl::buffer<TYPE_A> &a, std::int64_t lda, sycl::buffer<TYPE_B> &w,                \
-               sycl::buffer<TYPE_A> &scratchpad, std::int64_t scratchpad_size) {                  \
-        heevd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w, scratchpad, \
-              scratchpad_size);                                                                   \
-    }
-
-HEEVD_LAUNCHER(std::complex<float>, float, rocsolver_cheevd)
-HEEVD_LAUNCHER(std::complex<double>, double, rocsolver_zheevd)
-
-#undef HEEVD_LAUNCHER
-
-template <typename Func, typename T_A, typename T_B>
-inline void hegvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype,
-                  oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                  sycl::buffer<T_A> &a, std::int64_t lda, sycl::buffer<T_A> &b, std::int64_t ldb,
-                  sycl::buffer<T_B> &w, sycl::buffer<T_A> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using rocmDataType_A = typename RocmEquivalentType<T_A>::Type;
-    using rocmDataType_B = typename RocmEquivalentType<T_B>::Type;
-    overflow_check(n, lda, ldb, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        auto w_acc = w.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType_A *>(a_acc);
-            auto b_ = sc.get_mem<rocmDataType_A *>(b_acc);
-            auto w_ = sc.get_mem<rocmDataType_B *>(w_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<rocmDataType_B *>(scratch_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_itype(itype),
-                                        get_rocsolver_job(jobz), get_rocblas_fill_mode(uplo), n, a_,
-                                        lda, b_, ldb, w_, scratch_, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define HEGVD_LAUNCHER(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE)                                         \
-    void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,                     \
-               oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE_A> &a, std::int64_t lda, \
-               sycl::buffer<TYPE_A> &b, std::int64_t ldb, sycl::buffer<TYPE_B> &w,                \
-               sycl::buffer<TYPE_A> &scratchpad, std::int64_t scratchpad_size) {                  \
-        hegvd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, b, ldb, \
-              w, scratchpad, scratchpad_size);                                                    \
-    }
-
-HEGVD_LAUNCHER(std::complex<float>, float, rocsolver_chegvd)
-HEGVD_LAUNCHER(std::complex<double>, double, rocsolver_zhegvd)
-
-#undef HEGVD_LAUNCHER
-
-template <typename Func, typename T_A, typename T_B>
-inline void hetrd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, sycl::buffer<T_A> &a, std::int64_t lda, sycl::buffer<T_B> &d,
-                  sycl::buffer<T_B> &e, sycl::buffer<T_A> &tau, sycl::buffer<T_A> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using rocmDataType_A = typename RocmEquivalentType<T_A>::Type;
-    using rocmDataType_B = typename RocmEquivalentType<T_B>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto d_acc = d.template get_access<sycl::access::mode::write>(cgh);
-        auto e_acc = e.template get_access<sycl::access::mode::write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType_A *>(a_acc);
-            auto d_ = sc.get_mem<rocmDataType_B *>(d_acc);
-            auto e_ = sc.get_mem<rocmDataType_B *>(e_acc);
-            auto tau_ = sc.get_mem<rocmDataType_A *>(tau_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, a_, lda, d_, e_, tau_);
-        });
-    });
-}
-
-#define HETRD_LAUNCHER(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE)                               \
-    void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,              \
-               sycl::buffer<TYPE_A> &a, std::int64_t lda, sycl::buffer<TYPE_B> &d,      \
-               sycl::buffer<TYPE_B> &e, sycl::buffer<TYPE_A> &tau,                      \
-               sycl::buffer<TYPE_A> &scratchpad, std::int64_t scratchpad_size) {        \
-        hetrd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau, \
-              scratchpad, scratchpad_size);                                             \
-    }
-
-HETRD_LAUNCHER(std::complex<float>, float, rocsolver_chetrd)
-HETRD_LAUNCHER(std::complex<double>, double, rocsolver_zhetrd)
-
-#undef HETRD_LAUNCHER
-
-void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "hetrf");
-}
-void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "hetrf");
-}
-
-template <typename Func, typename T>
-inline void orgbr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::generate vec,
-                  std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<T> &a,
-                  std::int64_t lda, sycl::buffer<T> &tau, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto tau_ = sc.get_mem<rocmDataType *>(tau_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_generate(vec), m,
-                                        n, k, a_, lda, tau_);
-        });
-    });
-}
-
-#define ORGBR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                    \
-    void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,      \
-               std::int64_t k, sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &tau,   \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                     \
-        orgbr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, scratchpad, \
-              scratchpad_size);                                                                    \
-    }
-
-ORGBR_LAUNCHER(float, rocsolver_sorgbr)
-ORGBR_LAUNCHER(double, rocsolver_dorgbr)
-
-#undef ORGBR_LAUNCHER
-
-template <typename Func, typename T>
-inline void orgqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                  std::int64_t n, std::int64_t k, sycl::buffer<T> &a, std::int64_t lda,
-                  sycl::buffer<T> &tau, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto tau_ = sc.get_mem<rocmDataType *>(tau_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_);
-        });
-    });
-}
-
-#define ORGQR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                               \
-    void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,            \
-               sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &tau,              \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                \
-        orgqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, scratchpad, \
-              scratchpad_size);                                                               \
-    }
-
-ORGQR_LAUNCHER(float, rocsolver_sorgqr)
-ORGQR_LAUNCHER(double, rocsolver_dorgqr)
-
-#undef ORGQR_LAUNCHER
-
-template <typename Func, typename T>
-inline void orgtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau,
-                  sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::read>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto tau_ = sc.get_mem<rocmDataType *>(tau_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, a_, lda, tau_);
-        });
-    });
-}
-
-#define ORGTR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                   \
-    void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE> &a, \
-               std::int64_t lda, sycl::buffer<TYPE> &tau, sycl::buffer<TYPE> &scratchpad,         \
-               std::int64_t scratchpad_size) {                                                    \
-        orgtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, tau, scratchpad,     \
-              scratchpad_size);                                                                   \
-    }
-
-ORGTR_LAUNCHER(float, rocsolver_sorgtr)
-ORGTR_LAUNCHER(double, rocsolver_dorgtr)
-
-#undef ORGTR_LAUNCHER
-
-template <typename Func, typename T>
-inline void ormtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau,
-                  sycl::buffer<T> &c, std::int64_t ldc, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldc, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::read_write>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto tau_ = sc.get_mem<rocmDataType *>(tau_acc);
-            auto c_ = sc.get_mem<rocmDataType *>(c_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side),
-                                        get_rocblas_fill_mode(uplo), get_rocblas_operation(trans),
-                                        m, n, a_, lda, tau_, c_, ldc);
-        });
-    });
-}
-
-#define ORMTR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                   \
-    void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,                \
-               oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,                      \
-               sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &tau,                  \
-               sycl::buffer<TYPE> &c, std::int64_t ldc, sycl::buffer<TYPE> &scratchpad,           \
-               std::int64_t scratchpad_size) {                                                    \
-        ormtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a, lda, tau, \
-              c, ldc, scratchpad, scratchpad_size);                                               \
-    }
-
-ORMTR_LAUNCHER(float, rocsolver_sormtr)
-ORMTR_LAUNCHER(double, rocsolver_dormtr)
-
-#undef ORMTR_LAUNCHER
-
-void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-           sycl::buffer<float> &tau, sycl::buffer<float> &c, std::int64_t ldc,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "ormrq");
-}
-void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-           sycl::buffer<double> &tau, sycl::buffer<double> &c, std::int64_t ldc,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "ormrq");
-}
-
-template <typename Func, typename T>
-inline void ormqr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-                  sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau, sycl::buffer<T> &c,
-                  std::int64_t ldc, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, ldc, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::read>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto tau_ = sc.get_mem<rocmDataType *>(tau_acc);
-            auto c_ = sc.get_mem<rocmDataType *>(c_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side),
-                                        get_rocblas_operation(trans), m, n, k, a_, lda, tau_, c_,
-                                        ldc);
-        });
-    });
-}
-
-#define ORMQR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                    \
-    void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,           \
-               std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<TYPE> &a,              \
-               std::int64_t lda, sycl::buffer<TYPE> &tau, sycl::buffer<TYPE> &c, std::int64_t ldc, \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                     \
-        ormqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda, tau, c,  \
-              ldc, scratchpad, scratchpad_size);                                                   \
-    }
-
-ORMQR_LAUNCHER(float, rocsolver_sormqr)
-ORMQR_LAUNCHER(double, rocsolver_dormqr)
-
-#undef ORMQR_LAUNCHER
-
-template <typename Func, typename T>
-inline void potrf(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, a_, lda, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define POTRF_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                   \
-    void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE> &a, \
-               std::int64_t lda, sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {  \
-        potrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad,          \
-              scratchpad_size);                                                                   \
-    }
-
-POTRF_LAUNCHER(float, rocsolver_spotrf)
-POTRF_LAUNCHER(double, rocsolver_dpotrf)
-POTRF_LAUNCHER(std::complex<float>, rocsolver_cpotrf)
-POTRF_LAUNCHER(std::complex<double>, rocsolver_zpotrf)
-
-#undef POTRF_LAUNCHER
-
-template <typename Func, typename T>
-inline void potri(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, a_, lda, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define POTRI_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                   \
-    void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE> &a, \
-               std::int64_t lda, sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {  \
-        potri(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad,          \
-              scratchpad_size);                                                                   \
-    }
-
-POTRI_LAUNCHER(float, rocsolver_spotri)
-POTRI_LAUNCHER(double, rocsolver_dpotri)
-POTRI_LAUNCHER(std::complex<float>, rocsolver_cpotri)
-POTRI_LAUNCHER(std::complex<double>, rocsolver_zpotri)
-
-#undef POTRI_LAUNCHER
-
-template <typename Func, typename T>
-inline void potrs(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::int64_t nrhs, sycl::buffer<T> &a, std::int64_t lda,
-                  sycl::buffer<T> &b, std::int64_t ldb, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, nrhs, lda, ldb, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto b_ = sc.get_mem<rocmDataType *>(b_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, nrhs, a_, lda, b_, ldb);
-        });
-    });
-}
-
-#define POTRS_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                  \
-    void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,    \
-               sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &b, std::int64_t ldb, \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                   \
-        potrs(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda, b, ldb,       \
-              scratchpad, scratchpad_size);                                                      \
-    }
-
-POTRS_LAUNCHER(float, rocsolver_spotrs)
-POTRS_LAUNCHER(double, rocsolver_dpotrs)
-POTRS_LAUNCHER(std::complex<float>, rocsolver_cpotrs)
-POTRS_LAUNCHER(std::complex<double>, rocsolver_zpotrs)
-
-#undef POTRS_LAUNCHER
-
-template <typename Func, typename T>
-inline void syevd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<T> &a, std::int64_t lda,
-                  sycl::buffer<T> &w, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto w_acc = w.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto w_ = sc.get_mem<rocmDataType *>(w_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<rocmDataType *>(scratch_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_job(jobz),
-                                        get_rocblas_fill_mode(uplo), n, a_, lda, w_, scratch_,
-                                        devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define SYEVD_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                   \
-    void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, \
-               sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &w,                    \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                    \
-        syevd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w, scratchpad, \
-              scratchpad_size);                                                                   \
-    }
-
-SYEVD_LAUNCHER(float, rocsolver_ssyevd)
-SYEVD_LAUNCHER(double, rocsolver_dsyevd)
-
-#undef SYEVD_LAUNCHER
-
-template <typename Func, typename T>
-inline void sygvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype,
-                  oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<T> &a,
-                  std::int64_t lda, sycl::buffer<T> &b, std::int64_t ldb, sycl::buffer<T> &w,
-                  sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, ldb, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto b_acc = b.template get_access<sycl::access::mode::read_write>(cgh);
-        auto w_acc = w.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        auto scratch_acc = scratchpad.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto b_ = sc.get_mem<rocmDataType *>(b_acc);
-            auto w_ = sc.get_mem<rocmDataType *>(w_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            auto scratch_ = sc.get_mem<rocmDataType *>(scratch_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_itype(itype),
-                                        get_rocsolver_job(jobz), get_rocblas_fill_mode(uplo), n, a_,
-                                        lda, b_, ldb, w_, scratch_, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define SYGVD_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                   \
-    void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,                     \
-               oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE> &a, std::int64_t lda,   \
-               sycl::buffer<TYPE> &b, std::int64_t ldb, sycl::buffer<TYPE> &w,                    \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                    \
-        sygvd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, b, ldb, \
-              w, scratchpad, scratchpad_size);                                                    \
-    }
-
-SYGVD_LAUNCHER(float, rocsolver_ssygvd)
-SYGVD_LAUNCHER(double, rocsolver_dsygvd)
-
-#undef SYGVD_LAUNCH
-
-template <typename Func, typename T>
-inline void sytrd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &d,
-                  sycl::buffer<T> &e, sycl::buffer<T> &tau, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto d_acc = d.template get_access<sycl::access::mode::write>(cgh);
-        auto e_acc = e.template get_access<sycl::access::mode::write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto d_ = sc.get_mem<rocmDataType *>(d_acc);
-            auto e_ = sc.get_mem<rocmDataType *>(e_acc);
-            auto tau_ = sc.get_mem<rocmDataType *>(tau_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, a_, lda, d_, e_, tau_);
-        });
-    });
-}
-
-#define SYTRD_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                   \
-    void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE> &a, \
-               std::int64_t lda, sycl::buffer<TYPE> &d, sycl::buffer<TYPE> &e,                    \
-               sycl::buffer<TYPE> &tau, sycl::buffer<TYPE> &scratchpad,                           \
-               std::int64_t scratchpad_size) {                                                    \
-        sytrd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau,           \
-              scratchpad, scratchpad_size);                                                       \
-    }
-
-SYTRD_LAUNCHER(float, rocsolver_ssytrd)
-SYTRD_LAUNCHER(double, rocsolver_dsytrd)
-
-#undef SYTRD_LAUNCHER
-
-template <typename Func, typename T>
-inline void sytrf(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda,
-                  sycl::buffer<std::int64_t> &ipiv, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    sycl::buffer<int> devInfo{ 1 };
-
-    // rocsolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Create new buffer with 32-bit ints then copy over results
-    std::uint64_t ipiv_size = n;
-    sycl::buffer<int, 1> ipiv32(sycl::range<1>{ ipiv_size });
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto ipiv32_acc = ipiv32.template get_access<sycl::access::mode::write>(cgh);
-        auto devInfo_acc = devInfo.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto ipiv32_ = sc.get_mem<int *>(ipiv32_acc);
-            auto devInfo_ = sc.get_mem<int *>(devInfo_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, a_, lda, ipiv32_, devInfo_);
-        });
-    });
-
-    // Copy from 32-bit buffer to 64-bit
-    queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(done);
-        auto ipiv32_acc = ipiv32.template get_access<sycl::access::mode::read>(cgh);
-        auto ipiv_acc = ipiv.template get_access<sycl::access::mode::write>(cgh);
-        cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-            ipiv_acc[index] = static_cast<std::int64_t>(ipiv32_acc[index]);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-}
-
-#define SYTRF_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                    \
-    void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE> &a,  \
-               std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<TYPE> &scratchpad, \
-               std::int64_t scratchpad_size) {                                                     \
-        sytrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, ipiv, scratchpad,     \
-              scratchpad_size);                                                                    \
-    }
-
-SYTRF_LAUNCHER(float, rocsolver_ssytrf)
-SYTRF_LAUNCHER(double, rocsolver_dsytrf)
-SYTRF_LAUNCHER(std::complex<float>, rocsolver_csytrf)
-SYTRF_LAUNCHER(std::complex<double>, rocsolver_zsytrf)
-
-#undef SYTRF_LAUNCHER
-
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "trtrs");
-}
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "trtrs");
-}
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "trtrs");
-}
-void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-           oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "trtrs");
-}
-
-template <typename Func, typename T>
-inline void ungbr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::generate vec,
-                  std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<T> &a,
-                  std::int64_t lda, sycl::buffer<T> &tau, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto tau_ = sc.get_mem<rocmDataType *>(tau_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_generate(vec), m,
-                                        n, k, a_, lda, tau_);
-        });
-    });
-}
-
-#define UNGBR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                    \
-    void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n,      \
-               std::int64_t k, sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &tau,   \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                     \
-        ungbr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, scratchpad, \
-              scratchpad_size);                                                                    \
-    }
-
-UNGBR_LAUNCHER(std::complex<float>, rocsolver_cungbr)
-UNGBR_LAUNCHER(std::complex<double>, rocsolver_zungbr)
-
-#undef UNGBR_LAUNCHER
-
-template <typename Func, typename T>
-inline void ungqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                  std::int64_t n, std::int64_t k, sycl::buffer<T> &a, std::int64_t lda,
-                  sycl::buffer<T> &tau, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto tau_ = sc.get_mem<rocmDataType *>(tau_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_);
-        });
-    });
-}
-
-#define UNGQR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                               \
-    void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,            \
-               sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &tau,              \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                \
-        ungqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, scratchpad, \
-              scratchpad_size);                                                               \
-    }
-
-UNGQR_LAUNCHER(std::complex<float>, rocsolver_cungqr)
-UNGQR_LAUNCHER(std::complex<double>, rocsolver_zungqr)
-
-#undef UNGQR_LAUNCHER
-
-template <typename Func, typename T>
-inline void ungtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau,
-                  sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto tau_ = sc.get_mem<rocmDataType *>(tau_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, a_, lda, tau_);
-        });
-    });
-}
-
-#define UNGTR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                   \
-    void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<TYPE> &a, \
-               std::int64_t lda, sycl::buffer<TYPE> &tau, sycl::buffer<TYPE> &scratchpad,         \
-               std::int64_t scratchpad_size) {                                                    \
-        ungtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, tau, scratchpad,     \
-              scratchpad_size);                                                                   \
-    }
-
-UNGTR_LAUNCHER(std::complex<float>, rocsolver_cungtr)
-UNGTR_LAUNCHER(std::complex<double>, rocsolver_zungtr)
-
-#undef UNGTR_LAUNCHER
-
-void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "unmrq");
-}
-void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-           std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    throw unimplemented("lapack", "unmrq");
-}
-
-template <typename Func, typename T>
-inline void unmqr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-                  sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau, sycl::buffer<T> &c,
-                  std::int64_t ldc, sycl::buffer<T> &scratchpad, std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto tau_ = sc.get_mem<rocmDataType *>(tau_acc);
-            auto c_ = sc.get_mem<rocmDataType *>(c_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side),
-                                        get_rocblas_operation(trans), m, n, k, a_, lda, tau_, c_,
-                                        ldc);
-        });
-    });
-}
-
-#define UNMQR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                    \
-    void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,           \
-               std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<TYPE> &a,              \
-               std::int64_t lda, sycl::buffer<TYPE> &tau, sycl::buffer<TYPE> &c, std::int64_t ldc, \
-               sycl::buffer<TYPE> &scratchpad, std::int64_t scratchpad_size) {                     \
-        unmqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda, tau, c,  \
-              ldc, scratchpad, scratchpad_size);                                                   \
-    }
-
-UNMQR_LAUNCHER(std::complex<float>, rocsolver_cunmqr)
-UNMQR_LAUNCHER(std::complex<double>, rocsolver_zunmqr)
-
-#undef UNMQR_LAUNCHER
-
-template <typename Func, typename T>
-inline void unmtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                  std::int64_t n, sycl::buffer<T> &a, std::int64_t lda, sycl::buffer<T> &tau,
-                  sycl::buffer<T> &c, std::int64_t ldc, sycl::buffer<T> &scratchpad,
-                  std::int64_t scratchpad_size) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldc, scratchpad_size);
-    queue.submit([&](sycl::handler &cgh) {
-        auto a_acc = a.template get_access<sycl::access::mode::read_write>(cgh);
-        auto tau_acc = tau.template get_access<sycl::access::mode::write>(cgh);
-        auto c_acc = c.template get_access<sycl::access::mode::read_write>(cgh);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = sc.get_mem<rocmDataType *>(a_acc);
-            auto tau_ = sc.get_mem<rocmDataType *>(tau_acc);
-            auto c_ = sc.get_mem<rocmDataType *>(c_acc);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side),
-                                        get_rocblas_fill_mode(uplo), get_rocblas_operation(trans),
-                                        m, n, a_, lda, tau_, c_, ldc);
-        });
-    });
-}
-
-#define UNMTR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE)                                                   \
-    void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,                \
-               oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,                      \
-               sycl::buffer<TYPE> &a, std::int64_t lda, sycl::buffer<TYPE> &tau,                  \
-               sycl::buffer<TYPE> &c, std::int64_t ldc, sycl::buffer<TYPE> &scratchpad,           \
-               std::int64_t scratchpad_size) {                                                    \
-        unmtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a, lda, tau, \
-              c, ldc, scratchpad, scratchpad_size);                                               \
-    }
-
-UNMTR_LAUNCHER(std::complex<float>, rocsolver_cunmtr)
-UNMTR_LAUNCHER(std::complex<double>, rocsolver_zunmtr)
-
-#undef UNMTR_LAUNCHER
-
-// USM APIs
-
-template <typename Func, typename T_A, typename T_B>
-inline sycl::event gebrd(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, T_A *a, std::int64_t lda, T_B *d, T_B *e, T_A *tauq,
-                         T_A *taup, T_A *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType_A = typename RocmEquivalentType<T_A>::Type;
-    using rocmDataType_B = typename RocmEquivalentType<T_B>::Type;
-    overflow_check(m, n, lda, scratchpad_size);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType_A *>(a);
-            auto d_ = reinterpret_cast<rocmDataType_B *>(d);
-            auto e_ = reinterpret_cast<rocmDataType_B *>(e);
-            auto tauq_ = reinterpret_cast<rocmDataType_A *>(tauq);
-            auto taup_ = reinterpret_cast<rocmDataType_A *>(taup);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, d_, e_, tauq_,
-                                        taup_);
-        });
-    });
-    return done;
-}
-
-#define GEBRD_LAUNCHER_USM(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE)                                      \
-    sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE_A *a,               \
-                      std::int64_t lda, TYPE_B *d, TYPE_B *e, TYPE_A *tauq, TYPE_A *taup,          \
-                      TYPE_A *scratchpad, std::int64_t scratchpad_size,                            \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return gebrd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, a, lda, d, e, tauq, taup, \
-                     scratchpad, scratchpad_size, dependencies);                                   \
-    }
-
-GEBRD_LAUNCHER_USM(float, float, rocsolver_sgebrd)
-GEBRD_LAUNCHER_USM(double, double, rocsolver_dgebrd)
-GEBRD_LAUNCHER_USM(std::complex<float>, float, rocsolver_cgebrd)
-GEBRD_LAUNCHER_USM(std::complex<double>, double, rocsolver_zgebrd)
-
-#undef GEBRD_LAUNCHER_USM
-
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                  float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "gerqf");
-}
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                  double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "gerqf");
-}
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *tau, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "gerqf");
-}
-sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex<double> *a,
-                  std::int64_t lda, std::complex<double> *tau, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "gerqf");
-}
-
-template <typename Func, typename T>
-inline sycl::event geqrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, T *a, std::int64_t lda, T *tau, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto tau_ = reinterpret_cast<rocmDataType *>(tau);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, tau_);
-        });
-    });
-    return done;
-}
-
-#define GEQRF_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                                \
-    sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE *a,                 \
-                      std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return geqrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, a, lda, tau, scratchpad,  \
-                     scratchpad_size, dependencies);                                               \
-    }
-
-GEQRF_LAUNCHER_USM(float, rocsolver_sgeqrf)
-GEQRF_LAUNCHER_USM(double, rocsolver_dgeqrf)
-GEQRF_LAUNCHER_USM(std::complex<float>, rocsolver_cgeqrf)
-GEQRF_LAUNCHER_USM(std::complex<double>, rocsolver_zgeqrf)
-
-#undef GEQRF_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event getrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, T *a, std::int64_t lda, std::int64_t *ipiv, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, lda, scratchpad_size);
-
-    // rocsolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Allocate memory with 32-bit ints then copy over results
-    std::uint64_t ipiv_size = std::min(n, m);
-    int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue);
-
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            auto ipiv_ = reinterpret_cast<int *>(ipiv32);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, ipiv_,
-                                        devInfo_);
-        });
-    });
-
-    // Copy from 32-bit USM to 64-bit
-    auto done_casting = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(done);
-        cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-            ipiv[index] = static_cast<std::int64_t>(ipiv32[index]);
-        });
-    });
-
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(ipiv32, queue);
-    free(devInfo, queue);
-    return done_casting;
-}
-
-#define GETRF_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                                \
-    sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE *a,                 \
-                      std::int64_t lda, std::int64_t *ipiv, TYPE *scratchpad,                      \
-                      std::int64_t scratchpad_size,                                                \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return getrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, a, lda, ipiv, scratchpad, \
-                     scratchpad_size, dependencies);                                               \
-    }
-
-GETRF_LAUNCHER_USM(float, rocsolver_sgetrf)
-GETRF_LAUNCHER_USM(double, rocsolver_dgetrf)
-GETRF_LAUNCHER_USM(std::complex<float>, rocsolver_cgetrf)
-GETRF_LAUNCHER_USM(std::complex<double>, rocsolver_zgetrf)
-
-#undef GETRF_LAUNCHER_USM
-
-sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                  std::int64_t *ipiv, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri");
-}
-sycl::event getri(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda,
-                  std::int64_t *ipiv, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri");
-}
-sycl::event getri(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda,
-                  std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri");
-}
-sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                  std::int64_t *ipiv, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "getri");
-}
-
-template <typename Func, typename T>
-inline sycl::event getrs(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, T *a,
-                         std::int64_t lda, std::int64_t *ipiv, T *b, std::int64_t ldb,
-                         T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, nrhs, lda, ldb, scratchpad_size);
-
-    // rocsolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Create new buffer and convert 64-bit values.
-    std::uint64_t ipiv_size = n;
-    int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue);
-
-    auto done_casting = queue.submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-            ipiv32[index] = static_cast<std::int32_t>(ipiv[index]);
-        });
-    });
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        cgh.depends_on(done_casting);
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto ipiv_ = reinterpret_cast<int *>(ipiv32);
-            auto b_ = reinterpret_cast<rocmDataType *>(b);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_operation(trans),
-                                        n, nrhs, a_, lda, ipiv_, b_, ldb);
-        });
-    });
-
-    queue.wait();
-
-    free(ipiv32, queue);
-
-    return done;
-}
-
-#define GETRS_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                              \
-    sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,          \
-                      std::int64_t nrhs, TYPE *a, std::int64_t lda, std::int64_t *ipiv, TYPE *b, \
-                      std::int64_t ldb, TYPE *scratchpad, std::int64_t scratchpad_size,          \
-                      const std::vector<sycl::event> &dependencies) {                            \
-        return getrs(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, trans, n, nrhs, a, lda, ipiv, \
-                     b, ldb, scratchpad, scratchpad_size, dependencies);                         \
-    }
-
-GETRS_LAUNCHER_USM(float, rocsolver_sgetrs)
-GETRS_LAUNCHER_USM(double, rocsolver_dgetrs)
-GETRS_LAUNCHER_USM(std::complex<float>, rocsolver_cgetrs)
-GETRS_LAUNCHER_USM(std::complex<double>, rocsolver_zgetrs)
-
-#undef GETRS_LAUNCHER_USM
-
-template <typename Func, typename T_A, typename T_B>
-inline sycl::event gesvd(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                         std::int64_t n, T_A *a, std::int64_t lda, T_B *s, T_A *u, std::int64_t ldu,
-                         T_A *vt, std::int64_t ldvt, T_A *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType_A = typename RocmEquivalentType<T_A>::Type;
-    using rocmDataType_B = typename RocmEquivalentType<T_B>::Type;
-    overflow_check(m, n, lda, ldu, ldvt, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType_A *>(a);
-            auto s_ = reinterpret_cast<rocmDataType_B *>(s);
-            auto u_ = reinterpret_cast<rocmDataType_A *>(u);
-            auto vt_ = reinterpret_cast<rocmDataType_A *>(vt);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            auto scratch_ = reinterpret_cast<rocmDataType_B *>(scratchpad);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_jobsvd(jobu),
-                                        get_rocsolver_jobsvd(jobvt), m, n, a_, lda, s_, u_, ldu,
-                                        vt_, ldvt, scratch_, rocblas_workmode::rocblas_outofplace,
-                                        devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define GESVD_LAUNCHER_USM(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE)                                    \
-    sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,   \
-                      std::int64_t m, std::int64_t n, TYPE_A *a, std::int64_t lda, TYPE_B *s,    \
-                      TYPE_A *u, std::int64_t ldu, TYPE_A *vt, std::int64_t ldvt,                \
-                      TYPE_A *scratchpad, std::int64_t scratchpad_size,                          \
-                      const std::vector<sycl::event> &dependencies) {                            \
-        return gesvd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, jobu, jobvt, m, n, a, lda, s, \
-                     u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies);               \
-    }
-
-GESVD_LAUNCHER_USM(float, float, rocsolver_sgesvd)
-GESVD_LAUNCHER_USM(double, double, rocsolver_dgesvd)
-GESVD_LAUNCHER_USM(std::complex<float>, float, rocsolver_cgesvd)
-GESVD_LAUNCHER_USM(std::complex<double>, double, rocsolver_zgesvd)
-
-#undef GESVD_LAUNCHER_USM
-
-template <typename Func, typename T_A, typename T_B>
-inline sycl::event heevd(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T_A *&a,
-                         std::int64_t lda, T_B *&w, T_A *&scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType_A = typename RocmEquivalentType<T_A>::Type;
-    using rocmDataType_B = typename RocmEquivalentType<T_B>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType_A *>(a);
-            auto w_ = reinterpret_cast<rocmDataType_B *>(w);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            auto scratch_ = reinterpret_cast<rocmDataType_B *>(scratchpad);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_job(jobz),
-                                        get_rocblas_fill_mode(uplo), n, a_, lda, w_, scratch_,
-                                        devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define HEEVD_LAUNCHER_USM(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE)                                     \
-    sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,          \
-                      std::int64_t n, TYPE_A *a, std::int64_t lda, TYPE_B *w, TYPE_A *scratchpad, \
-                      std::int64_t scratchpad_size,                                               \
-                      const std::vector<sycl::event> &dependencies) {                             \
-        return heevd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w,      \
-                     scratchpad, scratchpad_size, dependencies);                                  \
-    }
-
-HEEVD_LAUNCHER_USM(std::complex<float>, float, rocsolver_cheevd)
-HEEVD_LAUNCHER_USM(std::complex<double>, double, rocsolver_zheevd)
-
-#undef HEEVD_LAUNCHER_USM
-
-template <typename Func, typename T_A, typename T_B>
-inline sycl::event hegvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T_A *&a,
-                         std::int64_t lda, T_A *&b, std::int64_t ldb, T_B *&w, T_A *&scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType_A = typename RocmEquivalentType<T_A>::Type;
-    using rocmDataType_B = typename RocmEquivalentType<T_B>::Type;
-    overflow_check(n, lda, ldb, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType_A *>(a);
-            auto b_ = reinterpret_cast<rocmDataType_A *>(b);
-            auto w_ = reinterpret_cast<rocmDataType_B *>(w);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            auto scratch_ = reinterpret_cast<rocmDataType_B *>(scratchpad);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_itype(itype),
-                                        get_rocsolver_job(jobz), get_rocblas_fill_mode(uplo), n, a_,
-                                        lda, b_, ldb, w_, scratch_, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define HEGVD_LAUNCHER_USM(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE)                                    \
-    sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,             \
-                      oneapi::mkl::uplo uplo, std::int64_t n, TYPE_A *a, std::int64_t lda,       \
-                      TYPE_A *b, std::int64_t ldb, TYPE_B *w, TYPE_A *scratchpad,                \
-                      std::int64_t scratchpad_size,                                              \
-                      const std::vector<sycl::event> &dependencies) {                            \
-        return hegvd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, \
-                     b, ldb, w, scratchpad, scratchpad_size, dependencies);                      \
-    }
-
-HEGVD_LAUNCHER_USM(std::complex<float>, float, rocsolver_chegvd)
-HEGVD_LAUNCHER_USM(std::complex<double>, double, rocsolver_zhegvd)
-
-#undef HEGVD_LAUNCHER_USM
-
-template <typename Func, typename T_A, typename T_B>
-inline sycl::event hetrd(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, T_A *a, std::int64_t lda, T_B *d,
-                         T_B *e, T_A *tau, T_A *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType_A = typename RocmEquivalentType<T_A>::Type;
-    using rocmDataType_B = typename RocmEquivalentType<T_B>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType_A *>(a);
-            auto d_ = reinterpret_cast<rocmDataType_B *>(d);
-            auto e_ = reinterpret_cast<rocmDataType_B *>(e);
-            auto tau_ = reinterpret_cast<rocmDataType_A *>(tau);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, a_, lda, d_, e_, tau_);
-        });
-    });
-    return done;
-}
-
-#define HETRD_LAUNCHER_USM(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE)                                  \
-    sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE_A *a,   \
-                      std::int64_t lda, TYPE_B *d, TYPE_B *e, TYPE_A *tau, TYPE_A *scratchpad, \
-                      std::int64_t scratchpad_size,                                            \
-                      const std::vector<sycl::event> &dependencies) {                          \
-        return hetrd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau, \
-                     scratchpad, scratchpad_size, dependencies);                               \
-    }
-
-HETRD_LAUNCHER_USM(std::complex<float>, float, rocsolver_chetrd)
-HETRD_LAUNCHER_USM(std::complex<double>, double, rocsolver_zhetrd)
-
-#undef HETRD_LAUNCHER_USM
-
-sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "hetrf");
-}
-sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "hetrf");
-}
-
-template <typename Func, typename T>
-inline sycl::event orgbr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k,
-                         T *a, std::int64_t lda, T *tau, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto tau_ = reinterpret_cast<rocmDataType *>(tau);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_generate(vec), m,
-                                        n, k, a_, lda, tau_);
-        });
-    });
-    return done;
-}
-
-#define ORGBR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                           \
-    sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,          \
-                      std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, TYPE *tau,   \
-                      TYPE *scratchpad, std::int64_t scratchpad_size,                         \
-                      const std::vector<sycl::event> &dependencies) {                         \
-        return orgbr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, \
-                     scratchpad, scratchpad_size, dependencies);                              \
-    }
-
-ORGBR_LAUNCHER_USM(float, rocsolver_sorgbr)
-ORGBR_LAUNCHER_USM(double, rocsolver_dorgbr)
-
-#undef ORGBR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event orgqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau,
-                         T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto tau_ = reinterpret_cast<rocmDataType *>(tau);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_);
-        });
-    });
-    return done;
-}
-
-#define ORGQR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                                \
-    sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, \
-                      std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return orgqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, k, a, lda, tau,           \
-                     scratchpad, scratchpad_size, dependencies);                                   \
-    }
-
-ORGQR_LAUNCHER_USM(float, rocsolver_sorgqr)
-ORGQR_LAUNCHER_USM(double, rocsolver_dorgqr)
-
-#undef ORGQR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event orgtr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, T *tau,
-                         T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto tau_ = reinterpret_cast<rocmDataType *>(tau);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, a_, lda, tau_);
-        });
-    });
-    return done;
-}
-
-#define ORGTR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                                \
-    sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a,         \
-                      std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return orgtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, tau,           \
-                     scratchpad, scratchpad_size, dependencies);                                   \
-    }
-
-ORGTR_LAUNCHER_USM(float, rocsolver_sorgtr)
-ORGTR_LAUNCHER_USM(double, rocsolver_dorgtr)
-
-#undef ORGTR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event ormtr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, T *a,
-                         std::int64_t lda, T *tau, T *c, std::int64_t ldc, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldc, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto tau_ = reinterpret_cast<rocmDataType *>(tau);
-            auto c_ = reinterpret_cast<rocmDataType *>(c);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side),
-                                        get_rocblas_fill_mode(uplo), get_rocblas_operation(trans),
-                                        m, n, a_, lda, tau_, c_, ldc);
-        });
-    });
-    return done;
-}
-
-#define ORMTR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                             \
-    sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,       \
-                      oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, TYPE *a,    \
-                      std::int64_t lda, TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad, \
-                      std::int64_t scratchpad_size,                                             \
-                      const std::vector<sycl::event> &dependencies) {                           \
-        return ormtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a,  \
-                     lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies);              \
-    }
-
-ORMTR_LAUNCHER_USM(float, rocsolver_sormtr)
-ORMTR_LAUNCHER_USM(double, rocsolver_dormtr)
-
-#undef ORMTR_LAUNCHER_USM
-
-sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                  float *tau, float *c, std::int64_t ldc, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "ormrq");
-}
-sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                  double *tau, double *c, std::int64_t ldc, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "ormrq");
-}
-
-template <typename Func, typename T>
-inline sycl::event ormqr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *c,
-                         std::int64_t ldc, T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, ldc, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto tau_ = reinterpret_cast<rocmDataType *>(tau);
-            auto c_ = reinterpret_cast<rocmDataType *>(c);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side),
-                                        get_rocblas_operation(trans), m, n, k, a_, lda, tau_, c_,
-                                        ldc);
-        });
-    });
-    return done;
-}
-
-#define ORMQR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                              \
-    sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,  \
-                      std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, \
-                      TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad,                    \
-                      std::int64_t scratchpad_size,                                              \
-                      const std::vector<sycl::event> &dependencies) {                            \
-        return ormqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda, \
-                     tau, c, ldc, scratchpad, scratchpad_size, dependencies);                    \
-    }
-
-ORMQR_LAUNCHER_USM(float, rocsolver_sormqr)
-ORMQR_LAUNCHER_USM(double, rocsolver_dormqr)
-
-#undef ORMQR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event potrf(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda,
-                         T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, a_, lda, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define POTRF_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                             \
-    sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a,      \
-                      std::int64_t lda, TYPE *scratchpad, std::int64_t scratchpad_size,         \
-                      const std::vector<sycl::event> &dependencies) {                           \
-        return potrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad, \
-                     scratchpad_size, dependencies);                                            \
-    }
-
-POTRF_LAUNCHER_USM(float, rocsolver_spotrf)
-POTRF_LAUNCHER_USM(double, rocsolver_dpotrf)
-POTRF_LAUNCHER_USM(std::complex<float>, rocsolver_cpotrf)
-POTRF_LAUNCHER_USM(std::complex<double>, rocsolver_zpotrf)
-
-#undef POTRF_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event potri(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda,
-                         T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto scratch_ = reinterpret_cast<rocmDataType *>(scratchpad);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, a_, lda, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define POTRI_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                             \
-    sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a,      \
-                      std::int64_t lda, TYPE *scratchpad, std::int64_t scratchpad_size,         \
-                      const std::vector<sycl::event> &dependencies) {                           \
-        return potri(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad, \
-                     scratchpad_size, dependencies);                                            \
-    }
-
-POTRI_LAUNCHER_USM(float, rocsolver_spotri)
-POTRI_LAUNCHER_USM(double, rocsolver_dpotri)
-POTRI_LAUNCHER_USM(std::complex<float>, rocsolver_cpotri)
-POTRI_LAUNCHER_USM(std::complex<double>, rocsolver_zpotri)
-
-#undef POTRI_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event potrs(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, T *a,
-                         std::int64_t lda, T *b, std::int64_t ldb, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, nrhs, lda, ldb, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto b_ = reinterpret_cast<rocmDataType *>(b);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, nrhs, a_, lda, b_, ldb);
-        });
-    });
-    return done;
-}
-
-#define POTRS_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                               \
-    sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,                 \
-                      std::int64_t nrhs, TYPE *a, std::int64_t lda, TYPE *b, std::int64_t ldb,    \
-                      TYPE *scratchpad, std::int64_t scratchpad_size,                             \
-                      const std::vector<sycl::event> &dependencies) {                             \
-        return potrs(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda, b, ldb, \
-                     scratchpad, scratchpad_size, dependencies);                                  \
-    }
-
-POTRS_LAUNCHER_USM(float, rocsolver_spotrs)
-POTRS_LAUNCHER_USM(double, rocsolver_dpotrs)
-POTRS_LAUNCHER_USM(std::complex<float>, rocsolver_cpotrs)
-POTRS_LAUNCHER_USM(std::complex<double>, rocsolver_zpotrs)
-
-#undef POTRS_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event syevd(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T *a,
-                         std::int64_t lda, T *w, T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto w_ = reinterpret_cast<rocmDataType *>(w);
-            auto scratch_ = reinterpret_cast<rocmDataType *>(scratchpad);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_job(jobz),
-                                        get_rocblas_fill_mode(uplo), n, a_, lda, w_, scratch_,
-                                        devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define SYEVD_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                          \
-    sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,     \
-                      std::int64_t n, TYPE *a, std::int64_t lda, TYPE *w, TYPE *scratchpad,  \
-                      std::int64_t scratchpad_size,                                          \
-                      const std::vector<sycl::event> &dependencies) {                        \
-        return syevd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w, \
-                     scratchpad, scratchpad_size, dependencies);                             \
-    }
-
-SYEVD_LAUNCHER_USM(float, rocsolver_ssyevd)
-SYEVD_LAUNCHER_USM(double, rocsolver_dsyevd)
-
-#undef SYEVD_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event sygvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype,
-                         oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T *a,
-                         std::int64_t lda, T *b, std::int64_t ldb, T *w, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, ldb, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto b_ = reinterpret_cast<rocmDataType *>(b);
-            auto w_ = reinterpret_cast<rocmDataType *>(w);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            auto scratch_ = reinterpret_cast<rocmDataType *>(scratchpad);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_itype(itype),
-                                        get_rocsolver_job(jobz), get_rocblas_fill_mode(uplo), n, a_,
-                                        lda, b_, ldb, w_, scratch_, devInfo_);
-        });
-    });
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(devInfo, queue);
-    return done;
-}
-
-#define SYGVD_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                               \
-    sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,              \
-                      oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, std::int64_t lda, TYPE *b, \
-                      std::int64_t ldb, TYPE *w, TYPE *scratchpad, std::int64_t scratchpad_size,  \
-                      const std::vector<sycl::event> &dependencies) {                             \
-        return sygvd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda,  \
-                     b, ldb, w, scratchpad, scratchpad_size, dependencies);                       \
-    }
-
-SYGVD_LAUNCHER_USM(float, rocsolver_ssygvd)
-SYGVD_LAUNCHER_USM(double, rocsolver_dsygvd)
-
-#undef SYGVD_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event sytrd(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, T *d, T *e,
-                         T *tau, T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto d_ = reinterpret_cast<rocmDataType *>(d);
-            auto e_ = reinterpret_cast<rocmDataType *>(e);
-            auto tau_ = reinterpret_cast<rocmDataType *>(tau);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, a_, lda, d_, e_, tau_);
-        });
-    });
-    return done;
-}
-
-#define SYTRD_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                            \
-    sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a,     \
-                      std::int64_t lda, TYPE *d, TYPE *e, TYPE *tau, TYPE *scratchpad,         \
-                      std::int64_t scratchpad_size,                                            \
-                      const std::vector<sycl::event> &dependencies) {                          \
-        return sytrd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau, \
-                     scratchpad, scratchpad_size, dependencies);                               \
-    }
-
-SYTRD_LAUNCHER_USM(float, rocsolver_ssytrd)
-SYTRD_LAUNCHER_USM(double, rocsolver_dsytrd)
-
-#undef SYTRD_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event sytrf(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda,
-                         std::int64_t *ipiv, T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    int *devInfo = (int *)malloc_device(sizeof(int), queue);
-
-    // rocsolver legacy api does not accept 64-bit ints.
-    // To get around the limitation.
-    // Allocate memory with 32-bit ints then copy over results
-    std::uint64_t ipiv_size = n;
-    int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue);
-
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto ipiv_ = reinterpret_cast<int *>(ipiv32);
-            auto devInfo_ = reinterpret_cast<int *>(devInfo);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, a_, lda, ipiv_, devInfo_);
-        });
-    });
-
-    // Copy from 32-bit USM to 64-bit
-    auto done_casting = queue.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(done);
-        cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) {
-            ipiv[index] = static_cast<std::int64_t>(ipiv32[index]);
-        });
-    });
-
-    lapack_info_check(queue, devInfo, __func__, func_name);
-    free(ipiv32, queue);
-    free(devInfo, queue);
-    return done_casting;
-}
-
-#define SYTRF_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                        \
-    sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \
-                      std::int64_t lda, std::int64_t *ipiv, TYPE *scratchpad,              \
-                      std::int64_t scratchpad_size,                                        \
-                      const std::vector<sycl::event> &dependencies) {                      \
-        return sytrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, ipiv,  \
-                     scratchpad, scratchpad_size, dependencies);                           \
-    }
-
-SYTRF_LAUNCHER_USM(float, rocsolver_ssytrf)
-SYTRF_LAUNCHER_USM(double, rocsolver_dsytrf)
-SYTRF_LAUNCHER_USM(std::complex<float>, rocsolver_csytrf)
-SYTRF_LAUNCHER_USM(std::complex<double>, rocsolver_zsytrf)
-
-#undef SYTRF_LAUNCHER_USM
-
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "trtrs");
-}
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, double *a,
-                  std::int64_t lda, double *b, std::int64_t ldb, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "trtrs");
-}
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a,
-                  std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "trtrs");
-}
-sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                  oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                  std::int64_t ldb, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "trtrs");
-}
-
-template <typename Func, typename T>
-inline sycl::event ungbr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k,
-                         T *a, std::int64_t lda, T *tau, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto tau_ = reinterpret_cast<rocmDataType *>(tau);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_generate(vec), m,
-                                        n, k, a_, lda, tau_);
-        });
-    });
-    return done;
-}
-
-#define UNGBR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                           \
-    sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,          \
-                      std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, TYPE *tau,   \
-                      TYPE *scratchpad, std::int64_t scratchpad_size,                         \
-                      const std::vector<sycl::event> &dependencies) {                         \
-        return ungbr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, \
-                     scratchpad, scratchpad_size, dependencies);                              \
-    }
-
-UNGBR_LAUNCHER_USM(std::complex<float>, rocsolver_cungbr)
-UNGBR_LAUNCHER_USM(std::complex<double>, rocsolver_zungbr)
-
-#undef UNGBR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event ungqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m,
-                         std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau,
-                         T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, k, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto tau_ = reinterpret_cast<rocmDataType *>(tau);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_);
-        });
-    });
-    return done;
-}
-
-#define UNGQR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                                \
-    sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, \
-                      std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return ungqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, k, a, lda, tau,           \
-                     scratchpad, scratchpad_size, dependencies);                                   \
-    }
-
-UNGQR_LAUNCHER_USM(std::complex<float>, rocsolver_cungqr)
-UNGQR_LAUNCHER_USM(std::complex<double>, rocsolver_zungqr)
-
-#undef UNGQR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event ungtr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, T *tau,
-                         T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto tau_ = reinterpret_cast<rocmDataType *>(tau);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo),
-                                        n, a_, lda, tau_);
-        });
-    });
-    return done;
-}
-
-#define UNGTR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                                \
-    sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a,         \
-                      std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \
-                      const std::vector<sycl::event> &dependencies) {                              \
-        return ungtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, tau,           \
-                     scratchpad, scratchpad_size, dependencies);                                   \
-    }
-
-UNGTR_LAUNCHER_USM(std::complex<float>, rocsolver_cungtr)
-UNGTR_LAUNCHER_USM(std::complex<double>, rocsolver_zungtr)
-
-#undef UNGTR_LAUNCHER_USM
-
-sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *tau, std::complex<float> *c,
-                  std::int64_t ldc, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "unmrq");
-}
-sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                  std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> *a,
-                  std::int64_t lda, std::complex<double> *tau, std::complex<double> *c,
-                  std::int64_t ldc, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    throw unimplemented("lapack", "unmrq");
-}
-
-template <typename Func, typename T>
-inline sycl::event unmqr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m,
-                         std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *c,
-                         std::int64_t ldc, T *scratchpad, std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(n, lda, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto tau_ = reinterpret_cast<rocmDataType *>(tau);
-            auto c_ = reinterpret_cast<rocmDataType *>(c);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side),
-                                        get_rocblas_operation(trans), m, n, k, a_, lda, tau_, c_,
-                                        ldc);
-        });
-    });
-    return done;
-}
-
-#define UNMQR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                              \
-    sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,  \
-                      std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, \
-                      TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad,                    \
-                      std::int64_t scratchpad_size,                                              \
-                      const std::vector<sycl::event> &dependencies) {                            \
-        return unmqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda, \
-                     tau, c, ldc, scratchpad, scratchpad_size, dependencies);                    \
-    }
-
-UNMQR_LAUNCHER_USM(std::complex<float>, rocsolver_cunmqr)
-UNMQR_LAUNCHER_USM(std::complex<double>, rocsolver_zunmqr)
-
-#undef UNMQR_LAUNCHER_USM
-
-template <typename Func, typename T>
-inline sycl::event unmtr(const char *func_name, Func func, sycl::queue &queue,
-                         oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                         oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, T *a,
-                         std::int64_t lda, T *tau, T *c, std::int64_t ldc, T *scratchpad,
-                         std::int64_t scratchpad_size,
-                         const std::vector<sycl::event> &dependencies) {
-    using rocmDataType = typename RocmEquivalentType<T>::Type;
-    overflow_check(m, n, lda, ldc, scratchpad_size);
-    auto done = queue.submit([&](sycl::handler &cgh) {
-        int64_t num_events = dependencies.size();
-        for (int64_t i = 0; i < num_events; i++) {
-            cgh.depends_on(dependencies[i]);
-        }
-        onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) {
-            auto handle = sc.get_handle(queue);
-            auto a_ = reinterpret_cast<rocmDataType *>(a);
-            auto tau_ = reinterpret_cast<rocmDataType *>(tau);
-            auto c_ = reinterpret_cast<rocmDataType *>(c);
-            rocblas_status err;
-            ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side),
-                                        get_rocblas_fill_mode(uplo), get_rocblas_operation(trans),
-                                        m, n, a_, lda, tau_, c_, ldc);
-        });
-    });
-    return done;
-}
-
-#define UNMTR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE)                                             \
-    sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,       \
-                      oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, TYPE *a,    \
-                      std::int64_t lda, TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad, \
-                      std::int64_t scratchpad_size,                                             \
-                      const std::vector<sycl::event> &dependencies) {                           \
-        return unmtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a,  \
-                     lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies);              \
-    }
-
-UNMTR_LAUNCHER_USM(std::complex<float>, rocsolver_cunmtr)
-UNMTR_LAUNCHER_USM(std::complex<double>, rocsolver_zunmtr)
-
-#undef UNMTR_LAUNCHER_USM
-
-// SCRATCHPAD APIs
-
-#define GEBRD_LAUNCHER_SCRATCH(TYPE)                                                              \
-    template <>                                                                                   \
-    std::int64_t gebrd_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t m, std::int64_t n, \
-                                             std::int64_t lda) {                                  \
-        return 0;                                                                                 \
-    }
-
-GEBRD_LAUNCHER_SCRATCH(float)
-GEBRD_LAUNCHER_SCRATCH(double)
-GEBRD_LAUNCHER_SCRATCH(std::complex<float>)
-GEBRD_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef GEBRD_LAUNCHER_SCRATCH
-
-template <>
-std::int64_t gerqf_scratchpad_size<float>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                          std::int64_t lda) {
-    throw unimplemented("lapack", "gerqf_scratchpad_size");
-}
-template <>
-std::int64_t gerqf_scratchpad_size<double>(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                           std::int64_t lda) {
-    throw unimplemented("lapack", "gerqf_scratchpad_size");
-}
-template <>
-std::int64_t gerqf_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda) {
-    throw unimplemented("lapack", "gerqf_scratchpad_size");
-}
-template <>
-std::int64_t gerqf_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda) {
-    throw unimplemented("lapack", "gerqf_scratchpad_size");
-}
-
-#define GEQRF_LAUNCHER_SCRATCH(TYPE)                                                              \
-    template <>                                                                                   \
-    std::int64_t geqrf_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t m, std::int64_t n, \
-                                             std::int64_t lda) {                                  \
-        return 0;                                                                                 \
-    }
-
-GEQRF_LAUNCHER_SCRATCH(float)
-GEQRF_LAUNCHER_SCRATCH(double)
-GEQRF_LAUNCHER_SCRATCH(std::complex<float>)
-GEQRF_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef GEQRF_LAUNCHER_SCRATCH
-
-#define GESVD_LAUNCHER_SCRATCH(TYPE)                                                              \
-    template <>                                                                                   \
-    std::int64_t gesvd_scratchpad_size<TYPE>(                                                     \
-        sycl::queue & queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, \
-        std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt) {                  \
-        return std::min(m, n) - 1;                                                                \
-    }
-
-GESVD_LAUNCHER_SCRATCH(float)
-GESVD_LAUNCHER_SCRATCH(double)
-GESVD_LAUNCHER_SCRATCH(std::complex<float>)
-GESVD_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef GESVD_LAUNCHER_SCRATCH
-
-#define GETRF_LAUNCHER_SCRATCH(TYPE)                                                              \
-    template <>                                                                                   \
-    std::int64_t getrf_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t m, std::int64_t n, \
-                                             std::int64_t lda) {                                  \
-        return 0;                                                                                 \
-    }
-
-GETRF_LAUNCHER_SCRATCH(float)
-GETRF_LAUNCHER_SCRATCH(double)
-GETRF_LAUNCHER_SCRATCH(std::complex<float>)
-GETRF_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef GETRF_LAUNCHER_SCRATCH
-
-template <>
-std::int64_t getri_scratchpad_size<float>(sycl::queue &queue, std::int64_t n, std::int64_t lda) {
-    throw unimplemented("lapack", "getri_scratchpad_size");
-}
-template <>
-std::int64_t getri_scratchpad_size<double>(sycl::queue &queue, std::int64_t n, std::int64_t lda) {
-    throw unimplemented("lapack", "getri_scratchpad_size");
-}
-template <>
-std::int64_t getri_scratchpad_size<std::complex<float>>(sycl::queue &queue, std::int64_t n,
-                                                        std::int64_t lda) {
-    throw unimplemented("lapack", "getri_scratchpad_size");
-}
-template <>
-std::int64_t getri_scratchpad_size<std::complex<double>>(sycl::queue &queue, std::int64_t n,
-                                                         std::int64_t lda) {
-    throw unimplemented("lapack", "getri_scratchpad_size");
-}
-
-#define GETRS_LAUNCHER_SCRATCH(TYPE)                                                              \
-    template <>                                                                                   \
-    std::int64_t getrs_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::transpose trans,   \
-                                             std::int64_t n, std::int64_t nrhs, std::int64_t lda, \
-                                             std::int64_t ldb) {                                  \
-        return 0;                                                                                 \
-    }
-
-GETRS_LAUNCHER_SCRATCH(float)
-GETRS_LAUNCHER_SCRATCH(double)
-GETRS_LAUNCHER_SCRATCH(std::complex<float>)
-GETRS_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef GETRS_LAUNCHER_SCRATCH
-
-#define HEEVD_LAUNCHER_SCRATCH(TYPE)                                                     \
-    template <>                                                                          \
-    std::int64_t heevd_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::job jobz, \
-                                             oneapi::mkl::uplo uplo, std::int64_t n,     \
-                                             std::int64_t lda) {                         \
-        return n;                                                                        \
-    }
-
-HEEVD_LAUNCHER_SCRATCH(std::complex<float>)
-HEEVD_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef HEEVD_LAUNCHER_SCRATCH
-
-#define HEGVD_LAUNCHER_SCRATCH(TYPE)                                                               \
-    template <>                                                                                    \
-    std::int64_t hegvd_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t itype,              \
-                                             oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,        \
-                                             std::int64_t n, std::int64_t lda, std::int64_t ldb) { \
-        return n;                                                                                  \
-    }
-
-HEGVD_LAUNCHER_SCRATCH(std::complex<float>)
-HEGVD_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef HEGVD_LAUNCHER_SCRATCH
-
-#define HETRD_LAUNCHER_SCRATCH(TYPE)                                                      \
-    template <>                                                                           \
-    std::int64_t hetrd_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo, \
-                                             std::int64_t n, std::int64_t lda) {          \
-        return 0;                                                                         \
-    }
-
-HETRD_LAUNCHER_SCRATCH(std::complex<float>)
-HETRD_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef HETRD_LAUNCHER_SCRATCH
-
-template <>
-std::int64_t hetrf_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda) {
-    throw unimplemented("lapack", "hetrf_scratchpad_size");
-}
-template <>
-std::int64_t hetrf_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda) {
-    throw unimplemented("lapack", "hetrf_scratchpad_size");
-}
-
-#define ORGBR_LAUNCHER_SCRATCH(TYPE)                                                         \
-    template <>                                                                              \
-    std::int64_t orgbr_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::generate vec, \
-                                             std::int64_t m, std::int64_t n, std::int64_t k, \
-                                             std::int64_t lda) {                             \
-        return 0;                                                                            \
-    }
-
-ORGBR_LAUNCHER_SCRATCH(float)
-ORGBR_LAUNCHER_SCRATCH(double)
-
-#undef ORGBR_LAUNCHER_SCRATCH
-
-#define ORGTR_LAUNCHER_SCRATCH(TYPE)                                                      \
-    template <>                                                                           \
-    std::int64_t orgtr_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo, \
-                                             std::int64_t n, std::int64_t lda) {          \
-        return 0;                                                                         \
-    }
-
-ORGTR_LAUNCHER_SCRATCH(float)
-ORGTR_LAUNCHER_SCRATCH(double)
-
-#undef ORGTR_LAUNCHER_SCRATCH
-
-#define ORGQR_LAUNCHER_SCRATCH(TYPE)                                                              \
-    template <>                                                                                   \
-    std::int64_t orgqr_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t m, std::int64_t n, \
-                                             std::int64_t k, std::int64_t lda) {                  \
-        return 0;                                                                                 \
-    }
-
-ORGQR_LAUNCHER_SCRATCH(float)
-ORGQR_LAUNCHER_SCRATCH(double)
-
-#undef ORGQR_LAUNCHER_SCRATCH
-
-template <>
-std::int64_t ormrq_scratchpad_size<float>(sycl::queue &queue, oneapi::mkl::side side,
-                                          oneapi::mkl::transpose trans, std::int64_t m,
-                                          std::int64_t n, std::int64_t k, std::int64_t lda,
-                                          std::int64_t ldc) {
-    throw unimplemented("lapack", "ormrq_scratchpad_size");
-}
-template <>
-std::int64_t ormrq_scratchpad_size<double>(sycl::queue &queue, oneapi::mkl::side side,
-                                           oneapi::mkl::transpose trans, std::int64_t m,
-                                           std::int64_t n, std::int64_t k, std::int64_t lda,
-                                           std::int64_t ldc) {
-    throw unimplemented("lapack", "ormrq_scratchpad_size");
-}
-
-#define ORMQRF_LAUNCHER_SCRATCH(TYPE)                                                              \
-    template <>                                                                                    \
-    std::int64_t ormqr_scratchpad_size<TYPE>(                                                      \
-        sycl::queue & queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, \
-        std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) {                      \
-        return 0;                                                                                  \
-    }
-
-ORMQRF_LAUNCHER_SCRATCH(float)
-ORMQRF_LAUNCHER_SCRATCH(double)
-
-#undef ORMQRF_LAUNCHER_SCRATCH
-
-#define ORMTR_LAUNCHER_SCRATCH(TYPE)                                                               \
-    template <>                                                                                    \
-    std::int64_t ormtr_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::side side,          \
-                                             oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, \
-                                             std::int64_t m, std::int64_t n, std::int64_t lda,     \
-                                             std::int64_t ldc) {                                   \
-        return 0;                                                                                  \
-    }
-
-ORMTR_LAUNCHER_SCRATCH(float)
-ORMTR_LAUNCHER_SCRATCH(double)
-
-#undef ORMTR_LAUNCHER_SCRATCH
-
-#define POTRF_LAUNCHER_SCRATCH(TYPE)                                                      \
-    template <>                                                                           \
-    std::int64_t potrf_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo, \
-                                             std::int64_t n, std::int64_t lda) {          \
-        return 0;                                                                         \
-    }
-
-POTRF_LAUNCHER_SCRATCH(float)
-POTRF_LAUNCHER_SCRATCH(double)
-POTRF_LAUNCHER_SCRATCH(std::complex<float>)
-POTRF_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef POTRF_LAUNCHER_SCRATCH
-
-#define POTRS_LAUNCHER_SCRATCH(TYPE)                                                              \
-    template <>                                                                                   \
-    std::int64_t potrs_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo,         \
-                                             std::int64_t n, std::int64_t nrhs, std::int64_t lda, \
-                                             std::int64_t ldb) {                                  \
-        return 0;                                                                                 \
-    }
-
-POTRS_LAUNCHER_SCRATCH(float)
-POTRS_LAUNCHER_SCRATCH(double)
-POTRS_LAUNCHER_SCRATCH(std::complex<float>)
-POTRS_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef POTRS_LAUNCHER_SCRATCH
-
-#define POTRI_LAUNCHER_SCRATCH(TYPE)                                                      \
-    template <>                                                                           \
-    std::int64_t potri_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo, \
-                                             std::int64_t n, std::int64_t lda) {          \
-        return 0;                                                                         \
-    }
-
-POTRI_LAUNCHER_SCRATCH(float)
-POTRI_LAUNCHER_SCRATCH(double)
-POTRI_LAUNCHER_SCRATCH(std::complex<float>)
-POTRI_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef POTRI_LAUNCHER_SCRATCH
-
-#define SYTRF_LAUNCHER_SCRATCH(TYPE)                                                      \
-    template <>                                                                           \
-    std::int64_t sytrf_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo, \
-                                             std::int64_t n, std::int64_t lda) {          \
-        return 0;                                                                         \
-    }
-
-SYTRF_LAUNCHER_SCRATCH(float)
-SYTRF_LAUNCHER_SCRATCH(double)
-SYTRF_LAUNCHER_SCRATCH(std::complex<float>)
-SYTRF_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef SYTRF_LAUNCHER_SCRATCH
-
-#define SYEVD_LAUNCHER_SCRATCH(TYPE)                                                     \
-    template <>                                                                          \
-    std::int64_t syevd_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::job jobz, \
-                                             oneapi::mkl::uplo uplo, std::int64_t n,     \
-                                             std::int64_t lda) {                         \
-        return n;                                                                        \
-    }
-
-SYEVD_LAUNCHER_SCRATCH(float)
-SYEVD_LAUNCHER_SCRATCH(double)
-
-#undef SYEVD_LAUNCHER_SCRATCH
-
-#define SYGVD_LAUNCHER_SCRATCH(TYPE)                                                               \
-    template <>                                                                                    \
-    std::int64_t sygvd_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t itype,              \
-                                             oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,        \
-                                             std::int64_t n, std::int64_t lda, std::int64_t ldb) { \
-        return n;                                                                                  \
-    }
-
-SYGVD_LAUNCHER_SCRATCH(float)
-SYGVD_LAUNCHER_SCRATCH(double)
-
-#undef SYGVD_LAUNCHER_SCRATCH
-
-#define SYTRD_LAUNCHER_SCRATCH(TYPE)                                                      \
-    template <>                                                                           \
-    std::int64_t sytrd_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo, \
-                                             std::int64_t n, std::int64_t lda) {          \
-        return 0;                                                                         \
-    }
-
-SYTRD_LAUNCHER_SCRATCH(float)
-SYTRD_LAUNCHER_SCRATCH(double)
-
-#undef SYTRD_LAUNCHER_SCRATCH
-
-#define TRTRS_LAUNCHER_SCRATCH(TYPE)                                                               \
-    template <>                                                                                    \
-    std::int64_t trtrs_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo,          \
-                                             oneapi::mkl::transpose trans, oneapi::mkl::diag diag, \
-                                             std::int64_t n, std::int64_t nrhs, std::int64_t lda,  \
-                                             std::int64_t ldb) {                                   \
-        return 0;                                                                                  \
-    }
-
-TRTRS_LAUNCHER_SCRATCH(float)
-TRTRS_LAUNCHER_SCRATCH(double)
-TRTRS_LAUNCHER_SCRATCH(std::complex<float>)
-TRTRS_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef TRTRS_LAUNCHER_SCRATCH
-
-#define UNGBR_LAUNCHER_SCRATCH(TYPE)                                                         \
-    template <>                                                                              \
-    std::int64_t ungbr_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::generate vec, \
-                                             std::int64_t m, std::int64_t n, std::int64_t k, \
-                                             std::int64_t lda) {                             \
-        return 0;                                                                            \
-    }
-
-UNGBR_LAUNCHER_SCRATCH(std::complex<float>)
-UNGBR_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef UNGBR_LAUNCHER_SCRATCH
-
-#define UNGQR_LAUNCHER_SCRATCH(TYPE)                                                              \
-    template <>                                                                                   \
-    std::int64_t ungqr_scratchpad_size<TYPE>(sycl::queue & queue, std::int64_t m, std::int64_t n, \
-                                             std::int64_t k, std::int64_t lda) {                  \
-        return 0;                                                                                 \
-    }
-
-UNGQR_LAUNCHER_SCRATCH(std::complex<float>)
-UNGQR_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef UNGQR_LAUNCHER_SCRATCH
-
-#define UNGTR_LAUNCHER_SCRATCH(TYPE)                                                      \
-    template <>                                                                           \
-    std::int64_t ungtr_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::uplo uplo, \
-                                             std::int64_t n, std::int64_t lda) {          \
-        return 0;                                                                         \
-    }
-
-UNGTR_LAUNCHER_SCRATCH(std::complex<float>)
-UNGTR_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef UNGTR_LAUNCHER_SCRATCH
-
-template <>
-std::int64_t unmrq_scratchpad_size<std::complex<float>>(sycl::queue &queue, oneapi::mkl::side side,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t m, std::int64_t n,
-                                                        std::int64_t k, std::int64_t lda,
-                                                        std::int64_t ldc) {
-    throw unimplemented("lapack", "unmrq_scratchpad_size");
-}
-template <>
-std::int64_t unmrq_scratchpad_size<std::complex<double>>(sycl::queue &queue, oneapi::mkl::side side,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t m, std::int64_t n,
-                                                         std::int64_t k, std::int64_t lda,
-                                                         std::int64_t ldc) {
-    throw unimplemented("lapack", "unmrq_scratchpad_size");
-}
-
-#define UNMQR_LAUNCHER_SCRATCH(TYPE)                                                               \
-    template <>                                                                                    \
-    std::int64_t unmqr_scratchpad_size<TYPE>(                                                      \
-        sycl::queue & queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, \
-        std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) {                      \
-        return 0;                                                                                  \
-    }
-
-UNMQR_LAUNCHER_SCRATCH(std::complex<float>)
-UNMQR_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef UNMQR_LAUNCHER_SCRATCH
-
-#define UNMTR_LAUNCHER_SCRATCH(TYPE)                                                               \
-    template <>                                                                                    \
-    std::int64_t unmtr_scratchpad_size<TYPE>(sycl::queue & queue, oneapi::mkl::side side,          \
-                                             oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, \
-                                             std::int64_t m, std::int64_t n, std::int64_t lda,     \
-                                             std::int64_t ldc) {                                   \
-        return 0;                                                                                  \
-    }
-
-UNMTR_LAUNCHER_SCRATCH(std::complex<float>)
-UNMTR_LAUNCHER_SCRATCH(std::complex<double>)
-
-#undef UNMTR_LAUNCHER_SCRATCH
-
-} // namespace rocsolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/lapack/backends/rocsolver/rocsolver_scope_handle.cpp b/src/lapack/backends/rocsolver/rocsolver_scope_handle.cpp
deleted file mode 100644
index 42e262e7b..000000000
--- a/src/lapack/backends/rocsolver/rocsolver_scope_handle.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright 2022 Intel Corporation
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "rocsolver_scope_handle.hpp"
-#if __has_include(<sycl/detail/common.hpp>)
-#include <sycl/detail/common.hpp>
-#else
-#include <CL/sycl/detail/common.hpp>
-#endif
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace rocsolver {
-
-/**
- * Inserts a new element in the map if its key is unique. This new element
- * is constructed in place using args as the arguments for the construction
- * of a value_type (which is an object of a pair type). The insertion only
- * takes place if no other element in the container has a key equivalent to
- * the one being emplaced (keys in a map container are unique).
- */
-thread_local rocsolver_handle<pi_context> RocsolverScopedContextHandler::handle_helper =
-    rocsolver_handle<pi_context>{};
-
-RocsolverScopedContextHandler::RocsolverScopedContextHandler(sycl::queue queue,
-                                                             sycl::interop_handle &ih)
-        : ih(ih),
-          needToRecover_(false) {
-    placedContext_ = new sycl::context(queue.get_context());
-    auto hipDevice = ih.get_native_device<sycl::backend::ext_oneapi_hip>();
-    hipError_t err;
-    hipCtx_t desired;
-    HIP_ERROR_FUNC(hipCtxGetCurrent, err, &original_);
-    HIP_ERROR_FUNC(hipDevicePrimaryCtxRetain, err, &desired, hipDevice);
-    if (original_ != desired) {
-        // Sets the desired context as the active one for the thread
-        HIP_ERROR_FUNC(hipCtxSetCurrent, err, desired);
-        // No context is installed and the suggested context is primary
-        // This is the most common case. We can activate the context in the
-        // thread and leave it there until all the PI context referring to the
-        // same underlying rocblas primary context are destroyed. This emulates
-        // the behaviour of the rocblas runtime api, and avoids costly context
-        // switches. No action is required on this side of the if.
-        needToRecover_ = !(original_ == nullptr);
-    }
-}
-
-RocsolverScopedContextHandler::~RocsolverScopedContextHandler() noexcept(false) {
-    if (needToRecover_) {
-        hipError_t err;
-        HIP_ERROR_FUNC(hipCtxSetCurrent, err, original_);
-    }
-    delete placedContext_;
-}
-
-void ContextCallback(void *userData) {
-    auto *ptr = static_cast<std::atomic<rocblas_handle> *>(userData);
-    if (!ptr) {
-        return;
-    }
-    auto handle = ptr->exchange(nullptr);
-    if (handle != nullptr) {
-        rocblas_status err1;
-        ROCSOLVER_ERROR_FUNC(rocblas_destroy_handle, err1, handle);
-        handle = nullptr;
-    }
-    else {
-        // if the handle is nullptr it means the handle was already destroyed by
-        // the rocblas_handle destructor and we're free to delete the atomic
-        // object.
-        delete ptr;
-    }
-}
-
-rocblas_handle RocsolverScopedContextHandler::get_handle(const sycl::queue &queue) {
-    auto hipDevice = ih.get_native_device<sycl::backend::ext_oneapi_hip>();
-    hipError_t hipErr;
-    hipCtx_t desired;
-    HIP_ERROR_FUNC(hipDevicePrimaryCtxRetain, hipErr, &desired, hipDevice);
-    auto piPlacedContext_ = reinterpret_cast<pi_context>(desired);
-    hipStream_t streamId = get_stream(queue);
-    rocblas_status err;
-    auto it = handle_helper.rocsolver_handle_mapper_.find(piPlacedContext_);
-    if (it != handle_helper.rocsolver_handle_mapper_.end()) {
-        if (it->second == nullptr) {
-            handle_helper.rocsolver_handle_mapper_.erase(it);
-        }
-        else {
-            auto handle = it->second->load();
-            if (handle != nullptr) {
-                hipStream_t currentStreamId;
-                ROCSOLVER_ERROR_FUNC(rocblas_get_stream, err, handle, &currentStreamId);
-                if (currentStreamId != streamId) {
-                    ROCSOLVER_ERROR_FUNC(rocblas_set_stream, err, handle, streamId);
-                }
-                return handle;
-            }
-            else {
-                handle_helper.rocsolver_handle_mapper_.erase(it);
-            }
-        }
-    }
-
-    rocblas_handle handle;
-
-    ROCSOLVER_ERROR_FUNC(rocblas_create_handle, err, &handle);
-    ROCSOLVER_ERROR_FUNC(rocblas_set_stream, err, handle, streamId);
-
-    auto insert_iter = handle_helper.rocsolver_handle_mapper_.insert(
-        std::make_pair(piPlacedContext_, new std::atomic<rocblas_handle>(handle)));
-
-    sycl::detail::pi::contextSetExtendedDeleter(*placedContext_, ContextCallback,
-                                                insert_iter.first->second);
-
-    return handle;
-}
-
-hipStream_t RocsolverScopedContextHandler::get_stream(const sycl::queue &queue) {
-    return sycl::get_native<sycl::backend::ext_oneapi_hip>(queue);
-}
-sycl::context RocsolverScopedContextHandler::get_context(const sycl::queue &queue) {
-    return queue.get_context();
-}
-
-} // namespace rocsolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/lapack/backends/rocsolver/rocsolver_scope_handle.hpp b/src/lapack/backends/rocsolver/rocsolver_scope_handle.hpp
deleted file mode 100644
index 9f1bc068a..000000000
--- a/src/lapack/backends/rocsolver/rocsolver_scope_handle.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright 2022 Intel Corporation
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#ifndef _ROCSOLVER_SCOPED_HANDLE_HPP_
-#define _ROCSOLVER_SCOPED_HANDLE_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <atomic>
-#include <memory>
-#include <thread>
-#include <unordered_map>
-#include "rocsolver_helper.hpp"
-#include "rocsolver_handle.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace rocsolver {
-
-class RocsolverScopedContextHandler {
-    hipCtx_t original_;
-    sycl::context *placedContext_;
-    bool needToRecover_;
-    sycl::interop_handle &ih;
-    static thread_local rocsolver_handle<pi_context> handle_helper;
-    hipStream_t get_stream(const sycl::queue &queue);
-    sycl::context get_context(const sycl::queue &queue);
-
-public:
-    RocsolverScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih);
-
-    ~RocsolverScopedContextHandler() noexcept(false);
-
-    rocblas_handle get_handle(const sycl::queue &queue);
-    // This is a work-around function for reinterpret_casting the memory. This
-    // will be fixed when SYCL-2020 has been implemented for Pi backend.
-    template <typename T, typename U>
-    inline T get_mem(U acc) {
-        hipDeviceptr_t hipPtr = ih.get_native_mem<sycl::backend::ext_oneapi_hip>(acc);
-        return reinterpret_cast<T>(hipPtr);
-    }
-};
-
-} // namespace rocsolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
-#endif //_ROCSOLVER_SCOPED_HANDLE_HPP_
diff --git a/src/lapack/backends/rocsolver/rocsolver_task.hpp b/src/lapack/backends/rocsolver/rocsolver_task.hpp
deleted file mode 100644
index 08f8e5cea..000000000
--- a/src/lapack/backends/rocsolver/rocsolver_task.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright 2022 Intel Corporation
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef _MKL_LAPACK_ROCSOLVER_TASK_HPP_
-#define _MKL_LAPACK_ROCSOLVER_TASK_HPP_
-#include <hip/hip_runtime.h>
-#include <rocblas/rocblas.h>
-#include <rocsolver/rocsolver.h>
-#include <complex>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl/types.hpp"
-#include "rocsolver_scope_handle.hpp"
-#if __has_include(<sycl/detail/pi.hpp>)
-#include <sycl/detail/pi.hpp>
-#else
-#include <CL/sycl/detail/pi.hpp>
-#endif
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace rocsolver {
-
-template <typename H, typename F>
-static inline void host_task_internal(H &cgh, sycl::queue queue, F f) {
-    cgh.host_task([f, queue](cl::sycl::interop_handle ih) {
-        auto sc = RocsolverScopedContextHandler(queue, ih);
-        f(sc);
-    });
-}
-
-template <typename H, typename F>
-static inline void onemkl_rocsolver_host_task(H &cgh, sycl::queue queue, F f) {
-    (void)host_task_internal(cgh, queue, f);
-}
-
-} // namespace rocsolver
-} // namespace lapack
-} // namespace mkl
-} // namespace oneapi
-#endif // _MKL_LAPACK_ROCSOLVER_TASK_HPP_
diff --git a/src/lapack/backends/rocsolver/rocsolver_wrappers.cpp b/src/lapack/backends/rocsolver/rocsolver_wrappers.cpp
deleted file mode 100644
index 8613cc05e..000000000
--- a/src/lapack/backends/rocsolver/rocsolver_wrappers.cpp
+++ /dev/null
@@ -1,428 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Copyright 2022 Intel Corporation
-*
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-#include "lapack/function_table.hpp"
-#include "oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hpp"
-
-#define WRAPPER_VERSION 1
-
-extern "C" ONEMKL_EXPORT lapack_function_table_t mkl_lapack_table = {
-    WRAPPER_VERSION,
-#define LAPACK_BACKEND rocsolver
-    oneapi::mkl::lapack::rocsolver::gebrd,
-    oneapi::mkl::lapack::rocsolver::gebrd,
-    oneapi::mkl::lapack::rocsolver::gebrd,
-    oneapi::mkl::lapack::rocsolver::gebrd,
-    oneapi::mkl::lapack::rocsolver::gerqf,
-    oneapi::mkl::lapack::rocsolver::gerqf,
-    oneapi::mkl::lapack::rocsolver::gerqf,
-    oneapi::mkl::lapack::rocsolver::gerqf,
-    oneapi::mkl::lapack::rocsolver::geqrf,
-    oneapi::mkl::lapack::rocsolver::geqrf,
-    oneapi::mkl::lapack::rocsolver::geqrf,
-    oneapi::mkl::lapack::rocsolver::geqrf,
-    oneapi::mkl::lapack::rocsolver::getrf,
-    oneapi::mkl::lapack::rocsolver::getrf,
-    oneapi::mkl::lapack::rocsolver::getrf,
-    oneapi::mkl::lapack::rocsolver::getrf,
-    oneapi::mkl::lapack::rocsolver::getri,
-    oneapi::mkl::lapack::rocsolver::getri,
-    oneapi::mkl::lapack::rocsolver::getri,
-    oneapi::mkl::lapack::rocsolver::getri,
-    oneapi::mkl::lapack::rocsolver::getrs,
-    oneapi::mkl::lapack::rocsolver::getrs,
-    oneapi::mkl::lapack::rocsolver::getrs,
-    oneapi::mkl::lapack::rocsolver::getrs,
-    oneapi::mkl::lapack::rocsolver::gesvd,
-    oneapi::mkl::lapack::rocsolver::gesvd,
-    oneapi::mkl::lapack::rocsolver::gesvd,
-    oneapi::mkl::lapack::rocsolver::gesvd,
-    oneapi::mkl::lapack::rocsolver::heevd,
-    oneapi::mkl::lapack::rocsolver::heevd,
-    oneapi::mkl::lapack::rocsolver::hegvd,
-    oneapi::mkl::lapack::rocsolver::hegvd,
-    oneapi::mkl::lapack::rocsolver::hetrd,
-    oneapi::mkl::lapack::rocsolver::hetrd,
-    oneapi::mkl::lapack::rocsolver::hetrf,
-    oneapi::mkl::lapack::rocsolver::hetrf,
-    oneapi::mkl::lapack::rocsolver::orgbr,
-    oneapi::mkl::lapack::rocsolver::orgbr,
-    oneapi::mkl::lapack::rocsolver::orgqr,
-    oneapi::mkl::lapack::rocsolver::orgqr,
-    oneapi::mkl::lapack::rocsolver::orgtr,
-    oneapi::mkl::lapack::rocsolver::orgtr,
-    oneapi::mkl::lapack::rocsolver::ormtr,
-    oneapi::mkl::lapack::rocsolver::ormtr,
-    oneapi::mkl::lapack::rocsolver::ormrq,
-    oneapi::mkl::lapack::rocsolver::ormrq,
-    oneapi::mkl::lapack::rocsolver::ormqr,
-    oneapi::mkl::lapack::rocsolver::ormqr,
-    oneapi::mkl::lapack::rocsolver::potrf,
-    oneapi::mkl::lapack::rocsolver::potrf,
-    oneapi::mkl::lapack::rocsolver::potrf,
-    oneapi::mkl::lapack::rocsolver::potrf,
-    oneapi::mkl::lapack::rocsolver::potri,
-    oneapi::mkl::lapack::rocsolver::potri,
-    oneapi::mkl::lapack::rocsolver::potri,
-    oneapi::mkl::lapack::rocsolver::potri,
-    oneapi::mkl::lapack::rocsolver::potrs,
-    oneapi::mkl::lapack::rocsolver::potrs,
-    oneapi::mkl::lapack::rocsolver::potrs,
-    oneapi::mkl::lapack::rocsolver::potrs,
-    oneapi::mkl::lapack::rocsolver::syevd,
-    oneapi::mkl::lapack::rocsolver::syevd,
-    oneapi::mkl::lapack::rocsolver::sygvd,
-    oneapi::mkl::lapack::rocsolver::sygvd,
-    oneapi::mkl::lapack::rocsolver::sytrd,
-    oneapi::mkl::lapack::rocsolver::sytrd,
-    oneapi::mkl::lapack::rocsolver::sytrf,
-    oneapi::mkl::lapack::rocsolver::sytrf,
-    oneapi::mkl::lapack::rocsolver::sytrf,
-    oneapi::mkl::lapack::rocsolver::sytrf,
-    oneapi::mkl::lapack::rocsolver::trtrs,
-    oneapi::mkl::lapack::rocsolver::trtrs,
-    oneapi::mkl::lapack::rocsolver::trtrs,
-    oneapi::mkl::lapack::rocsolver::trtrs,
-    oneapi::mkl::lapack::rocsolver::ungbr,
-    oneapi::mkl::lapack::rocsolver::ungbr,
-    oneapi::mkl::lapack::rocsolver::ungqr,
-    oneapi::mkl::lapack::rocsolver::ungqr,
-    oneapi::mkl::lapack::rocsolver::ungtr,
-    oneapi::mkl::lapack::rocsolver::ungtr,
-    oneapi::mkl::lapack::rocsolver::unmrq,
-    oneapi::mkl::lapack::rocsolver::unmrq,
-    oneapi::mkl::lapack::rocsolver::unmqr,
-    oneapi::mkl::lapack::rocsolver::unmqr,
-    oneapi::mkl::lapack::rocsolver::unmtr,
-    oneapi::mkl::lapack::rocsolver::unmtr,
-    oneapi::mkl::lapack::rocsolver::gebrd,
-    oneapi::mkl::lapack::rocsolver::gebrd,
-    oneapi::mkl::lapack::rocsolver::gebrd,
-    oneapi::mkl::lapack::rocsolver::gebrd,
-    oneapi::mkl::lapack::rocsolver::gerqf,
-    oneapi::mkl::lapack::rocsolver::gerqf,
-    oneapi::mkl::lapack::rocsolver::gerqf,
-    oneapi::mkl::lapack::rocsolver::gerqf,
-    oneapi::mkl::lapack::rocsolver::geqrf,
-    oneapi::mkl::lapack::rocsolver::geqrf,
-    oneapi::mkl::lapack::rocsolver::geqrf,
-    oneapi::mkl::lapack::rocsolver::geqrf,
-    oneapi::mkl::lapack::rocsolver::getrf,
-    oneapi::mkl::lapack::rocsolver::getrf,
-    oneapi::mkl::lapack::rocsolver::getrf,
-    oneapi::mkl::lapack::rocsolver::getrf,
-    oneapi::mkl::lapack::rocsolver::getri,
-    oneapi::mkl::lapack::rocsolver::getri,
-    oneapi::mkl::lapack::rocsolver::getri,
-    oneapi::mkl::lapack::rocsolver::getri,
-    oneapi::mkl::lapack::rocsolver::getrs,
-    oneapi::mkl::lapack::rocsolver::getrs,
-    oneapi::mkl::lapack::rocsolver::getrs,
-    oneapi::mkl::lapack::rocsolver::getrs,
-    oneapi::mkl::lapack::rocsolver::gesvd,
-    oneapi::mkl::lapack::rocsolver::gesvd,
-    oneapi::mkl::lapack::rocsolver::gesvd,
-    oneapi::mkl::lapack::rocsolver::gesvd,
-    oneapi::mkl::lapack::rocsolver::heevd,
-    oneapi::mkl::lapack::rocsolver::heevd,
-    oneapi::mkl::lapack::rocsolver::hegvd,
-    oneapi::mkl::lapack::rocsolver::hegvd,
-    oneapi::mkl::lapack::rocsolver::hetrd,
-    oneapi::mkl::lapack::rocsolver::hetrd,
-    oneapi::mkl::lapack::rocsolver::hetrf,
-    oneapi::mkl::lapack::rocsolver::hetrf,
-    oneapi::mkl::lapack::rocsolver::orgbr,
-    oneapi::mkl::lapack::rocsolver::orgbr,
-    oneapi::mkl::lapack::rocsolver::orgqr,
-    oneapi::mkl::lapack::rocsolver::orgqr,
-    oneapi::mkl::lapack::rocsolver::orgtr,
-    oneapi::mkl::lapack::rocsolver::orgtr,
-    oneapi::mkl::lapack::rocsolver::ormtr,
-    oneapi::mkl::lapack::rocsolver::ormtr,
-    oneapi::mkl::lapack::rocsolver::ormrq,
-    oneapi::mkl::lapack::rocsolver::ormrq,
-    oneapi::mkl::lapack::rocsolver::ormqr,
-    oneapi::mkl::lapack::rocsolver::ormqr,
-    oneapi::mkl::lapack::rocsolver::potrf,
-    oneapi::mkl::lapack::rocsolver::potrf,
-    oneapi::mkl::lapack::rocsolver::potrf,
-    oneapi::mkl::lapack::rocsolver::potrf,
-    oneapi::mkl::lapack::rocsolver::potri,
-    oneapi::mkl::lapack::rocsolver::potri,
-    oneapi::mkl::lapack::rocsolver::potri,
-    oneapi::mkl::lapack::rocsolver::potri,
-    oneapi::mkl::lapack::rocsolver::potrs,
-    oneapi::mkl::lapack::rocsolver::potrs,
-    oneapi::mkl::lapack::rocsolver::potrs,
-    oneapi::mkl::lapack::rocsolver::potrs,
-    oneapi::mkl::lapack::rocsolver::syevd,
-    oneapi::mkl::lapack::rocsolver::syevd,
-    oneapi::mkl::lapack::rocsolver::sygvd,
-    oneapi::mkl::lapack::rocsolver::sygvd,
-    oneapi::mkl::lapack::rocsolver::sytrd,
-    oneapi::mkl::lapack::rocsolver::sytrd,
-    oneapi::mkl::lapack::rocsolver::sytrf,
-    oneapi::mkl::lapack::rocsolver::sytrf,
-    oneapi::mkl::lapack::rocsolver::sytrf,
-    oneapi::mkl::lapack::rocsolver::sytrf,
-    oneapi::mkl::lapack::rocsolver::trtrs,
-    oneapi::mkl::lapack::rocsolver::trtrs,
-    oneapi::mkl::lapack::rocsolver::trtrs,
-    oneapi::mkl::lapack::rocsolver::trtrs,
-    oneapi::mkl::lapack::rocsolver::ungbr,
-    oneapi::mkl::lapack::rocsolver::ungbr,
-    oneapi::mkl::lapack::rocsolver::ungqr,
-    oneapi::mkl::lapack::rocsolver::ungqr,
-    oneapi::mkl::lapack::rocsolver::ungtr,
-    oneapi::mkl::lapack::rocsolver::ungtr,
-    oneapi::mkl::lapack::rocsolver::unmrq,
-    oneapi::mkl::lapack::rocsolver::unmrq,
-    oneapi::mkl::lapack::rocsolver::unmqr,
-    oneapi::mkl::lapack::rocsolver::unmqr,
-    oneapi::mkl::lapack::rocsolver::unmtr,
-    oneapi::mkl::lapack::rocsolver::unmtr,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch,
-    oneapi::mkl::lapack::rocsolver::getri_batch,
-    oneapi::mkl::lapack::rocsolver::getri_batch,
-    oneapi::mkl::lapack::rocsolver::getri_batch,
-    oneapi::mkl::lapack::rocsolver::getri_batch,
-    oneapi::mkl::lapack::rocsolver::getrs_batch,
-    oneapi::mkl::lapack::rocsolver::getrs_batch,
-    oneapi::mkl::lapack::rocsolver::getrs_batch,
-    oneapi::mkl::lapack::rocsolver::getrs_batch,
-    oneapi::mkl::lapack::rocsolver::getrf_batch,
-    oneapi::mkl::lapack::rocsolver::getrf_batch,
-    oneapi::mkl::lapack::rocsolver::getrf_batch,
-    oneapi::mkl::lapack::rocsolver::getrf_batch,
-    oneapi::mkl::lapack::rocsolver::orgqr_batch,
-    oneapi::mkl::lapack::rocsolver::orgqr_batch,
-    oneapi::mkl::lapack::rocsolver::potrf_batch,
-    oneapi::mkl::lapack::rocsolver::potrf_batch,
-    oneapi::mkl::lapack::rocsolver::potrf_batch,
-    oneapi::mkl::lapack::rocsolver::potrf_batch,
-    oneapi::mkl::lapack::rocsolver::potrs_batch,
-    oneapi::mkl::lapack::rocsolver::potrs_batch,
-    oneapi::mkl::lapack::rocsolver::potrs_batch,
-    oneapi::mkl::lapack::rocsolver::potrs_batch,
-    oneapi::mkl::lapack::rocsolver::ungqr_batch,
-    oneapi::mkl::lapack::rocsolver::ungqr_batch,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch,
-    oneapi::mkl::lapack::rocsolver::getrf_batch,
-    oneapi::mkl::lapack::rocsolver::getrf_batch,
-    oneapi::mkl::lapack::rocsolver::getrf_batch,
-    oneapi::mkl::lapack::rocsolver::getrf_batch,
-    oneapi::mkl::lapack::rocsolver::getri_batch,
-    oneapi::mkl::lapack::rocsolver::getri_batch,
-    oneapi::mkl::lapack::rocsolver::getri_batch,
-    oneapi::mkl::lapack::rocsolver::getri_batch,
-    oneapi::mkl::lapack::rocsolver::getrs_batch,
-    oneapi::mkl::lapack::rocsolver::getrs_batch,
-    oneapi::mkl::lapack::rocsolver::getrs_batch,
-    oneapi::mkl::lapack::rocsolver::getrs_batch,
-    oneapi::mkl::lapack::rocsolver::orgqr_batch,
-    oneapi::mkl::lapack::rocsolver::orgqr_batch,
-    oneapi::mkl::lapack::rocsolver::potrf_batch,
-    oneapi::mkl::lapack::rocsolver::potrf_batch,
-    oneapi::mkl::lapack::rocsolver::potrf_batch,
-    oneapi::mkl::lapack::rocsolver::potrf_batch,
-    oneapi::mkl::lapack::rocsolver::potrs_batch,
-    oneapi::mkl::lapack::rocsolver::potrs_batch,
-    oneapi::mkl::lapack::rocsolver::potrs_batch,
-    oneapi::mkl::lapack::rocsolver::potrs_batch,
-    oneapi::mkl::lapack::rocsolver::ungqr_batch,
-    oneapi::mkl::lapack::rocsolver::ungqr_batch,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch,
-    oneapi::mkl::lapack::rocsolver::getrf_batch,
-    oneapi::mkl::lapack::rocsolver::getrf_batch,
-    oneapi::mkl::lapack::rocsolver::getrf_batch,
-    oneapi::mkl::lapack::rocsolver::getrf_batch,
-    oneapi::mkl::lapack::rocsolver::getri_batch,
-    oneapi::mkl::lapack::rocsolver::getri_batch,
-    oneapi::mkl::lapack::rocsolver::getri_batch,
-    oneapi::mkl::lapack::rocsolver::getri_batch,
-    oneapi::mkl::lapack::rocsolver::getrs_batch,
-    oneapi::mkl::lapack::rocsolver::getrs_batch,
-    oneapi::mkl::lapack::rocsolver::getrs_batch,
-    oneapi::mkl::lapack::rocsolver::getrs_batch,
-    oneapi::mkl::lapack::rocsolver::orgqr_batch,
-    oneapi::mkl::lapack::rocsolver::orgqr_batch,
-    oneapi::mkl::lapack::rocsolver::potrf_batch,
-    oneapi::mkl::lapack::rocsolver::potrf_batch,
-    oneapi::mkl::lapack::rocsolver::potrf_batch,
-    oneapi::mkl::lapack::rocsolver::potrf_batch,
-    oneapi::mkl::lapack::rocsolver::potrs_batch,
-    oneapi::mkl::lapack::rocsolver::potrs_batch,
-    oneapi::mkl::lapack::rocsolver::potrs_batch,
-    oneapi::mkl::lapack::rocsolver::potrs_batch,
-    oneapi::mkl::lapack::rocsolver::ungqr_batch,
-    oneapi::mkl::lapack::rocsolver::ungqr_batch,
-    oneapi::mkl::lapack::rocsolver::gebrd_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::gebrd_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::gebrd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::gebrd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::gerqf_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::gerqf_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::gerqf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::gerqf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::geqrf_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::geqrf_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::geqrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::geqrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::gesvd_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::gesvd_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::gesvd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::gesvd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::getrf_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::getrf_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::getrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::getrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::getri_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::getri_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::getri_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::getri_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::getrs_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::getrs_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::getrs_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::getrs_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::heevd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::heevd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::hegvd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::hegvd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::hetrd_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::hetrd_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::hetrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::hetrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::orgbr_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::orgbr_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::orgtr_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::orgtr_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::orgqr_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::orgqr_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::ormrq_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::ormrq_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::ormqr_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::ormqr_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::ormtr_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::ormtr_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::potrf_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::potrf_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::potrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::potrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::potrs_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::potrs_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::potrs_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::potrs_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::potri_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::potri_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::potri_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::potri_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::sytrf_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::sytrf_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::sytrf_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::sytrf_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::syevd_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::syevd_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::sygvd_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::sygvd_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::sytrd_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::sytrd_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::trtrs_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::trtrs_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::trtrs_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::trtrs_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::ungbr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::ungbr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::ungqr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::ungqr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::ungtr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::ungtr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::unmrq_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::unmrq_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::unmqr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::unmqr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::unmtr_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::unmtr_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::getrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::getrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::getrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::getrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::getri_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::getri_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::getri_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::getri_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::getrs_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::getrs_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::getrs_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::getrs_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::potrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::potrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::potrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::potrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::potrs_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::potrs_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::potrs_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::potrs_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::orgqr_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::orgqr_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::ungqr_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::ungqr_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::getrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::getrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::getrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::getrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::getri_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::getri_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::getri_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::getri_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::getrs_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::getrs_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::getrs_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::getrs_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::geqrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::orgqr_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::orgqr_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::potrf_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::potrf_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::potrf_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::potrf_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::potrs_batch_scratchpad_size<float>,
-    oneapi::mkl::lapack::rocsolver::potrs_batch_scratchpad_size<double>,
-    oneapi::mkl::lapack::rocsolver::potrs_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::potrs_batch_scratchpad_size<std::complex<double>>,
-    oneapi::mkl::lapack::rocsolver::ungqr_batch_scratchpad_size<std::complex<float>>,
-    oneapi::mkl::lapack::rocsolver::ungqr_batch_scratchpad_size<std::complex<double>>
-#undef LAPACK_BACKEND
-};
diff --git a/src/lapack/function_table.hpp b/src/lapack/function_table.hpp
deleted file mode 100644
index e034fe357..000000000
--- a/src/lapack/function_table.hpp
+++ /dev/null
@@ -1,1839 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#include <complex>
-#include <cstdint>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/types.hpp"
-
-typedef struct {
-    int version;
-    void (*cgebrd_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<float> &d, sycl::buffer<float> &e,
-                        sycl::buffer<std::complex<float>> &tauq,
-                        sycl::buffer<std::complex<float>> &taup,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dgebrd_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-                        std::int64_t lda, sycl::buffer<double> &d, sycl::buffer<double> &e,
-                        sycl::buffer<double> &tauq, sycl::buffer<double> &taup,
-                        sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-    void (*sgebrd_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                        std::int64_t lda, sycl::buffer<float> &d, sycl::buffer<float> &e,
-                        sycl::buffer<float> &tauq, sycl::buffer<float> &taup,
-                        sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-    void (*zgebrd_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<double> &d, sycl::buffer<double> &e,
-                        sycl::buffer<std::complex<double>> &tauq,
-                        sycl::buffer<std::complex<double>> &taup,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*sgerqf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                        std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dgerqf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-                        std::int64_t lda, sycl::buffer<double> &tau,
-                        sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-    void (*cgerqf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>> &tau,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zgerqf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>> &tau,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cgeqrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>> &tau,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dgeqrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-                        std::int64_t lda, sycl::buffer<double> &tau,
-                        sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-    void (*sgeqrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                        std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zgeqrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>> &tau,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cgetrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::int64_t> &ipiv,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dgetrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-                        std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                        sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-    void (*sgetrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-                        std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                        sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-    void (*zgetrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::int64_t> &ipiv,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cgetri_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                        std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dgetri_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a,
-                        std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                        sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-    void (*sgetri_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a,
-                        std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                        sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-    void (*zgetri_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                        std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cgetrs_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<float>> &b,
-                        std::int64_t ldb, sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dgetrs_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                        sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &b, std::int64_t ldb,
-                        sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-    void (*sgetrs_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                        sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &b, std::int64_t ldb,
-                        sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-    void (*zgetrs_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                        std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &b,
-                        std::int64_t ldb, sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dgesvd_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                        std::int64_t m, std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                        sycl::buffer<double> &s, sycl::buffer<double> &u, std::int64_t ldu,
-                        sycl::buffer<double> &vt, std::int64_t ldvt,
-                        sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-    void (*sgesvd_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                        std::int64_t m, std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                        sycl::buffer<float> &s, sycl::buffer<float> &u, std::int64_t ldu,
-                        sycl::buffer<float> &vt, std::int64_t ldvt, sycl::buffer<float> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cgesvd_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                        std::int64_t m, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-                        std::int64_t lda, sycl::buffer<float> &s,
-                        sycl::buffer<std::complex<float>> &u, std::int64_t ldu,
-                        sycl::buffer<std::complex<float>> &vt, std::int64_t ldvt,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zgesvd_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                        std::int64_t m, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-                        std::int64_t lda, sycl::buffer<double> &s,
-                        sycl::buffer<std::complex<double>> &u, std::int64_t ldu,
-                        sycl::buffer<std::complex<double>> &vt, std::int64_t ldvt,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cheevd_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                        std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zheevd_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                        std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*chegvd_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                        oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                        sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zhegvd_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                        oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                        sycl::buffer<double> &w, sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*chetrd_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<float> &d, sycl::buffer<float> &e,
-                        sycl::buffer<std::complex<float>> &tau,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zhetrd_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<double> &d, sycl::buffer<double> &e,
-                        sycl::buffer<std::complex<double>> &tau,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*chetrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::int64_t> &ipiv,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zhetrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::int64_t> &ipiv,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*sorgbr_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                        std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-                        sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dorgbr_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                        std::int64_t n, std::int64_t k, sycl::buffer<double> &a, std::int64_t lda,
-                        sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dorgqr_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                        sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-    void (*sorgqr_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                        sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-    void (*sorgtr_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                        sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-    void (*dorgtr_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                        sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-    void (*sormtr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                        oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                        sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-                        sycl::buffer<float> &c, std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dormtr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                        oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                        sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-                        sycl::buffer<double> &c, std::int64_t ldc, sycl::buffer<double> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*sormrq_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                        std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &c,
-                        std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dormrq_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                        std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &c,
-                        std::int64_t ldc, sycl::buffer<double> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dormqr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-                        std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &c,
-                        std::int64_t ldc, sycl::buffer<double> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*sormqr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<float> &a,
-                        std::int64_t lda, sycl::buffer<float> &tau, sycl::buffer<float> &c,
-                        std::int64_t ldc, sycl::buffer<float> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*spotrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dpotrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cpotrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zpotrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*spotri_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dpotri_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cpotri_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zpotri_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*spotrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                        sycl::buffer<float> &b, std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dpotrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                        sycl::buffer<double> &b, std::int64_t ldb, sycl::buffer<double> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cpotrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zpotrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dsyevd_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                        std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-                        sycl::buffer<double> &w, sycl::buffer<double> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*ssyevd_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                        std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-                        sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dsygvd_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                        oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-                        std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-                        sycl::buffer<double> &w, sycl::buffer<double> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*ssygvd_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                        oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-                        std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-                        sycl::buffer<float> &w, sycl::buffer<float> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dsytrd_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &d,
-                        sycl::buffer<double> &e, sycl::buffer<double> &tau,
-                        sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-    void (*ssytrd_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &d,
-                        sycl::buffer<float> &e, sycl::buffer<float> &tau,
-                        sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-    void (*ssytrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                        sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-    void (*dsytrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-                        sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-    void (*csytrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::int64_t> &ipiv,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zsytrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::int64_t> &ipiv,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*ctrtrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                        oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*dtrtrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                        oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                        sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b,
-                        std::int64_t ldb, sycl::buffer<double> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*strtrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                        oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                        sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b,
-                        std::int64_t ldb, sycl::buffer<float> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*ztrtrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                        oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cungbr_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                        std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zungbr_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                        std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a,
-                        std::int64_t lda, sycl::buffer<std::complex<double>> &tau,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cungqr_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>> &tau,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zungqr_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>> &tau,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cungtr_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>> &tau,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zungtr_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>> &tau,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cunmrq_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>> &tau,
-                        sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zunmrq_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>> &tau,
-                        sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cunmqr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>> &tau,
-                        sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zunmqr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                        std::int64_t m, std::int64_t n, std::int64_t k,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>> &tau,
-                        sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*cunmtr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                        oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                        sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<float>> &tau,
-                        sycl::buffer<std::complex<float>> &c, std::int64_t ldc,
-                        sycl::buffer<std::complex<float>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    void (*zunmtr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                        oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                        sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                        sycl::buffer<std::complex<double>> &tau,
-                        sycl::buffer<std::complex<double>> &c, std::int64_t ldc,
-                        sycl::buffer<std::complex<double>> &scratchpad,
-                        std::int64_t scratchpad_size);
-    sycl::event (*cgebrd_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<float> *a, std::int64_t lda, float *d, float *e,
-                                   std::complex<float> *tauq, std::complex<float> *taup,
-                                   std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgebrd_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                   std::int64_t lda, double *d, double *e, double *tauq,
-                                   double *taup, double *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*sgebrd_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                   std::int64_t lda, float *d, float *e, float *tauq, float *taup,
-                                   float *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgebrd_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<double> *a, std::int64_t lda, double *d, double *e,
-                                   std::complex<double> *tauq, std::complex<double> *taup,
-                                   std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*sgerqf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                   std::int64_t lda, float *tau, float *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgerqf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                   std::int64_t lda, double *tau, double *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cgerqf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *tau, std::complex<float> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgerqf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *tau, std::complex<double> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cgeqrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *tau, std::complex<float> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgeqrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                   std::int64_t lda, double *tau, double *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*sgeqrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                   std::int64_t lda, float *tau, float *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgeqrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *tau, std::complex<double> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cgetrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                   std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgetrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a,
-                                   std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*sgetrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a,
-                                   std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgetrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                   std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cgetri_usm_sycl)(sycl::queue &queue, std::int64_t n, std::complex<float> *a,
-                                   std::int64_t lda, std::int64_t *ipiv,
-                                   std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgetri_usm_sycl)(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda,
-                                   std::int64_t *ipiv, double *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*sgetri_usm_sycl)(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda,
-                                   std::int64_t *ipiv, float *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgetri_usm_sycl)(sycl::queue &queue, std::int64_t n, std::complex<double> *a,
-                                   std::int64_t lda, std::int64_t *ipiv,
-                                   std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cgetrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                   std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                   std::int64_t *ipiv, std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgetrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                   std::int64_t nrhs, double *a, std::int64_t lda,
-                                   std::int64_t *ipiv, double *b, std::int64_t ldb,
-                                   double *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*sgetrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                   std::int64_t nrhs, float *a, std::int64_t lda,
-                                   std::int64_t *ipiv, float *b, std::int64_t ldb,
-                                   float *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgetrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                                   std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                   std::int64_t *ipiv, std::complex<double> *b, std::int64_t ldb,
-                                   std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgesvd_usm_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                   oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                   double *a, std::int64_t lda, double *s, double *u,
-                                   std::int64_t ldu, double *vt, std::int64_t ldvt,
-                                   double *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*sgesvd_usm_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                   oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                   float *a, std::int64_t lda, float *s, float *u, std::int64_t ldu,
-                                   float *vt, std::int64_t ldvt, float *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cgesvd_usm_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                   oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                   std::complex<float> *a, std::int64_t lda, float *s,
-                                   std::complex<float> *u, std::int64_t ldu,
-                                   std::complex<float> *vt, std::int64_t ldvt,
-                                   std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgesvd_usm_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                   oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                                   std::complex<double> *a, std::int64_t lda, double *s,
-                                   std::complex<double> *u, std::int64_t ldu,
-                                   std::complex<double> *vt, std::int64_t ldvt,
-                                   std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cheevd_usm_sycl)(sycl::queue &queue, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                   std::int64_t lda, float *w, std::complex<float> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zheevd_usm_sycl)(sycl::queue &queue, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                   std::int64_t lda, double *w, std::complex<double> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*chegvd_usm_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                   float *w, std::complex<float> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zhegvd_usm_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                   double *w, std::complex<double> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*chetrd_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::complex<float> *a, std::int64_t lda, float *d, float *e,
-                                   std::complex<float> *tau, std::complex<float> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zhetrd_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::complex<double> *a, std::int64_t lda, double *d, double *e,
-                                   std::complex<double> *tau, std::complex<double> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*chetrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                   std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zhetrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                   std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*sorgbr_usm_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                                   float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dorgbr_usm_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                                   double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dorgqr_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, double *a, std::int64_t lda, double *tau,
-                                   double *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*sorgqr_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, float *a, std::int64_t lda, float *tau,
-                                   float *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*sorgtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   float *a, std::int64_t lda, float *tau, float *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dorgtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   double *a, std::int64_t lda, double *tau, double *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*sormtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, float *a, std::int64_t lda,
-                                   float *tau, float *c, std::int64_t ldc, float *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dormtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, double *a, std::int64_t lda,
-                                   double *tau, double *c, std::int64_t ldc, double *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*sormrq_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, float *a, std::int64_t lda, float *tau, float *c,
-                                   std::int64_t ldc, float *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dormrq_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, double *a, std::int64_t lda, double *tau,
-                                   double *c, std::int64_t ldc, double *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dormqr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, double *a, std::int64_t lda, double *tau,
-                                   double *c, std::int64_t ldc, double *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*sormqr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, float *a, std::int64_t lda, float *tau, float *c,
-                                   std::int64_t ldc, float *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*spotrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   float *a, std::int64_t lda, float *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dpotrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   double *a, std::int64_t lda, double *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cpotrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zpotrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*spotri_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   float *a, std::int64_t lda, float *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dpotri_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   double *a, std::int64_t lda, double *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cpotri_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zpotri_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*spotrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t nrhs, float *a, std::int64_t lda, float *b,
-                                   std::int64_t ldb, float *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dpotrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t nrhs, double *a, std::int64_t lda, double *b,
-                                   std::int64_t ldb, double *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cpotrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zpotrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *b, std::int64_t ldb,
-                                   std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dsyevd_usm_sycl)(sycl::queue &queue, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                                   std::int64_t lda, double *w, double *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*ssyevd_usm_sycl)(sycl::queue &queue, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                                   std::int64_t lda, float *w, float *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dsygvd_usm_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                                   std::int64_t lda, double *b, std::int64_t ldb, double *w,
-                                   double *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*ssygvd_usm_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                                   std::int64_t lda, float *b, std::int64_t ldb, float *w,
-                                   float *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dsytrd_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   double *a, std::int64_t lda, double *d, double *e, double *tau,
-                                   double *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*ssytrd_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   float *a, std::int64_t lda, float *d, float *e, float *tau,
-                                   float *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*ssytrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   float *a, std::int64_t lda, std::int64_t *ipiv,
-                                   float *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dsytrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   double *a, std::int64_t lda, std::int64_t *ipiv,
-                                   double *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*csytrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                                   std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zsytrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                                   std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*ctrtrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                   std::int64_t n, std::int64_t nrhs, std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *b, std::int64_t ldb,
-                                   std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*dtrtrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                   std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda,
-                                   double *b, std::int64_t ldb, double *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*strtrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                   std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda,
-                                   float *b, std::int64_t ldb, float *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*ztrtrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                   oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                                   std::int64_t n, std::int64_t nrhs, std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *b, std::int64_t ldb,
-                                   std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cungbr_usm_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *tau,
-                                   std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zungbr_usm_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m,
-                                   std::int64_t n, std::int64_t k, std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *tau,
-                                   std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cungqr_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *tau, std::complex<float> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zungqr_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *tau, std::complex<double> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cungtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *tau, std::complex<float> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zungtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *tau, std::complex<double> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cunmrq_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *tau, std::complex<float> *c,
-                                   std::int64_t ldc, std::complex<float> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zunmrq_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *tau, std::complex<double> *c,
-                                   std::int64_t ldc, std::complex<double> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cunmqr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                   std::complex<float> *tau, std::complex<float> *c,
-                                   std::int64_t ldc, std::complex<float> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zunmqr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-                                   std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                   std::complex<double> *tau, std::complex<double> *c,
-                                   std::int64_t ldc, std::complex<double> *scratchpad,
-                                   std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*cunmtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::complex<float> *a,
-                                   std::int64_t lda, std::complex<float> *tau,
-                                   std::complex<float> *c, std::int64_t ldc,
-                                   std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    sycl::event (*zunmtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                   oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                   std::int64_t m, std::int64_t n, std::complex<double> *a,
-                                   std::int64_t lda, std::complex<double> *tau,
-                                   std::complex<double> *c, std::int64_t ldc,
-                                   std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                                   const std::vector<sycl::event> &dependencies);
-    void (*sgeqrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                              sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<float> &tau, std::int64_t stride_tau,
-                              std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*dgeqrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                              sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<double> &tau, std::int64_t stride_tau,
-                              std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*cgeqrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                              sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<float>> &tau,
-                              std::int64_t stride_tau, std::int64_t batch_size,
-                              sycl::buffer<std::complex<float>> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*zgeqrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                              sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                              std::int64_t stride_tau, std::int64_t batch_size,
-                              sycl::buffer<std::complex<double>> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*sgetri_batch_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                              std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*dgetri_batch_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                              std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*cgetri_batch_sycl)(sycl::queue &queue, std::int64_t n,
-                              sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                              std::int64_t stride_ipiv, std::int64_t batch_size,
-                              sycl::buffer<std::complex<float>> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*zgetri_batch_sycl)(sycl::queue &queue, std::int64_t n,
-                              sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                              std::int64_t stride_ipiv, std::int64_t batch_size,
-                              sycl::buffer<std::complex<double>> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*sgetrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                              std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                              std::int64_t stride_ipiv, sycl::buffer<float> &b, std::int64_t ldb,
-                              std::int64_t stride_b, std::int64_t batch_size,
-                              sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-    void (*dgetrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                              std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                              std::int64_t stride_ipiv, sycl::buffer<double> &b, std::int64_t ldb,
-                              std::int64_t stride_b, std::int64_t batch_size,
-                              sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-    void (*cgetrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                              std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                              sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                              std::int64_t stride_b, std::int64_t batch_size,
-                              sycl::buffer<std::complex<float>> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*zgetrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-                              std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                              sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                              std::int64_t stride_b, std::int64_t batch_size,
-                              sycl::buffer<std::complex<double>> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*sgetrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                              sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                              std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*dgetrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                              sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                              std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*cgetrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                              sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                              std::int64_t stride_ipiv, std::int64_t batch_size,
-                              sycl::buffer<std::complex<float>> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*zgetrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                              sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                              std::int64_t stride_ipiv, std::int64_t batch_size,
-                              sycl::buffer<std::complex<double>> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*sorgqr_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                              sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<float> &tau, std::int64_t stride_tau,
-                              std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*dorgqr_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                              sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<double> &tau, std::int64_t stride_tau,
-                              std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*spotrf_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                              sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                              std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*dpotrf_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                              sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                              std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*cpotrf_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                              sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                              std::int64_t stride_a, std::int64_t batch_size,
-                              sycl::buffer<std::complex<float>> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*zpotrf_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                              sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                              std::int64_t stride_a, std::int64_t batch_size,
-                              sycl::buffer<std::complex<double>> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*spotrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                              std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<float> &b, std::int64_t ldb,
-                              std::int64_t stride_b, std::int64_t batch_size,
-                              sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size);
-    void (*dpotrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                              std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<double> &b, std::int64_t ldb,
-                              std::int64_t stride_b, std::int64_t batch_size,
-                              sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size);
-    void (*cpotrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                              std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                              std::int64_t stride_b, std::int64_t batch_size,
-                              sycl::buffer<std::complex<float>> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*zpotrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                              std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                              std::int64_t lda, std::int64_t stride_a,
-                              sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                              std::int64_t stride_b, std::int64_t batch_size,
-                              sycl::buffer<std::complex<double>> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*cungqr_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                              sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<float>> &tau,
-                              std::int64_t stride_tau, std::int64_t batch_size,
-                              sycl::buffer<std::complex<float>> &scratchpad,
-                              std::int64_t scratchpad_size);
-    void (*zungqr_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-                              sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                              std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                              std::int64_t stride_tau, std::int64_t batch_size,
-                              sycl::buffer<std::complex<double>> &scratchpad,
-                              std::int64_t scratchpad_size);
-    sycl::event (*sgeqrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         float *a, std::int64_t lda, std::int64_t stride_a,
-                                         float *tau, std::int64_t stride_tau,
-                                         std::int64_t batch_size, float *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgeqrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         double *a, std::int64_t lda, std::int64_t stride_a,
-                                         double *tau, std::int64_t stride_tau,
-                                         std::int64_t batch_size, double *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*cgeqrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::complex<float> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<float> *tau,
-                                         std::int64_t stride_tau, std::int64_t batch_size,
-                                         std::complex<float> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgeqrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<double> *tau,
-                                         std::int64_t stride_tau, std::int64_t batch_size,
-                                         std::complex<double> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*sgetrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         float *a, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t *ipiv, std::int64_t stride_ipiv,
-                                         std::int64_t batch_size, float *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgetrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         double *a, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t *ipiv, std::int64_t stride_ipiv,
-                                         std::int64_t batch_size, double *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*cgetrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::complex<float> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t *ipiv,
-                                         std::int64_t stride_ipiv, std::int64_t batch_size,
-                                         std::complex<float> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgetrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t *ipiv,
-                                         std::int64_t stride_ipiv, std::int64_t batch_size,
-                                         std::complex<double> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*sgetri_batch_usm_sycl)(sycl::queue &queue, std::int64_t n, float *a,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t *ipiv, std::int64_t stride_ipiv,
-                                         std::int64_t batch_size, float *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgetri_batch_usm_sycl)(sycl::queue &queue, std::int64_t n, double *a,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t *ipiv, std::int64_t stride_ipiv,
-                                         std::int64_t batch_size, double *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*cgetri_batch_usm_sycl)(sycl::queue &queue, std::int64_t n, std::complex<float> *a,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t *ipiv, std::int64_t stride_ipiv,
-                                         std::int64_t batch_size, std::complex<float> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgetri_batch_usm_sycl)(sycl::queue &queue, std::int64_t n,
-                                         std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t *ipiv,
-                                         std::int64_t stride_ipiv, std::int64_t batch_size,
-                                         std::complex<double> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*sgetrs_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t n, std::int64_t nrhs, float *a,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t *ipiv, std::int64_t stride_ipiv, float *b,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size, float *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgetrs_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                         std::int64_t n, std::int64_t nrhs, double *a,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t *ipiv, std::int64_t stride_ipiv, double *b,
-                                         std::int64_t ldb, std::int64_t stride_b,
-                                         std::int64_t batch_size, double *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*cgetrs_batch_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-        std::complex<float> *a, std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-        std::int64_t stride_ipiv, std::complex<float> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgetrs_batch_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-        std::complex<double> *a, std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-        std::int64_t stride_ipiv, std::complex<double> *b, std::int64_t ldb, std::int64_t stride_b,
-        std::int64_t batch_size, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*sorgqr_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t k, float *a, std::int64_t lda,
-                                         std::int64_t stride_a, float *tau, std::int64_t stride_tau,
-                                         std::int64_t batch_size, float *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*dorgqr_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t k, double *a, std::int64_t lda,
-                                         std::int64_t stride_a, double *tau,
-                                         std::int64_t stride_tau, std::int64_t batch_size,
-                                         double *scratchpad, std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*spotrf_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                         float *a, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t batch_size, float *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*dpotrf_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                         double *a, std::int64_t lda, std::int64_t stride_a,
-                                         std::int64_t batch_size, double *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*cpotrf_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                         std::complex<float> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t batch_size,
-                                         std::complex<float> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*zpotrf_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                         std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::int64_t batch_size,
-                                         std::complex<double> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*spotrs_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                         std::int64_t nrhs, float *a, std::int64_t lda,
-                                         std::int64_t stride_a, float *b, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size,
-                                         float *scratchpad, std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*dpotrs_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                         std::int64_t nrhs, double *a, std::int64_t lda,
-                                         std::int64_t stride_a, double *b, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size,
-                                         double *scratchpad, std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*cpotrs_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                         std::int64_t nrhs, std::complex<float> *a,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::complex<float> *b, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size,
-                                         std::complex<float> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*zpotrs_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-                                         std::int64_t nrhs, std::complex<double> *a,
-                                         std::int64_t lda, std::int64_t stride_a,
-                                         std::complex<double> *b, std::int64_t ldb,
-                                         std::int64_t stride_b, std::int64_t batch_size,
-                                         std::complex<double> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*cungqr_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<float> *tau,
-                                         std::int64_t stride_tau, std::int64_t batch_size,
-                                         std::complex<float> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*zungqr_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                         std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                                         std::int64_t stride_a, std::complex<double> *tau,
-                                         std::int64_t stride_tau, std::int64_t batch_size,
-                                         std::complex<double> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*sgeqrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         float **a, std::int64_t *lda, float **tau,
-                                         std::int64_t group_count, std::int64_t *group_sizes,
-                                         float *scratchpad, std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgeqrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         double **a, std::int64_t *lda, double **tau,
-                                         std::int64_t group_count, std::int64_t *group_sizes,
-                                         double *scratchpad, std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*cgeqrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::complex<float> **a, std::int64_t *lda,
-                                         std::complex<float> **tau, std::int64_t group_count,
-                                         std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgeqrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::complex<double> **a, std::int64_t *lda,
-                                         std::complex<double> **tau, std::int64_t group_count,
-                                         std::int64_t *group_sizes,
-                                         std::complex<double> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*sgetrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         float **a, std::int64_t *lda, std::int64_t **ipiv,
-                                         std::int64_t group_count, std::int64_t *group_sizes,
-                                         float *scratchpad, std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgetrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         double **a, std::int64_t *lda, std::int64_t **ipiv,
-                                         std::int64_t group_count, std::int64_t *group_sizes,
-                                         double *scratchpad, std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*cgetrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::complex<float> **a, std::int64_t *lda,
-                                         std::int64_t **ipiv, std::int64_t group_count,
-                                         std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgetrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::complex<double> **a, std::int64_t *lda,
-                                         std::int64_t **ipiv, std::int64_t group_count,
-                                         std::int64_t *group_sizes,
-                                         std::complex<double> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*sgetri_group_usm_sycl)(sycl::queue &queue, std::int64_t *n, float **a,
-                                         std::int64_t *lda, std::int64_t **ipiv,
-                                         std::int64_t group_count, std::int64_t *group_sizes,
-                                         float *scratchpad, std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgetri_group_usm_sycl)(sycl::queue &queue, std::int64_t *n, double **a,
-                                         std::int64_t *lda, std::int64_t **ipiv,
-                                         std::int64_t group_count, std::int64_t *group_sizes,
-                                         double *scratchpad, std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*cgetri_group_usm_sycl)(sycl::queue &queue, std::int64_t *n,
-                                         std::complex<float> **a, std::int64_t *lda,
-                                         std::int64_t **ipiv, std::int64_t group_count,
-                                         std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgetri_group_usm_sycl)(sycl::queue &queue, std::int64_t *n,
-                                         std::complex<double> **a, std::int64_t *lda,
-                                         std::int64_t **ipiv, std::int64_t group_count,
-                                         std::int64_t *group_sizes,
-                                         std::complex<double> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*sgetrs_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                         std::int64_t *n, std::int64_t *nrhs, float **a,
-                                         std::int64_t *lda, std::int64_t **ipiv, float **b,
-                                         std::int64_t *ldb, std::int64_t group_count,
-                                         std::int64_t *group_sizes, float *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*dgetrs_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                         std::int64_t *n, std::int64_t *nrhs, double **a,
-                                         std::int64_t *lda, std::int64_t **ipiv, double **b,
-                                         std::int64_t *ldb, std::int64_t group_count,
-                                         std::int64_t *group_sizes, double *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*cgetrs_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose *trans,
-                                         std::int64_t *n, std::int64_t *nrhs,
-                                         std::complex<float> **a, std::int64_t *lda,
-                                         std::int64_t **ipiv, std::complex<float> **b,
-                                         std::int64_t *ldb, std::int64_t group_count,
-                                         std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*zgetrs_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-        std::complex<double> **a, std::int64_t *lda, std::int64_t **ipiv, std::complex<double> **b,
-        std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes,
-        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-        const std::vector<sycl::event> &dependencies);
-    sycl::event (*sorgqr_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *k, float **a, std::int64_t *lda, float **tau,
-                                         std::int64_t group_count, std::int64_t *group_sizes,
-                                         float *scratchpad, std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*dorgqr_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *k, double **a, std::int64_t *lda,
-                                         double **tau, std::int64_t group_count,
-                                         std::int64_t *group_sizes, double *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*spotrf_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                         std::int64_t *n, float **a, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes,
-                                         float *scratchpad, std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*dpotrf_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                         std::int64_t *n, double **a, std::int64_t *lda,
-                                         std::int64_t group_count, std::int64_t *group_sizes,
-                                         double *scratchpad, std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*cpotrf_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                         std::int64_t *n, std::complex<float> **a,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*zpotrf_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                         std::int64_t *n, std::complex<double> **a,
-                                         std::int64_t *lda, std::int64_t group_count,
-                                         std::int64_t *group_sizes,
-                                         std::complex<double> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*spotrs_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                         std::int64_t *n, std::int64_t *nrhs, float **a,
-                                         std::int64_t *lda, float **b, std::int64_t *ldb,
-                                         std::int64_t group_count, std::int64_t *group_sizes,
-                                         float *scratchpad, std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*dpotrs_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                         std::int64_t *n, std::int64_t *nrhs, double **a,
-                                         std::int64_t *lda, double **b, std::int64_t *ldb,
-                                         std::int64_t group_count, std::int64_t *group_sizes,
-                                         double *scratchpad, std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*cpotrs_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-        std::complex<float> **a, std::int64_t *lda, std::complex<float> **b, std::int64_t *ldb,
-        std::int64_t group_count, std::int64_t *group_sizes, std::complex<float> *scratchpad,
-        std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*zpotrs_group_usm_sycl)(
-        sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs,
-        std::complex<double> **a, std::int64_t *lda, std::complex<double> **b, std::int64_t *ldb,
-        std::int64_t group_count, std::int64_t *group_sizes, std::complex<double> *scratchpad,
-        std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies);
-    sycl::event (*cungqr_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *k, std::complex<float> **a,
-                                         std::int64_t *lda, std::complex<float> **tau,
-                                         std::int64_t group_count, std::int64_t *group_sizes,
-                                         std::complex<float> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-    sycl::event (*zungqr_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-                                         std::int64_t *k, std::complex<double> **a,
-                                         std::int64_t *lda, std::complex<double> **tau,
-                                         std::int64_t group_count, std::int64_t *group_sizes,
-                                         std::complex<double> *scratchpad,
-                                         std::int64_t scratchpad_size,
-                                         const std::vector<sycl::event> &dependencies);
-
-    std::int64_t (*sgebrd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*dgebrd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*cgebrd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*zgebrd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*sgerqf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*dgerqf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*cgerqf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*zgerqf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*sgeqrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*dgeqrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*cgeqrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*zgeqrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*sgesvd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                                oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                                std::int64_t n, std::int64_t lda, std::int64_t ldu,
-                                                std::int64_t ldvt);
-    std::int64_t (*dgesvd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                                oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                                std::int64_t n, std::int64_t lda, std::int64_t ldu,
-                                                std::int64_t ldvt);
-    std::int64_t (*cgesvd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                                oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                                std::int64_t n, std::int64_t lda, std::int64_t ldu,
-                                                std::int64_t ldvt);
-    std::int64_t (*zgesvd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                                                oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                                std::int64_t n, std::int64_t lda, std::int64_t ldu,
-                                                std::int64_t ldvt);
-    std::int64_t (*sgetrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*dgetrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*cgetrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*zgetrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*sgetri_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*dgetri_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*cgetri_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*zgetri_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*sgetrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t ldb);
-    std::int64_t (*dgetrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t ldb);
-    std::int64_t (*cgetrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t ldb);
-    std::int64_t (*zgetrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans,
-                                                std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t ldb);
-    std::int64_t (*cheevd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::job jobz,
-                                                oneapi::mkl::uplo uplo, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*zheevd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::job jobz,
-                                                oneapi::mkl::uplo uplo, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*chegvd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t itype,
-                                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda, std::int64_t ldb);
-    std::int64_t (*zhegvd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t itype,
-                                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda, std::int64_t ldb);
-    std::int64_t (*chetrd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*zhetrd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*chetrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*zhetrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*sorgbr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::generate vect,
-                                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                                std::int64_t lda);
-    std::int64_t (*dorgbr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::generate vect,
-                                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                                std::int64_t lda);
-    std::int64_t (*sorgtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*dorgtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*sorgqr_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t k, std::int64_t lda);
-    std::int64_t (*dorgqr_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t k, std::int64_t lda);
-    std::int64_t (*sormrq_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                                oneapi::mkl::transpose trans, std::int64_t m,
-                                                std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                std::int64_t ldc);
-    std::int64_t (*dormrq_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                                oneapi::mkl::transpose trans, std::int64_t m,
-                                                std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                std::int64_t ldc);
-    std::int64_t (*sormqr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                                oneapi::mkl::transpose trans, std::int64_t m,
-                                                std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                std::int64_t ldc);
-    std::int64_t (*dormqr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                                oneapi::mkl::transpose trans, std::int64_t m,
-                                                std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                std::int64_t ldc);
-    std::int64_t (*sormtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                                oneapi::mkl::uplo uplo,
-                                                oneapi::mkl::transpose trans, std::int64_t m,
-                                                std::int64_t n, std::int64_t lda, std::int64_t ldc);
-    std::int64_t (*dormtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                                oneapi::mkl::uplo uplo,
-                                                oneapi::mkl::transpose trans, std::int64_t m,
-                                                std::int64_t n, std::int64_t lda, std::int64_t ldc);
-    std::int64_t (*spotrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*dpotrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*cpotrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*zpotrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*spotrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t ldb);
-    std::int64_t (*dpotrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t ldb);
-    std::int64_t (*cpotrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t ldb);
-    std::int64_t (*zpotrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t ldb);
-    std::int64_t (*spotri_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*dpotri_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*cpotri_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*zpotri_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*ssytrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*dsytrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*csytrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*zsytrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*ssyevd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::job jobz,
-                                                oneapi::mkl::uplo uplo, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*dsyevd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::job jobz,
-                                                oneapi::mkl::uplo uplo, std::int64_t n,
-                                                std::int64_t lda);
-    std::int64_t (*ssygvd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t itype,
-                                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda, std::int64_t ldb);
-    std::int64_t (*dsygvd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t itype,
-                                                oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda, std::int64_t ldb);
-    std::int64_t (*ssytrd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*dsytrd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*strtrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                oneapi::mkl::transpose trans,
-                                                oneapi::mkl::diag diag, std::int64_t n,
-                                                std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t ldb);
-    std::int64_t (*dtrtrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                oneapi::mkl::transpose trans,
-                                                oneapi::mkl::diag diag, std::int64_t n,
-                                                std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t ldb);
-    std::int64_t (*ctrtrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                oneapi::mkl::transpose trans,
-                                                oneapi::mkl::diag diag, std::int64_t n,
-                                                std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t ldb);
-    std::int64_t (*ztrtrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                oneapi::mkl::transpose trans,
-                                                oneapi::mkl::diag diag, std::int64_t n,
-                                                std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t ldb);
-    std::int64_t (*cungbr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::generate vect,
-                                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                                std::int64_t lda);
-    std::int64_t (*zungbr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::generate vect,
-                                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                                std::int64_t lda);
-    std::int64_t (*cungqr_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t k, std::int64_t lda);
-    std::int64_t (*zungqr_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n,
-                                                std::int64_t k, std::int64_t lda);
-    std::int64_t (*cungtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*zungtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                std::int64_t n, std::int64_t lda);
-    std::int64_t (*cunmrq_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                                oneapi::mkl::transpose trans, std::int64_t m,
-                                                std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                std::int64_t ldc);
-    std::int64_t (*zunmrq_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                                oneapi::mkl::transpose trans, std::int64_t m,
-                                                std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                std::int64_t ldc);
-    std::int64_t (*cunmqr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                                oneapi::mkl::transpose trans, std::int64_t m,
-                                                std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                std::int64_t ldc);
-    std::int64_t (*zunmqr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                                oneapi::mkl::transpose trans, std::int64_t m,
-                                                std::int64_t n, std::int64_t k, std::int64_t lda,
-                                                std::int64_t ldc);
-    std::int64_t (*cunmtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                                oneapi::mkl::uplo uplo,
-                                                oneapi::mkl::transpose trans, std::int64_t m,
-                                                std::int64_t n, std::int64_t lda, std::int64_t ldc);
-    std::int64_t (*zunmtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side,
-                                                oneapi::mkl::uplo uplo,
-                                                oneapi::mkl::transpose trans, std::int64_t m,
-                                                std::int64_t n, std::int64_t lda, std::int64_t ldc);
-    std::int64_t (*sgetrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m,
-                                                      std::int64_t n, std::int64_t lda,
-                                                      std::int64_t stride_a,
-                                                      std::int64_t stride_ipiv,
-                                                      std::int64_t batch_size);
-    std::int64_t (*dgetrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m,
-                                                      std::int64_t n, std::int64_t lda,
-                                                      std::int64_t stride_a,
-                                                      std::int64_t stride_ipiv,
-                                                      std::int64_t batch_size);
-    std::int64_t (*cgetrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m,
-                                                      std::int64_t n, std::int64_t lda,
-                                                      std::int64_t stride_a,
-                                                      std::int64_t stride_ipiv,
-                                                      std::int64_t batch_size);
-    std::int64_t (*zgetrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m,
-                                                      std::int64_t n, std::int64_t lda,
-                                                      std::int64_t stride_a,
-                                                      std::int64_t stride_ipiv,
-                                                      std::int64_t batch_size);
-    std::int64_t (*sgetri_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n,
-                                                      std::int64_t lda, std::int64_t stride_a,
-                                                      std::int64_t stride_ipiv,
-                                                      std::int64_t batch_size);
-    std::int64_t (*dgetri_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n,
-                                                      std::int64_t lda, std::int64_t stride_a,
-                                                      std::int64_t stride_ipiv,
-                                                      std::int64_t batch_size);
-    std::int64_t (*cgetri_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n,
-                                                      std::int64_t lda, std::int64_t stride_a,
-                                                      std::int64_t stride_ipiv,
-                                                      std::int64_t batch_size);
-    std::int64_t (*zgetri_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n,
-                                                      std::int64_t lda, std::int64_t stride_a,
-                                                      std::int64_t stride_ipiv,
-                                                      std::int64_t batch_size);
-    std::int64_t (*sgetrs_batch_scratchpad_size_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-        std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb,
-        std::int64_t stride_b, std::int64_t batch_size);
-    std::int64_t (*dgetrs_batch_scratchpad_size_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-        std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb,
-        std::int64_t stride_b, std::int64_t batch_size);
-    std::int64_t (*cgetrs_batch_scratchpad_size_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-        std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb,
-        std::int64_t stride_b, std::int64_t batch_size);
-    std::int64_t (*zgetrs_batch_scratchpad_size_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-        std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb,
-        std::int64_t stride_b, std::int64_t batch_size);
-    std::int64_t (*sgeqrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m,
-                                                      std::int64_t n, std::int64_t lda,
-                                                      std::int64_t stride_a,
-                                                      std::int64_t stride_tau,
-                                                      std::int64_t batch_size);
-    std::int64_t (*dgeqrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m,
-                                                      std::int64_t n, std::int64_t lda,
-                                                      std::int64_t stride_a,
-                                                      std::int64_t stride_tau,
-                                                      std::int64_t batch_size);
-    std::int64_t (*cgeqrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m,
-                                                      std::int64_t n, std::int64_t lda,
-                                                      std::int64_t stride_a,
-                                                      std::int64_t stride_tau,
-                                                      std::int64_t batch_size);
-    std::int64_t (*zgeqrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m,
-                                                      std::int64_t n, std::int64_t lda,
-                                                      std::int64_t stride_a,
-                                                      std::int64_t stride_tau,
-                                                      std::int64_t batch_size);
-    std::int64_t (*spotrf_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                      std::int64_t n, std::int64_t lda,
-                                                      std::int64_t stride_a,
-                                                      std::int64_t batch_size);
-    std::int64_t (*dpotrf_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                      std::int64_t n, std::int64_t lda,
-                                                      std::int64_t stride_a,
-                                                      std::int64_t batch_size);
-    std::int64_t (*cpotrf_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                      std::int64_t n, std::int64_t lda,
-                                                      std::int64_t stride_a,
-                                                      std::int64_t batch_size);
-    std::int64_t (*zpotrf_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                      std::int64_t n, std::int64_t lda,
-                                                      std::int64_t stride_a,
-                                                      std::int64_t batch_size);
-    std::int64_t (*spotrs_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                      std::int64_t n, std::int64_t nrhs,
-                                                      std::int64_t lda, std::int64_t stride_a,
-                                                      std::int64_t ldb, std::int64_t stride_b,
-                                                      std::int64_t batch_size);
-    std::int64_t (*dpotrs_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                      std::int64_t n, std::int64_t nrhs,
-                                                      std::int64_t lda, std::int64_t stride_a,
-                                                      std::int64_t ldb, std::int64_t stride_b,
-                                                      std::int64_t batch_size);
-    std::int64_t (*cpotrs_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                      std::int64_t n, std::int64_t nrhs,
-                                                      std::int64_t lda, std::int64_t stride_a,
-                                                      std::int64_t ldb, std::int64_t stride_b,
-                                                      std::int64_t batch_size);
-    std::int64_t (*zpotrs_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                      std::int64_t n, std::int64_t nrhs,
-                                                      std::int64_t lda, std::int64_t stride_a,
-                                                      std::int64_t ldb, std::int64_t stride_b,
-                                                      std::int64_t batch_size);
-    std::int64_t (*sorgqr_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m,
-                                                      std::int64_t n, std::int64_t k,
-                                                      std::int64_t lda, std::int64_t stride_a,
-                                                      std::int64_t stride_tau,
-                                                      std::int64_t batch_size);
-    std::int64_t (*dorgqr_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m,
-                                                      std::int64_t n, std::int64_t k,
-                                                      std::int64_t lda, std::int64_t stride_a,
-                                                      std::int64_t stride_tau,
-                                                      std::int64_t batch_size);
-    std::int64_t (*cungqr_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m,
-                                                      std::int64_t n, std::int64_t k,
-                                                      std::int64_t lda, std::int64_t stride_a,
-                                                      std::int64_t stride_tau,
-                                                      std::int64_t batch_size);
-    std::int64_t (*zungqr_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m,
-                                                      std::int64_t n, std::int64_t k,
-                                                      std::int64_t lda, std::int64_t stride_a,
-                                                      std::int64_t stride_tau,
-                                                      std::int64_t batch_size);
-    std::int64_t (*sgetrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m,
-                                                      std::int64_t *n, std::int64_t *lda,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*dgetrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m,
-                                                      std::int64_t *n, std::int64_t *lda,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*cgetrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m,
-                                                      std::int64_t *n, std::int64_t *lda,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*zgetrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m,
-                                                      std::int64_t *n, std::int64_t *lda,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*sgetri_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *n,
-                                                      std::int64_t *lda, std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*dgetri_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *n,
-                                                      std::int64_t *lda, std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*cgetri_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *n,
-                                                      std::int64_t *lda, std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*zgetri_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *n,
-                                                      std::int64_t *lda, std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*sgetrs_group_scratchpad_size_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-        std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes);
-    std::int64_t (*dgetrs_group_scratchpad_size_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-        std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes);
-    std::int64_t (*cgetrs_group_scratchpad_size_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-        std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes);
-    std::int64_t (*zgetrs_group_scratchpad_size_sycl)(
-        sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-        std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes);
-    std::int64_t (*sgeqrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m,
-                                                      std::int64_t *n, std::int64_t *lda,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*dgeqrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m,
-                                                      std::int64_t *n, std::int64_t *lda,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*cgeqrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m,
-                                                      std::int64_t *n, std::int64_t *lda,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*zgeqrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m,
-                                                      std::int64_t *n, std::int64_t *lda,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*sorgqr_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m,
-                                                      std::int64_t *n, std::int64_t *k,
-                                                      std::int64_t *lda, std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*dorgqr_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m,
-                                                      std::int64_t *n, std::int64_t *k,
-                                                      std::int64_t *lda, std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*spotrf_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                      std::int64_t *n, std::int64_t *lda,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*dpotrf_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                      std::int64_t *n, std::int64_t *lda,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*cpotrf_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                      std::int64_t *n, std::int64_t *lda,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*zpotrf_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                      std::int64_t *n, std::int64_t *lda,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*spotrs_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                      std::int64_t *n, std::int64_t *nrhs,
-                                                      std::int64_t *lda, std::int64_t *ldb,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*dpotrs_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                      std::int64_t *n, std::int64_t *nrhs,
-                                                      std::int64_t *lda, std::int64_t *ldb,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*cpotrs_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                      std::int64_t *n, std::int64_t *nrhs,
-                                                      std::int64_t *lda, std::int64_t *ldb,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*zpotrs_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                                                      std::int64_t *n, std::int64_t *nrhs,
-                                                      std::int64_t *lda, std::int64_t *ldb,
-                                                      std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*cungqr_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m,
-                                                      std::int64_t *n, std::int64_t *k,
-                                                      std::int64_t *lda, std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-    std::int64_t (*zungqr_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m,
-                                                      std::int64_t *n, std::int64_t *k,
-                                                      std::int64_t *lda, std::int64_t group_count,
-                                                      std::int64_t *group_sizes);
-
-} lapack_function_table_t;
diff --git a/src/lapack/lapack_loader.cpp b/src/lapack/lapack_loader.cpp
deleted file mode 100644
index 43fe349d1..000000000
--- a/src/lapack/lapack_loader.cpp
+++ /dev/null
@@ -1,3004 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/lapack/detail/lapack_loader.hpp"
-
-#include "function_table_initializer.hpp"
-#include "lapack/function_table.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace lapack {
-namespace detail {
-
-static oneapi::mkl::detail::table_initializer<domain::lapack, lapack_function_table_t>
-    function_tables;
-
-void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<float> &d,
-           sycl::buffer<float> &e, sycl::buffer<std::complex<float>> &tauq,
-           sycl::buffer<std::complex<float>> &taup, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].cgebrd_sycl(queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                                        scratchpad_size);
-}
-void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &d,
-           sycl::buffer<double> &e, sycl::buffer<double> &tauq, sycl::buffer<double> &taup,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].dgebrd_sycl(queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                                        scratchpad_size);
-}
-void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &d, sycl::buffer<float> &e,
-           sycl::buffer<float> &tauq, sycl::buffer<float> &taup, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].sgebrd_sycl(queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                                        scratchpad_size);
-}
-void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda, sycl::buffer<double> &d,
-           sycl::buffer<double> &e, sycl::buffer<std::complex<double>> &tauq,
-           sycl::buffer<std::complex<double>> &taup, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].zgebrd_sycl(queue, m, n, a, lda, d, e, tauq, taup, scratchpad,
-                                        scratchpad_size);
-}
-void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].sgerqf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].dgerqf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].cgerqf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].zgerqf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].cgeqrf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].dgeqrf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].sgeqrf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].zgeqrf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].cgetrf_sycl(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].dgetrf_sycl(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].sgetrf_sycl(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].zgetrf_sycl(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].cgetri_sycl(queue, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].dgetri_sycl(queue, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].sgetri_sycl(queue, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].zgetri_sycl(queue, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans,
-           std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<float>> &b,
-           std::int64_t ldb, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].cgetrs_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                                        scratchpad_size);
-}
-void getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans,
-           std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<double> &b, std::int64_t ldb,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].dgetrs_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                                        scratchpad_size);
-}
-void getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans,
-           std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<float> &b, std::int64_t ldb,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].sgetrs_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                                        scratchpad_size);
-}
-void getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans,
-           std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-           std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].zgetrs_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad,
-                                        scratchpad_size);
-}
-void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-           oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &s, sycl::buffer<double> &u, std::int64_t ldu,
-           sycl::buffer<double> &vt, std::int64_t ldvt, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].dgesvd_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,
-                                        scratchpad, scratchpad_size);
-}
-void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-           oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &s, sycl::buffer<float> &u, std::int64_t ldu,
-           sycl::buffer<float> &vt, std::int64_t ldvt, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].sgesvd_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,
-                                        scratchpad, scratchpad_size);
-}
-void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-           oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<float> &s,
-           sycl::buffer<std::complex<float>> &u, std::int64_t ldu,
-           sycl::buffer<std::complex<float>> &vt, std::int64_t ldvt,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].cgesvd_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,
-                                        scratchpad, scratchpad_size);
-}
-void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-           oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda, sycl::buffer<double> &s,
-           sycl::buffer<std::complex<double>> &u, std::int64_t ldu,
-           sycl::buffer<std::complex<double>> &vt, std::int64_t ldvt,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].zgesvd_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt,
-                                        scratchpad, scratchpad_size);
-}
-void heevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz,
-           oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<float> &w, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].cheevd_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad,
-                                        scratchpad_size);
-}
-void heevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz,
-           oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<std::complex<double>> &a,
-           std::int64_t lda, sycl::buffer<double> &w,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].zheevd_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad,
-                                        scratchpad_size);
-}
-void hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-           oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &b, std::int64_t ldb, sycl::buffer<float> &w,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].chegvd_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                                        scratchpad_size);
-}
-void hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-           oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &b, std::int64_t ldb, sycl::buffer<double> &w,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].zhegvd_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                                        scratchpad_size);
-}
-void hetrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<float> &d,
-           sycl::buffer<float> &e, sycl::buffer<std::complex<float>> &tau,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].chetrd_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                                        scratchpad_size);
-}
-void hetrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda, sycl::buffer<double> &d,
-           sycl::buffer<double> &e, sycl::buffer<std::complex<double>> &tau,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].zhetrd_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                                        scratchpad_size);
-}
-void hetrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].chetrf_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void hetrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].zhetrf_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void orgbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec,
-           std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<float> &a, std::int64_t lda,
-           sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].sorgbr_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad,
-                                        scratchpad_size);
-}
-void orgbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec,
-           std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].dorgbr_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad,
-                                        scratchpad_size);
-}
-void orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           std::int64_t k, sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].dorgqr_sycl(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-void orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           std::int64_t k, sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].sorgqr_sycl(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-void orgtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].sorgtr_sycl(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void orgtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].dorgtr_sycl(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void ormtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-           oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-           sycl::buffer<float> &c, std::int64_t ldc, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].sormtr_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size);
-}
-void ormtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-           oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-           sycl::buffer<double> &c, std::int64_t ldc, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].dormtr_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size);
-}
-void ormrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-           sycl::buffer<float> &c, std::int64_t ldc, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].sormrq_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size);
-}
-void ormrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-           sycl::buffer<double> &c, std::int64_t ldc, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].dormrq_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size);
-}
-void ormqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &tau,
-           sycl::buffer<double> &c, std::int64_t ldc, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].dormqr_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size);
-}
-void ormqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &tau,
-           sycl::buffer<float> &c, std::int64_t ldc, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].sormqr_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size);
-}
-void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].spotrf_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].dpotrf_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].cpotrf_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].zpotrf_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].spotri_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].dpotri_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].cpotri_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].zpotri_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size);
-}
-void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b,
-           std::int64_t ldb, sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].spotrs_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                                        scratchpad_size);
-}
-void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b,
-           std::int64_t ldb, sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].dpotrs_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                                        scratchpad_size);
-}
-void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           std::int64_t nrhs, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].cpotrs_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                                        scratchpad_size);
-}
-void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           std::int64_t nrhs, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].zpotrs_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                                        scratchpad_size);
-}
-void syevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz,
-           oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a, std::int64_t lda,
-           sycl::buffer<double> &w, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].dsyevd_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad,
-                                        scratchpad_size);
-}
-void syevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz,
-           oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a, std::int64_t lda,
-           sycl::buffer<float> &w, sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].ssyevd_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad,
-                                        scratchpad_size);
-}
-void sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-           oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<double> &a,
-           std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb, sycl::buffer<double> &w,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].dsygvd_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                                        scratchpad_size);
-}
-void sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-           oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer<float> &a,
-           std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb, sycl::buffer<float> &w,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].ssygvd_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad,
-                                        scratchpad_size);
-}
-void sytrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &d,
-           sycl::buffer<double> &e, sycl::buffer<double> &tau, sycl::buffer<double> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].dsytrd_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                                        scratchpad_size);
-}
-void sytrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &d, sycl::buffer<float> &e,
-           sycl::buffer<float> &tau, sycl::buffer<float> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].ssytrd_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                                        scratchpad_size);
-}
-void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].ssytrf_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].dsytrf_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda, sycl::buffer<std::int64_t> &ipiv,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].csytrf_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::int64_t> &ipiv, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].zsytrf_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size);
-}
-void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-           oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].ctrtrs_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                                        scratchpad, scratchpad_size);
-}
-void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-           oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<double> &a, std::int64_t lda, sycl::buffer<double> &b, std::int64_t ldb,
-           sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].dtrtrs_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                                        scratchpad, scratchpad_size);
-}
-void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-           oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<float> &a, std::int64_t lda, sycl::buffer<float> &b, std::int64_t ldb,
-           sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].strtrs_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                                        scratchpad, scratchpad_size);
-}
-void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-           oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].ztrtrs_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb,
-                                        scratchpad, scratchpad_size);
-}
-void ungbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec,
-           std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<std::complex<float>> &a,
-           std::int64_t lda, sycl::buffer<std::complex<float>> &tau,
-           sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].cungbr_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad,
-                                        scratchpad_size);
-}
-void ungbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec,
-           std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer<std::complex<double>> &a,
-           std::int64_t lda, sycl::buffer<std::complex<double>> &tau,
-           sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].zungbr_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad,
-                                        scratchpad_size);
-}
-void ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].cungqr_sycl(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-void ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-           std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].zungqr_sycl(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size);
-}
-void ungtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].cungtr_sycl(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void ungtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].zungtr_sycl(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size);
-}
-void unmrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].cunmrq_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size);
-}
-void unmrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].zunmrq_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size);
-}
-void unmqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].cunmqr_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size);
-}
-void unmqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-           oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].zunmqr_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size);
-}
-void unmtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-           oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<float>> &tau, sycl::buffer<std::complex<float>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<float>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].cunmtr_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size);
-}
-void unmtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-           oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n,
-           sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-           sycl::buffer<std::complex<double>> &tau, sycl::buffer<std::complex<double>> &c,
-           std::int64_t ldc, sycl::buffer<std::complex<double>> &scratchpad,
-           std::int64_t scratchpad_size) {
-    function_tables[libkey].zunmtr_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc,
-                                        scratchpad, scratchpad_size);
-}
-sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, float *d, float *e,
-                  std::complex<float> *tauq, std::complex<float> *taup,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgebrd_usm_sycl(queue, m, n, a, lda, d, e, tauq, taup,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  double *a, std::int64_t lda, double *d, double *e, double *tauq, double *taup,
-                  double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgebrd_usm_sycl(queue, m, n, a, lda, d, e, tauq, taup,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  float *a, std::int64_t lda, float *d, float *e, float *tauq, float *taup,
-                  float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgebrd_usm_sycl(queue, m, n, a, lda, d, e, tauq, taup,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, double *d, double *e,
-                  std::complex<double> *tauq, std::complex<double> *taup,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgebrd_usm_sycl(queue, m, n, a, lda, d, e, tauq, taup,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  float *a, std::int64_t lda, float *tau, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgerqf_usm_sycl(queue, m, n, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  double *a, std::int64_t lda, double *tau, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgerqf_usm_sycl(queue, m, n, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgerqf_usm_sycl(queue, m, n, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *tau,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgerqf_usm_sycl(queue, m, n, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgeqrf_usm_sycl(queue, m, n, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  double *a, std::int64_t lda, double *tau, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgeqrf_usm_sycl(queue, m, n, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  float *a, std::int64_t lda, float *tau, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgeqrf_usm_sycl(queue, m, n, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *tau,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgeqrf_usm_sycl(queue, m, n, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgetrf_usm_sycl(queue, m, n, a, lda, ipiv, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgetrf_usm_sycl(queue, m, n, a, lda, ipiv, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgetrf_usm_sycl(queue, m, n, a, lda, ipiv, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgetrf_usm_sycl(queue, m, n, a, lda, ipiv, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgetri_usm_sycl(queue, n, a, lda, ipiv, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *a,
-                  std::int64_t lda, std::int64_t *ipiv, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgetri_usm_sycl(queue, n, a, lda, ipiv, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *a,
-                  std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgetri_usm_sycl(queue, n, a, lda, ipiv, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgetri_usm_sycl(queue, n, a, lda, ipiv, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans,
-                  std::int64_t n, std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                  std::int64_t *ipiv, std::complex<float> *b, std::int64_t ldb,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgetrs_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans,
-                  std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda,
-                  std::int64_t *ipiv, double *b, std::int64_t ldb, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgetrs_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans,
-                  std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t *ipiv,
-                  float *b, std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgetrs_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans,
-                  std::int64_t n, std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                  std::int64_t *ipiv, std::complex<double> *b, std::int64_t ldb,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgetrs_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                  oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, double *a,
-                  std::int64_t lda, double *s, double *u, std::int64_t ldu, double *vt,
-                  std::int64_t ldvt, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgesvd_usm_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt,
-                                                   ldvt, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                  oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float *a,
-                  std::int64_t lda, float *s, float *u, std::int64_t ldu, float *vt,
-                  std::int64_t ldvt, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgesvd_usm_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt,
-                                                   ldvt, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                  oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::complex<float> *a,
-                  std::int64_t lda, float *s, std::complex<float> *u, std::int64_t ldu,
-                  std::complex<float> *vt, std::int64_t ldvt, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgesvd_usm_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt,
-                                                   ldvt, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu,
-                  oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, double *s, std::complex<double> *u,
-                  std::int64_t ldu, std::complex<double> *vt, std::int64_t ldvt,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgesvd_usm_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt,
-                                                   ldvt, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event heevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                  float *w, std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cheevd_usm_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event heevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                  double *w, std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zheevd_usm_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-                  oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *b,
-                  std::int64_t ldb, float *w, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].chegvd_usm_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-                  oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *b,
-                  std::int64_t ldb, double *w, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zhegvd_usm_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event hetrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::complex<float> *a, std::int64_t lda, float *d, float *e,
-                  std::complex<float> *tau, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].chetrd_usm_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event hetrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::complex<double> *a, std::int64_t lda, double *d, double *e,
-                  std::complex<double> *tau, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zhetrd_usm_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event hetrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].chetrf_usm_sycl(queue, uplo, n, a, lda, ipiv, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event hetrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zhetrf_usm_sycl(queue, uplo, n, a, lda, ipiv, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event orgbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec,
-                  std::int64_t m, std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                  float *tau, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sorgbr_usm_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event orgbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec,
-                  std::int64_t m, std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                  double *tau, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dorgbr_usm_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  std::int64_t k, double *a, std::int64_t lda, double *tau, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dorgqr_usm_sycl(queue, m, n, k, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  std::int64_t k, float *a, std::int64_t lda, float *tau, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sorgqr_usm_sycl(queue, m, n, k, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event orgtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, float *a, std::int64_t lda, float *tau, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sorgtr_usm_sycl(queue, uplo, n, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event orgtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, double *a, std::int64_t lda, double *tau, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dorgtr_usm_sycl(queue, uplo, n, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event ormtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                  std::int64_t n, float *a, std::int64_t lda, float *tau, float *c,
-                  std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sormtr_usm_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c,
-                                                   ldc, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event ormtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                  std::int64_t n, double *a, std::int64_t lda, double *tau, double *c,
-                  std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dormtr_usm_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c,
-                                                   ldc, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event ormrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-                  float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                  float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sormrq_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event ormrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-                  double *a, std::int64_t lda, double *tau, double *c, std::int64_t ldc,
-                  double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dormrq_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event ormqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-                  double *a, std::int64_t lda, double *tau, double *c, std::int64_t ldc,
-                  double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dormqr_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event ormqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-                  float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc,
-                  float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sormqr_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, float *a, std::int64_t lda, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].spotrf_usm_sycl(queue, uplo, n, a, lda, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, double *a, std::int64_t lda, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dpotrf_usm_sycl(queue, uplo, n, a, lda, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cpotrf_usm_sycl(queue, uplo, n, a, lda, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zpotrf_usm_sycl(queue, uplo, n, a, lda, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, float *a, std::int64_t lda, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].spotri_usm_sycl(queue, uplo, n, a, lda, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, double *a, std::int64_t lda, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dpotri_usm_sycl(queue, uplo, n, a, lda, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cpotri_usm_sycl(queue, uplo, n, a, lda, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zpotri_usm_sycl(queue, uplo, n, a, lda, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda, float *b,
-                  std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].spotrs_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda, double *b,
-                  std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dpotrs_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                  std::complex<float> *b, std::int64_t ldb, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cpotrs_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                  std::complex<double> *b, std::int64_t ldb, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zpotrs_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event syevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, double *w,
-                  double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dsyevd_usm_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event syevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, float *w,
-                  float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].ssyevd_usm_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-                  oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, double *a,
-                  std::int64_t lda, double *b, std::int64_t ldb, double *w, double *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dsygvd_usm_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype,
-                  oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, float *a,
-                  std::int64_t lda, float *b, std::int64_t ldb, float *w, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].ssygvd_usm_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event sytrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, double *a, std::int64_t lda, double *d, double *e, double *tau,
-                  double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dsytrd_usm_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event sytrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, float *a, std::int64_t lda, float *d, float *e, float *tau,
-                  float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].ssytrd_usm_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].ssytrf_usm_sycl(queue, uplo, n, a, lda, ipiv, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, double *a, std::int64_t lda, std::int64_t *ipiv,
-                  double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dsytrf_usm_sycl(queue, uplo, n, a, lda, ipiv, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::complex<float> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].csytrf_usm_sycl(queue, uplo, n, a, lda, ipiv, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::complex<double> *a, std::int64_t lda, std::int64_t *ipiv,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zsytrf_usm_sycl(queue, uplo, n, a, lda, ipiv, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                  std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                  std::complex<float> *b, std::int64_t ldb, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].ctrtrs_usm_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b,
-                                                   ldb, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                  std::int64_t nrhs, double *a, std::int64_t lda, double *b, std::int64_t ldb,
-                  double *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dtrtrs_usm_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b,
-                                                   ldb, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                  std::int64_t nrhs, float *a, std::int64_t lda, float *b, std::int64_t ldb,
-                  float *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].strtrs_usm_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b,
-                                                   ldb, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n,
-                  std::int64_t nrhs, std::complex<double> *a, std::int64_t lda,
-                  std::complex<double> *b, std::int64_t ldb, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].ztrtrs_usm_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b,
-                                                   ldb, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event ungbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec,
-                  std::int64_t m, std::int64_t n, std::int64_t k, std::complex<float> *a,
-                  std::int64_t lda, std::complex<float> *tau, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cungbr_usm_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event ungbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec,
-                  std::int64_t m, std::int64_t n, std::int64_t k, std::complex<double> *a,
-                  std::int64_t lda, std::complex<double> *tau, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zungbr_usm_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                  std::complex<float> *tau, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cungqr_usm_sycl(queue, m, n, k, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                  std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                  std::complex<double> *tau, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zungqr_usm_sycl(queue, m, n, k, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event ungtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                  std::complex<float> *tau, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cungtr_usm_sycl(queue, uplo, n, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event ungtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                  std::complex<double> *tau, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zungtr_usm_sycl(queue, uplo, n, a, lda, tau, scratchpad,
-                                                   scratchpad_size, dependencies);
-}
-sycl::event unmrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                  std::complex<float> *c, std::int64_t ldc, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cunmrq_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event unmrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *tau,
-                  std::complex<double> *c, std::int64_t ldc, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zunmrq_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event unmqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-                  std::complex<float> *a, std::int64_t lda, std::complex<float> *tau,
-                  std::complex<float> *c, std::int64_t ldc, std::complex<float> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cunmqr_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event unmqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k,
-                  std::complex<double> *a, std::int64_t lda, std::complex<double> *tau,
-                  std::complex<double> *c, std::int64_t ldc, std::complex<double> *scratchpad,
-                  std::int64_t scratchpad_size, const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zunmqr_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc,
-                                                   scratchpad, scratchpad_size, dependencies);
-}
-sycl::event unmtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                  std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                  std::complex<float> *tau, std::complex<float> *c, std::int64_t ldc,
-                  std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cunmtr_usm_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c,
-                                                   ldc, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event unmtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side,
-                  oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m,
-                  std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                  std::complex<double> *tau, std::complex<double> *c, std::int64_t ldc,
-                  std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                  const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zunmtr_usm_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c,
-                                                   ldc, scratchpad, scratchpad_size, dependencies);
-}
-void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<float> &tau, std::int64_t stride_tau, std::int64_t batch_size,
-                 sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].sgeqrf_batch_sycl(queue, m, n, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<double> &tau, std::int64_t stride_tau, std::int64_t batch_size,
-                 sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].dgeqrf_batch_sycl(queue, m, n, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<float>> &tau, std::int64_t stride_tau,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].cgeqrf_batch_sycl(queue, m, n, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::complex<double>> &tau, std::int64_t stride_tau,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].zgeqrf_batch_sycl(queue, m, n, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].sgetri_batch_sycl(queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].dgetri_batch_sycl(queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].cgetri_batch_sycl(queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].zgetri_batch_sycl(queue, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans,
-                 std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 sycl::buffer<float> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].sgetrs_batch_sycl(queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                                              stride_ipiv, b, ldb, stride_b, batch_size, scratchpad,
-                                              scratchpad_size);
-}
-void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans,
-                 std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 sycl::buffer<double> &b, std::int64_t ldb, std::int64_t stride_b,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].dgetrs_batch_sycl(queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                                              stride_ipiv, b, ldb, stride_b, batch_size, scratchpad,
-                                              scratchpad_size);
-}
-void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans,
-                 std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                 std::int64_t stride_ipiv, sycl::buffer<std::complex<float>> &b, std::int64_t ldb,
-                 std::int64_t stride_b, std::int64_t batch_size,
-                 sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].cgetrs_batch_sycl(queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                                              stride_ipiv, b, ldb, stride_b, batch_size, scratchpad,
-                                              scratchpad_size);
-}
-void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans,
-                 std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::int64_t> &ipiv,
-                 std::int64_t stride_ipiv, sycl::buffer<std::complex<double>> &b, std::int64_t ldb,
-                 std::int64_t stride_b, std::int64_t batch_size,
-                 sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].zgetrs_batch_sycl(queue, trans, n, nrhs, a, lda, stride_a, ipiv,
-                                              stride_ipiv, b, ldb, stride_b, batch_size, scratchpad,
-                                              scratchpad_size);
-}
-void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].sgetrf_batch_sycl(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].dgetrf_batch_sycl(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<float>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<std::complex<float>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].cgetrf_batch_sycl(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 sycl::buffer<std::complex<double>> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<std::int64_t> &ipiv, std::int64_t stride_ipiv,
-                 std::int64_t batch_size, sycl::buffer<std::complex<double>> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].zgetrf_batch_sycl(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 std::int64_t k, sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<float> &tau, std::int64_t stride_tau, std::int64_t batch_size,
-                 sycl::buffer<float> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].sorgqr_batch_sycl(queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 std::int64_t k, sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                 sycl::buffer<double> &tau, std::int64_t stride_tau, std::int64_t batch_size,
-                 sycl::buffer<double> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].dorgqr_batch_sycl(queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                 std::int64_t n, sycl::buffer<float> &a, std::int64_t lda, std::int64_t stride_a,
-                 std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].spotrf_batch_sycl(queue, uplo, n, a, lda, stride_a, batch_size,
-                                              scratchpad, scratchpad_size);
-}
-void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                 std::int64_t n, sycl::buffer<double> &a, std::int64_t lda, std::int64_t stride_a,
-                 std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].dpotrf_batch_sycl(queue, uplo, n, a, lda, stride_a, batch_size,
-                                              scratchpad, scratchpad_size);
-}
-void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                 std::int64_t n, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                 std::int64_t stride_a, std::int64_t batch_size,
-                 sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].cpotrf_batch_sycl(queue, uplo, n, a, lda, stride_a, batch_size,
-                                              scratchpad, scratchpad_size);
-}
-void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                 std::int64_t n, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                 std::int64_t stride_a, std::int64_t batch_size,
-                 sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].zpotrf_batch_sycl(queue, uplo, n, a, lda, stride_a, batch_size,
-                                              scratchpad, scratchpad_size);
-}
-void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                 std::int64_t n, std::int64_t nrhs, sycl::buffer<float> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<float> &b, std::int64_t ldb,
-                 std::int64_t stride_b, std::int64_t batch_size, sycl::buffer<float> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].spotrs_batch_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                                              stride_b, batch_size, scratchpad, scratchpad_size);
-}
-void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                 std::int64_t n, std::int64_t nrhs, sycl::buffer<double> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<double> &b, std::int64_t ldb,
-                 std::int64_t stride_b, std::int64_t batch_size, sycl::buffer<double> &scratchpad,
-                 std::int64_t scratchpad_size) {
-    function_tables[libkey].dpotrs_batch_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                                              stride_b, batch_size, scratchpad, scratchpad_size);
-}
-void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                 std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<float>> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::complex<float>> &b,
-                 std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                 sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].cpotrs_batch_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                                              stride_b, batch_size, scratchpad, scratchpad_size);
-}
-void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                 std::int64_t n, std::int64_t nrhs, sycl::buffer<std::complex<double>> &a,
-                 std::int64_t lda, std::int64_t stride_a, sycl::buffer<std::complex<double>> &b,
-                 std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                 sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].zpotrs_batch_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb,
-                                              stride_b, batch_size, scratchpad, scratchpad_size);
-}
-void ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 std::int64_t k, sycl::buffer<std::complex<float>> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::complex<float>> &tau,
-                 std::int64_t stride_tau, std::int64_t batch_size,
-                 sycl::buffer<std::complex<float>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].cungqr_batch_sycl(queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-void ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-                 std::int64_t k, sycl::buffer<std::complex<double>> &a, std::int64_t lda,
-                 std::int64_t stride_a, sycl::buffer<std::complex<double>> &tau,
-                 std::int64_t stride_tau, std::int64_t batch_size,
-                 sycl::buffer<std::complex<double>> &scratchpad, std::int64_t scratchpad_size) {
-    function_tables[libkey].zungqr_batch_sycl(queue, m, n, k, a, lda, stride_a, tau, stride_tau,
-                                              batch_size, scratchpad, scratchpad_size);
-}
-sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, float *a, std::int64_t lda, std::int64_t stride_a,
-                        float *tau, std::int64_t stride_tau, std::int64_t batch_size,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgeqrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, tau,
-                                                         stride_tau, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, double *a, std::int64_t lda, std::int64_t stride_a,
-                        double *tau, std::int64_t stride_tau, std::int64_t batch_size,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgeqrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, tau,
-                                                         stride_tau, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::complex<float> *tau, std::int64_t stride_tau,
-                        std::int64_t batch_size, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgeqrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, tau,
-                                                         stride_tau, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::complex<double> *tau, std::int64_t stride_tau,
-                        std::int64_t batch_size, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgeqrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, tau,
-                                                         stride_tau, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, float *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t *ipiv, std::int64_t stride_ipiv, std::int64_t batch_size,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgetrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, ipiv,
-                                                         stride_ipiv, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, double *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t *ipiv, std::int64_t stride_ipiv, std::int64_t batch_size,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgetrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, ipiv,
-                                                         stride_ipiv, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv,
-                        std::int64_t batch_size, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgetrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, ipiv,
-                                                         stride_ipiv, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv,
-                        std::int64_t batch_size, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgetrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, ipiv,
-                                                         stride_ipiv, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgetri_batch_usm_sycl(queue, n, a, lda, stride_a, ipiv,
-                                                         stride_ipiv, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, std::int64_t batch_size, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgetri_batch_usm_sycl(queue, n, a, lda, stride_a, ipiv,
-                                                         stride_ipiv, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t *ipiv, std::int64_t stride_ipiv, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgetri_batch_usm_sycl(queue, n, a, lda, stride_a, ipiv,
-                                                         stride_ipiv, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n,
-                        std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t *ipiv, std::int64_t stride_ipiv, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgetri_batch_usm_sycl(queue, n, a, lda, stride_a, ipiv,
-                                                         stride_ipiv, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                        oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, float *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, float *b, std::int64_t ldb, std::int64_t stride_b,
-                        std::int64_t batch_size, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgetrs_batch_usm_sycl(
-        queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size,
-        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                        oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, double *a,
-                        std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv,
-                        std::int64_t stride_ipiv, double *b, std::int64_t ldb,
-                        std::int64_t stride_b, std::int64_t batch_size, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgetrs_batch_usm_sycl(
-        queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size,
-        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                        oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                        std::complex<float> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex<float> *b,
-                        std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgetrs_batch_usm_sycl(
-        queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size,
-        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                        oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs,
-                        std::complex<double> *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex<double> *b,
-                        std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgetrs_batch_usm_sycl(
-        queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size,
-        scratchpad, scratchpad_size, dependencies);
-}
-sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, std::int64_t k, float *a, std::int64_t lda,
-                        std::int64_t stride_a, float *tau, std::int64_t stride_tau,
-                        std::int64_t batch_size, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sorgqr_batch_usm_sycl(queue, m, n, k, a, lda, stride_a, tau,
-                                                         stride_tau, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, std::int64_t k, double *a, std::int64_t lda,
-                        std::int64_t stride_a, double *tau, std::int64_t stride_tau,
-                        std::int64_t batch_size, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dorgqr_batch_usm_sycl(queue, m, n, k, a, lda, stride_a, tau,
-                                                         stride_tau, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                        std::int64_t n, float *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t batch_size, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].spotrf_batch_usm_sycl(
-        queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                        std::int64_t n, double *a, std::int64_t lda, std::int64_t stride_a,
-                        std::int64_t batch_size, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dpotrf_batch_usm_sycl(
-        queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                        std::int64_t n, std::complex<float> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cpotrf_batch_usm_sycl(
-        queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                        std::int64_t n, std::complex<double> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zpotrf_batch_usm_sycl(
-        queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies);
-}
-sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                        std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda,
-                        std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b,
-                        std::int64_t batch_size, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].spotrs_batch_usm_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b,
-                                                         ldb, stride_b, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                        std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda,
-                        std::int64_t stride_a, double *b, std::int64_t ldb, std::int64_t stride_b,
-                        std::int64_t batch_size, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dpotrs_batch_usm_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b,
-                                                         ldb, stride_b, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                        std::int64_t n, std::int64_t nrhs, std::complex<float> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::complex<float> *b, std::int64_t ldb,
-                        std::int64_t stride_b, std::int64_t batch_size,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cpotrs_batch_usm_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b,
-                                                         ldb, stride_b, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo,
-                        std::int64_t n, std::int64_t nrhs, std::complex<double> *a,
-                        std::int64_t lda, std::int64_t stride_a, std::complex<double> *b,
-                        std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zpotrs_batch_usm_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b,
-                                                         ldb, stride_b, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, std::int64_t k, std::complex<float> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::complex<float> *tau, std::int64_t stride_tau,
-                        std::int64_t batch_size, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cungqr_batch_usm_sycl(queue, m, n, k, a, lda, stride_a, tau,
-                                                         stride_tau, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m,
-                        std::int64_t n, std::int64_t k, std::complex<double> *a, std::int64_t lda,
-                        std::int64_t stride_a, std::complex<double> *tau, std::int64_t stride_tau,
-                        std::int64_t batch_size, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zungqr_batch_usm_sycl(queue, m, n, k, a, lda, stride_a, tau,
-                                                         stride_tau, batch_size, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m,
-                        std::int64_t *n, float **a, std::int64_t *lda, float **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgeqrf_group_usm_sycl(queue, m, n, a, lda, tau, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m,
-                        std::int64_t *n, double **a, std::int64_t *lda, double **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgeqrf_group_usm_sycl(queue, m, n, a, lda, tau, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m,
-                        std::int64_t *n, std::complex<float> **a, std::int64_t *lda,
-                        std::complex<float> **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgeqrf_group_usm_sycl(queue, m, n, a, lda, tau, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m,
-                        std::int64_t *n, std::complex<double> **a, std::int64_t *lda,
-                        std::complex<double> **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgeqrf_group_usm_sycl(queue, m, n, a, lda, tau, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m,
-                        std::int64_t *n, float **a, std::int64_t *lda, std::int64_t **ipiv,
-                        std::int64_t group_count, std::int64_t *group_sizes, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgetrf_group_usm_sycl(queue, m, n, a, lda, ipiv, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m,
-                        std::int64_t *n, double **a, std::int64_t *lda, std::int64_t **ipiv,
-                        std::int64_t group_count, std::int64_t *group_sizes, double *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgetrf_group_usm_sycl(queue, m, n, a, lda, ipiv, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m,
-                        std::int64_t *n, std::complex<float> **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgetrf_group_usm_sycl(queue, m, n, a, lda, ipiv, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m,
-                        std::int64_t *n, std::complex<double> **a, std::int64_t *lda,
-                        std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgetrf_group_usm_sycl(queue, m, n, a, lda, ipiv, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, float **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgetri_group_usm_sycl(queue, n, a, lda, ipiv, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, double **a,
-                        std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgetri_group_usm_sycl(queue, n, a, lda, ipiv, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                        std::complex<float> **a, std::int64_t *lda, std::int64_t **ipiv,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgetri_group_usm_sycl(queue, n, a, lda, ipiv, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n,
-                        std::complex<double> **a, std::int64_t *lda, std::int64_t **ipiv,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgetri_group_usm_sycl(queue, n, a, lda, ipiv, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                        oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-                        float **a, std::int64_t *lda, std::int64_t **ipiv, float **b,
-                        std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes,
-                        float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sgetrs_group_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b,
-                                                         ldb, group_count, group_sizes, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                        oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-                        double **a, std::int64_t *lda, std::int64_t **ipiv, double **b,
-                        std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dgetrs_group_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b,
-                                                         ldb, group_count, group_sizes, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                        oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-                        std::complex<float> **a, std::int64_t *lda, std::int64_t **ipiv,
-                        std::complex<float> **b, std::int64_t *ldb, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cgetrs_group_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b,
-                                                         ldb, group_count, group_sizes, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue,
-                        oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs,
-                        std::complex<double> **a, std::int64_t *lda, std::int64_t **ipiv,
-                        std::complex<double> **b, std::int64_t *ldb, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zgetrs_group_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b,
-                                                         ldb, group_count, group_sizes, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m,
-                        std::int64_t *n, std::int64_t *k, float **a, std::int64_t *lda, float **tau,
-                        std::int64_t group_count, std::int64_t *group_sizes, float *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].sorgqr_group_usm_sycl(queue, m, n, k, a, lda, tau, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m,
-                        std::int64_t *n, std::int64_t *k, double **a, std::int64_t *lda,
-                        double **tau, std::int64_t group_count, std::int64_t *group_sizes,
-                        double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dorgqr_group_usm_sycl(queue, m, n, k, a, lda, tau, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                        std::int64_t *n, float **a, std::int64_t *lda, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].spotrf_group_usm_sycl(queue, uplo, n, a, lda, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                        std::int64_t *n, double **a, std::int64_t *lda, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dpotrf_group_usm_sycl(queue, uplo, n, a, lda, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                        std::int64_t *n, std::complex<float> **a, std::int64_t *lda,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cpotrf_group_usm_sycl(queue, uplo, n, a, lda, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                        std::int64_t *n, std::complex<double> **a, std::int64_t *lda,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zpotrf_group_usm_sycl(queue, uplo, n, a, lda, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                        std::int64_t *n, std::int64_t *nrhs, float **a, std::int64_t *lda,
-                        float **b, std::int64_t *ldb, std::int64_t group_count,
-                        std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].spotrs_group_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb,
-                                                         group_count, group_sizes, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                        std::int64_t *n, std::int64_t *nrhs, double **a, std::int64_t *lda,
-                        double **b, std::int64_t *ldb, std::int64_t group_count,
-                        std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].dpotrs_group_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb,
-                                                         group_count, group_sizes, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                        std::int64_t *n, std::int64_t *nrhs, std::complex<float> **a,
-                        std::int64_t *lda, std::complex<float> **b, std::int64_t *ldb,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<float> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cpotrs_group_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb,
-                                                         group_count, group_sizes, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo,
-                        std::int64_t *n, std::int64_t *nrhs, std::complex<double> **a,
-                        std::int64_t *lda, std::complex<double> **b, std::int64_t *ldb,
-                        std::int64_t group_count, std::int64_t *group_sizes,
-                        std::complex<double> *scratchpad, std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zpotrs_group_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb,
-                                                         group_count, group_sizes, scratchpad,
-                                                         scratchpad_size, dependencies);
-}
-sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m,
-                        std::int64_t *n, std::int64_t *k, std::complex<float> **a,
-                        std::int64_t *lda, std::complex<float> **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<float> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].cungqr_group_usm_sycl(queue, m, n, k, a, lda, tau, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m,
-                        std::int64_t *n, std::int64_t *k, std::complex<double> **a,
-                        std::int64_t *lda, std::complex<double> **tau, std::int64_t group_count,
-                        std::int64_t *group_sizes, std::complex<double> *scratchpad,
-                        std::int64_t scratchpad_size,
-                        const std::vector<sycl::event> &dependencies) {
-    return function_tables[libkey].zungqr_group_usm_sycl(queue, m, n, k, a, lda, tau, group_count,
-                                                         group_sizes, scratchpad, scratchpad_size,
-                                                         dependencies);
-}
-
-template <>
-std::int64_t gebrd_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          std::int64_t m, std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].sgebrd_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t gebrd_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           std::int64_t m, std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].dgebrd_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t gebrd_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].cgebrd_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t gebrd_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].zgebrd_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t gerqf_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          std::int64_t m, std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].sgerqf_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t gerqf_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           std::int64_t m, std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].dgerqf_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t gerqf_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].cgerqf_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t gerqf_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].zgerqf_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t geqrf_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          std::int64_t m, std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].sgeqrf_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t geqrf_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           std::int64_t m, std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].dgeqrf_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t geqrf_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].cgeqrf_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t geqrf_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].zgeqrf_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t gesvd_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                                          std::int64_t m, std::int64_t n, std::int64_t lda,
-                                          std::int64_t ldu, std::int64_t ldvt) {
-    return function_tables[libkey].sgesvd_scratchpad_size_sycl(queue, jobu, jobvt, m, n, lda, ldu,
-                                                               ldvt);
-}
-template <>
-std::int64_t gesvd_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                                           std::int64_t m, std::int64_t n, std::int64_t lda,
-                                           std::int64_t ldu, std::int64_t ldvt) {
-    return function_tables[libkey].dgesvd_scratchpad_size_sycl(queue, jobu, jobvt, m, n, lda, ldu,
-                                                               ldvt);
-}
-template <>
-std::int64_t gesvd_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue,
-                                                        oneapi::mkl::jobsvd jobu,
-                                                        oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda,
-                                                        std::int64_t ldu, std::int64_t ldvt) {
-    return function_tables[libkey].cgesvd_scratchpad_size_sycl(queue, jobu, jobvt, m, n, lda, ldu,
-                                                               ldvt);
-}
-template <>
-std::int64_t gesvd_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue,
-                                                         oneapi::mkl::jobsvd jobu,
-                                                         oneapi::mkl::jobsvd jobvt, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda,
-                                                         std::int64_t ldu, std::int64_t ldvt) {
-    return function_tables[libkey].zgesvd_scratchpad_size_sycl(queue, jobu, jobvt, m, n, lda, ldu,
-                                                               ldvt);
-}
-template <>
-std::int64_t getrf_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          std::int64_t m, std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].sgetrf_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t getrf_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           std::int64_t m, std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].dgetrf_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t getrf_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].cgetrf_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t getrf_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].zgetrf_scratchpad_size_sycl(queue, m, n, lda);
-}
-template <>
-std::int64_t getri_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].sgetri_scratchpad_size_sycl(queue, n, lda);
-}
-template <>
-std::int64_t getri_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].dgetri_scratchpad_size_sycl(queue, n, lda);
-}
-template <>
-std::int64_t getri_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, std::int64_t n,
-                                                        std::int64_t lda) {
-    return function_tables[libkey].cgetri_scratchpad_size_sycl(queue, n, lda);
-}
-template <>
-std::int64_t getri_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, std::int64_t n,
-                                                         std::int64_t lda) {
-    return function_tables[libkey].zgetri_scratchpad_size_sycl(queue, n, lda);
-}
-template <>
-std::int64_t getrs_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          oneapi::mkl::transpose trans, std::int64_t n,
-                                          std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) {
-    return function_tables[libkey].sgetrs_scratchpad_size_sycl(queue, trans, n, nrhs, lda, ldb);
-}
-template <>
-std::int64_t getrs_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           oneapi::mkl::transpose trans, std::int64_t n,
-                                           std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) {
-    return function_tables[libkey].dgetrs_scratchpad_size_sycl(queue, trans, n, nrhs, lda, ldb);
-}
-template <>
-std::int64_t getrs_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t n, std::int64_t nrhs,
-                                                        std::int64_t lda, std::int64_t ldb) {
-    return function_tables[libkey].cgetrs_scratchpad_size_sycl(queue, trans, n, nrhs, lda, ldb);
-}
-template <>
-std::int64_t getrs_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t n, std::int64_t nrhs,
-                                                         std::int64_t lda, std::int64_t ldb) {
-    return function_tables[libkey].zgetrs_scratchpad_size_sycl(queue, trans, n, nrhs, lda, ldb);
-}
-template <>
-std::int64_t heevd_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::job jobz,
-                                                        oneapi::mkl::uplo uplo, std::int64_t n,
-                                                        std::int64_t lda) {
-    return function_tables[libkey].cheevd_scratchpad_size_sycl(queue, jobz, uplo, n, lda);
-}
-template <>
-std::int64_t heevd_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::job jobz,
-                                                         oneapi::mkl::uplo uplo, std::int64_t n,
-                                                         std::int64_t lda) {
-    return function_tables[libkey].zheevd_scratchpad_size_sycl(queue, jobz, uplo, n, lda);
-}
-template <>
-std::int64_t hegvd_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, std::int64_t itype,
-                                                        oneapi::mkl::job jobz,
-                                                        oneapi::mkl::uplo uplo, std::int64_t n,
-                                                        std::int64_t lda, std::int64_t ldb) {
-    return function_tables[libkey].chegvd_scratchpad_size_sycl(queue, itype, jobz, uplo, n, lda,
-                                                               ldb);
-}
-template <>
-std::int64_t hegvd_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, std::int64_t itype,
-                                                         oneapi::mkl::job jobz,
-                                                         oneapi::mkl::uplo uplo, std::int64_t n,
-                                                         std::int64_t lda, std::int64_t ldb) {
-    return function_tables[libkey].zhegvd_scratchpad_size_sycl(queue, itype, jobz, uplo, n, lda,
-                                                               ldb);
-}
-template <>
-std::int64_t hetrd_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].chetrd_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t hetrd_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].zhetrd_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t hetrf_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].chetrf_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t hetrf_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].zhetrf_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t orgbr_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          oneapi::mkl::generate vect, std::int64_t m,
-                                          std::int64_t n, std::int64_t k, std::int64_t lda) {
-    return function_tables[libkey].sorgbr_scratchpad_size_sycl(queue, vect, m, n, k, lda);
-}
-template <>
-std::int64_t orgbr_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           oneapi::mkl::generate vect, std::int64_t m,
-                                           std::int64_t n, std::int64_t k, std::int64_t lda) {
-    return function_tables[libkey].dorgbr_scratchpad_size_sycl(queue, vect, m, n, k, lda);
-}
-template <>
-std::int64_t orgtr_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          oneapi::mkl::uplo uplo, std::int64_t n,
-                                          std::int64_t lda) {
-    return function_tables[libkey].sorgtr_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t orgtr_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           oneapi::mkl::uplo uplo, std::int64_t n,
-                                           std::int64_t lda) {
-    return function_tables[libkey].dorgtr_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t orgqr_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          std::int64_t m, std::int64_t n, std::int64_t k,
-                                          std::int64_t lda) {
-    return function_tables[libkey].sorgqr_scratchpad_size_sycl(queue, m, n, k, lda);
-}
-template <>
-std::int64_t orgqr_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           std::int64_t m, std::int64_t n, std::int64_t k,
-                                           std::int64_t lda) {
-    return function_tables[libkey].dorgqr_scratchpad_size_sycl(queue, m, n, k, lda);
-}
-template <>
-std::int64_t ormrq_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                          std::int64_t m, std::int64_t n, std::int64_t k,
-                                          std::int64_t lda, std::int64_t ldc) {
-    return function_tables[libkey].sormrq_scratchpad_size_sycl(queue, side, trans, m, n, k, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t ormrq_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                           std::int64_t m, std::int64_t n, std::int64_t k,
-                                           std::int64_t lda, std::int64_t ldc) {
-    return function_tables[libkey].dormrq_scratchpad_size_sycl(queue, side, trans, m, n, k, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t ormqr_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                          std::int64_t m, std::int64_t n, std::int64_t k,
-                                          std::int64_t lda, std::int64_t ldc) {
-    return function_tables[libkey].sormqr_scratchpad_size_sycl(queue, side, trans, m, n, k, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t ormqr_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           oneapi::mkl::side side, oneapi::mkl::transpose trans,
-                                           std::int64_t m, std::int64_t n, std::int64_t k,
-                                           std::int64_t lda, std::int64_t ldc) {
-    return function_tables[libkey].dormqr_scratchpad_size_sycl(queue, side, trans, m, n, k, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t ormtr_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                          oneapi::mkl::transpose trans, std::int64_t m,
-                                          std::int64_t n, std::int64_t lda, std::int64_t ldc) {
-    return function_tables[libkey].sormtr_scratchpad_size_sycl(queue, side, uplo, trans, m, n, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t ormtr_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           oneapi::mkl::side side, oneapi::mkl::uplo uplo,
-                                           oneapi::mkl::transpose trans, std::int64_t m,
-                                           std::int64_t n, std::int64_t lda, std::int64_t ldc) {
-    return function_tables[libkey].dormtr_scratchpad_size_sycl(queue, side, uplo, trans, m, n, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t potrf_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          oneapi::mkl::uplo uplo, std::int64_t n,
-                                          std::int64_t lda) {
-    return function_tables[libkey].spotrf_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t potrf_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           oneapi::mkl::uplo uplo, std::int64_t n,
-                                           std::int64_t lda) {
-    return function_tables[libkey].dpotrf_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t potrf_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].cpotrf_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t potrf_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].zpotrf_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t potrs_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs,
-                                          std::int64_t lda, std::int64_t ldb) {
-    return function_tables[libkey].spotrs_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb);
-}
-template <>
-std::int64_t potrs_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           oneapi::mkl::uplo uplo, std::int64_t n,
-                                           std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) {
-    return function_tables[libkey].dpotrs_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb);
-}
-template <>
-std::int64_t potrs_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t nrhs,
-                                                        std::int64_t lda, std::int64_t ldb) {
-    return function_tables[libkey].cpotrs_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb);
-}
-template <>
-std::int64_t potrs_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t nrhs,
-                                                         std::int64_t lda, std::int64_t ldb) {
-    return function_tables[libkey].zpotrs_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb);
-}
-template <>
-std::int64_t potri_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          oneapi::mkl::uplo uplo, std::int64_t n,
-                                          std::int64_t lda) {
-    return function_tables[libkey].spotri_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t potri_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           oneapi::mkl::uplo uplo, std::int64_t n,
-                                           std::int64_t lda) {
-    return function_tables[libkey].dpotri_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t potri_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].cpotri_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t potri_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].zpotri_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t sytrf_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          oneapi::mkl::uplo uplo, std::int64_t n,
-                                          std::int64_t lda) {
-    return function_tables[libkey].ssytrf_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t sytrf_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           oneapi::mkl::uplo uplo, std::int64_t n,
-                                           std::int64_t lda) {
-    return function_tables[libkey].dsytrf_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t sytrf_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].csytrf_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t sytrf_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].zsytrf_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t syevd_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                          std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].ssyevd_scratchpad_size_sycl(queue, jobz, uplo, n, lda);
-}
-template <>
-std::int64_t syevd_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                                           std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].dsyevd_scratchpad_size_sycl(queue, jobz, uplo, n, lda);
-}
-template <>
-std::int64_t sygvd_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          std::int64_t itype, oneapi::mkl::job jobz,
-                                          oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                          std::int64_t ldb) {
-    return function_tables[libkey].ssygvd_scratchpad_size_sycl(queue, itype, jobz, uplo, n, lda,
-                                                               ldb);
-}
-template <>
-std::int64_t sygvd_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           std::int64_t itype, oneapi::mkl::job jobz,
-                                           oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                                           std::int64_t ldb) {
-    return function_tables[libkey].dsygvd_scratchpad_size_sycl(queue, itype, jobz, uplo, n, lda,
-                                                               ldb);
-}
-template <>
-std::int64_t sytrd_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          oneapi::mkl::uplo uplo, std::int64_t n,
-                                          std::int64_t lda) {
-    return function_tables[libkey].ssytrd_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t sytrd_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           oneapi::mkl::uplo uplo, std::int64_t n,
-                                           std::int64_t lda) {
-    return function_tables[libkey].dsytrd_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t trtrs_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                          oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                          oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs,
-                                          std::int64_t lda, std::int64_t ldb) {
-    return function_tables[libkey].strtrs_scratchpad_size_sycl(queue, uplo, trans, diag, n, nrhs,
-                                                               lda, ldb);
-}
-template <>
-std::int64_t trtrs_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                           oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                                           oneapi::mkl::diag diag, std::int64_t n,
-                                           std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) {
-    return function_tables[libkey].dtrtrs_scratchpad_size_sycl(queue, uplo, trans, diag, n, nrhs,
-                                                               lda, ldb);
-}
-template <>
-std::int64_t trtrs_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        oneapi::mkl::transpose trans,
-                                                        oneapi::mkl::diag diag, std::int64_t n,
-                                                        std::int64_t nrhs, std::int64_t lda,
-                                                        std::int64_t ldb) {
-    return function_tables[libkey].ctrtrs_scratchpad_size_sycl(queue, uplo, trans, diag, n, nrhs,
-                                                               lda, ldb);
-}
-template <>
-std::int64_t trtrs_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         oneapi::mkl::transpose trans,
-                                                         oneapi::mkl::diag diag, std::int64_t n,
-                                                         std::int64_t nrhs, std::int64_t lda,
-                                                         std::int64_t ldb) {
-    return function_tables[libkey].ztrtrs_scratchpad_size_sycl(queue, uplo, trans, diag, n, nrhs,
-                                                               lda, ldb);
-}
-template <>
-std::int64_t ungbr_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue,
-                                                        oneapi::mkl::generate vect, std::int64_t m,
-                                                        std::int64_t n, std::int64_t k,
-                                                        std::int64_t lda) {
-    return function_tables[libkey].cungbr_scratchpad_size_sycl(queue, vect, m, n, k, lda);
-}
-template <>
-std::int64_t ungbr_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue,
-                                                         oneapi::mkl::generate vect, std::int64_t m,
-                                                         std::int64_t n, std::int64_t k,
-                                                         std::int64_t lda) {
-    return function_tables[libkey].zungbr_scratchpad_size_sycl(queue, vect, m, n, k, lda);
-}
-template <>
-std::int64_t ungqr_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, std::int64_t m,
-                                                        std::int64_t n, std::int64_t k,
-                                                        std::int64_t lda) {
-    return function_tables[libkey].cungqr_scratchpad_size_sycl(queue, m, n, k, lda);
-}
-template <>
-std::int64_t ungqr_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, std::int64_t m,
-                                                         std::int64_t n, std::int64_t k,
-                                                         std::int64_t lda) {
-    return function_tables[libkey].zungqr_scratchpad_size_sycl(queue, m, n, k, lda);
-}
-template <>
-std::int64_t ungtr_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                        std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].cungtr_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t ungtr_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::uplo uplo,
-                                                         std::int64_t n, std::int64_t lda) {
-    return function_tables[libkey].zungtr_scratchpad_size_sycl(queue, uplo, n, lda);
-}
-template <>
-std::int64_t unmrq_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::side side,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t m, std::int64_t n,
-                                                        std::int64_t k, std::int64_t lda,
-                                                        std::int64_t ldc) {
-    return function_tables[libkey].cunmrq_scratchpad_size_sycl(queue, side, trans, m, n, k, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t unmrq_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::side side,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t m, std::int64_t n,
-                                                         std::int64_t k, std::int64_t lda,
-                                                         std::int64_t ldc) {
-    return function_tables[libkey].zunmrq_scratchpad_size_sycl(queue, side, trans, m, n, k, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t unmqr_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::side side,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t m, std::int64_t n,
-                                                        std::int64_t k, std::int64_t lda,
-                                                        std::int64_t ldc) {
-    return function_tables[libkey].cunmqr_scratchpad_size_sycl(queue, side, trans, m, n, k, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t unmqr_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::side side,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t m, std::int64_t n,
-                                                         std::int64_t k, std::int64_t lda,
-                                                         std::int64_t ldc) {
-    return function_tables[libkey].zunmqr_scratchpad_size_sycl(queue, side, trans, m, n, k, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t unmtr_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                        sycl::queue &queue, oneapi::mkl::side side,
-                                                        oneapi::mkl::uplo uplo,
-                                                        oneapi::mkl::transpose trans,
-                                                        std::int64_t m, std::int64_t n,
-                                                        std::int64_t lda, std::int64_t ldc) {
-    return function_tables[libkey].cunmtr_scratchpad_size_sycl(queue, side, uplo, trans, m, n, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t unmtr_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                         sycl::queue &queue, oneapi::mkl::side side,
-                                                         oneapi::mkl::uplo uplo,
-                                                         oneapi::mkl::transpose trans,
-                                                         std::int64_t m, std::int64_t n,
-                                                         std::int64_t lda, std::int64_t ldc) {
-    return function_tables[libkey].zunmtr_scratchpad_size_sycl(queue, side, uplo, trans, m, n, lda,
-                                                               ldc);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                std::int64_t m, std::int64_t n, std::int64_t lda,
-                                                std::int64_t stride_a, std::int64_t stride_ipiv,
-                                                std::int64_t batch_size) {
-    return function_tables[libkey].sgetrf_batch_scratchpad_size_sycl(queue, m, n, lda, stride_a,
-                                                                     stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                 std::int64_t m, std::int64_t n, std::int64_t lda,
-                                                 std::int64_t stride_a, std::int64_t stride_ipiv,
-                                                 std::int64_t batch_size) {
-    return function_tables[libkey].dgetrf_batch_scratchpad_size_sycl(queue, m, n, lda, stride_a,
-                                                                     stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) {
-    return function_tables[libkey].cgetrf_batch_scratchpad_size_sycl(queue, m, n, lda, stride_a,
-                                                                     stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) {
-    return function_tables[libkey].zgetrf_batch_scratchpad_size_sycl(queue, m, n, lda, stride_a,
-                                                                     stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                std::int64_t n, std::int64_t lda,
-                                                std::int64_t stride_a, std::int64_t stride_ipiv,
-                                                std::int64_t batch_size) {
-    return function_tables[libkey].sgetri_batch_scratchpad_size_sycl(queue, n, lda, stride_a,
-                                                                     stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                 std::int64_t n, std::int64_t lda,
-                                                 std::int64_t stride_a, std::int64_t stride_ipiv,
-                                                 std::int64_t batch_size) {
-    return function_tables[libkey].dgetri_batch_scratchpad_size_sycl(queue, n, lda, stride_a,
-                                                                     stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) {
-    return function_tables[libkey].cgetri_batch_scratchpad_size_sycl(queue, n, lda, stride_a,
-                                                                     stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::int64_t lda,
-    std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) {
-    return function_tables[libkey].zgetri_batch_scratchpad_size_sycl(queue, n, lda, stride_a,
-                                                                     stride_ipiv, batch_size);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                oneapi::mkl::transpose trans, std::int64_t n,
-                                                std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t stride_a, std::int64_t stride_ipiv,
-                                                std::int64_t ldb, std::int64_t stride_b,
-                                                std::int64_t batch_size) {
-    return function_tables[libkey].sgetrs_batch_scratchpad_size_sycl(
-        queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                 oneapi::mkl::transpose trans, std::int64_t n,
-                                                 std::int64_t nrhs, std::int64_t lda,
-                                                 std::int64_t stride_a, std::int64_t stride_ipiv,
-                                                 std::int64_t ldb, std::int64_t stride_b,
-                                                 std::int64_t batch_size) {
-    return function_tables[libkey].dgetrs_batch_scratchpad_size_sycl(
-        queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv,
-    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    return function_tables[libkey].cgetrs_batch_scratchpad_size_sycl(
-        queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv,
-    std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) {
-    return function_tables[libkey].zgetrs_batch_scratchpad_size_sycl(
-        queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                std::int64_t m, std::int64_t n, std::int64_t lda,
-                                                std::int64_t stride_a, std::int64_t stride_tau,
-                                                std::int64_t batch_size) {
-    return function_tables[libkey].sgeqrf_batch_scratchpad_size_sycl(queue, m, n, lda, stride_a,
-                                                                     stride_tau, batch_size);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                 std::int64_t m, std::int64_t n, std::int64_t lda,
-                                                 std::int64_t stride_a, std::int64_t stride_tau,
-                                                 std::int64_t batch_size) {
-    return function_tables[libkey].dgeqrf_batch_scratchpad_size_sycl(queue, m, n, lda, stride_a,
-                                                                     stride_tau, batch_size);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) {
-    return function_tables[libkey].cgeqrf_batch_scratchpad_size_sycl(queue, m, n, lda, stride_a,
-                                                                     stride_tau, batch_size);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) {
-    return function_tables[libkey].zgeqrf_batch_scratchpad_size_sycl(queue, m, n, lda, stride_a,
-                                                                     stride_tau, batch_size);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                oneapi::mkl::uplo uplo, std::int64_t n,
-                                                std::int64_t lda, std::int64_t stride_a,
-                                                std::int64_t batch_size) {
-    return function_tables[libkey].spotrf_batch_scratchpad_size_sycl(queue, uplo, n, lda, stride_a,
-                                                                     batch_size);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                 oneapi::mkl::uplo uplo, std::int64_t n,
-                                                 std::int64_t lda, std::int64_t stride_a,
-                                                 std::int64_t batch_size) {
-    return function_tables[libkey].dpotrf_batch_scratchpad_size_sycl(queue, uplo, n, lda, stride_a,
-                                                                     batch_size);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size) {
-    return function_tables[libkey].cpotrf_batch_scratchpad_size_sycl(queue, uplo, n, lda, stride_a,
-                                                                     batch_size);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size) {
-    return function_tables[libkey].zpotrf_batch_scratchpad_size_sycl(queue, uplo, n, lda, stride_a,
-                                                                     batch_size);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                oneapi::mkl::uplo uplo, std::int64_t n,
-                                                std::int64_t nrhs, std::int64_t lda,
-                                                std::int64_t stride_a, std::int64_t ldb,
-                                                std::int64_t stride_b, std::int64_t batch_size) {
-    return function_tables[libkey].spotrs_batch_scratchpad_size_sycl(
-        queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                 oneapi::mkl::uplo uplo, std::int64_t n,
-                                                 std::int64_t nrhs, std::int64_t lda,
-                                                 std::int64_t stride_a, std::int64_t ldb,
-                                                 std::int64_t stride_b, std::int64_t batch_size) {
-    return function_tables[libkey].dpotrs_batch_scratchpad_size_sycl(
-        queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size) {
-    return function_tables[libkey].cpotrs_batch_scratchpad_size_sycl(
-        queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n,
-    std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb,
-    std::int64_t stride_b, std::int64_t batch_size) {
-    return function_tables[libkey].zpotrs_batch_scratchpad_size_sycl(
-        queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size);
-}
-template <>
-std::int64_t orgqr_batch_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                std::int64_t m, std::int64_t n, std::int64_t k,
-                                                std::int64_t lda, std::int64_t stride_a,
-                                                std::int64_t stride_tau, std::int64_t batch_size) {
-    return function_tables[libkey].sorgqr_batch_scratchpad_size_sycl(queue, m, n, k, lda, stride_a,
-                                                                     stride_tau, batch_size);
-}
-template <>
-std::int64_t orgqr_batch_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                 std::int64_t m, std::int64_t n, std::int64_t k,
-                                                 std::int64_t lda, std::int64_t stride_a,
-                                                 std::int64_t stride_tau, std::int64_t batch_size) {
-    return function_tables[libkey].dorgqr_batch_scratchpad_size_sycl(queue, m, n, k, lda, stride_a,
-                                                                     stride_tau, batch_size);
-}
-template <>
-std::int64_t ungqr_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) {
-    return function_tables[libkey].cungqr_batch_scratchpad_size_sycl(queue, m, n, k, lda, stride_a,
-                                                                     stride_tau, batch_size);
-}
-template <>
-std::int64_t ungqr_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k,
-    std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) {
-    return function_tables[libkey].zungqr_batch_scratchpad_size_sycl(queue, m, n, k, lda, stride_a,
-                                                                     stride_tau, batch_size);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-                                                std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    return function_tables[libkey].sgetrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                 std::int64_t *m, std::int64_t *n,
-                                                 std::int64_t *lda, std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    return function_tables[libkey].dgetrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                              sycl::queue &queue, std::int64_t *m,
-                                                              std::int64_t *n, std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes) {
-    return function_tables[libkey].cgetrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t getrf_batch_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                               sycl::queue &queue, std::int64_t *m,
-                                                               std::int64_t *n, std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes) {
-    return function_tables[libkey].zgetrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                std::int64_t *n, std::int64_t *lda,
-                                                std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    return function_tables[libkey].sgetri_group_scratchpad_size_sycl(queue, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                 std::int64_t *n, std::int64_t *lda,
-                                                 std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    return function_tables[libkey].dgetri_group_scratchpad_size_sycl(queue, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                              sycl::queue &queue, std::int64_t *n,
-                                                              std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes) {
-    return function_tables[libkey].cgetri_group_scratchpad_size_sycl(queue, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t getri_batch_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                               sycl::queue &queue, std::int64_t *n,
-                                                               std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes) {
-    return function_tables[libkey].zgetri_group_scratchpad_size_sycl(queue, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                oneapi::mkl::transpose *trans, std::int64_t *n,
-                                                std::int64_t *nrhs, std::int64_t *lda,
-                                                std::int64_t *ldb, std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    return function_tables[libkey].sgetrs_group_scratchpad_size_sycl(queue, trans, n, nrhs, lda,
-                                                                     ldb, group_count, group_sizes);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                 oneapi::mkl::transpose *trans, std::int64_t *n,
-                                                 std::int64_t *nrhs, std::int64_t *lda,
-                                                 std::int64_t *ldb, std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    return function_tables[libkey].dgetrs_group_scratchpad_size_sycl(queue, trans, n, nrhs, lda,
-                                                                     ldb, group_count, group_sizes);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-    std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes) {
-    return function_tables[libkey].cgetrs_group_scratchpad_size_sycl(queue, trans, n, nrhs, lda,
-                                                                     ldb, group_count, group_sizes);
-}
-template <>
-std::int64_t getrs_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n,
-    std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes) {
-    return function_tables[libkey].zgetrs_group_scratchpad_size_sycl(queue, trans, n, nrhs, lda,
-                                                                     ldb, group_count, group_sizes);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                std::int64_t *m, std::int64_t *n, std::int64_t *lda,
-                                                std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    return function_tables[libkey].sgeqrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                 std::int64_t *m, std::int64_t *n,
-                                                 std::int64_t *lda, std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    return function_tables[libkey].dgeqrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<std::complex<float>>(oneapi::mkl::device libkey,
-                                                              sycl::queue &queue, std::int64_t *m,
-                                                              std::int64_t *n, std::int64_t *lda,
-                                                              std::int64_t group_count,
-                                                              std::int64_t *group_sizes) {
-    return function_tables[libkey].cgeqrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t geqrf_batch_scratchpad_size<std::complex<double>>(oneapi::mkl::device libkey,
-                                                               sycl::queue &queue, std::int64_t *m,
-                                                               std::int64_t *n, std::int64_t *lda,
-                                                               std::int64_t group_count,
-                                                               std::int64_t *group_sizes) {
-    return function_tables[libkey].zgeqrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count,
-                                                                     group_sizes);
-}
-template <>
-std::int64_t orgqr_batch_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                                std::int64_t *lda, std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    return function_tables[libkey].sorgqr_group_scratchpad_size_sycl(queue, m, n, k, lda,
-                                                                     group_count, group_sizes);
-}
-template <>
-std::int64_t orgqr_batch_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                 std::int64_t *m, std::int64_t *n, std::int64_t *k,
-                                                 std::int64_t *lda, std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    return function_tables[libkey].dorgqr_group_scratchpad_size_sycl(queue, m, n, k, lda,
-                                                                     group_count, group_sizes);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                                std::int64_t *lda, std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    return function_tables[libkey].spotrf_group_scratchpad_size_sycl(queue, uplo, n, lda,
-                                                                     group_count, group_sizes);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                 oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                                 std::int64_t *lda, std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    return function_tables[libkey].dpotrf_group_scratchpad_size_sycl(queue, uplo, n, lda,
-                                                                     group_count, group_sizes);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-    std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes) {
-    return function_tables[libkey].cpotrf_group_scratchpad_size_sycl(queue, uplo, n, lda,
-                                                                     group_count, group_sizes);
-}
-template <>
-std::int64_t potrf_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-    std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes) {
-    return function_tables[libkey].zpotrf_group_scratchpad_size_sycl(queue, uplo, n, lda,
-                                                                     group_count, group_sizes);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<float>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                                std::int64_t *nrhs, std::int64_t *lda,
-                                                std::int64_t *ldb, std::int64_t group_count,
-                                                std::int64_t *group_sizes) {
-    return function_tables[libkey].spotrs_group_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb,
-                                                                     group_count, group_sizes);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<double>(oneapi::mkl::device libkey, sycl::queue &queue,
-                                                 oneapi::mkl::uplo *uplo, std::int64_t *n,
-                                                 std::int64_t *nrhs, std::int64_t *lda,
-                                                 std::int64_t *ldb, std::int64_t group_count,
-                                                 std::int64_t *group_sizes) {
-    return function_tables[libkey].dpotrs_group_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb,
-                                                                     group_count, group_sizes);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-    std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes) {
-    return function_tables[libkey].cpotrs_group_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb,
-                                                                     group_count, group_sizes);
-}
-template <>
-std::int64_t potrs_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n,
-    std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count,
-    std::int64_t *group_sizes) {
-    return function_tables[libkey].zpotrs_group_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb,
-                                                                     group_count, group_sizes);
-}
-template <>
-std::int64_t ungqr_batch_scratchpad_size<std::complex<float>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-    std::int64_t *k, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes) {
-    return function_tables[libkey].cungqr_group_scratchpad_size_sycl(queue, m, n, k, lda,
-                                                                     group_count, group_sizes);
-}
-template <>
-std::int64_t ungqr_batch_scratchpad_size<std::complex<double>>(
-    oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n,
-    std::int64_t *k, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes) {
-    return function_tables[libkey].zungqr_group_scratchpad_size_sycl(queue, m, n, k, lda,
-                                                                     group_count, group_sizes);
-}
-
-} //namespace detail
-} //namespace lapack
-} //namespace mkl
-} //namespace oneapi
diff --git a/src/rng/CMakeLists.txt b/src/rng/CMakeLists.txt
deleted file mode 100644
index 30df39403..000000000
--- a/src/rng/CMakeLists.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build backends
-add_subdirectory(backends)
-
-# Recipe for RNG loader object
-if(BUILD_SHARED_LIBS)
-add_library(onemkl_rng OBJECT)
-target_sources(onemkl_rng PRIVATE rng_loader.cpp)
-target_include_directories(onemkl_rng
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${PROJECT_SOURCE_DIR}/src/include
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-          $<TARGET_FILE_DIR:onemkl>
-)
-
-target_compile_options(onemkl_rng PRIVATE ${ONEMKL_BUILD_COPT})
-
-set_target_properties(onemkl_rng PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET onemkl_rng SOURCES rng_loader.cpp)
-else()
-  target_link_libraries(onemkl_rng PUBLIC ONEMKL::SYCL::SYCL)
-endif()
-
-endif()
diff --git a/src/rng/backends/CMakeLists.txt b/src/rng/backends/CMakeLists.txt
deleted file mode 100644
index 9045f7e75..000000000
--- a/src/rng/backends/CMakeLists.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_custom_target(onemkl_backend_libs_rng)
-add_dependencies(onemkl_backend_libs onemkl_backend_libs_rng)
-
-if(ENABLE_MKLCPU_BACKEND)
-  add_subdirectory(mklcpu)
-endif()
-
-if(ENABLE_MKLGPU_BACKEND)
-  add_subdirectory(mklgpu)
-endif()
-
-if(ENABLE_CURAND_BACKEND AND UNIX)
-  add_subdirectory(curand)
-endif()
-
-if(ENABLE_ROCRAND_BACKEND AND UNIX)
-  add_subdirectory(rocrand)
-endif()
-
diff --git a/src/rng/backends/curand/CMakeLists.txt b/src/rng/backends/curand/CMakeLists.txt
deleted file mode 100644
index f37a34f1d..000000000
--- a/src/rng/backends/curand/CMakeLists.txt
+++ /dev/null
@@ -1,103 +0,0 @@
-#=================================================================================
-# cuRAND back-end Copyright (c) 2021, The Regents of the University of
-# California, through Lawrence Berkeley National Laboratory (subject to receipt
-# of any required approvals from the U.S. Dept. of Energy). All rights
-# reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# (1) Redistributions of source code must retain the above copyright notice,
-# this list of conditions and the following disclaimer.
-#
-# (2) Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-#
-# (3) Neither the name of the University of California, Lawrence Berkeley
-# National Laboratory, U.S. Dept. of Energy nor the names of its contributors
-# may be used to endorse or promote products derived from this software
-# without specific prior written permission.
-#
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-# You are under no obligation whatsoever to provide any bug fixes, patches,
-# or upgrades to the features, functionality or performance of the source
-# code ("Enhancements") to anyone; however, if you choose to make your
-# Enhancements available either publicly, or directly to Lawrence Berkeley
-# National Laboratory, without imposing a separate written license agreement
-# for such Enhancements, then you hereby grant the following license: a
-# non-exclusive, royalty-free perpetual license to install, use, modify,
-# prepare derivative works, incorporate into other computer software,
-# distribute, and sublicense such enhancements or derivative works thereof,
-# in binary and source code form.
-#
-# If you have questions about your rights to use or distribute this software,
-# please contact Berkeley Lab's Intellectual Property Office at
-# IPO@lbl.gov.
-#
-# NOTICE.  This Software was developed under funding from the U.S. Department
-# of Energy and the U.S. Government consequently retains certain rights.  As
-# such, the U.S. Government has been granted for itself and others acting on
-# its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
-# Software to reproduce, distribute copies to the public, prepare derivative
-# works, and perform publicly and display publicly, and to permit others to do
-# so.
-#=================================================================================
-
-set(LIB_NAME onemkl_rng_curand)
-set(LIB_OBJ ${LIB_NAME}_obj)
-find_package(cuRAND REQUIRED)
-
-set(SOURCES philox4x32x10.cpp
-  mrg32k3a.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: mkl_rng_curand_wrappers.cpp>)
-
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT ${SOURCES})
-add_dependencies(onemkl_backend_libs_rng ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${CMAKE_BINARY_DIR}/bin
-          ${MKL_INCLUDE}
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL ONEMKL::cuRAND::cuRAND)
-target_compile_features(${LIB_OBJ} PUBLIC cxx_std_11)
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET ${LIB_OBJ} SOURCES ${SOURCES})
-endif()
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/rng/backends/curand/curand_helper.hpp b/src/rng/backends/curand/curand_helper.hpp
deleted file mode 100644
index 3926e6283..000000000
--- a/src/rng/backends/curand/curand_helper.hpp
+++ /dev/null
@@ -1,326 +0,0 @@
-/*******************************************************************************
- * cuRAND back-end Copyright (c) 2021, The Regents of the University of
- * California, through Lawrence Berkeley National Laboratory (subject to receipt
- * of any required approvals from the U.S. Dept. of Energy). All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * (1) Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * (2) Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * (3) Neither the name of the University of California, Lawrence Berkeley
- * National Laboratory, U.S. Dept. of Energy nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * You are under no obligation whatsoever to provide any bug fixes, patches,
- * or upgrades to the features, functionality or performance of the source
- * code ("Enhancements") to anyone; however, if you choose to make your
- * Enhancements available either publicly, or directly to Lawrence Berkeley
- * National Laboratory, without imposing a separate written license agreement
- * for such Enhancements, then you hereby grant the following license: a
- * non-exclusive, royalty-free perpetual license to install, use, modify,
- * prepare derivative works, incorporate into other computer software,
- * distribute, and sublicense such enhancements or derivative works thereof,
- * in binary and source code form.
- *
- * If you have questions about your rights to use or distribute this software,
- * please contact Berkeley Lab's Intellectual Property Office at
- * IPO@lbl.gov.
- *
- * NOTICE.  This Software was developed under funding from the U.S. Department
- * of Energy and the U.S. Government consequently retains certain rights.  As
- * such, the U.S. Government has been granted for itself and others acting on
- * its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
- * Software to reproduce, distribute copies to the public, prepare derivative
- * works, and perform publicly and display publicly, and to permit others to do
- * so.
- ******************************************************************************/
-
-/**
- * @file curand_helper.cpp : contains the implementation of all the routines
- * for CUDA backend
- */
-#ifndef _MKL_RNG_CURAND_HELPER_HPP_
-#define _MKL_RNG_CURAND_HELPER_HPP_
-#include <cuda.h>
-#include <curand.h>
-
-#include <complex>
-
-#include "oneapi/mkl/types.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace curand {
-
-class curand_error : virtual public std::runtime_error {
-protected:
-    inline const char* curand_error_map(curandStatus_t error) {
-        switch (error) {
-            case CURAND_STATUS_SUCCESS: return "CURAND_STATUS_SUCCESS";
-
-            case CURAND_STATUS_VERSION_MISMATCH: return "CURAND_STATUS_VERSION_MISMATCH";
-
-            case CURAND_STATUS_NOT_INITIALIZED: return "CURAND_STATUS_NOT_INITIALIZED";
-
-            case CURAND_STATUS_ALLOCATION_FAILED: return "CURAND_STATUS_ALLOCATION_FAILED";
-
-            case CURAND_STATUS_TYPE_ERROR: return "CURAND_STATUS_TYPE_ERROR";
-
-            case CURAND_STATUS_OUT_OF_RANGE: return "CURAND_STATUS_OUT_OF_RANGE";
-
-            case CURAND_STATUS_LENGTH_NOT_MULTIPLE: return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-
-            case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-                return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-
-            case CURAND_STATUS_LAUNCH_FAILURE: return "CURAND_STATUS_LAUNCH_FAILURE";
-
-            case CURAND_STATUS_PREEXISTING_FAILURE: return "CURAND_STATUS_PREEXISTING_FAILURE";
-
-            case CURAND_STATUS_INITIALIZATION_FAILED: return "CURAND_STATUS_INITIALIZATION_FAILED";
-
-            case CURAND_STATUS_ARCH_MISMATCH: return "CURAND_STATUS_ARCH_MISMATCH";
-
-            case CURAND_STATUS_INTERNAL_ERROR: return "CURAND_STATUS_INTERNAL_ERROR";
-
-            default: return "<unknown>";
-        }
-    }
-
-    int error_number; ///< Error number
-public:
-    /** Constructor (C++ STL string, curandStatus_t).
-   *  @param msg The error message
-   *  @param err_num error number
-   */
-    explicit curand_error(std::string message, curandStatus_t result)
-            : std::runtime_error((message + std::string(curand_error_map(result)))) {
-        error_number = static_cast<int>(result);
-    }
-
-    /** Destructor.
-   *  Virtual to allow for subclassing.
-   */
-    virtual ~curand_error() throw() {}
-
-    /** Returns error number.
-   *  @return #error_number
-   */
-    virtual int getErrorNumber() const throw() {
-        return error_number;
-    }
-};
-
-class cuda_error : virtual public std::runtime_error {
-protected:
-    inline const char* cuda_error_map(CUresult result) {
-        switch (result) {
-            case CUDA_SUCCESS: return "CUDA_SUCCESS";
-
-            case CUDA_ERROR_NOT_PERMITTED: return "CUDA_ERROR_NOT_PERMITTED";
-
-            case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT";
-
-            case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE";
-
-            case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE";
-
-            case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY";
-
-            case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
-
-            default: return "<unknown>";
-        }
-    }
-    int error_number; ///< error number
-public:
-    /** Constructor (C++ STL string, CUresult).
-   *  @param msg The error message
-   *  @param err_num Error number
-   */
-    explicit cuda_error(std::string message, CUresult result)
-            : std::runtime_error((message + std::string(cuda_error_map(result)))) {
-        error_number = static_cast<int>(result);
-    }
-
-    /** Destructor.
-   *  Virtual to allow for subclassing.
-   */
-    virtual ~cuda_error() throw() {}
-
-    /** Returns error number.
-   *  @return #error_number
-   */
-    virtual int getErrorNumber() const throw() {
-        return error_number;
-    }
-};
-
-#define CUDA_ERROR_FUNC(name, err, ...)                                 \
-    err = name(__VA_ARGS__);                                            \
-    if (err != CUDA_SUCCESS) {                                          \
-        throw cuda_error(std::string(#name) + std::string(" : "), err); \
-    }
-
-#define CURAND_CALL(func, status, ...)                                       \
-    status = func(__VA_ARGS__);                                              \
-    if (status != CURAND_STATUS_SUCCESS) {                                   \
-        throw curand_error(std::string(#func) + std::string(" : "), status); \
-    }
-
-// Static template functions oneapi::mkl::rng::curand::range_transform_fp for
-// Buffer and USM APIs
-//
-// cuRAND has no built-in functionality to specify a custom range for sampling
-// random numbers; `curandGenerateUniform' generates uniform random numbers on
-// [0, 1). This function is used to convert to range [a, b).
-//
-// Supported types:
-//      float
-//      double
-//
-// Input arguments:
-//      queue - the queue to submit the kernel to
-//      a     - range lower bound (inclusive)
-//      b     - range upper bound (exclusive)
-//      r     - buffer to store transformed random numbers
-template <typename T>
-static inline void range_transform_fp(sycl::queue& queue, T a, T b, std::int64_t n,
-                                      sycl::buffer<T, 1>& r) {
-    queue.submit([&](sycl::handler& cgh) {
-        auto acc = r.template get_access<sycl::access::mode::read_write>(cgh);
-        cgh.parallel_for(n, [=](sycl::id<1> id) { acc[id] = acc[id] * (b - a) + a; });
-    });
-}
-
-template <typename T>
-static inline sycl::event range_transform_fp(sycl::queue& queue, T a, T b, std::int64_t n, T* r,
-                                             sycl::event dependency) {
-    return queue.parallel_for(n, dependency, [=](sycl::id<1> id) { r[id] = r[id] * (b - a) + a; });
-}
-
-template <typename T>
-static inline void range_transform_fp_accurate(sycl::queue& queue, T a, T b, std::int64_t n,
-                                               sycl::buffer<T, 1>& r) {
-    queue.submit([&](sycl::handler& cgh) {
-        auto acc = r.template get_access<sycl::access::mode::read_write>(cgh);
-        cgh.parallel_for(n, [=](sycl::id<1> id) {
-            acc[id] = acc[id] * (b - a) + a;
-            if (acc[id] < a) {
-                acc[id] = a;
-            }
-            else if (acc[id] > b) {
-                acc[id] = b;
-            }
-        });
-    });
-}
-
-template <typename T>
-static inline sycl::event range_transform_fp_accurate(sycl::queue& queue, T a, T b, std::int64_t n,
-                                                      T* r, sycl::event dependency) {
-    return queue.parallel_for(n, dependency, [=](sycl::id<1> id) {
-        r[id] = r[id] * (b - a) + a;
-        if (r[id] < a) {
-            r[id] = a;
-        }
-        else if (r[id] > b) {
-            r[id] = b;
-        }
-    });
-}
-
-// Static template functions oneapi::mkl::rng::curand::range_transform_int for
-// Buffer and USM APIs
-//
-// cuRAND has no built-in functionality to specify a custom range for sampling
-// random numbers; `curandGenerateUniform' generates uniform random numbers on
-// [0, 1). This function is used to convert to range [a, b).
-//
-// Supported types:
-//      std::int32_t
-//      std::uint32_t
-//
-// Input arguments:
-//      queue - the queue to submit the kernel to
-//      a     - range lower bound (inclusive)
-//      b     - range upper bound (exclusive)
-//      r     - buffer to store transformed random numbers
-template <typename T>
-inline void range_transform_int(sycl::queue& queue, T a, T b, std::int64_t n,
-                                sycl::buffer<std::uint32_t, 1>& in, sycl::buffer<T, 1>& out) {
-    queue.submit([&](sycl::handler& cgh) {
-        auto acc_in = in.template get_access<sycl::access::mode::read>(cgh);
-        auto acc_out = out.template get_access<sycl::access::mode::write>(cgh);
-        cgh.parallel_for(n, [=](sycl::id<1> id) { acc_out[id] = a + acc_in[id] % (b - a); });
-    });
-}
-
-template <typename T>
-inline sycl::event range_transform_int(sycl::queue& queue, T a, T b, std::int64_t n,
-                                       std::uint32_t* in, T* out, sycl::event dependency) {
-    return queue.parallel_for(n, dependency,
-                              [=](sycl::id<1> id) { out[id] = a + in[id] % (b - a); });
-}
-
-// Static template functions oneapi::mkl::rng::curand::sample_bernoulli for
-// Buffer and USM APIs
-//
-// cuRAND has no built-in functionality to sample from a Bernoulli distribution.
-// The implementation here uses uniformly-generated random numbers and returns
-// the corresponding Bernoulli distribution based on a probability.
-//
-// Supported types:
-//      std::int32_t
-//      std::uint32_t
-//
-// Input arguments:
-//      queue - the queue to submit the kernel to
-//      p     - success probablity of a trial
-//      in    - buffer containing uniformly-generated random numbers
-//      out   - buffer to store Bernoulli
-template <typename T>
-static inline void sample_bernoulli_from_uniform(sycl::queue& queue, float p, std::int64_t n,
-                                                 sycl::buffer<float, 1> in,
-                                                 sycl::buffer<T, 1>& out) {
-    queue.submit([&](sycl::handler& cgh) {
-        auto acc_in = in.template get_access<sycl::access::mode::read>(cgh);
-        auto acc_out = out.template get_access<sycl::access::mode::write>(cgh);
-        cgh.parallel_for(n, [=](sycl::id<1> id) { acc_out[id] = acc_in[id] < p; });
-    });
-}
-
-template <typename T>
-static inline sycl::event sample_bernoulli_from_uniform(sycl::queue& queue, float p, std::int64_t n,
-                                                        float* in, T* out) {
-    return queue.parallel_for(n, [=](sycl::id<1> id) { out[id] = in[id] < p; });
-}
-
-} // namespace curand
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _MKL_RNG_CURAND_HELPER_HPP_
diff --git a/src/rng/backends/curand/curand_task.hpp b/src/rng/backends/curand/curand_task.hpp
deleted file mode 100644
index adc08b840..000000000
--- a/src/rng/backends/curand/curand_task.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-#ifndef _MKL_RNG_CURAND_TASK_HPP_
-#define _MKL_RNG_CURAND_TASK_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "curand_helper.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace curand {
-#ifdef __HIPSYCL__
-template <typename H, typename A, typename E, typename F>
-static inline void host_task_internal(H &cgh, A acc, E e, F f) {
-    cgh.hipSYCL_enqueue_custom_operation([=](sycl::interop_handle ih) {
-        curandStatus_t status;
-        CURAND_CALL(curandSetStream, status, e, ih.get_native_queue<sycl::backend::cuda>());
-        auto r_ptr =
-            reinterpret_cast<typename A::value_type *>(ih.get_native_mem<sycl::backend::cuda>(acc));
-        f(r_ptr);
-    });
-}
-
-template <typename H, typename E, typename F>
-static inline void host_task_internal(H &cgh, E e, F f) {
-    cgh.hipSYCL_enqueue_custom_operation([=](sycl::interop_handle ih) {
-        curandStatus_t status;
-        CURAND_CALL(curandSetStream, status, e, ih.get_native_queue<sycl::backend::cuda>());
-        f(ih);
-    });
-}
-#else
-template <typename H, typename A, typename E, typename F>
-static inline void host_task_internal(H &cgh, A acc, E e, F f) {
-    cgh.host_task([=](sycl::interop_handle ih) {
-        curandStatus_t status;
-        auto stream = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
-        CURAND_CALL(curandSetStream, status, e, stream);
-        auto r_ptr = reinterpret_cast<typename A::value_type *>(
-            ih.get_native_mem<sycl::backend::ext_oneapi_cuda>(acc));
-        f(r_ptr);
-    });
-}
-
-template <typename H, typename E, typename F>
-static inline void host_task_internal(H &cgh, E e, F f) {
-    cgh.host_task([=](sycl::interop_handle ih) {
-        curandStatus_t status;
-        auto stream = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
-        CURAND_CALL(curandSetStream, status, e, stream);
-        f(ih);
-    });
-}
-#endif
-template <typename H, typename A, typename E, typename F>
-static inline void onemkl_curand_host_task(H &cgh, A acc, E e, F f) {
-    host_task_internal(cgh, acc, e, f);
-}
-
-template <typename H, typename Engine, typename F>
-static inline void onemkl_curand_host_task(H &cgh, Engine e, F f) {
-    host_task_internal(cgh, e, f);
-}
-
-} // namespace curand
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif
diff --git a/src/rng/backends/curand/mkl_rng_curand_wrappers.cpp b/src/rng/backends/curand/mkl_rng_curand_wrappers.cpp
deleted file mode 100644
index 393433c81..000000000
--- a/src/rng/backends/curand/mkl_rng_curand_wrappers.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*******************************************************************************
- * cuRAND back-end Copyright (c) 2021, The Regents of the University of
- * California, through Lawrence Berkeley National Laboratory (subject to receipt
- * of any required approvals from the U.S. Dept. of Energy). All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * (1) Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * (2) Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * (3) Neither the name of the University of California, Lawrence Berkeley
- * National Laboratory, U.S. Dept. of Energy nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * You are under no obligation whatsoever to provide any bug fixes, patches,
- * or upgrades to the features, functionality or performance of the source
- * code ("Enhancements") to anyone; however, if you choose to make your
- * Enhancements available either publicly, or directly to Lawrence Berkeley
- * National Laboratory, without imposing a separate written license agreement
- * for such Enhancements, then you hereby grant the following license: a
- * non-exclusive, royalty-free perpetual license to install, use, modify,
- * prepare derivative works, incorporate into other computer software,
- * distribute, and sublicense such enhancements or derivative works thereof,
- * in binary and source code form.
- *
- * If you have questions about your rights to use or distribute this software,
- * please contact Berkeley Lab's Intellectual Property Office at
- * IPO@lbl.gov.
- *
- * NOTICE.  This Software was developed under funding from the U.S. Department
- * of Energy and the U.S. Government consequently retains certain rights.  As
- * such, the U.S. Government has been granted for itself and others acting on
- * its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
- * Software to reproduce, distribute copies to the public, prepare derivative
- * works, and perform publicly and display publicly, and to permit others to do
- * so.
- ******************************************************************************/
-
-#include "oneapi/mkl/rng/detail/curand/onemkl_rng_curand.hpp"
-#include "rng/function_table.hpp"
-
-#define WRAPPER_VERSION 1
-
-extern "C" ONEMKL_EXPORT rng_function_table_t mkl_rng_table = {
-    WRAPPER_VERSION, oneapi::mkl::rng::curand::create_philox4x32x10,
-    oneapi::mkl::rng::curand::create_philox4x32x10, oneapi::mkl::rng::curand::create_mrg32k3a,
-    oneapi::mkl::rng::curand::create_mrg32k3a
-};
diff --git a/src/rng/backends/curand/mrg32k3a.cpp b/src/rng/backends/curand/mrg32k3a.cpp
deleted file mode 100644
index dd44f4def..000000000
--- a/src/rng/backends/curand/mrg32k3a.cpp
+++ /dev/null
@@ -1,828 +0,0 @@
-/*******************************************************************************
- * cuRAND back-end Copyright (c) 2021, The Regents of the University of
- * California, through Lawrence Berkeley National Laboratory (subject to receipt
- * of any required approvals from the U.S. Dept. of Energy). All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * (1) Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * (2) Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * (3) Neither the name of the University of California, Lawrence Berkeley
- * National Laboratory, U.S. Dept. of Energy nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * You are under no obligation whatsoever to provide any bug fixes, patches,
- * or upgrades to the features, functionality or performance of the source
- * code ("Enhancements") to anyone; however, if you choose to make your
- * Enhancements available either publicly, or directly to Lawrence Berkeley
- * National Laboratory, without imposing a separate written license agreement
- * for such Enhancements, then you hereby grant the following license: a
- * non-exclusive, royalty-free perpetual license to install, use, modify,
- * prepare derivative works, incorporate into other computer software,
- * distribute, and sublicense such enhancements or derivative works thereof,
- * in binary and source code form.
- *
- * If you have questions about your rights to use or distribute this software,
- * please contact Berkeley Lab's Intellectual Property Office at
- * IPO@lbl.gov.
- *
- * NOTICE.  This Software was developed under funding from the U.S. Department
- * of Energy and the U.S. Government consequently retains certain rights.  As
- * such, the U.S. Government has been granted for itself and others acting on
- * its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
- * Software to reproduce, distribute copies to the public, prepare derivative
- * works, and perform publicly and display publicly, and to permit others to do
- * so.
- ******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#ifndef __HIPSYCL__
-#if __has_include(<sycl/context.hpp>)
-#if __SYCL_COMPILER_VERSION <= 20220930
-#include <sycl/backend/cuda.hpp>
-#endif
-#else
-#include <CL/sycl/backend/cuda.hpp>
-#endif
-#endif
-#include <iostream>
-
-#include "curand_helper.hpp"
-#include "curand_task.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/rng/detail/curand/onemkl_rng_curand.hpp"
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-#include "oneapi/mkl/rng/engines.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace curand {
-
-#if !defined(_WIN64)
-class mrg32k3a_impl : public oneapi::mkl::rng::detail::engine_impl {
-public:
-    mrg32k3a_impl(sycl::queue queue, std::uint32_t seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        curandStatus_t status;
-        CURAND_CALL(curandCreateGenerator, status, &engine_, CURAND_RNG_PSEUDO_MRG32K3A);
-        CURAND_CALL(curandSetPseudoRandomGeneratorSeed, status, engine_, (unsigned long long)seed);
-    }
-
-    mrg32k3a_impl(sycl::queue queue, std::initializer_list<std::uint32_t> seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine",
-                                         "multi-seed unsupported by cuRAND backend");
-    }
-
-    mrg32k3a_impl(const mrg32k3a_impl* other) : oneapi::mkl::rng::detail::engine_impl(*other) {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine",
-                                         "copy construction unsupported by cuRAND backend");
-    }
-
-    // Buffers API
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniform, status, engine_, r_ptr, n);
-            });
-        });
-        range_transform_fp(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniformDouble, status, engine_, r_ptr, n);
-            });
-        });
-        range_transform_fp(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::uniform<
-                              std::int32_t, oneapi::mkl::rng::uniform_method::standard>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        sycl::buffer<std::uint32_t, 1> ib(n);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = ib.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](std::uint32_t* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerate, status, engine_, r_ptr, n);
-            });
-        });
-        range_transform_int(queue_, distr.a(), distr.b(), n, ib, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniform, status, engine_, r_ptr, n);
-            });
-        });
-        range_transform_fp_accurate<float>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniformDouble, status, engine_, r_ptr, n);
-            });
-        });
-        range_transform_fp_accurate<double>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              float, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateNormal, status, engine_, r_ptr, n, distr.mean(),
-                            distr.stddev());
-            });
-        });
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              double, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateNormalDouble, status, engine_, r_ptr, n, distr.mean(),
-                            distr.stddev());
-            });
-        });
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              float, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateLogNormal, status, engine_, r_ptr, n, distr.m(),
-                            distr.s());
-            });
-        });
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              double, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateLogNormalDouble, status, engine_, r_ptr, n, distr.m(),
-                            distr.s());
-            });
-        });
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(const bits<std::uint32_t>& distr, std::int64_t n,
-                          sycl::buffer<std::uint32_t, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.template get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](std::uint32_t* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerate, status, engine_, r_ptr, n);
-            });
-        });
-    }
-
-    // USM APIs
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        sycl::event generate_event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniform, status, engine_, r, n);
-            });
-        });
-        return range_transform_fp(queue_, distr.a(), distr.b(), n, r, generate_event);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        sycl::event generate_event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniformDouble, status, engine_, r, n);
-            });
-        });
-        return range_transform_fp(queue_, distr.a(), distr.b(), n, r, generate_event);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<std::int32_t, oneapi::mkl::rng::uniform_method::standard>&
-            distr,
-        std::int64_t n, std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        auto usm_deleter = [this](std::uint32_t* ptr) {
-            sycl::free(ptr, this->queue_);
-        };
-        std::unique_ptr<std::uint32_t, decltype(usm_deleter)> usm_ib(
-            sycl::malloc_device<std::uint32_t>(n, queue_), usm_deleter);
-        std::uint32_t* ib = usm_ib.get();
-        sycl::event::wait_and_throw(dependencies);
-
-        sycl::event generate_event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerate, status, engine_, ib, n);
-            });
-        });
-        range_transform_int(queue_, distr.a(), distr.b(), n, ib, r, generate_event)
-            .wait_and_throw();
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        sycl::event generate_event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniform, status, engine_, r, n);
-            });
-        });
-        return range_transform_fp_accurate<float>(queue_, distr.a(), distr.b(), n, r,
-                                                  generate_event);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        sycl::event generate_event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniformDouble, status, engine_, r, n);
-            });
-        });
-        return range_transform_fp_accurate(queue_, distr.a(), distr.b(), n, r, generate_event);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateNormal, status, engine_, r, n, distr.mean(),
-                            distr.stddev());
-            });
-        });
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateNormalDouble, status, engine_, r, n, distr.mean(),
-                            distr.stddev());
-            });
-        });
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateLogNormal, status, engine_, r, n, distr.m(), distr.s());
-            });
-        });
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateLogNormalDouble, status, engine_, r, n, distr.m(),
-                            distr.s());
-            });
-        });
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::uint32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bits<std::uint32_t>& distr, std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerate, status, engine_, r, n);
-            });
-        });
-    }
-
-    virtual oneapi::mkl::rng::detail::engine_impl* copy_state() override {
-        return new mrg32k3a_impl(this);
-    }
-
-    virtual void skip_ahead(std::uint64_t num_to_skip) override {
-        curandStatus_t status;
-        CURAND_CALL(curandSetGeneratorOffset, status, engine_, num_to_skip);
-    }
-
-    virtual void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) override {
-        throw oneapi::mkl::unimplemented("rng", "skip_ahead",
-                                         "initializer list unsupported by cuRAND backend");
-    }
-
-    virtual void leapfrog(std::uint64_t idx, std::uint64_t stride) override {
-        throw oneapi::mkl::unimplemented("rng", "leapfrog", "unsupported by cuRAND backend");
-    }
-
-    virtual ~mrg32k3a_impl() override {
-        curandDestroyGenerator(engine_);
-    }
-
-private:
-    curandGenerator_t engine_;
-    std::uint32_t seed_;
-};
-#else // cuRAND backend is currently not supported on Windows
-class mrg32k3a_impl : public oneapi::mkl::rng::detail::engine_impl {
-public:
-    mrg32k3a_impl(sycl::queue queue, std::uint32_t seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    mrg32k3a_impl(sycl::queue queue, std::initializer_list<std::uint32_t> seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    mrg32k3a_impl(const mrg32k3a_impl* other) : oneapi::mkl::rng::detail::engine_impl(*other) {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    // Buffers API
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::uniform<
-                              std::int32_t, oneapi::mkl::rng::uniform_method::standard>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              float, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              double, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              float, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              double, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const bits<std::uint32_t>& distr, std::int64_t n,
-                          sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    // USM APIs
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<std::int32_t, oneapi::mkl::rng::uniform_method::standard>&
-            distr,
-        std::int64_t n, std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::uint32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bits<std::uint32_t>& distr, std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual oneapi::mkl::rng::detail::engine_impl* copy_state() override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return nullptr;
-    }
-
-    virtual void skip_ahead(std::uint64_t num_to_skip) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void leapfrog(std::uint64_t idx, std::uint64_t stride) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual ~mrg32k3a_impl() override {}
-};
-#endif
-
-oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(sycl::queue queue, std::uint32_t seed) {
-    return new mrg32k3a_impl(queue, seed);
-}
-
-oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(sycl::queue queue,
-                                                       std::initializer_list<std::uint32_t> seed) {
-    return new mrg32k3a_impl(queue, seed);
-}
-
-} // namespace curand
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/rng/backends/curand/philox4x32x10.cpp b/src/rng/backends/curand/philox4x32x10.cpp
deleted file mode 100644
index c3d4393d2..000000000
--- a/src/rng/backends/curand/philox4x32x10.cpp
+++ /dev/null
@@ -1,850 +0,0 @@
-/*******************************************************************************
- * cuRAND back-end Copyright (c) 2021, The Regents of the University of
- * California, through Lawrence Berkeley National Laboratory (subject to receipt
- * of any required approvals from the U.S. Dept. of Energy). All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * (1) Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * (2) Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * (3) Neither the name of the University of California, Lawrence Berkeley
- * National Laboratory, U.S. Dept. of Energy nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * You are under no obligation whatsoever to provide any bug fixes, patches,
- * or upgrades to the features, functionality or performance of the source
- * code ("Enhancements") to anyone; however, if you choose to make your
- * Enhancements available either publicly, or directly to Lawrence Berkeley
- * National Laboratory, without imposing a separate written license agreement
- * for such Enhancements, then you hereby grant the following license: a
- * non-exclusive, royalty-free perpetual license to install, use, modify,
- * prepare derivative works, incorporate into other computer software,
- * distribute, and sublicense such enhancements or derivative works thereof,
- * in binary and source code form.
- *
- * If you have questions about your rights to use or distribute this software,
- * please contact Berkeley Lab's Intellectual Property Office at
- * IPO@lbl.gov.
- *
- * NOTICE.  This Software was developed under funding from the U.S. Department
- * of Energy and the U.S. Government consequently retains certain rights.  As
- * such, the U.S. Government has been granted for itself and others acting on
- * its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
- * Software to reproduce, distribute copies to the public, prepare derivative
- * works, and perform publicly and display publicly, and to permit others to do
- * so.
- ******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#ifndef __HIPSYCL__
-#if __has_include(<sycl/context.hpp>)
-#if __SYCL_COMPILER_VERSION <= 20220930
-#include <sycl/backend/cuda.hpp>
-#endif
-#else
-#include <CL/sycl/backend/cuda.hpp>
-#endif
-#endif
-#include <iostream>
-
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-// #include "oneapi/mkl/rng/engines.hpp"
-#include "curand_helper.hpp"
-#include "curand_task.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/rng/detail/curand/onemkl_rng_curand.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace curand {
-
-#if !defined(_WIN64)
-/*
- * Note that cuRAND consists of two pieces: a host (CPU) API and a device (GPU)
- * API. The host API acts like any standard library; the `curand.h' header is
- * included and the functions can be called as usual. The generator is
- * instantiated on the host and random numbers can be generated on either the
- * host CPU or device. For device-side generation, calls to the library happen
- * on the host, but the actual work of RNG is done on the device. In this case,
- * the resulting random numbers are stored in global memory on the device. These
- * random numbers can then be used in other kernels or be copied back to the
- * host for further processing. For host-side generation, everything is done on
- * the host, and the random numbers are stored in host memory.
- *
- * The second piece is the device header, `curand_kernel.h'. Using this file
- * permits setting up random number generator states and generating sequences of
- * random numbers. This allows random numbers to be generated and immediately
- * consumed in other kernels without requiring the random numbers to be written
- * to, and read from, global memory.
- *
- * Here we utilize the host API since this is most aligned with how oneMKL
- * generates random numbers.
- *
- */
-class philox4x32x10_impl : public oneapi::mkl::rng::detail::engine_impl {
-public:
-    philox4x32x10_impl(sycl::queue queue, std::uint64_t seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        curandStatus_t status;
-        CURAND_CALL(curandCreateGenerator, status, &engine_, CURAND_RNG_PSEUDO_PHILOX4_32_10);
-        CURAND_CALL(curandSetPseudoRandomGeneratorSeed, status, engine_, (unsigned long long)seed);
-    }
-
-    philox4x32x10_impl(sycl::queue queue, std::initializer_list<std::uint64_t> seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine",
-                                         "multi-seed unsupported by cuRAND backend");
-    }
-
-    philox4x32x10_impl(const philox4x32x10_impl* other)
-            : oneapi::mkl::rng::detail::engine_impl(*other) {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine",
-                                         "copy construction unsupported by cuRAND backend");
-    }
-
-    // Buffers API
-
-    virtual inline void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniform, status, engine_, r_ptr, n);
-            });
-        });
-        range_transform_fp(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniformDouble, status, engine_, r_ptr, n);
-            });
-        });
-        range_transform_fp(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::uniform<
-                              std::int32_t, oneapi::mkl::rng::uniform_method::standard>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        sycl::buffer<std::uint32_t, 1> ib(n);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = ib.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](std::uint32_t* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerate, status, engine_, r_ptr, n);
-            });
-        });
-        range_transform_int(queue_, distr.a(), distr.b(), n, ib, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniform, status, engine_, r_ptr, n);
-            });
-        });
-        range_transform_fp_accurate(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniformDouble, status, engine_, r_ptr, n);
-            });
-        });
-        range_transform_fp_accurate(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              float, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateNormal, status, engine_, r_ptr, n, distr.mean(),
-                            distr.stddev());
-            });
-        });
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              double, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateNormalDouble, status, engine_, r_ptr, n, distr.mean(),
-                            distr.stddev());
-            });
-        });
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              float, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateLogNormal, status, engine_, r_ptr, n, distr.m(),
-                            distr.s());
-            });
-        });
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              double, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateLogNormalDouble, status, engine_, r_ptr, n, distr.m(),
-                            distr.s());
-            });
-        });
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-    }
-
-    virtual void generate(const bits<std::uint32_t>& distr, std::int64_t n,
-                          sycl::buffer<std::uint32_t, 1>& r) override {
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc = r.template get_access<sycl::access::mode::read_write>(cgh);
-            onemkl_curand_host_task(cgh, acc, engine_, [=](std::uint32_t* r_ptr) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerate, status, engine_, r_ptr, n);
-            });
-        });
-    }
-
-    // USM APIs
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        sycl::event generate_event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniform, status, engine_, r, n);
-            });
-        });
-        return range_transform_fp(queue_, distr.a(), distr.b(), n, r, generate_event);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        sycl::event generate_event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniformDouble, status, engine_, r, n);
-            });
-        });
-        return range_transform_fp(queue_, distr.a(), distr.b(), n, r, generate_event);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<std::int32_t, oneapi::mkl::rng::uniform_method::standard>&
-            distr,
-        std::int64_t n, std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        auto usm_deleter = [this](std::uint32_t* ptr) {
-            sycl::free(ptr, this->queue_);
-        };
-        std::unique_ptr<std::uint32_t, decltype(usm_deleter)> usm_ib(
-            sycl::malloc_device<std::uint32_t>(n, queue_), usm_deleter);
-        std::uint32_t* ib = usm_ib.get();
-        sycl::event::wait_and_throw(dependencies);
-
-        sycl::event generate_event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerate, status, engine_, ib, n);
-            });
-        });
-        range_transform_int(queue_, distr.a(), distr.b(), n, ib, r, generate_event)
-            .wait_and_throw();
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        sycl::event generate_event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniform, status, engine_, r, n);
-            });
-        });
-        return range_transform_fp_accurate(queue_, distr.a(), distr.b(), n, r, generate_event);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        sycl::event generate_event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateUniformDouble, status, engine_, r, n);
-            });
-        });
-        return range_transform_fp_accurate(queue_, distr.a(), distr.b(), n, r, generate_event);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateNormal, status, engine_, r, n, distr.mean(),
-                            distr.stddev());
-            });
-        });
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateNormalDouble, status, engine_, r, n, distr.mean(),
-                            distr.stddev());
-            });
-        });
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateLogNormal, status, engine_, r, n, distr.m(), distr.s());
-            });
-        });
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerateLogNormalDouble, status, engine_, r, n, distr.m(),
-                            distr.s());
-            });
-        });
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::uint32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "ICDF method not used for pseudorandom generators in cuRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bits<std::uint32_t>& distr, std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            onemkl_curand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                curandStatus_t status;
-                CURAND_CALL(curandGenerate, status, engine_, r, n);
-            });
-        });
-    }
-
-    virtual oneapi::mkl::rng::detail::engine_impl* copy_state() override {
-        return new philox4x32x10_impl(this);
-    }
-
-    virtual void skip_ahead(std::uint64_t num_to_skip) override {
-        curandStatus_t status;
-        CURAND_CALL(curandSetGeneratorOffset, status, engine_, num_to_skip);
-    }
-
-    virtual void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) override {
-        throw oneapi::mkl::unimplemented("rng", "skip_ahead",
-                                         "initializer list unsupported by cuRAND backend");
-    }
-
-    virtual void leapfrog(std::uint64_t idx, std::uint64_t stride) override {
-        throw oneapi::mkl::unimplemented("rng", "leapfrog", "unsupported by cuRAND backend");
-    }
-
-    virtual ~philox4x32x10_impl() override {
-        curandDestroyGenerator(engine_);
-    }
-
-private:
-    curandGenerator_t engine_;
-};
-#else // cuRAND backend is currently not supported on Windows
-class philox4x32x10_impl : public oneapi::mkl::rng::detail::engine_impl {
-public:
-    philox4x32x10_impl(sycl::queue queue, std::uint64_t seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    philox4x32x10_impl(sycl::queue queue, std::initializer_list<std::uint64_t> seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    philox4x32x10_impl(const philox4x32x10_impl* other)
-            : oneapi::mkl::rng::detail::engine_impl(*other) {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    // Buffers API
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::uniform<
-                              std::int32_t, oneapi::mkl::rng::uniform_method::standard>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              float, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              double, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              float, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              double, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const bits<std::uint32_t>& distr, std::int64_t n,
-                          sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    // USM APIs
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<std::int32_t, oneapi::mkl::rng::uniform_method::standard>&
-            distr,
-        std::int64_t n, std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::uint32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bits<std::uint32_t>& distr, std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual oneapi::mkl::rng::detail::engine_impl* copy_state() override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return nullptr;
-    }
-
-    virtual void skip_ahead(std::uint64_t num_to_skip) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void leapfrog(std::uint64_t idx, std::uint64_t stride) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual ~philox4x32x10_impl() override {}
-};
-#endif
-
-oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(sycl::queue queue, std::uint64_t seed) {
-    return new philox4x32x10_impl(queue, seed);
-}
-
-oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(
-    sycl::queue queue, std::initializer_list<std::uint64_t> seed) {
-    return new philox4x32x10_impl(queue, seed);
-}
-
-} // namespace curand
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/rng/backends/mklcpu/CMakeLists.txt b/src/rng/backends/mklcpu/CMakeLists.txt
deleted file mode 100644
index e72ce048f..000000000
--- a/src/rng/backends/mklcpu/CMakeLists.txt
+++ /dev/null
@@ -1,71 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(LIB_NAME onemkl_rng_mklcpu)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-set(SOURCES cpu_common.hpp
-  philox4x32x10.cpp
-  mrg32k3a.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: mkl_rng_cpu_wrappers.cpp>
-)
-
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT ${SOURCES})
-add_dependencies(onemkl_backend_libs_rng ${LIB_NAME})
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET ${LIB_OBJ} SOURCES ${SOURCES})
-endif()
-target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL MKL::MKL)
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-# Set oneMKL libraries as not transitive for dynamic
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/rng/backends/mklcpu/cpu_common.hpp b/src/rng/backends/mklcpu/cpu_common.hpp
deleted file mode 100644
index cbd6cae59..000000000
--- a/src/rng/backends/mklcpu/cpu_common.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _RNG_CPU_COMMON_HPP_
-#define _RNG_CPU_COMMON_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace mklcpu {
-
-// host_task automatically uses run_on_host_intel if it is supported by the
-//  compiler. Otherwise, it falls back to single_task.
-template <typename K, typename H, typename F>
-static inline auto host_task_internal(H &cgh, F f, int) -> decltype(cgh.host_task(f)) {
-    return cgh.host_task(f);
-}
-
-template <typename K, typename H, typename F>
-static inline void host_task_internal(H &cgh, F f, long) {
-#ifndef __SYCL_DEVICE_ONLY__
-    cgh.template single_task<K>(f);
-#endif
-}
-
-template <typename K, typename H, typename F>
-static inline void host_task(H &cgh, F f) {
-    (void)host_task_internal<K>(cgh, f, 0);
-}
-
-template <typename Engine, typename Distr>
-class kernel_name {};
-
-template <typename Engine, typename Distr>
-class kernel_name_usm {};
-
-template <typename Acc>
-typename Acc::value_type *get_raw_ptr(Acc acc) {
-// Workaround for AdaptiveCPP, as they do not yet support the get_multi_ptr function
-#ifndef __HIPSYCL__
-    return acc.template get_multi_ptr<sycl::access::decorated::no>().get_raw();
-#else
-    return acc.get_pointer();
-#endif
-}
-
-} // namespace mklcpu
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_RNG_CPU_COMMON_HPP_
diff --git a/src/rng/backends/mklcpu/mkl_rng_cpu_wrappers.cpp b/src/rng/backends/mklcpu/mkl_rng_cpu_wrappers.cpp
deleted file mode 100644
index 840205db7..000000000
--- a/src/rng/backends/mklcpu/mkl_rng_cpu_wrappers.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "rng/function_table.hpp"
-#include "oneapi/mkl/rng/detail/mklcpu/onemkl_rng_mklcpu.hpp"
-
-#define WRAPPER_VERSION 1
-
-extern "C" ONEMKL_EXPORT rng_function_table_t mkl_rng_table = {
-    WRAPPER_VERSION, oneapi::mkl::rng::mklcpu::create_philox4x32x10,
-    oneapi::mkl::rng::mklcpu::create_philox4x32x10, oneapi::mkl::rng::mklcpu::create_mrg32k3a,
-    oneapi::mkl::rng::mklcpu::create_mrg32k3a
-};
diff --git a/src/rng/backends/mklcpu/mrg32k3a.cpp b/src/rng/backends/mklcpu/mrg32k3a.cpp
deleted file mode 100644
index cc234de45..000000000
--- a/src/rng/backends/mklcpu/mrg32k3a.cpp
+++ /dev/null
@@ -1,585 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <iostream>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "mkl_vsl.h"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-#include "oneapi/mkl/rng/detail/mklcpu/onemkl_rng_mklcpu.hpp"
-
-#include "cpu_common.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace mklcpu {
-
-class mrg32k3a_impl : public oneapi::mkl::rng::detail::engine_impl {
-public:
-    mrg32k3a_impl(sycl::queue queue, std::uint32_t seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        vslNewStream(&stream_, VSL_BRNG_MRG32K3A, seed);
-        state_size_ = vslGetStreamSize(stream_);
-    }
-
-    mrg32k3a_impl(sycl::queue queue, std::initializer_list<std::uint32_t> seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        vslNewStreamEx(&stream_, VSL_BRNG_MRG32K3A, 2 * seed.size(),
-                       reinterpret_cast<const std::uint32_t*>(seed.begin()));
-        state_size_ = vslGetStreamSize(stream_);
-    }
-
-    mrg32k3a_impl(const mrg32k3a_impl* other) : oneapi::mkl::rng::detail::engine_impl(*other) {
-        vslCopyStream(&stream_, other->stream_);
-        state_size_ = vslGetStreamSize(stream_);
-    }
-
-    // Buffers APIs
-
-    virtual void generate(const uniform<float, uniform_method::standard>& distr, std::int64_t n,
-                          sycl::buffer<float, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD,
-                             static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                             get_raw_ptr(acc_r), distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual void generate(const uniform<double, uniform_method::standard>& distr, std::int64_t n,
-                          sycl::buffer<double, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD,
-                             static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                             get_raw_ptr(acc_r), distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual void generate(const uniform<std::int32_t, uniform_method::standard>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                viRngUniform(VSL_RNG_METHOD_UNIFORM_STD,
-                             static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                             get_raw_ptr(acc_r), distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual void generate(const uniform<float, uniform_method::accurate>& distr, std::int64_t n,
-                          sycl::buffer<float, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE,
-                             static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                             get_raw_ptr(acc_r), distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual void generate(const uniform<double, uniform_method::accurate>& distr, std::int64_t n,
-                          sycl::buffer<double, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE,
-                             static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                             get_raw_ptr(acc_r), distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual void generate(const gaussian<float, gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2,
-                              static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                              get_raw_ptr(acc_r), distr.mean(), distr.stddev());
-            });
-        });
-    }
-
-    virtual void generate(const gaussian<double, gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2,
-                              static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                              get_raw_ptr(acc_r), distr.mean(), distr.stddev());
-            });
-        });
-    }
-
-    virtual void generate(const gaussian<float, gaussian_method::icdf>& distr, std::int64_t n,
-                          sycl::buffer<float, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF,
-                              static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                              get_raw_ptr(acc_r), distr.mean(), distr.stddev());
-            });
-        });
-    }
-
-    virtual void generate(const gaussian<double, gaussian_method::icdf>& distr, std::int64_t n,
-                          sycl::buffer<double, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF,
-                              static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                              get_raw_ptr(acc_r), distr.mean(), distr.stddev());
-            });
-        });
-    }
-
-    virtual void generate(const lognormal<float, lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngLognormal(VSL_RNG_METHOD_LOGNORMAL_BOXMULLER2,
-                               static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                               get_raw_ptr(acc_r), distr.m(), distr.s(), distr.displ(),
-                               distr.scale());
-            });
-        });
-    }
-
-    virtual void generate(const lognormal<double, lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngLognormal(VSL_RNG_METHOD_LOGNORMAL_BOXMULLER2,
-                               static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                               get_raw_ptr(acc_r), distr.m(), distr.s(), distr.displ(),
-                               distr.scale());
-            });
-        });
-    }
-
-    virtual void generate(const lognormal<float, lognormal_method::icdf>& distr, std::int64_t n,
-                          sycl::buffer<float, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngLognormal(VSL_RNG_METHOD_LOGNORMAL_ICDF,
-                               static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                               get_raw_ptr(acc_r), distr.m(), distr.s(), distr.displ(),
-                               distr.scale());
-            });
-        });
-    }
-
-    virtual void generate(const lognormal<double, lognormal_method::icdf>& distr, std::int64_t n,
-                          sycl::buffer<double, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngLognormal(VSL_RNG_METHOD_LOGNORMAL_ICDF,
-                               static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                               get_raw_ptr(acc_r), distr.m(), distr.s(), distr.displ(),
-                               distr.scale());
-            });
-        });
-    }
-
-    virtual void generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF,
-                               static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                               get_raw_ptr(acc_r), distr.p());
-            });
-        });
-    }
-
-    virtual void generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                std::uint32_t* r_ptr = get_raw_ptr(acc_r);
-                viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF,
-                               static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                               reinterpret_cast<std::int32_t*>(r_ptr), distr.p());
-            });
-        });
-    }
-
-    virtual void generate(const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                viRngPoisson(VSL_RNG_METHOD_POISSON_POISNORM,
-                             static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                             get_raw_ptr(acc_r), distr.lambda());
-            });
-        });
-    }
-
-    virtual void generate(const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                std::uint32_t* r_ptr = get_raw_ptr(acc_r);
-                viRngPoisson(VSL_RNG_METHOD_POISSON_POISNORM,
-                             static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                             reinterpret_cast<std::int32_t*>(r_ptr), distr.lambda());
-            });
-        });
-    }
-
-    virtual void generate(const bits<std::uint32_t>& distr, std::int64_t n,
-                          sycl::buffer<std::uint32_t, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                viRngUniformBits(VSL_RNG_METHOD_UNIFORMBITS_STD,
-                                 static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                                 get_raw_ptr(acc_r));
-            });
-        });
-    }
-
-    // USM APIs
-
-    virtual sycl::event generate(const uniform<float, uniform_method::standard>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, n, r, distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const uniform<double, uniform_method::standard>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, n, r, distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const uniform<std::int32_t, uniform_method::standard>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                viRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, n, r, distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const uniform<float, uniform_method::accurate>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, n, r, distr.a(),
-                             distr.b());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const uniform<double, uniform_method::accurate>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, n, r, distr.a(),
-                             distr.b());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const gaussian<float, gaussian_method::box_muller2>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2, stream, n, r, distr.mean(),
-                              distr.stddev());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const gaussian<double, gaussian_method::box_muller2>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2, stream, n, r, distr.mean(),
-                              distr.stddev());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const gaussian<float, gaussian_method::icdf>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, n, r, distr.mean(),
-                              distr.stddev());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const gaussian<double, gaussian_method::icdf>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, n, r, distr.mean(),
-                              distr.stddev());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const lognormal<float, lognormal_method::box_muller2>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngLognormal(VSL_RNG_METHOD_LOGNORMAL_BOXMULLER2, stream, n, r, distr.m(),
-                               distr.s(), distr.displ(), distr.scale());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const lognormal<double, lognormal_method::box_muller2>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngLognormal(VSL_RNG_METHOD_LOGNORMAL_BOXMULLER2, stream, n, r, distr.m(),
-                               distr.s(), distr.displ(), distr.scale());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const lognormal<float, lognormal_method::icdf>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngLognormal(VSL_RNG_METHOD_LOGNORMAL_ICDF, stream, n, r, distr.m(), distr.s(),
-                               distr.displ(), distr.scale());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const lognormal<double, lognormal_method::icdf>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngLognormal(VSL_RNG_METHOD_LOGNORMAL_ICDF, stream, n, r, distr.m(), distr.s(),
-                               distr.displ(), distr.scale());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, stream, n, r, distr.p());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, stream, n,
-                               reinterpret_cast<int32_t*>(r), distr.p());
-            });
-        });
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                viRngPoisson(VSL_RNG_METHOD_POISSON_POISNORM, stream, n, r, distr.lambda());
-            });
-        });
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::uint32_t* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(cgh, [=]() {
-                viRngPoisson(VSL_RNG_METHOD_POISSON_POISNORM, stream, n,
-                             reinterpret_cast<int32_t*>(r), distr.lambda());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const bits<std::uint32_t>& distr, std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<mrg32k3a_impl, decltype(distr)>>(
-                cgh, [=]() { viRngUniformBits(VSL_RNG_METHOD_UNIFORMBITS_STD, stream, n, r); });
-        });
-    }
-
-    virtual oneapi::mkl::rng::detail::engine_impl* copy_state() override {
-        return new mrg32k3a_impl(this);
-    }
-
-    virtual void skip_ahead(std::uint64_t num_to_skip) override {
-        vslSkipAheadStream(stream_, num_to_skip);
-    }
-
-    virtual void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) override {
-        vslSkipAheadStreamEx(stream_, num_to_skip.size(), (unsigned long long*)num_to_skip.begin());
-    }
-
-    virtual void leapfrog(std::uint64_t idx, std::uint64_t stride) override {
-        throw oneapi::mkl::unimplemented("rng", "leapfrog");
-    }
-
-    virtual ~mrg32k3a_impl() override {
-        vslDeleteStream(&stream_);
-    }
-
-private:
-    VSLStreamStatePtr stream_;
-    std::int32_t state_size_;
-};
-
-oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(sycl::queue queue, std::uint32_t seed) {
-    return new mrg32k3a_impl(queue, seed);
-}
-
-oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(sycl::queue queue,
-                                                       std::initializer_list<std::uint32_t> seed) {
-    return new mrg32k3a_impl(queue, seed);
-}
-
-} // namespace mklcpu
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/rng/backends/mklcpu/philox4x32x10.cpp b/src/rng/backends/mklcpu/philox4x32x10.cpp
deleted file mode 100644
index 3f8e5e89b..000000000
--- a/src/rng/backends/mklcpu/philox4x32x10.cpp
+++ /dev/null
@@ -1,587 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <iostream>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "mkl_vsl.h"
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-#include "oneapi/mkl/rng/detail/mklcpu/onemkl_rng_mklcpu.hpp"
-
-#include "cpu_common.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace mklcpu {
-
-class philox4x32x10_impl : public oneapi::mkl::rng::detail::engine_impl {
-public:
-    philox4x32x10_impl(sycl::queue queue, std::uint64_t seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        vslNewStreamEx(&stream_, VSL_BRNG_PHILOX4X32X10, 2,
-                       reinterpret_cast<std::uint32_t*>(&seed));
-        state_size_ = vslGetStreamSize(stream_);
-    }
-
-    philox4x32x10_impl(sycl::queue queue, std::initializer_list<std::uint64_t> seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        vslNewStreamEx(&stream_, VSL_BRNG_PHILOX4X32X10, 2 * seed.size(),
-                       reinterpret_cast<const std::uint32_t*>(seed.begin()));
-        state_size_ = vslGetStreamSize(stream_);
-    }
-
-    philox4x32x10_impl(const philox4x32x10_impl* other)
-            : oneapi::mkl::rng::detail::engine_impl(*other) {
-        vslCopyStream(&stream_, other->stream_);
-        state_size_ = vslGetStreamSize(stream_);
-    }
-
-    // Buffers APIs
-
-    virtual void generate(const uniform<float, uniform_method::standard>& distr, std::int64_t n,
-                          sycl::buffer<float, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD,
-                             static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                             get_raw_ptr(acc_r), distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual void generate(const uniform<double, uniform_method::standard>& distr, std::int64_t n,
-                          sycl::buffer<double, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD,
-                             static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                             get_raw_ptr(acc_r), distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual void generate(const uniform<std::int32_t, uniform_method::standard>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                viRngUniform(VSL_RNG_METHOD_UNIFORM_STD,
-                             static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                             get_raw_ptr(acc_r), distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual void generate(const uniform<float, uniform_method::accurate>& distr, std::int64_t n,
-                          sycl::buffer<float, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE,
-                             static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                             get_raw_ptr(acc_r), distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual void generate(const uniform<double, uniform_method::accurate>& distr, std::int64_t n,
-                          sycl::buffer<double, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE,
-                             static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                             get_raw_ptr(acc_r), distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual void generate(const gaussian<float, gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2,
-                              static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                              get_raw_ptr(acc_r), distr.mean(), distr.stddev());
-            });
-        });
-    }
-
-    virtual void generate(const gaussian<double, gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2,
-                              static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                              get_raw_ptr(acc_r), distr.mean(), distr.stddev());
-            });
-        });
-    }
-
-    virtual void generate(const gaussian<float, gaussian_method::icdf>& distr, std::int64_t n,
-                          sycl::buffer<float, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF,
-                              static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                              get_raw_ptr(acc_r), distr.mean(), distr.stddev());
-            });
-        });
-    }
-
-    virtual void generate(const gaussian<double, gaussian_method::icdf>& distr, std::int64_t n,
-                          sycl::buffer<double, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF,
-                              static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                              get_raw_ptr(acc_r), distr.mean(), distr.stddev());
-            });
-        });
-    }
-
-    virtual void generate(const lognormal<float, lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngLognormal(VSL_RNG_METHOD_LOGNORMAL_BOXMULLER2,
-                               static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                               get_raw_ptr(acc_r), distr.m(), distr.s(), distr.displ(),
-                               distr.scale());
-            });
-        });
-    }
-
-    virtual void generate(const lognormal<double, lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngLognormal(VSL_RNG_METHOD_LOGNORMAL_BOXMULLER2,
-                               static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                               get_raw_ptr(acc_r), distr.m(), distr.s(), distr.displ(),
-                               distr.scale());
-            });
-        });
-    }
-
-    virtual void generate(const lognormal<float, lognormal_method::icdf>& distr, std::int64_t n,
-                          sycl::buffer<float, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngLognormal(VSL_RNG_METHOD_LOGNORMAL_ICDF,
-                               static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                               get_raw_ptr(acc_r), distr.m(), distr.s(), distr.displ(),
-                               distr.scale());
-            });
-        });
-    }
-
-    virtual void generate(const lognormal<double, lognormal_method::icdf>& distr, std::int64_t n,
-                          sycl::buffer<double, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngLognormal(VSL_RNG_METHOD_LOGNORMAL_ICDF,
-                               static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                               get_raw_ptr(acc_r), distr.m(), distr.s(), distr.displ(),
-                               distr.scale());
-            });
-        });
-    }
-
-    virtual void generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF,
-                               static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                               get_raw_ptr(acc_r), distr.p());
-            });
-        });
-    }
-
-    virtual void generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                std::uint32_t* r_ptr = get_raw_ptr(acc_r);
-                viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF,
-                               static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                               reinterpret_cast<std::int32_t*>(r_ptr), distr.p());
-            });
-        });
-    }
-
-    virtual void generate(const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                viRngPoisson(VSL_RNG_METHOD_POISSON_POISNORM,
-                             static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                             get_raw_ptr(acc_r), distr.lambda());
-            });
-        });
-    }
-
-    virtual void generate(const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                std::uint32_t* r_ptr = get_raw_ptr(acc_r);
-                viRngPoisson(VSL_RNG_METHOD_POISSON_POISNORM,
-                             static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                             reinterpret_cast<std::int32_t*>(r_ptr), distr.lambda());
-            });
-        });
-    }
-
-    virtual void generate(const bits<std::uint32_t>& distr, std::int64_t n,
-                          sycl::buffer<std::uint32_t, 1>& r) override {
-        sycl::buffer<char, 1> stream_buf(static_cast<char*>(stream_), state_size_);
-        queue_.submit([&](sycl::handler& cgh) {
-            auto acc_stream = stream_buf.get_access<sycl::access::mode::read_write>(cgh);
-            auto acc_r = r.get_access<sycl::access::mode::read_write>(cgh);
-            host_task<kernel_name<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                viRngUniformBits(VSL_RNG_METHOD_UNIFORMBITS_STD,
-                                 static_cast<VSLStreamStatePtr>(get_raw_ptr(acc_stream)), n,
-                                 get_raw_ptr(acc_r));
-            });
-        });
-    }
-
-    // USM APIs
-
-    virtual sycl::event generate(const uniform<float, uniform_method::standard>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, n, r, distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const uniform<double, uniform_method::standard>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, n, r, distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const uniform<std::int32_t, uniform_method::standard>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                viRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, n, r, distr.a(), distr.b());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const uniform<float, uniform_method::accurate>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, n, r, distr.a(),
-                             distr.b());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const uniform<double, uniform_method::accurate>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, stream, n, r, distr.a(),
-                             distr.b());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const gaussian<float, gaussian_method::box_muller2>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2, stream, n, r, distr.mean(),
-                              distr.stddev());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const gaussian<double, gaussian_method::box_muller2>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2, stream, n, r, distr.mean(),
-                              distr.stddev());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const gaussian<float, gaussian_method::icdf>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, n, r, distr.mean(),
-                              distr.stddev());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const gaussian<double, gaussian_method::icdf>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, stream, n, r, distr.mean(),
-                              distr.stddev());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const lognormal<float, lognormal_method::box_muller2>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngLognormal(VSL_RNG_METHOD_LOGNORMAL_BOXMULLER2, stream, n, r, distr.m(),
-                               distr.s(), distr.displ(), distr.scale());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const lognormal<double, lognormal_method::box_muller2>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngLognormal(VSL_RNG_METHOD_LOGNORMAL_BOXMULLER2, stream, n, r, distr.m(),
-                               distr.s(), distr.displ(), distr.scale());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const lognormal<float, lognormal_method::icdf>& distr,
-                                 std::int64_t n, float* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vsRngLognormal(VSL_RNG_METHOD_LOGNORMAL_ICDF, stream, n, r, distr.m(), distr.s(),
-                               distr.displ(), distr.scale());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const lognormal<double, lognormal_method::icdf>& distr,
-                                 std::int64_t n, double* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                vdRngLognormal(VSL_RNG_METHOD_LOGNORMAL_ICDF, stream, n, r, distr.m(), distr.s(),
-                               distr.displ(), distr.scale());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, stream, n, r, distr.p());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, stream, n,
-                               reinterpret_cast<int32_t*>(r), distr.p());
-            });
-        });
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                viRngPoisson(VSL_RNG_METHOD_POISSON_POISNORM, stream, n, r, distr.lambda());
-            });
-        });
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::uint32_t* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(cgh, [=]() {
-                viRngPoisson(VSL_RNG_METHOD_POISSON_POISNORM, stream, n,
-                             reinterpret_cast<int32_t*>(r), distr.lambda());
-            });
-        });
-    }
-
-    virtual sycl::event generate(const bits<std::uint32_t>& distr, std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        return queue_.submit([&](sycl::handler& cgh) {
-            VSLStreamStatePtr stream = stream_;
-            host_task<kernel_name_usm<philox4x32x10_impl, decltype(distr)>>(
-                cgh, [=]() { viRngUniformBits(VSL_RNG_METHOD_UNIFORMBITS_STD, stream, n, r); });
-        });
-    }
-
-    virtual oneapi::mkl::rng::detail::engine_impl* copy_state() override {
-        return new philox4x32x10_impl(this);
-    }
-
-    virtual void skip_ahead(std::uint64_t num_to_skip) override {
-        vslSkipAheadStream(stream_, num_to_skip);
-    }
-
-    virtual void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) override {
-        vslSkipAheadStreamEx(stream_, num_to_skip.size(), (unsigned long long*)num_to_skip.begin());
-    }
-
-    virtual void leapfrog(std::uint64_t idx, std::uint64_t stride) override {
-        throw oneapi::mkl::unimplemented("rng", "leapfrog");
-    }
-
-    virtual ~philox4x32x10_impl() override {
-        vslDeleteStream(&stream_);
-    }
-
-private:
-    VSLStreamStatePtr stream_;
-    std::int32_t state_size_;
-};
-
-oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(sycl::queue queue, std::uint64_t seed) {
-    return new philox4x32x10_impl(queue, seed);
-}
-
-oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(
-    sycl::queue queue, std::initializer_list<std::uint64_t> seed) {
-    return new philox4x32x10_impl(queue, seed);
-}
-
-} // namespace mklcpu
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/rng/backends/mklgpu/CMakeLists.txt b/src/rng/backends/mklgpu/CMakeLists.txt
deleted file mode 100644
index 150f90136..000000000
--- a/src/rng/backends/mklgpu/CMakeLists.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(LIB_NAME onemkl_rng_mklgpu)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT
-  mkl_internal_rng_gpu.hpp
-  philox4x32x10.cpp
-  mrg32k3a.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: mkl_rng_gpu_wrappers.cpp>
-)
-add_dependencies(onemkl_backend_libs_rng ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-
-if(TARGET MKL::MKL_SYCL::RNG)
-  target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL MKL::MKL_SYCL::RNG)
-else()
-  target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL MKL::MKL_DPCPP)
-endif()
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-# Set oneMKL libraries as not transitive for dynamic
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/rng/backends/mklgpu/mkl_internal_rng_gpu.hpp b/src/rng/backends/mklgpu/mkl_internal_rng_gpu.hpp
deleted file mode 100755
index 5ca480ba5..000000000
--- a/src/rng/backends/mklgpu/mkl_internal_rng_gpu.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _MKL_INTERNAL_RNG_GPU_HPP_
-#define _MKL_INTERNAL_RNG_GPU_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace detail {
-
-template <typename EngineType>
-class engine_base_impl;
-
-namespace gpu {
-
-template <typename EngineType>
-engine_base_impl<EngineType>* create_engine(sycl::queue& queue, std::uint64_t seed);
-
-template <typename EngineType>
-engine_base_impl<EngineType>* create_engine(sycl::queue& queue, std::int64_t n,
-                                            const unsigned int* seed_ptr);
-
-template <typename EngineType>
-engine_base_impl<EngineType>* create_engine(sycl::queue& queue,
-                                            engine_base_impl<EngineType>* other_impl);
-
-template <typename EngineType>
-void skip_ahead(sycl::queue& queue, engine_base_impl<EngineType>* impl, std::uint64_t num_to_skip);
-
-template <typename EngineType>
-void skip_ahead(sycl::queue& queue, engine_base_impl<EngineType>* impl,
-                std::initializer_list<std::uint64_t> num_to_skip);
-
-template <typename EngineType>
-void leapfrog(sycl::queue& queue, engine_base_impl<EngineType>* impl, std::uint64_t idx,
-              std::uint64_t stride);
-
-template <typename EngineType>
-void delete_engine(sycl::queue& queue, engine_base_impl<EngineType>* impl);
-
-template <typename EngineType, typename DistrType>
-sycl::event generate(sycl::queue& queue, const DistrType& distr,
-                     engine_base_impl<EngineType>* engine, std::int64_t n,
-                     sycl::buffer<typename DistrType::result_type, 1>& r);
-
-template <typename EngineType, typename DistrType>
-sycl::event generate(sycl::queue& queue, const DistrType& distr,
-                     engine_base_impl<EngineType>* engine, std::int64_t n,
-                     typename DistrType::result_type* r,
-                     const std::vector<sycl::event>& dependencies = {});
-
-} // namespace gpu
-} // namespace detail
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif //_MKL_INTERNAL_RNG_GPU_HPP_
diff --git a/src/rng/backends/mklgpu/mkl_rng_gpu_wrappers.cpp b/src/rng/backends/mklgpu/mkl_rng_gpu_wrappers.cpp
deleted file mode 100644
index 6754b2749..000000000
--- a/src/rng/backends/mklgpu/mkl_rng_gpu_wrappers.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "rng/function_table.hpp"
-#include "oneapi/mkl/rng/detail/mklgpu/onemkl_rng_mklgpu.hpp"
-
-#define WRAPPER_VERSION 1
-
-extern "C" ONEMKL_EXPORT rng_function_table_t mkl_rng_table = {
-    WRAPPER_VERSION, oneapi::mkl::rng::mklgpu::create_philox4x32x10,
-    oneapi::mkl::rng::mklgpu::create_philox4x32x10, oneapi::mkl::rng::mklgpu::create_mrg32k3a,
-    oneapi::mkl::rng::mklgpu::create_mrg32k3a
-};
diff --git a/src/rng/backends/mklgpu/mrg32k3a.cpp b/src/rng/backends/mklgpu/mrg32k3a.cpp
deleted file mode 100644
index 05d24e1a4..000000000
--- a/src/rng/backends/mklgpu/mrg32k3a.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <iostream>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "mkl_version.h"
-
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-#include "oneapi/mkl/rng/engines.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/rng/detail/mklgpu/onemkl_rng_mklgpu.hpp"
-
-#include "mkl_internal_rng_gpu.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace mklgpu {
-
-class mrg32k3a_impl : public oneapi::mkl::rng::detail::engine_impl {
-public:
-    mrg32k3a_impl(sycl::queue queue, std::uint32_t seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        engine_ =
-            oneapi::mkl::rng::detail::gpu::create_engine<oneapi::mkl::rng::mrg32k3a>(queue, seed);
-    }
-
-    mrg32k3a_impl(sycl::queue queue, std::initializer_list<std::uint32_t> seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        engine_ = oneapi::mkl::rng::detail::gpu::create_engine<oneapi::mkl::rng::mrg32k3a>(
-            queue, (std::int64_t)(seed.size()), (const unsigned int*)seed.begin());
-    }
-
-    mrg32k3a_impl(const mrg32k3a_impl* other) : oneapi::mkl::rng::detail::engine_impl(*other) {
-        sycl::queue queue(other->queue_);
-        engine_ = oneapi::mkl::rng::detail::gpu::create_engine<oneapi::mkl::rng::mrg32k3a>(
-            queue, other->engine_);
-    }
-
-    // Buffers API
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::uniform<
-                              std::int32_t, oneapi::mkl::rng::uniform_method::standard>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              float, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              double, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              float, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              double, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const bits<std::uint32_t>& distr, std::int64_t n,
-                          sycl::buffer<std::uint32_t, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    // USM APIs
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<std::int32_t, oneapi::mkl::rng::uniform_method::standard>&
-            distr,
-        std::int64_t n, std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        ;
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::uint32_t* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(const bits<std::uint32_t>& distr, std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual oneapi::mkl::rng::detail::engine_impl* copy_state() override {
-        return new mrg32k3a_impl(this);
-    }
-
-    virtual void skip_ahead(std::uint64_t num_to_skip) override {
-        oneapi::mkl::rng::detail::gpu::skip_ahead(queue_, engine_, num_to_skip);
-    }
-
-    virtual void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) override {
-        oneapi::mkl::rng::detail::gpu::skip_ahead(queue_, engine_, num_to_skip);
-    }
-
-    virtual void leapfrog(std::uint64_t idx, std::uint64_t stride) override {
-        throw oneapi::mkl::unimplemented("rng", "leapfrog");
-    }
-
-    virtual ~mrg32k3a_impl() override {
-        oneapi::mkl::rng::detail::gpu::delete_engine(queue_, engine_);
-    }
-
-private:
-    oneapi::mkl::rng::detail::engine_base_impl<oneapi::mkl::rng::mrg32k3a>* engine_;
-};
-
-oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(sycl::queue queue, std::uint32_t seed) {
-    return new mrg32k3a_impl(queue, seed);
-}
-
-oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(sycl::queue queue,
-                                                       std::initializer_list<std::uint32_t> seed) {
-    return new mrg32k3a_impl(queue, seed);
-}
-
-} // namespace mklgpu
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/rng/backends/mklgpu/philox4x32x10.cpp b/src/rng/backends/mklgpu/philox4x32x10.cpp
deleted file mode 100644
index bcf869c61..000000000
--- a/src/rng/backends/mklgpu/philox4x32x10.cpp
+++ /dev/null
@@ -1,318 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <iostream>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "mkl_version.h"
-
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-#include "oneapi/mkl/rng/engines.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/rng/detail/mklgpu/onemkl_rng_mklgpu.hpp"
-
-#include "mkl_internal_rng_gpu.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace mklgpu {
-
-class philox4x32x10_impl : public oneapi::mkl::rng::detail::engine_impl {
-public:
-    philox4x32x10_impl(sycl::queue queue, std::uint64_t seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        engine_ = oneapi::mkl::rng::detail::gpu::create_engine<oneapi::mkl::rng::philox4x32x10>(
-            queue, seed);
-    }
-
-    philox4x32x10_impl(sycl::queue queue, std::initializer_list<std::uint64_t> seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        engine_ = oneapi::mkl::rng::detail::gpu::create_engine<oneapi::mkl::rng::philox4x32x10>(
-            queue, (std::int64_t)(seed.size() * 2), (const unsigned int*)seed.begin());
-    }
-
-    philox4x32x10_impl(const philox4x32x10_impl* other)
-            : oneapi::mkl::rng::detail::engine_impl(*other) {
-        sycl::queue queue(other->queue_);
-        engine_ = oneapi::mkl::rng::detail::gpu::create_engine<oneapi::mkl::rng::philox4x32x10>(
-            queue, other->engine_);
-    }
-
-    // Buffers API
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::uniform<
-                              std::int32_t, oneapi::mkl::rng::uniform_method::standard>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              float, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              double, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              float, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              double, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    virtual void generate(const bits<std::uint32_t>& distr, std::int64_t n,
-                          sycl::buffer<std::uint32_t, 1>& r) override {
-        oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r);
-    }
-
-    // USM APIs
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<std::int32_t, oneapi::mkl::rng::uniform_method::standard>&
-            distr,
-        std::int64_t n, std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        ;
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::uint32_t* r, const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual sycl::event generate(const bits<std::uint32_t>& distr, std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        return oneapi::mkl::rng::detail::gpu::generate(queue_, distr, engine_, n, r, dependencies);
-    }
-
-    virtual oneapi::mkl::rng::detail::engine_impl* copy_state() override {
-        return new philox4x32x10_impl(this);
-    }
-
-    virtual void skip_ahead(std::uint64_t num_to_skip) override {
-        oneapi::mkl::rng::detail::gpu::skip_ahead(queue_, engine_, num_to_skip);
-    }
-
-    virtual void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) override {
-        oneapi::mkl::rng::detail::gpu::skip_ahead(queue_, engine_, num_to_skip);
-    }
-
-    virtual void leapfrog(std::uint64_t idx, std::uint64_t stride) override {
-        throw oneapi::mkl::unimplemented("rng", "leapfrog");
-    }
-
-    virtual ~philox4x32x10_impl() override {
-        oneapi::mkl::rng::detail::gpu::delete_engine(queue_, engine_);
-    }
-
-private:
-    oneapi::mkl::rng::detail::engine_base_impl<oneapi::mkl::rng::philox4x32x10>* engine_;
-};
-
-oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(sycl::queue queue, std::uint64_t seed) {
-    return new philox4x32x10_impl(queue, seed);
-}
-
-oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(
-    sycl::queue queue, std::initializer_list<std::uint64_t> seed) {
-    return new philox4x32x10_impl(queue, seed);
-}
-
-} // namespace mklgpu
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/rng/backends/rocrand/CMakeLists.txt b/src/rng/backends/rocrand/CMakeLists.txt
deleted file mode 100644
index 47929703b..000000000
--- a/src/rng/backends/rocrand/CMakeLists.txt
+++ /dev/null
@@ -1,94 +0,0 @@
-# =================================================================================
-# Copyright (C) 2022 Heidelberg University, Engineering Mathematics and
-# Computing Lab (EMCL) and Computing Centre (URZ) cuRAND back-end Copyright (c)
-# 2021, The Regents of the University of California, through Lawrence Berkeley
-# National Laboratory (subject to receipt of any required approvals from the
-# U.S. Dept. of Energy). All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# (1) Redistributions of source code must retain the above copyright notice,
-# this list of conditions and the following disclaimer.
-#
-# (2) Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# (3) Neither the name of the University of California, Lawrence Berkeley
-# National Laboratory, U.S. Dept. of Energy nor the names of its contributors
-# may be used to endorse or promote products derived from this software without
-# specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-# You are under no obligation whatsoever to provide any bug fixes, patches, or
-# upgrades to the features, functionality or performance of the source code
-# ("Enhancements") to anyone; however, if you choose to make your Enhancements
-# available either publicly, or directly to Lawrence Berkeley National
-# Laboratory, without imposing a separate written license agreement for such
-# Enhancements, then you hereby grant the following license: a non-exclusive,
-# royalty-free perpetual license to install, use, modify, prepare derivative
-# works, incorporate into other computer software, distribute, and sublicense
-# such enhancements or derivative works thereof, in binary and source code form.
-#
-# If you have questions about your rights to use or distribute this software,
-# please contact Berkeley Lab's Intellectual Property Office at IPO@lbl.gov.
-#
-# NOTICE.  This Software was developed under funding from the U.S. Department of
-# Energy and the U.S. Government consequently retains certain rights.  As such,
-# the U.S. Government has been granted for itself and others acting on its
-# behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software
-# to reproduce, distribute copies to the public, prepare derivative works, and
-# perform publicly and display publicly, and to permit others to do so.
-# =================================================================================
-
-set(LIB_NAME onemkl_rng_rocrand)
-set(LIB_OBJ ${LIB_NAME}_obj)
-find_package(hip REQUIRED)
-find_package(rocrand REQUIRED)
-find_package(Threads REQUIRED)
-
-set(SOURCES philox4x32x10.cpp mrg32k3a.cpp $<$<BOOL:${BUILD_SHARED_LIBS}>:
-            mkl_rng_rocrand_wrappers.cpp>)
-
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT ${SOURCES})
-add_dependencies(onemkl_backend_libs_rng ${LIB_NAME})
-
-target_include_directories(
-  ${LIB_OBJ} PRIVATE ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/src
-                     ${CMAKE_BINARY_DIR}/bin ${MKL_INCLUDE} ${ONEMKL_GENERATED_INCLUDE_PATH})
-
-target_link_libraries(${LIB_OBJ} PRIVATE roc::rocrand hip::host Threads::Threads)
-target_link_libraries(${LIB_OBJ} PUBLIC ONEMKL::SYCL::SYCL)
-target_compile_features(${LIB_OBJ} PUBLIC cxx_std_11)
-set_target_properties(${LIB_OBJ} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-if(USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET ${LIB_OBJ} SOURCES ${SOURCES})
-endif()
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES SOVERSION ${PROJECT_VERSION_MAJOR})
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(
-  TARGETS ${LIB_NAME}
-  EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib)
diff --git a/src/rng/backends/rocrand/mkl_rng_rocrand_wrappers.cpp b/src/rng/backends/rocrand/mkl_rng_rocrand_wrappers.cpp
deleted file mode 100644
index 5450f47b4..000000000
--- a/src/rng/backends/rocrand/mkl_rng_rocrand_wrappers.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) 
- * and Computing Centre (URZ)
- * cuRAND back-end Copyright (c) 2021, The Regents of the University of
- * California, through Lawrence Berkeley National Laboratory (subject to receipt
- * of any required approvals from the U.S. Dept. of Energy). All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * (1) Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * (2) Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * (3) Neither the name of the University of California, Lawrence Berkeley
- * National Laboratory, U.S. Dept. of Energy nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * You are under no obligation whatsoever to provide any bug fixes, patches,
- * or upgrades to the features, functionality or performance of the source
- * code ("Enhancements") to anyone; however, if you choose to make your
- * Enhancements available either publicly, or directly to Lawrence Berkeley
- * National Laboratory, without imposing a separate written license agreement
- * for such Enhancements, then you hereby grant the following license: a
- * non-exclusive, royalty-free perpetual license to install, use, modify,
- * prepare derivative works, incorporate into other computer software,
- * distribute, and sublicense such enhancements or derivative works thereof,
- * in binary and source code form.
- *
- * If you have questions about your rights to use or distribute this software,
- * please contact Berkeley Lab's Intellectual Property Office at
- * IPO@lbl.gov.
- *
- * NOTICE.  This Software was developed under funding from the U.S. Department
- * of Energy and the U.S. Government consequently retains certain rights.  As
- * such, the U.S. Government has been granted for itself and others acting on
- * its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
- * Software to reproduce, distribute copies to the public, prepare derivative
- * works, and perform publicly and display publicly, and to permit others to do
- * so.
- ******************************************************************************/
-
-#include "oneapi/mkl/rng/detail/rocrand/onemkl_rng_rocrand.hpp"
-#include "rng/function_table.hpp"
-
-#define WRAPPER_VERSION 1
-
-extern "C" ONEMKL_EXPORT rng_function_table_t mkl_rng_table = {
-    WRAPPER_VERSION, oneapi::mkl::rng::rocrand::create_philox4x32x10,
-    oneapi::mkl::rng::rocrand::create_philox4x32x10, oneapi::mkl::rng::rocrand::create_mrg32k3a,
-    oneapi::mkl::rng::rocrand::create_mrg32k3a
-};
diff --git a/src/rng/backends/rocrand/mrg32k3a.cpp b/src/rng/backends/rocrand/mrg32k3a.cpp
deleted file mode 100644
index 424f14caf..000000000
--- a/src/rng/backends/rocrand/mrg32k3a.cpp
+++ /dev/null
@@ -1,1026 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) 
- * and Computing Centre (URZ)
- * rocRAND back-end Copyright (c) 2021, The Regents of the University of
- * California, through Lawrence Berkeley National Laboratory (subject to receipt
- * of any required approvals from the U.S. Dept. of Energy). All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * (1) Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * (2) Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * (3) Neither the name of the University of California, Lawrence Berkeley
- * National Laboratory, U.S. Dept. of Energy nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * You are under no obligation whatsoever to provide any bug fixes, patches,
- * or upgrades to the features, functionality or performance of the source
- * code ("Enhancements") to anyone; however, if you choose to make your
- * Enhancements available either publicly, or directly to Lawrence Berkeley
- * National Laboratory, without imposing a separate written license agreement
- * for such Enhancements, then you hereby grant the following license: a
- * non-exclusive, royalty-free perpetual license to install, use, modify,
- * prepare derivative works, incorporate into other computer software,
- * distribute, and sublicense such enhancements or derivative works thereof,
- * in binary and source code form.
- *
- * If you have questions about your rights to use or distribute this software,
- * please contact Berkeley Lab's Intellectual Property Office at
- * IPO@lbl.gov.
- *
- * NOTICE.  This Software was developed under funding from the U.S. Department
- * of Energy and the U.S. Government consequently retains certain rights.  As
- * such, the U.S. Government has been granted for itself and others acting on
- * its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
- * Software to reproduce, distribute copies to the public, prepare derivative
- * works, and perform publicly and display publicly, and to permit others to do
- * so.
- ******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#ifndef __HIPSYCL__
-#if __has_include(<sycl/backend/cuda.hpp>)
-#include <sycl/backend/cuda.hpp>
-#else
-#include <CL/sycl/backend/cuda.hpp>
-#endif
-#endif
-#include <iostream>
-
-#include "rocrand_helper.hpp"
-#include "rocrand_task.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/rng/detail/rocrand/onemkl_rng_rocrand.hpp"
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-#include "oneapi/mkl/rng/engines.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace rocrand {
-
-#if !defined(_WIN64)
-class mrg32k3a_impl : public oneapi::mkl::rng::detail::engine_impl {
-public:
-    mrg32k3a_impl(sycl::queue queue, std::uint32_t seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue),
-              seed_(seed),
-              offset_(0) {
-        rocrand_status status;
-        ROCRAND_CALL(rocrand_create_generator, status, &engine_, ROCRAND_RNG_PSEUDO_MRG32K3A);
-        ROCRAND_CALL(rocrand_set_seed, status, engine_, (unsigned long long)seed);
-    }
-
-    mrg32k3a_impl(sycl::queue queue, std::initializer_list<std::uint32_t> seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine",
-                                         "multi-seed unsupported by rocRAND backend");
-    }
-
-    mrg32k3a_impl(const mrg32k3a_impl* other)
-            : oneapi::mkl::rng::detail::engine_impl(*other),
-              seed_(other->seed_),
-              offset_(other->offset_) {
-        rocrand_status status;
-        ROCRAND_CALL(rocrand_create_generator, status, &engine_, ROCRAND_RNG_PSEUDO_MRG32K3A);
-        ROCRAND_CALL(rocrand_set_seed, status, engine_, (unsigned long long)seed_);
-
-        // Allign this->engine_'s offset state with other->engine_'s offset
-        skip_ahead(offset_);
-    }
-
-    // Buffers API
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform, status, engine_, r_ptr, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        range_transform_fp<float>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform_double, status, engine_, r_ptr, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        range_transform_fp<double>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::uniform<
-                              std::int32_t, oneapi::mkl::rng::uniform_method::standard>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        sycl::buffer<std::uint32_t, 1> ib(n);
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = ib.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](std::uint32_t* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate, status, engine_, r_ptr, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        range_transform_int<std::int32_t>(queue_, distr.a(), distr.b(), n, ib, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform, status, engine_, r_ptr, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        range_transform_fp_accurate<float>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform_double, status, engine_, r_ptr, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        range_transform_fp_accurate<double>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              float, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_normal, status, engine_, r_ptr, n, distr.mean(),
-                                 distr.stddev());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              double, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_normal_double, status, engine_, r_ptr, n,
-                                 distr.mean(), distr.stddev());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_normal, status, engine_, r_ptr, n, distr.mean(),
-                                 distr.stddev());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_normal_double, status, engine_, r_ptr, n,
-                                 distr.mean(), distr.stddev());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              float, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_log_normal, status, engine_, r_ptr, n, distr.m(),
-                                 distr.s());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              double, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_log_normal_double, status, engine_, r_ptr, n,
-                                 distr.m(), distr.s());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_log_normal, status, engine_, r_ptr, n, distr.m(),
-                                 distr.s());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_log_normal_double, status, engine_, r_ptr, n,
-                                 distr.m(), distr.s());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "Bernoulli distribution method unsupported by rocRAND backend");
-    }
-
-    virtual void generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "Bernoulli distribution method unsupported by rocRAND backend");
-    }
-
-    virtual void generate(const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](std::int32_t* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_poisson, status, engine_, (std::uint32_t*)r_ptr,
-                                 n, distr.lambda());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](std::uint32_t* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_poisson, status, engine_, r_ptr, n,
-                                 distr.lambda());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(const bits<std::uint32_t>& distr, std::int64_t n,
-                          sycl::buffer<std::uint32_t, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.template get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](uint32_t* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate, status, engine_, r_ptr, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    // USM APIs
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform, status, engine_, r, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        return range_transform_fp<float>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform_double, status, engine_, r, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        return range_transform_fp<double>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<std::int32_t, oneapi::mkl::rng::uniform_method::standard>&
-            distr,
-        std::int64_t n, std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        std::uint32_t* ib = (std::uint32_t*)malloc_device(
-            n * sizeof(std::uint32_t), queue_.get_device(), queue_.get_context());
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate, status, engine_, ib, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        return range_transform_int(queue_, distr.a(), distr.b(), n, ib, r);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform, status, engine_, r, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        return range_transform_fp_accurate<float>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform_double, status, engine_, r, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        return range_transform_fp_accurate<double>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_normal, status, engine_, r, n, distr.mean(),
-                             distr.stddev());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_normal_double, status, engine_, r, n, distr.mean(),
-                             distr.stddev());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_normal, status, engine_, r, n, distr.mean(),
-                             distr.stddev());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_normal_double, status, engine_, r, n, distr.mean(),
-                             distr.stddev());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_log_normal, status, engine_, r, n, distr.m(),
-                             distr.s());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_log_normal_double, status, engine_, r, n, distr.m(),
-                             distr.s());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_log_normal, status, engine_, r, n, distr.m(),
-                             distr.s());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_log_normal_double, status, engine_, r, n, distr.m(),
-                             distr.s());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "Bernoulli distribution method unsupported by rocRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "mrg32ka engine",
-            "Bernoulli distribution method unsupported by rocRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_poisson, status, engine_, (std::uint32_t*)r, n,
-                             distr.lambda());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::uint32_t* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_poisson, status, engine_, r, n, distr.lambda());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(const bits<std::uint32_t>& distr, std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate, status, engine_, r, n);
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual oneapi::mkl::rng::detail::engine_impl* copy_state() override {
-        return new mrg32k3a_impl(this);
-    }
-
-    virtual void skip_ahead(std::uint64_t num_to_skip) override {
-        rocrand_status status;
-        ROCRAND_CALL(rocrand_set_offset, status, engine_, num_to_skip);
-    }
-
-    virtual void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) override {
-        throw oneapi::mkl::unimplemented("rng", "skip_ahead",
-                                         "initializer list unsupported by rocRAND backend");
-    }
-
-    virtual void leapfrog(std::uint64_t idx, std::uint64_t stride) override {
-        throw oneapi::mkl::unimplemented("rng", "leapfrog", "unsupported by rocRAND backend");
-    }
-
-    virtual ~mrg32k3a_impl() override {
-        rocrand_destroy_generator(engine_);
-    }
-
-private:
-    rocrand_generator engine_;
-    std::uint32_t seed_;
-    std::uint64_t offset_;
-
-    void increment_internal_offset(std::uint64_t n) {
-        offset_ += n;
-    }
-};
-#else // rocRAND backend is currently not supported on Windows
-class mrg32k3a_impl : public oneapi::mkl::rng::detail::engine_impl {
-public:
-    mrg32k3a_impl(sycl::queue queue, std::uint32_t seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    mrg32k3a_impl(sycl::queue queue, std::initializer_list<std::uint32_t> seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    mrg32k3a_impl(const mrg32k3a_impl* other) : oneapi::mkl::rng::detail::engine_impl(*other) {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    // Buffers API
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::uniform<
-                              std::int32_t, oneapi::mkl::rng::uniform_method::standard>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              float, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              double, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              float, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              double, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void generate(const bits<std::uint32_t>& distr, std::int64_t n,
-                          sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    // USM APIs
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<std::int32_t, oneapi::mkl::rng::uniform_method::standard>&
-            distr,
-        std::int64_t n, std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::uint32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bits<std::uint32_t>& distr, std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return sycl::event{};
-    }
-
-    virtual oneapi::mkl::rng::detail::engine_impl* copy_state() override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-        return nullptr;
-    }
-
-    virtual void skip_ahead(std::uint64_t num_to_skip) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual void leapfrog(std::uint64_t idx, std::uint64_t stride) override {
-        throw oneapi::mkl::unimplemented("rng", "mrg32ka engine");
-    }
-
-    virtual ~mrg32k3a_impl() override {}
-};
-#endif
-
-oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(sycl::queue queue, std::uint32_t seed) {
-    return new mrg32k3a_impl(queue, seed);
-}
-
-oneapi::mkl::rng::detail::engine_impl* create_mrg32k3a(sycl::queue queue,
-                                                       std::initializer_list<std::uint32_t> seed) {
-    return new mrg32k3a_impl(queue, seed);
-}
-
-} // namespace rocrand
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/rng/backends/rocrand/philox4x32x10.cpp b/src/rng/backends/rocrand/philox4x32x10.cpp
deleted file mode 100644
index 5bc241360..000000000
--- a/src/rng/backends/rocrand/philox4x32x10.cpp
+++ /dev/null
@@ -1,1048 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) 
- * and Computing Centre (URZ)
- * rocRAND back-end Copyright (c) 2021, The Regents of the University of
- * California, through Lawrence Berkeley National Laboratory (subject to receipt
- * of any required approvals from the U.S. Dept. of Energy). All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * (1) Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * (2) Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * (3) Neither the name of the University of California, Lawrence Berkeley
- * National Laboratory, U.S. Dept. of Energy nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * You are under no obligation whatsoever to provide any bug fixes, patches,
- * or upgrades to the features, functionality or performance of the source
- * code ("Enhancements") to anyone; however, if you choose to make your
- * Enhancements available either publicly, or directly to Lawrence Berkeley
- * National Laboratory, without imposing a separate written license agreement
- * for such Enhancements, then you hereby grant the following license: a
- * non-exclusive, royalty-free perpetual license to install, use, modify,
- * prepare derivative works, incorporate into other computer software,
- * distribute, and sublicense such enhancements or derivative works thereof,
- * in binary and source code form.
- *
- * If you have questions about your rights to use or distribute this software,
- * please contact Berkeley Lab's Intellectual Property Office at
- * IPO@lbl.gov.
- *
- * NOTICE.  This Software was developed under funding from the U.S. Department
- * of Energy and the U.S. Government consequently retains certain rights.  As
- * such, the U.S. Government has been granted for itself and others acting on
- * its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
- * Software to reproduce, distribute copies to the public, prepare derivative
- * works, and perform publicly and display publicly, and to permit others to do
- * so.
- ******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#ifndef __HIPSYCL__
-#if __has_include(<sycl/backend/cuda.hpp>)
-#include <sycl/backend/cuda.hpp>
-#else
-#include <CL/sycl/backend/cuda.hpp>
-#endif
-#endif
-#include <iostream>
-
-#include "rocrand_helper.hpp"
-#include "rocrand_task.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/rng/detail/rocrand/onemkl_rng_rocrand.hpp"
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-// #include "oneapi/mkl/rng/engines.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace rocrand {
-
-#if !defined(_WIN64)
-/*
- * Note that rocRAND consists of two pieces: a host (CPU) API and a device (GPU)
- * API. The host API acts like any standard library; the `rocrand.h' header is
- * included and the functions can be called as usual. The generator is
- * instantiated on the host and random numbers can be generated on either the
- * host CPU or device. For device-side generation, calls to the library happen
- * on the host, but the actual work of RNG is done on the device. In this case,
- * the resulting random numbers are stored in global memory on the device. These
- * random numbers can then be used in other kernels or be copied back to the
- * host for further processing. For host-side generation, everything is done on
- * the host, and the random numbers are stored in host memory.
- *
- * The second piece is the device header, `rocrand_kernel.h'. Using this file
- * permits setting up random number generator states and generating sequences of
- * random numbers. This allows random numbers to be generated and immediately
- * consumed in other kernels without requiring the random numbers to be written
- * to, and read from, global memory.
- *
- * Here we utilize the host API since this is most aligned with how oneMKL
- * generates random numbers.
- *
- */
-class philox4x32x10_impl : public oneapi::mkl::rng::detail::engine_impl {
-public:
-    philox4x32x10_impl(sycl::queue queue, std::uint64_t seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue),
-              seed_(seed),
-              offset_(0) {
-        rocrand_status status;
-        ROCRAND_CALL(rocrand_create_generator, status, &engine_, ROCRAND_RNG_PSEUDO_PHILOX4_32_10);
-        ROCRAND_CALL(rocrand_set_seed, status, engine_, (unsigned long long)seed);
-    }
-
-    philox4x32x10_impl(sycl::queue queue, std::initializer_list<std::uint64_t> seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine",
-                                         "multi-seed unsupported by rocRAND backend");
-    }
-
-    philox4x32x10_impl(const philox4x32x10_impl* other)
-            : oneapi::mkl::rng::detail::engine_impl(*other),
-              seed_(other->seed_),
-              offset_(other->offset_) {
-        rocrand_status status;
-        ROCRAND_CALL(rocrand_create_generator, status, &engine_, ROCRAND_RNG_PSEUDO_PHILOX4_32_10);
-        ROCRAND_CALL(rocrand_set_seed, status, engine_, (unsigned long long)seed_);
-
-        // Allign this->engine_'s offset state with other->engine_'s offset
-        skip_ahead(offset_);
-    }
-
-    // Buffers API
-
-    virtual inline void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform, status, engine_, r_ptr, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        range_transform_fp<float>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform_double, status, engine_, r_ptr, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        range_transform_fp<double>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::uniform<
-                              std::int32_t, oneapi::mkl::rng::uniform_method::standard>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        sycl::buffer<std::uint32_t, 1> ib(n);
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = ib.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](std::uint32_t* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate, status, engine_, r_ptr, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        range_transform_int<std::int32_t>(queue_, distr.a(), distr.b(), n, ib, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform, status, engine_, r_ptr, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        range_transform_fp_accurate<float>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform_double, status, engine_, r_ptr, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        range_transform_fp_accurate<double>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              float, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_normal, status, engine_, r_ptr, n, distr.mean(),
-                                 distr.stddev());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              double, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_normal_double, status, engine_, r_ptr, n,
-                                 distr.mean(), distr.stddev());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_normal, status, engine_, r_ptr, n, distr.mean(),
-                                 distr.stddev());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_normal_double, status, engine_, r_ptr, n,
-                                 distr.mean(), distr.stddev());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              float, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_log_normal, status, engine_, r_ptr, n, distr.m(),
-                                 distr.s());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              double, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_log_normal_double, status, engine_, r_ptr, n,
-                                 distr.m(), distr.s());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](float* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_log_normal, status, engine_, r_ptr, n, distr.m(),
-                                 distr.s());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](double* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_log_normal_double, status, engine_, r_ptr, n,
-                                 distr.m(), distr.s());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "Bernoulli distribution method unsupported by rocRAND backend");
-    }
-
-    virtual void generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "Bernoulli distribution method unsupported by rocRAND backend");
-    }
-
-    virtual void generate(const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](std::int32_t* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_poisson, status, engine_, (std::uint32_t*)r_ptr,
-                                 n, distr.lambda());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](std::uint32_t* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_poisson, status, engine_, r_ptr, n,
-                                 distr.lambda());
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    virtual void generate(const bits<std::uint32_t>& distr, std::int64_t n,
-                          sycl::buffer<std::uint32_t, 1>& r) override {
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                auto acc = r.template get_access<sycl::access::mode::read_write>(cgh);
-                onemkl_rocrand_host_task(cgh, acc, engine_, [=](std::uint32_t* r_ptr) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate, status, engine_, r_ptr, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-    }
-
-    // USM APIs
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform, status, engine_, r, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        return range_transform_fp<float>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform_double, status, engine_, r, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        return range_transform_fp<double>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<std::int32_t, oneapi::mkl::rng::uniform_method::standard>&
-            distr,
-        std::int64_t n, std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        std::uint32_t* ib = (std::uint32_t*)malloc_device(
-            n * sizeof(std::uint32_t), queue_.get_device(), queue_.get_context());
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate, status, engine_, ib, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        return range_transform_int(queue_, distr.a(), distr.b(), n, ib, r);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform, status, engine_, r, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        return range_transform_fp_accurate<float>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        queue_
-            .submit([&](sycl::handler& cgh) {
-                onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                    rocrand_status status;
-                    ROCRAND_CALL(rocrand_generate_uniform_double, status, engine_, r, n);
-                });
-            })
-            .wait_and_throw();
-
-        increment_internal_offset(n);
-
-        return range_transform_fp_accurate<double>(queue_, distr.a(), distr.b(), n, r);
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_normal, status, engine_, r, n, distr.mean(),
-                             distr.stddev());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_normal_double, status, engine_, r, n, distr.mean(),
-                             distr.stddev());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_normal, status, engine_, r, n, distr.mean(),
-                             distr.stddev());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_normal_double, status, engine_, r, n, distr.mean(),
-                             distr.stddev());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_log_normal, status, engine_, r, n, distr.m(),
-                             distr.s());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_log_normal_double, status, engine_, r, n, distr.m(),
-                             distr.s());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_log_normal, status, engine_, r, n, distr.m(),
-                             distr.s());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_log_normal_double, status, engine_, r, n, distr.m(),
-                             distr.s());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "Bernoulli distribution method unsupported by rocRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented(
-            "rng", "philox4x32x10 engine",
-            "Bernoulli distribution method unsupported by rocRAND backend");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_poisson, status, engine_, (std::uint32_t*)r, n,
-                             distr.lambda());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::uint32_t* r, const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate_poisson, status, engine_, r, n, distr.lambda());
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual sycl::event generate(const bits<std::uint32_t>& distr, std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        sycl::event::wait_and_throw(dependencies);
-        auto event = queue_.submit([&](sycl::handler& cgh) {
-            onemkl_rocrand_host_task(cgh, engine_, [=](sycl::interop_handle ih) {
-                rocrand_status status;
-                ROCRAND_CALL(rocrand_generate, status, engine_, r, n);
-            });
-        });
-
-        increment_internal_offset(n);
-
-        return event;
-    }
-
-    virtual oneapi::mkl::rng::detail::engine_impl* copy_state() override {
-        return new philox4x32x10_impl(this);
-    }
-
-    virtual void skip_ahead(std::uint64_t num_to_skip) override {
-        rocrand_status status;
-        ROCRAND_CALL(rocrand_set_offset, status, engine_, num_to_skip);
-    }
-
-    virtual void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) override {
-        throw oneapi::mkl::unimplemented("rng", "skip_ahead",
-                                         "initializer list unsupported by rocRAND backend");
-    }
-
-    virtual void leapfrog(std::uint64_t idx, std::uint64_t stride) override {
-        throw oneapi::mkl::unimplemented("rng", "leapfrog", "unsupported by rocRAND backend");
-    }
-
-    virtual ~philox4x32x10_impl() override {
-        rocrand_destroy_generator(engine_);
-    }
-
-private:
-    rocrand_generator engine_;
-    std::uint64_t seed_;
-    std::uint64_t offset_;
-
-    void increment_internal_offset(std::uint64_t n) {
-        offset_ += n;
-    }
-};
-#else // rocRAND backend is currently not supported on Windows
-class philox4x32x10_impl : public oneapi::mkl::rng::detail::engine_impl {
-public:
-    philox4x32x10_impl(sycl::queue queue, std::uint64_t seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    philox4x32x10_impl(sycl::queue queue, std::initializer_list<std::uint64_t> seed)
-            : oneapi::mkl::rng::detail::engine_impl(queue) {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    philox4x32x10_impl(const philox4x32x10_impl* other)
-            : oneapi::mkl::rng::detail::engine_impl(*other) {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    // Buffers API
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::uniform<
-                              std::int32_t, oneapi::mkl::rng::uniform_method::standard>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              float, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::gaussian<
-                              double, oneapi::mkl::rng::gaussian_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              float, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const oneapi::mkl::rng::lognormal<
-                              double, oneapi::mkl::rng::lognormal_method::box_muller2>& distr,
-                          std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<float, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, sycl::buffer<double, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::int32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr,
-                          std::int64_t n, sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void generate(const bits<std::uint32_t>& distr, std::int64_t n,
-                          sycl::buffer<std::uint32_t, 1>& r) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    // USM APIs
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<std::int32_t, oneapi::mkl::rng::uniform_method::standard>&
-            distr,
-        std::int64_t n, std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::box_muller2>&
-            distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, float* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>& distr,
-        std::int64_t n, double* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bernoulli<std::int32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::int32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bernoulli<std::uint32_t, bernoulli_method::icdf>& distr,
-                                 std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::int32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::int32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(
-        const poisson<std::uint32_t, poisson_method::gaussian_icdf_based>& distr, std::int64_t n,
-        std::uint32_t* r, const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual sycl::event generate(const bits<std::uint32_t>& distr, std::int64_t n, std::uint32_t* r,
-                                 const std::vector<sycl::event>& dependencies) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return sycl::event{};
-    }
-
-    virtual oneapi::mkl::rng::detail::engine_impl* copy_state() override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-        return nullptr;
-    }
-
-    virtual void skip_ahead(std::uint64_t num_to_skip) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void skip_ahead(std::initializer_list<std::uint64_t> num_to_skip) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual void leapfrog(std::uint64_t idx, std::uint64_t stride) override {
-        throw oneapi::mkl::unimplemented("rng", "philox4x32x10 engine");
-    }
-
-    virtual ~philox4x32x10_impl() override {}
-};
-#endif
-
-oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(sycl::queue queue, std::uint64_t seed) {
-    return new philox4x32x10_impl(queue, seed);
-}
-
-oneapi::mkl::rng::detail::engine_impl* create_philox4x32x10(
-    sycl::queue queue, std::initializer_list<std::uint64_t> seed) {
-    return new philox4x32x10_impl(queue, seed);
-}
-
-} // namespace rocrand
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/rng/backends/rocrand/rocrand_helper.hpp b/src/rng/backends/rocrand/rocrand_helper.hpp
deleted file mode 100644
index 205429ee8..000000000
--- a/src/rng/backends/rocrand/rocrand_helper.hpp
+++ /dev/null
@@ -1,335 +0,0 @@
-/*******************************************************************************
- * Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) 
- * and Computing Centre (URZ)
- * cuRAND back-end Copyright (c) 2021, The Regents of the University of
- * California, through Lawrence Berkeley National Laboratory (subject to receipt
- * of any required approvals from the U.S. Dept. of Energy). All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * (1) Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
- *
- * (2) Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * (3) Neither the name of the University of California, Lawrence Berkeley
- * National Laboratory, U.S. Dept. of Energy nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * You are under no obligation whatsoever to provide any bug fixes, patches,
- * or upgrades to the features, functionality or performance of the source
- * code ("Enhancements") to anyone; however, if you choose to make your
- * Enhancements available either publicly, or directly to Lawrence Berkeley
- * National Laboratory, without imposing a separate written license agreement
- * for such Enhancements, then you hereby grant the following license: a
- * non-exclusive, royalty-free perpetual license to install, use, modify,
- * prepare derivative works, incorporate into other computer software,
- * distribute, and sublicense such enhancements or derivative works thereof,
- * in binary and source code form.
- *
- * If you have questions about your rights to use or distribute this software,
- * please contact Berkeley Lab's Intellectual Property Office at
- * IPO@lbl.gov.
- *
- * NOTICE.  This Software was developed under funding from the U.S. Department
- * of Energy and the U.S. Government consequently retains certain rights.  As
- * such, the U.S. Government has been granted for itself and others acting on
- * its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
- * Software to reproduce, distribute copies to the public, prepare derivative
- * works, and perform publicly and display publicly, and to permit others to do
- * so.
- ******************************************************************************/
-
-/**
- * @file rocrand_helper.cpp : contains the implementation of all the routines
- * for HIP backend
- */
-#ifndef _MKL_RNG_ROCRAND_HELPER_HPP_
-#define _MKL_RNG_ROCRAND_HELPER_HPP_
-
-#include <rocrand.h>
-#include <complex>
-#include "oneapi/mkl/types.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace rocrand {
-
-// Static template functions oneapi::mkl::rng::rocrand::range_transform_fp for
-// Buffer and USM APIs
-//
-// rocRAND has no built-in functionality to specify a custom range for sampling
-// random numbers; `rocrand_generate_uniform' generates uniform random numbers on
-// [0, 1). This function is used to convert to range [a, b).
-//
-// Supported types:
-//      float
-//      double
-//
-// Input arguments:
-//      queue - the queue to submit the kernel to
-//      a     - range lower bound (inclusive)
-//      b     - range upper bound (exclusive)
-//      r     - buffer to store transformed random numbers
-template <typename T>
-static inline void range_transform_fp(sycl::queue& queue, T a, T b, std::int64_t n,
-                                      sycl::buffer<T, 1>& r) {
-    queue.submit([&](sycl::handler& cgh) {
-        auto acc = r.template get_access<sycl::access::mode::read_write>(cgh);
-        cgh.parallel_for(sycl::range<1>(n),
-                         [=](sycl::id<1> id) { acc[id[0]] = acc[id[0]] * (b - a) + a; });
-    });
-}
-template <typename T>
-static inline sycl::event range_transform_fp(sycl::queue& queue, T a, T b, std::int64_t n, T* r) {
-    return queue.submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(sycl::range<1>(n),
-                         [=](sycl::id<1> id) { r[id[0]] = r[id[0]] * (b - a) + a; });
-    });
-}
-template <typename T>
-static inline void range_transform_fp_accurate(sycl::queue& queue, T a, T b, std::int64_t n,
-                                               sycl::buffer<T, 1>& r) {
-    queue.submit([&](sycl::handler& cgh) {
-        auto acc = r.template get_access<sycl::access::mode::read_write>(cgh);
-        cgh.parallel_for(sycl::range<1>(n), [=](sycl::id<1> id) {
-            acc[id[0]] = acc[id[0]] * (b - a) + a;
-            if (acc[id[0]] < a) {
-                acc[id[0]] = a;
-            }
-            else if (acc[id[0]] > b) {
-                acc[id[0]] = b;
-            }
-        });
-    });
-}
-template <typename T>
-static inline sycl::event range_transform_fp_accurate(sycl::queue& queue, T a, T b, std::int64_t n,
-                                                      T* r) {
-    return queue.submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(sycl::range<1>(n), [=](sycl::id<1> id) {
-            r[id[0]] = r[id[0]] * (b - a) + a;
-            if (r[id[0]] < a) {
-                r[id[0]] = a;
-            }
-            else if (r[id[0]] > b) {
-                r[id[0]] = b;
-            }
-        });
-    });
-}
-
-// Static template functions oneapi::mkl::rng::rocrand::range_transform_int for
-// Buffer and USM APIs
-//
-// rocRAND has no built-in functionality to specify a custom range for sampling
-// random numbers; `rocrand_generate_uniform' generates uniform random numbers on
-// [0, 1). This function is used to convert to range [a, b).
-//
-// Supported types:
-//      std::int32_t
-//      std::uint32_t
-//
-// Input arguments:
-//      queue - the queue to submit the kernel to
-//      a     - range lower bound (inclusive)
-//      b     - range upper bound (exclusive)
-//      r     - buffer to store transformed random numbers
-template <typename T>
-inline void range_transform_int(sycl::queue& queue, T a, T b, std::int64_t n,
-                                sycl::buffer<std::uint32_t, 1>& in, sycl::buffer<T, 1>& out) {
-    queue.submit([&](sycl::handler& cgh) {
-        auto acc_in = in.template get_access<sycl::access::mode::read>(cgh);
-        auto acc_out = out.template get_access<sycl::access::mode::write>(cgh);
-        cgh.parallel_for(sycl::range<1>(n),
-                         [=](sycl::id<1> id) { acc_out[id[0]] = a + acc_in[id[0]] % (b - a); });
-    });
-}
-template <typename T>
-inline sycl::event range_transform_int(sycl::queue& queue, T a, T b, std::int64_t n,
-                                       std::uint32_t* in, T* out) {
-    return queue.submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(sycl::range<1>(n),
-                         [=](sycl::id<1> id) { out[id[0]] = a + in[id[0]] % (b - a); });
-    });
-}
-
-// Static template functions oneapi::mkl::rng::rocrand::sample_bernoulli for
-// Buffer and USM APIs
-//
-// rocRAND has no built-in functionality to sample from a Bernoulli distribution.
-// The implementation here uses uniformly-generated random numbers and returns
-// the corresponding Bernoulli distribution based on a probability.
-//
-// Supported types:
-//      std::int32_t
-//      std::uint32_t
-//
-// Input arguments:
-//      queue - the queue to submit the kernel to
-//      p     - success probablity of a trial
-//      in    - buffer containing uniformly-generated random numbers
-//      out   - buffer to store Bernoulli
-template <typename T>
-static inline void sample_bernoulli_from_uniform(sycl::queue& queue, float p, std::int64_t n,
-                                                 sycl::buffer<float, 1> in,
-                                                 sycl::buffer<T, 1>& out) {
-    queue.submit([&](sycl::handler& cgh) {
-        auto acc_in = in.template get_access<sycl::access::mode::read>(cgh);
-        auto acc_out = out.template get_access<sycl::access::mode::write>(cgh);
-        cgh.parallel_for(sycl::range<1>(n),
-                         [=](sycl::id<1> id) { acc_out[id[0]] = acc_in[id[0]] < p; });
-    });
-}
-template <typename T>
-static inline sycl::event sample_bernoulli_from_uniform(sycl::queue& queue, float p, std::int64_t n,
-                                                        float* in, T* out) {
-    return queue.submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(sycl::range<1>(n), [=](sycl::id<1> id) { out[id[0]] = in[id[0]] < p; });
-    });
-}
-
-class rocrand_error : virtual public std::runtime_error {
-protected:
-    inline const char* rocrand_error_map(rocrand_status error) {
-        switch (error) {
-            case ROCRAND_STATUS_SUCCESS: return "ROCRAND_STATUS_SUCCESS";
-
-            case ROCRAND_STATUS_NOT_CREATED: return "ROCRAND_STATUS_NOT_CREATED";
-
-            case ROCRAND_STATUS_ALLOCATION_FAILED: return "ROCRAND_STATUS_ALLOCATION_FAILED";
-
-            case ROCRAND_STATUS_TYPE_ERROR: return "ROCRAND_STATUS_TYPE_ERROR";
-
-            case ROCRAND_STATUS_OUT_OF_RANGE: return "ROCRAND_STATUS_OUT_OF_RANGE";
-
-            case ROCRAND_STATUS_LENGTH_NOT_MULTIPLE: return "ROCRAND_STATUS_LENGTH_NOT_MULTIPLE";
-
-            case ROCRAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-                return "ROCRAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-
-            case ROCRAND_STATUS_LAUNCH_FAILURE: return "ROCRAND_STATUS_LAUNCH_FAILURE";
-
-            case ROCRAND_STATUS_VERSION_MISMATCH: return "ROCRAND_STATUS_VERSION_MISMATCH";
-
-            case ROCRAND_STATUS_INTERNAL_ERROR: return "ROCRAND_STATUS_INTERNAL_ERROR";
-
-            default: return "<unknown>";
-        }
-    }
-
-    int error_number; ///< Error number
-public:
-    /** Constructor (C++ STL string, rocrand_status).
-   *  @param msg The error message
-   *  @param err_num error number
-   */
-    explicit rocrand_error(std::string message, rocrand_status result)
-            : std::runtime_error((message + std::string(rocrand_error_map(result)))) {
-        error_number = static_cast<int>(result);
-    }
-
-    /** Destructor.
-   *  Virtual to allow for subclassing.
-   */
-    virtual ~rocrand_error() throw() {}
-
-    /** Returns error number.
-   *  @return #error_number
-   */
-    virtual int getErrorNumber() const throw() {
-        return error_number;
-    }
-};
-
-class rocm_error : virtual public std::runtime_error {
-protected:
-    inline const char* rocm_error_map(hipError_t result) {
-        switch (result) {
-            case hipSuccess: return "hipSuccess";
-            case hipErrorInvalidContext: return "hipErrorInvalidContext";
-            case hipErrorInvalidKernelFile: return "hipErrorInvalidKernelFile";
-            case hipErrorMemoryAllocation: return "hipErrorMemoryAllocation";
-            case hipErrorInitializationError: return "hipErrorInitializationError";
-            case hipErrorLaunchFailure: return "hipErrorLaunchFailure";
-            case hipErrorLaunchOutOfResources: return "hipErrorLaunchOutOfResources";
-            case hipErrorInvalidDevice: return "hipErrorInvalidDevice";
-            case hipErrorInvalidValue: return "hipErrorInvalidValue";
-            case hipErrorInvalidDevicePointer: return "hipErrorInvalidDevicePointer";
-            case hipErrorInvalidMemcpyDirection: return "hipErrorInvalidMemcpyDirection";
-            case hipErrorUnknown: return "hipErrorUnknown";
-            case hipErrorInvalidResourceHandle: return "hipErrorInvalidResourceHandle";
-            case hipErrorNotReady: return "hipErrorNotReady";
-            case hipErrorNoDevice: return "hipErrorNoDevice";
-            case hipErrorPeerAccessAlreadyEnabled: return "hipErrorPeerAccessAlreadyEnabled";
-            case hipErrorPeerAccessNotEnabled: return "hipErrorPeerAccessNotEnabled";
-            case hipErrorRuntimeMemory: return "hipErrorRuntimeMemory";
-            case hipErrorRuntimeOther: return "hipErrorRuntimeOther";
-            case hipErrorHostMemoryAlreadyRegistered: return "hipErrorHostMemoryAlreadyRegistered";
-            case hipErrorHostMemoryNotRegistered: return "hipErrorHostMemoryNotRegistered";
-            case hipErrorMapBufferObjectFailed: return "hipErrorMapBufferObjectFailed";
-
-            default: return "<unknown>";
-        }
-    }
-    int error_number; ///< error number
-public:
-    /** Constructor (C++ STL string, hipError_t).
-   *  @param msg The error message
-   *  @param err_num Error number
-   */
-    explicit rocm_error(std::string message, hipError_t result)
-            : std::runtime_error((message + std::string(rocm_error_map(result)))) {
-        error_number = static_cast<int>(result);
-    }
-
-    /** Destructor.
-   *  Virtual to allow for subclassing.
-   */
-    virtual ~rocm_error() throw() {}
-
-    /** Returns error number.
-   *  @return #error_number
-   */
-    virtual int getErrorNumber() const throw() {
-        return error_number;
-    }
-};
-
-#define HIP_ERROR_FUNC(name, err, ...)                                  \
-    err = name(__VA_ARGS__);                                            \
-    if (err != HIP_SUCCESS) {                                           \
-        throw rocm_error(std::string(#name) + std::string(" : "), err); \
-    }
-
-#define ROCRAND_CALL(func, status, ...)                                       \
-    status = func(__VA_ARGS__);                                               \
-    if (status != ROCRAND_STATUS_SUCCESS) {                                   \
-        throw rocrand_error(std::string(#func) + std::string(" : "), status); \
-    }
-
-} // namespace rocrand
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _MKL_RNG_ROCRAND_HELPER_HPP_
diff --git a/src/rng/backends/rocrand/rocrand_task.hpp b/src/rng/backends/rocrand/rocrand_task.hpp
deleted file mode 100644
index 2588dc901..000000000
--- a/src/rng/backends/rocrand/rocrand_task.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-#ifndef _MKL_RNG_ROCRAND_TASK_HPP_
-#define _MKL_RNG_ROCRAND_TASK_HPP_
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "rocrand_helper.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace rocrand {
-#ifdef __HIPSYCL__
-template <typename H, typename A, typename E, typename F>
-static inline void host_task_internal(H &cgh, A acc, E e, F f) {
-    cgh.hipSYCL_enqueue_custom_operation([=](sycl::interop_handle ih) {
-        rocrand_status status;
-        ROCRAND_CALL(rocrand_set_stream, status, e, ih.get_native_queue<sycl::backend::hip>());
-        auto r_ptr =
-            reinterpret_cast<typename A::value_type *>(ih.get_native_mem<sycl::backend::hip>(acc));
-        f(r_ptr);
-    });
-}
-
-template <typename H, typename E, typename F>
-static inline void host_task_internal(H &cgh, E e, F f) {
-    cgh.hipSYCL_enqueue_custom_operation([=](sycl::interop_handle ih) {
-        rocrand_status status;
-        ROCRAND_CALL(rocrand_set_stream, status, e, ih.get_native_queue<sycl::backend::hip>());
-        f(ih);
-    });
-}
-#else
-template <typename H, typename A, typename E, typename F>
-static inline void host_task_internal(H &cgh, A acc, E e, F f) {
-    cgh.host_task([=](sycl::interop_handle ih) {
-        rocrand_status status;
-        auto stream = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
-        ROCRAND_CALL(rocrand_set_stream, status, e, stream);
-        auto r_ptr = reinterpret_cast<typename A::value_type *>(
-            ih.get_native_mem<sycl::backend::ext_oneapi_hip>(acc));
-        f(r_ptr);
-
-        hipError_t err;
-        HIP_ERROR_FUNC(hipStreamSynchronize, err, stream);
-    });
-}
-
-template <typename H, typename E, typename F>
-static inline void host_task_internal(H &cgh, E e, F f) {
-    cgh.host_task([=](sycl::interop_handle ih) {
-        rocrand_status status;
-        auto stream = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
-        ROCRAND_CALL(rocrand_set_stream, status, e, stream);
-        f(ih);
-
-        hipError_t err;
-        HIP_ERROR_FUNC(hipStreamSynchronize, err, stream);
-    });
-}
-#endif
-template <typename H, typename A, typename E, typename F>
-static inline void onemkl_rocrand_host_task(H &cgh, A acc, E e, F f) {
-    host_task_internal(cgh, acc, e, f);
-}
-
-template <typename H, typename Engine, typename F>
-static inline void onemkl_rocrand_host_task(H &cgh, Engine e, F f) {
-    host_task_internal(cgh, e, f);
-}
-
-} // namespace rocrand
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
-
-#endif
diff --git a/src/rng/function_table.hpp b/src/rng/function_table.hpp
deleted file mode 100644
index c94757250..000000000
--- a/src/rng/function_table.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _RNG_FUNCTION_TABLE_HPP_
-#define _RNG_FUNCTION_TABLE_HPP_
-
-#include <cstdint>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/rng/detail/engine_impl.hpp"
-
-typedef struct {
-    int version;
-
-    oneapi::mkl::rng::detail::engine_impl* (*create_philox4x32x10_sycl)(sycl::queue queue,
-                                                                        std::uint64_t seed);
-    oneapi::mkl::rng::detail::engine_impl* (*create_philox4x32x10_ex_sycl)(
-        sycl::queue queue, std::initializer_list<std::uint64_t> seed);
-
-    oneapi::mkl::rng::detail::engine_impl* (*create_mrg32k3a_sycl)(sycl::queue queue,
-                                                                   std::uint32_t seed);
-    oneapi::mkl::rng::detail::engine_impl* (*create_mrg32k3a_ex_sycl)(
-        sycl::queue queue, std::initializer_list<std::uint32_t> seed);
-} rng_function_table_t;
-
-#endif //_RNG_FUNCTION_TABLE_HPP_
diff --git a/src/rng/rng_loader.cpp b/src/rng/rng_loader.cpp
deleted file mode 100644
index 1734287ee..000000000
--- a/src/rng/rng_loader.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/rng/detail/rng_loader.hpp"
-
-#include "function_table_initializer.hpp"
-#include "rng/function_table.hpp"
-
-namespace oneapi {
-namespace mkl {
-namespace rng {
-namespace detail {
-
-static oneapi::mkl::detail::table_initializer<domain::rng, rng_function_table_t> function_tables;
-
-engine_impl* create_philox4x32x10(oneapi::mkl::device libkey, sycl::queue queue,
-                                  std::uint64_t seed) {
-    return function_tables[libkey].create_philox4x32x10_sycl(queue, seed);
-}
-
-engine_impl* create_philox4x32x10(oneapi::mkl::device libkey, sycl::queue queue,
-                                  std::initializer_list<std::uint64_t> seed) {
-    return function_tables[libkey].create_philox4x32x10_ex_sycl(queue, seed);
-}
-
-engine_impl* create_mrg32k3a(oneapi::mkl::device libkey, sycl::queue queue, std::uint32_t seed) {
-    return function_tables[libkey].create_mrg32k3a_sycl(queue, seed);
-}
-
-engine_impl* create_mrg32k3a(oneapi::mkl::device libkey, sycl::queue queue,
-                             std::initializer_list<std::uint32_t> seed) {
-    return function_tables[libkey].create_mrg32k3a_ex_sycl(queue, seed);
-}
-
-} // namespace detail
-} // namespace rng
-} // namespace mkl
-} // namespace oneapi
diff --git a/src/sparse_blas/CMakeLists.txt b/src/sparse_blas/CMakeLists.txt
deleted file mode 100644
index b93902f49..000000000
--- a/src/sparse_blas/CMakeLists.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_subdirectory(backends)
-
-if(BUILD_SHARED_LIBS)
-  add_library(onemkl_sparse_blas OBJECT)
-  target_sources(onemkl_sparse_blas PRIVATE sparse_blas_loader.cpp)
-  target_include_directories(onemkl_sparse_blas
-    PRIVATE ${PROJECT_SOURCE_DIR}/include
-            ${PROJECT_SOURCE_DIR}/src
-            ${PROJECT_SOURCE_DIR}/src/include
-            ${CMAKE_BINARY_DIR}/bin
-            ${ONEMKL_GENERATED_INCLUDE_PATH}
-            $<TARGET_FILE_DIR:onemkl>
-  )
-
-  target_compile_options(onemkl_sparse_blas PRIVATE ${ONEMKL_BUILD_COPT})
-
-  set_target_properties(onemkl_sparse_blas PROPERTIES
-    POSITION_INDEPENDENT_CODE ON
-  )
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET onemkl_sparse_blas SOURCES sparse_blas_loader.cpp)
-  else()
-    target_link_libraries(onemkl_sparse_blas PUBLIC ONEMKL::SYCL::SYCL)
-  endif()
-
-  include(WarningsUtils)
-  target_link_libraries(onemkl_sparse_blas PRIVATE onemkl_warnings)
-
-endif()
diff --git a/src/sparse_blas/backends/CMakeLists.txt b/src/sparse_blas/backends/CMakeLists.txt
deleted file mode 100644
index ef606c6e1..000000000
--- a/src/sparse_blas/backends/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_custom_target(onemkl_backend_libs_sparse_blas)
-add_dependencies(onemkl_backend_libs onemkl_backend_libs_sparse_blas)
-
-if(ENABLE_MKLCPU_BACKEND)
-  add_subdirectory(mklcpu)
-endif()
-
-if(ENABLE_MKLGPU_BACKEND)
-  add_subdirectory(mklgpu)
-endif()
diff --git a/src/sparse_blas/backends/backend_wrappers.cxx b/src/sparse_blas/backends/backend_wrappers.cxx
deleted file mode 100644
index 2c8161249..000000000
--- a/src/sparse_blas/backends/backend_wrappers.cxx
+++ /dev/null
@@ -1,85 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/*
-This file lists functions matching those required by sparse_blas_function_table_t in
-src/sparse_blas/function_table.hpp.
-
-To use this:
-
-#define WRAPPER_VERSION <Wrapper version number>
-#define BACKEND         <Backend name eg. mklgpu>
-
-extern "C" sparse_blas_function_table_t mkl_sparse_blas_table = {
-    WRAPPER_VERSION,
-#include "sparse_blas/backends/backend_wrappers.cxx"
-};
-
-Changes to this file should be matched to changes in sparse_blas/function_table.hpp. The required
-function template instantiations must be added to backend_sparse_blas_instantiations.cxx.
-*/
-
-// clang-format off
-oneapi::mkl::sparse::BACKEND::init_matrix_handle,
-oneapi::mkl::sparse::BACKEND::release_matrix_handle,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::set_csr_data,
-oneapi::mkl::sparse::BACKEND::optimize_gemm,
-oneapi::mkl::sparse::BACKEND::optimize_gemm,
-oneapi::mkl::sparse::BACKEND::optimize_gemv,
-oneapi::mkl::sparse::BACKEND::optimize_trsv,
-oneapi::mkl::sparse::BACKEND::gemv,
-oneapi::mkl::sparse::BACKEND::gemv,
-oneapi::mkl::sparse::BACKEND::gemv,
-oneapi::mkl::sparse::BACKEND::gemv,
-oneapi::mkl::sparse::BACKEND::gemv,
-oneapi::mkl::sparse::BACKEND::gemv,
-oneapi::mkl::sparse::BACKEND::gemv,
-oneapi::mkl::sparse::BACKEND::gemv,
-oneapi::mkl::sparse::BACKEND::trsv,
-oneapi::mkl::sparse::BACKEND::trsv,
-oneapi::mkl::sparse::BACKEND::trsv,
-oneapi::mkl::sparse::BACKEND::trsv,
-oneapi::mkl::sparse::BACKEND::trsv,
-oneapi::mkl::sparse::BACKEND::trsv,
-oneapi::mkl::sparse::BACKEND::trsv,
-oneapi::mkl::sparse::BACKEND::trsv,
-oneapi::mkl::sparse::BACKEND::gemm,
-oneapi::mkl::sparse::BACKEND::gemm,
-oneapi::mkl::sparse::BACKEND::gemm,
-oneapi::mkl::sparse::BACKEND::gemm,
-oneapi::mkl::sparse::BACKEND::gemm,
-oneapi::mkl::sparse::BACKEND::gemm,
-oneapi::mkl::sparse::BACKEND::gemm,
-oneapi::mkl::sparse::BACKEND::gemm,
-    // clang-format on
diff --git a/src/sparse_blas/backends/mkl_common/mkl_basic.cxx b/src/sparse_blas/backends/mkl_common/mkl_basic.cxx
deleted file mode 100644
index fd3b1563a..000000000
--- a/src/sparse_blas/backends/mkl_common/mkl_basic.cxx
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-void init_matrix_handle(sycl::queue & /*queue*/, detail::matrix_handle **p_handle) {
-    oneapi::mkl::sparse::init_matrix_handle(detail::get_handle(p_handle));
-}
-
-sycl::event release_matrix_handle(sycl::queue &queue, detail::matrix_handle **p_handle,
-                                  const std::vector<sycl::event> &dependencies) {
-    return oneapi::mkl::sparse::release_matrix_handle(queue, detail::get_handle(p_handle),
-                                                      dependencies);
-}
-
-template <typename fpType, typename intType>
-std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>> set_csr_data(
-    sycl::queue &queue, detail::matrix_handle *handle, intType num_rows, intType num_cols,
-    intType /*nnz*/, index_base index, sycl::buffer<intType, 1> &row_ptr,
-    sycl::buffer<intType, 1> &col_ind, sycl::buffer<fpType, 1> &val) {
-    oneapi::mkl::sparse::set_csr_data(queue, detail::get_handle(handle), num_rows, num_cols, index,
-                                      row_ptr, col_ind, val);
-}
-
-template <typename fpType, typename intType>
-std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>, sycl::event> set_csr_data(
-    sycl::queue &queue, detail::matrix_handle *handle, intType num_rows, intType num_cols,
-    intType /*nnz*/, index_base index, intType *row_ptr, intType *col_ind, fpType *val,
-    const std::vector<sycl::event> &dependencies) {
-    return oneapi::mkl::sparse::set_csr_data(queue, detail::get_handle(handle), num_rows, num_cols,
-                                             index, row_ptr, col_ind, val, dependencies);
-}
-
-#define INSTANTIATE_SET_CSR_DATA(FP_TYPE, INT_TYPE)                                                \
-    template std::enable_if_t<detail::are_fp_int_supported_v<FP_TYPE, INT_TYPE>>                   \
-    set_csr_data<FP_TYPE, INT_TYPE>(                                                               \
-        sycl::queue & queue, detail::matrix_handle * handle, INT_TYPE num_rows, INT_TYPE num_cols, \
-        INT_TYPE nnz, index_base index, sycl::buffer<INT_TYPE, 1> & row_ptr,                       \
-        sycl::buffer<INT_TYPE, 1> & col_ind, sycl::buffer<FP_TYPE, 1> & val);                      \
-    template std::enable_if_t<detail::are_fp_int_supported_v<FP_TYPE, INT_TYPE>, sycl::event>      \
-    set_csr_data<FP_TYPE, INT_TYPE>(sycl::queue & queue, detail::matrix_handle * handle,           \
-                                    INT_TYPE num_rows, INT_TYPE num_cols, INT_TYPE nnz,            \
-                                    index_base index, INT_TYPE * row_ptr, INT_TYPE * col_ind,      \
-                                    FP_TYPE * val, const std::vector<sycl::event> &dependencies)
-
-FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_SET_CSR_DATA);
-
-#undef INSTANTIATE_SET_CSR_DATA
diff --git a/src/sparse_blas/backends/mkl_common/mkl_helper.hpp b/src/sparse_blas/backends/mkl_common/mkl_helper.hpp
deleted file mode 100644
index da5235ee0..000000000
--- a/src/sparse_blas/backends/mkl_common/mkl_helper.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// MKLCPU and MKLGPU backends include
-// This include defines its own oneapi::mkl::sparse namespace with some of the types that are used here: matrix_handle_t, index_base, transpose, uolo, diag.
-#include <oneapi/mkl/spblas.hpp>
-
-// Includes are set up so that oneapi::mkl::sparse namespace refers to the MKLCPU and MKLGPU backends namespace (oneMKL product)
-// in this file.
-// oneapi::mkl::sparse::detail namespace refers to the oneMKL interface namespace.
-
-#include "oneapi/mkl/sparse_blas/detail/helper_types.hpp"
-
-namespace oneapi::mkl::sparse::detail {
-
-inline auto get_handle(detail::matrix_handle **handle) {
-    return reinterpret_cast<oneapi::mkl::sparse::matrix_handle_t *>(handle);
-}
-
-inline auto get_handle(detail::matrix_handle *handle) {
-    return reinterpret_cast<oneapi::mkl::sparse::matrix_handle_t>(handle);
-}
-
-} // namespace oneapi::mkl::sparse::detail
-
-#define FOR_EACH_FP_TYPE(INSTANTIATE_MACRO) \
-    INSTANTIATE_MACRO(float);               \
-    INSTANTIATE_MACRO(double);              \
-    INSTANTIATE_MACRO(std::complex<float>); \
-    INSTANTIATE_MACRO(std::complex<double>)
-
-#define FOR_EACH_FP_AND_INT_TYPE_HELPER(INSTANTIATE_MACRO, INT_TYPE) \
-    INSTANTIATE_MACRO(float, INT_TYPE);                              \
-    INSTANTIATE_MACRO(double, INT_TYPE);                             \
-    INSTANTIATE_MACRO(std::complex<float>, INT_TYPE);                \
-    INSTANTIATE_MACRO(std::complex<double>, INT_TYPE)
-
-#define FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_MACRO)                   \
-    FOR_EACH_FP_AND_INT_TYPE_HELPER(INSTANTIATE_MACRO, std::int32_t); \
-    FOR_EACH_FP_AND_INT_TYPE_HELPER(INSTANTIATE_MACRO, std::int64_t)
diff --git a/src/sparse_blas/backends/mkl_common/mkl_operations.cxx b/src/sparse_blas/backends/mkl_common/mkl_operations.cxx
deleted file mode 100644
index ba6960341..000000000
--- a/src/sparse_blas/backends/mkl_common/mkl_operations.cxx
+++ /dev/null
@@ -1,170 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-sycl::event optimize_gemm(sycl::queue& queue, transpose /*transpose_A*/,
-                          detail::matrix_handle* /*handle*/,
-                          const std::vector<sycl::event>& dependencies) {
-    // TODO: Call to optimize_gemm with 2024.1 oneMKL release
-    // Return an event depending on the dependencies
-    return queue.submit([=](sycl::handler& cgh) {
-        cgh.depends_on(dependencies);
-        cgh.host_task([=]() { /* Empty kernel */ });
-    });
-}
-
-sycl::event optimize_gemm(sycl::queue& queue, transpose /*transpose_A*/, transpose /*transpose_B*/,
-                          layout /*dense_matrix_layout*/, const std::int64_t /*columns*/,
-                          detail::matrix_handle* /*handle*/,
-                          const std::vector<sycl::event>& dependencies) {
-    // TODO: Call to optimize_gemm with 2024.1 oneMKL release
-    // Return an event depending on the dependencies
-    return queue.submit([=](sycl::handler& cgh) {
-        cgh.depends_on(dependencies);
-        cgh.host_task([=]() { /* Empty kernel */ });
-    });
-}
-
-sycl::event optimize_gemv(sycl::queue& queue, transpose transpose_val,
-                          detail::matrix_handle* handle,
-                          const std::vector<sycl::event>& dependencies) {
-    return oneapi::mkl::sparse::optimize_gemv(queue, transpose_val, detail::get_handle(handle),
-                                              dependencies);
-}
-
-sycl::event optimize_trsv(sycl::queue& queue, uplo uplo_val, transpose transpose_val, diag diag_val,
-                          detail::matrix_handle* handle,
-                          const std::vector<sycl::event>& dependencies) {
-    // TODO: Remove this if condition once Intel oneMKL adds support for trans/conjtrans to optimize_trsv
-    if (transpose_val != transpose::nontrans) {
-        throw mkl::unimplemented("sparse_blas/backends/mkl", __FUNCTION__,
-                                 "Transposed or conjugate trsv is not supported");
-    }
-    return oneapi::mkl::sparse::optimize_trsv(queue, uplo_val, transpose_val, diag_val,
-                                              detail::get_handle(handle), dependencies);
-}
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>> gemv(
-    sycl::queue& queue, transpose transpose_val, const fpType alpha,
-    detail::matrix_handle* A_handle, sycl::buffer<fpType, 1>& x, const fpType beta,
-    sycl::buffer<fpType, 1>& y) {
-    oneapi::mkl::sparse::gemv(queue, transpose_val, alpha, detail::get_handle(A_handle), x, beta, y);
-}
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemv(
-    sycl::queue& queue, transpose transpose_val, const fpType alpha,
-    detail::matrix_handle* A_handle, const fpType* x, const fpType beta, fpType* y,
-    const std::vector<sycl::event>& dependencies) {
-    return oneapi::mkl::sparse::gemv(queue, transpose_val, alpha, detail::get_handle(A_handle), x, beta, y,
-                                     dependencies);
-}
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>> trsv(sycl::queue& queue, uplo uplo_val,
-                                                         transpose transpose_val, diag diag_val,
-                                                         detail::matrix_handle* A_handle,
-                                                         sycl::buffer<fpType, 1>& x,
-                                                         sycl::buffer<fpType, 1>& y) {
-    // TODO: Remove this if condition once Intel oneMKL adds support for trans/conjtrans to trsv
-    if (transpose_val != transpose::nontrans) {
-        throw mkl::unimplemented("sparse_blas/backends/mkl", __FUNCTION__,
-                                 "Transposed or conjugate trsv is not supported");
-    }
-    oneapi::mkl::sparse::trsv(queue, uplo_val, transpose_val, diag_val,
-                              detail::get_handle(A_handle), x, y);
-}
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> trsv(
-    sycl::queue& queue, uplo uplo_val, transpose transpose_val, diag diag_val,
-    detail::matrix_handle* A_handle, const fpType* x, fpType* y,
-    const std::vector<sycl::event>& dependencies) {
-    // TODO: Remove this if condition once Intel oneMKL adds support for trans/conjtrans to trsv
-    if (transpose_val != transpose::nontrans) {
-        throw mkl::unimplemented("sparse_blas/backends/mkl", __FUNCTION__,
-                                 "Transposed or conjugate trsv is not supported");
-    }
-    // TODO: Remove const_cast in future oneMKL release
-    return oneapi::mkl::sparse::trsv(queue, uplo_val, transpose_val, diag_val,
-                                     detail::get_handle(A_handle), const_cast<fpType*>(x), y,
-                                     dependencies);
-}
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>> gemm(
-    sycl::queue& queue, layout dense_matrix_layout, transpose transpose_A, transpose transpose_B,
-    const fpType alpha, detail::matrix_handle* A_handle, sycl::buffer<fpType, 1>& B,
-    const std::int64_t columns, const std::int64_t ldb, const fpType beta,
-    sycl::buffer<fpType, 1>& C, const std::int64_t ldc) {
-    oneapi::mkl::sparse::gemm(queue, dense_matrix_layout, transpose_A, transpose_B, alpha,
-                              detail::get_handle(A_handle), B, columns, ldb, beta, C, ldc);
-}
-
-template <typename fpType>
-std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemm(
-    sycl::queue& queue, layout dense_matrix_layout, transpose transpose_A, transpose transpose_B,
-    const fpType alpha, detail::matrix_handle* A_handle, const fpType* B,
-    const std::int64_t columns, const std::int64_t ldb, const fpType beta, fpType* C,
-    const std::int64_t ldc, const std::vector<sycl::event>& dependencies) {
-    // TODO: Remove const_cast in future oneMKL release
-    return oneapi::mkl::sparse::gemm(queue, dense_matrix_layout, transpose_A, transpose_B, alpha,
-                                     detail::get_handle(A_handle), const_cast<fpType*>(B), columns,
-                                     ldb, beta, C, ldc, dependencies);
-}
-
-#define INSTANTIATE_GEMV(FP_TYPE)                                                          \
-    template std::enable_if_t<detail::is_fp_supported_v<FP_TYPE>> gemv(                    \
-        sycl::queue& queue, transpose transpose_val, const FP_TYPE alpha,                  \
-        detail::matrix_handle* A_handle, sycl::buffer<FP_TYPE, 1>& x, const FP_TYPE beta,  \
-        sycl::buffer<FP_TYPE, 1>& y);                                                      \
-    template std::enable_if_t<detail::is_fp_supported_v<FP_TYPE>, sycl::event> gemv(       \
-        sycl::queue& queue, transpose transpose_val, const FP_TYPE alpha,                  \
-        detail::matrix_handle* A_handle, const FP_TYPE* x, const FP_TYPE beta, FP_TYPE* y, \
-        const std::vector<sycl::event>& dependencies)
-
-#define INSTANTIATE_TRSV(FP_TYPE)                                                    \
-    template std::enable_if_t<detail::is_fp_supported_v<FP_TYPE>> trsv(              \
-        sycl::queue& queue, uplo uplo_val, transpose transpose_val, diag diag_val,   \
-        detail::matrix_handle* A_handle, sycl::buffer<FP_TYPE, 1>& x,                \
-        sycl::buffer<FP_TYPE, 1>& y);                                                \
-    template std::enable_if_t<detail::is_fp_supported_v<FP_TYPE>, sycl::event> trsv( \
-        sycl::queue& queue, uplo uplo_val, transpose transpose_val, diag diag_val,   \
-        detail::matrix_handle* A_handle, const FP_TYPE* x, FP_TYPE* y,               \
-        const std::vector<sycl::event>& dependencies)
-
-#define INSTANTIATE_GEMM(FP_TYPE)                                                                 \
-    template std::enable_if_t<detail::is_fp_supported_v<FP_TYPE>> gemm(                           \
-        sycl::queue& queue, layout dense_matrix_layout, transpose transpose_A,                    \
-        transpose transpose_B, const FP_TYPE alpha, detail::matrix_handle* A_handle,              \
-        sycl::buffer<FP_TYPE, 1>& B, const std::int64_t columns, const std::int64_t ldb,          \
-        const FP_TYPE beta, sycl::buffer<FP_TYPE, 1>& C, const std::int64_t ldc);                 \
-    template std::enable_if_t<detail::is_fp_supported_v<FP_TYPE>, sycl::event> gemm(              \
-        sycl::queue& queue, layout dense_matrix_layout, transpose transpose_A,                    \
-        transpose transpose_B, const FP_TYPE alpha, detail::matrix_handle* A_handle,              \
-        const FP_TYPE* B, const std::int64_t columns, const std::int64_t ldb, const FP_TYPE beta, \
-        FP_TYPE* C, const std::int64_t ldc, const std::vector<sycl::event>& dependencies)
-
-FOR_EACH_FP_TYPE(INSTANTIATE_GEMV);
-FOR_EACH_FP_TYPE(INSTANTIATE_TRSV);
-FOR_EACH_FP_TYPE(INSTANTIATE_GEMM);
-
-#undef INSTANTIATE_GEMV
-#undef INSTANTIATE_TRSV
-#undef INSTANTIATE_GEMM
diff --git a/src/sparse_blas/backends/mklcpu/CMakeLists.txt b/src/sparse_blas/backends/mklcpu/CMakeLists.txt
deleted file mode 100644
index cfcf9cf3d..000000000
--- a/src/sparse_blas/backends/mklcpu/CMakeLists.txt
+++ /dev/null
@@ -1,82 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(LIB_NAME onemkl_sparse_blas_mklcpu)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-include(WarningsUtils)
-
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT
-  mklcpu_basic.cpp
-  mklcpu_operations.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: mklcpu_wrappers.cpp>
-)
-add_dependencies(onemkl_backend_libs_sparse_blas ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-
-if(TARGET MKL::MKL_SYCL::SPARSE)
-  target_link_libraries(${LIB_OBJ}
-    PUBLIC ONEMKL::SYCL::SYCL
-    PUBLIC MKL::MKL_SYCL::SPARSE
-    PRIVATE onemkl_warnings
-  )
-else()
-  target_link_libraries(${LIB_OBJ}
-    PUBLIC ONEMKL::SYCL::SYCL
-    PUBLIC MKL::MKL_DPCPP
-    PRIVATE onemkl_warnings
-  )
-endif()
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-#Set oneMKL libraries as not transitive for dynamic
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/sparse_blas/backends/mklcpu/mklcpu_basic.cpp b/src/sparse_blas/backends/mklcpu/mklcpu_basic.cpp
deleted file mode 100644
index 9ab29ee92..000000000
--- a/src/sparse_blas/backends/mklcpu/mklcpu_basic.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "../mkl_common/mkl_helper.hpp"
-
-#include "oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp"
-
-namespace oneapi::mkl::sparse::mklcpu {
-
-#include "../mkl_common/mkl_basic.cxx"
-
-} // namespace oneapi::mkl::sparse::mklcpu
diff --git a/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp b/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp
deleted file mode 100644
index e636b1816..000000000
--- a/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "../mkl_common/mkl_helper.hpp"
-
-#include "oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp"
-
-namespace oneapi::mkl::sparse::mklcpu {
-
-#include "../mkl_common/mkl_operations.cxx"
-
-} // namespace oneapi::mkl::sparse::mklcpu
diff --git a/src/sparse_blas/backends/mklcpu/mklcpu_wrappers.cpp b/src/sparse_blas/backends/mklcpu/mklcpu_wrappers.cpp
deleted file mode 100644
index 40f75c60c..000000000
--- a/src/sparse_blas/backends/mklcpu/mklcpu_wrappers.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/sparse_blas/types.hpp"
-
-#include "oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp"
-
-#include "sparse_blas/function_table.hpp"
-
-#define WRAPPER_VERSION 1
-#define BACKEND         mklcpu
-
-extern "C" sparse_blas_function_table_t mkl_sparse_blas_table = {
-    WRAPPER_VERSION,
-#include "sparse_blas/backends/backend_wrappers.cxx"
-};
diff --git a/src/sparse_blas/backends/mklgpu/CMakeLists.txt b/src/sparse_blas/backends/mklgpu/CMakeLists.txt
deleted file mode 100644
index a31794547..000000000
--- a/src/sparse_blas/backends/mklgpu/CMakeLists.txt
+++ /dev/null
@@ -1,82 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(LIB_NAME onemkl_sparse_blas_mklgpu)
-set(LIB_OBJ ${LIB_NAME}_obj)
-
-include(WarningsUtils)
-
-add_library(${LIB_NAME})
-add_library(${LIB_OBJ} OBJECT
-  mklgpu_basic.cpp
-  mklgpu_operations.cpp
-  $<$<BOOL:${BUILD_SHARED_LIBS}>: mklgpu_wrappers.cpp>
-)
-add_dependencies(onemkl_backend_libs_sparse_blas ${LIB_NAME})
-
-target_include_directories(${LIB_OBJ}
-  PRIVATE ${PROJECT_SOURCE_DIR}/include
-          ${PROJECT_SOURCE_DIR}/src
-          ${CMAKE_BINARY_DIR}/bin
-          ${ONEMKL_GENERATED_INCLUDE_PATH}
-)
-
-target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
-
-if(TARGET MKL::MKL_SYCL::SPARSE)
-  target_link_libraries(${LIB_OBJ}
-    PUBLIC ONEMKL::SYCL::SYCL
-    PUBLIC MKL::MKL_SYCL::SPARSE
-    PRIVATE onemkl_warnings
-  )
-else()
-  target_link_libraries(${LIB_OBJ}
-    PUBLIC ONEMKL::SYCL::SYCL
-    PUBLIC MKL::MKL_DPCPP
-    PRIVATE onemkl_warnings
-  )
-endif()
-
-set_target_properties(${LIB_OBJ} PROPERTIES
-  POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
-
-#Set oneMKL libraries as not transitive for dynamic
-if(BUILD_SHARED_LIBS)
-  set_target_properties(${LIB_NAME} PROPERTIES
-    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
-  )
-endif()
-
-# Add major version to the library
-set_target_properties(${LIB_NAME} PROPERTIES
-  SOVERSION ${PROJECT_VERSION_MAJOR}
-)
-
-# Add dependencies rpath to the library
-list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
-
-# Add the library to install package
-install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
-install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/sparse_blas/backends/mklgpu/mklgpu_basic.cpp b/src/sparse_blas/backends/mklgpu/mklgpu_basic.cpp
deleted file mode 100644
index 8df24f8da..000000000
--- a/src/sparse_blas/backends/mklgpu/mklgpu_basic.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "../mkl_common/mkl_helper.hpp"
-
-#include "oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp"
-
-namespace oneapi::mkl::sparse::mklgpu {
-
-#include "../mkl_common/mkl_basic.cxx"
-
-} // namespace oneapi::mkl::sparse::mklgpu
diff --git a/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp b/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp
deleted file mode 100644
index 439dc4eea..000000000
--- a/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "../mkl_common/mkl_helper.hpp"
-
-#include "oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp"
-
-namespace oneapi::mkl::sparse::mklgpu {
-
-#include "../mkl_common/mkl_operations.cxx"
-
-} // namespace oneapi::mkl::sparse::mklgpu
diff --git a/src/sparse_blas/backends/mklgpu/mklgpu_wrappers.cpp b/src/sparse_blas/backends/mklgpu/mklgpu_wrappers.cpp
deleted file mode 100644
index 346b13540..000000000
--- a/src/sparse_blas/backends/mklgpu/mklgpu_wrappers.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/sparse_blas/types.hpp"
-
-#include "oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp"
-
-#include "sparse_blas/function_table.hpp"
-
-#define WRAPPER_VERSION 1
-#define BACKEND         mklgpu
-
-extern "C" sparse_blas_function_table_t mkl_sparse_blas_table = {
-    WRAPPER_VERSION,
-#include "sparse_blas/backends/backend_wrappers.cxx"
-};
diff --git a/src/sparse_blas/function_table.hpp b/src/sparse_blas/function_table.hpp
deleted file mode 100644
index 57279fb3f..000000000
--- a/src/sparse_blas/function_table.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* (*Licensed under the Apache License, Version 2.0 )(the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_SPARSE_BLAS_FUNCTION_TABLE_HPP_
-#define _ONEMKL_SPARSE_BLAS_FUNCTION_TABLE_HPP_
-
-#include "oneapi/mkl/sparse_blas/types.hpp"
-#include "sparse_blas/macros.hpp"
-
-#define DEFINE_SET_CSR_DATA(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX)                        \
-    void (*set_csr_data_buffer##FP_SUFFIX##INT_SUFFIX)(                                      \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t handle, INT_TYPE num_rows, \
-        INT_TYPE num_cols, INT_TYPE nnz, oneapi::mkl::index_base index,                      \
-        sycl::buffer<INT_TYPE, 1> & row_ptr, sycl::buffer<INT_TYPE, 1> & col_ind,            \
-        sycl::buffer<FP_TYPE, 1> & val);                                                     \
-    sycl::event (*set_csr_data_usm##FP_SUFFIX##INT_SUFFIX)(                                  \
-        sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t handle, INT_TYPE num_rows, \
-        INT_TYPE num_cols, INT_TYPE nnz, oneapi::mkl::index_base index, INT_TYPE * row_ptr,  \
-        INT_TYPE * col_ind, FP_TYPE * val, const std::vector<sycl::event> &dependencies)
-
-#define DEFINE_GEMV(FP_TYPE, FP_SUFFIX)                                                      \
-    void (*gemv_buffer##FP_SUFFIX)(                                                          \
-        sycl::queue & queue, oneapi::mkl::transpose transpose_val, const FP_TYPE alpha,      \
-        oneapi::mkl::sparse::matrix_handle_t A_handle, sycl::buffer<FP_TYPE, 1> &x,          \
-        const FP_TYPE beta, sycl::buffer<FP_TYPE, 1> &y);                                    \
-    sycl::event (*gemv_usm##FP_SUFFIX)(                                                      \
-        sycl::queue & queue, oneapi::mkl::transpose transpose_val, const FP_TYPE alpha,      \
-        oneapi::mkl::sparse::matrix_handle_t A_handle, const FP_TYPE *x, const FP_TYPE beta, \
-        FP_TYPE *y, const std::vector<sycl::event> &dependencies)
-
-#define DEFINE_TRSV(FP_TYPE, FP_SUFFIX)                                                        \
-    void (*trsv_buffer##FP_SUFFIX)(                                                            \
-        sycl::queue & queue, oneapi::mkl::uplo uplo_val, oneapi::mkl::transpose transpose_val, \
-        oneapi::mkl::diag diag_val, oneapi::mkl::sparse::matrix_handle_t A_handle,             \
-        sycl::buffer<FP_TYPE, 1> & x, sycl::buffer<FP_TYPE, 1> & y);                           \
-    sycl::event (*trsv_usm##FP_SUFFIX)(                                                        \
-        sycl::queue & queue, oneapi::mkl::uplo uplo_val, oneapi::mkl::transpose transpose_val, \
-        oneapi::mkl::diag diag_val, oneapi::mkl::sparse::matrix_handle_t A_handle,             \
-        const FP_TYPE *x, FP_TYPE *y, const std::vector<sycl::event> &dependencies)
-
-#define DEFINE_GEMM(FP_TYPE, FP_SUFFIX)                                                       \
-    void (*gemm_buffer##FP_SUFFIX)(                                                           \
-        sycl::queue & queue, oneapi::mkl::layout dense_matrix_layout,                         \
-        oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B,               \
-        const FP_TYPE alpha, oneapi::mkl::sparse::matrix_handle_t A_handle,                   \
-        sycl::buffer<FP_TYPE, 1> &B, const std::int64_t columns, const std::int64_t ldb,      \
-        const FP_TYPE beta, sycl::buffer<FP_TYPE, 1> &C, const std::int64_t ldc);             \
-    sycl::event (*gemm_usm##FP_SUFFIX)(                                                       \
-        sycl::queue & queue, oneapi::mkl::layout dense_matrix_layout,                         \
-        oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B,               \
-        const FP_TYPE alpha, oneapi::mkl::sparse::matrix_handle_t A_handle, const FP_TYPE *B, \
-        const std::int64_t columns, const std::int64_t ldb, const FP_TYPE beta, FP_TYPE *C,   \
-        const std::int64_t ldc, const std::vector<sycl::event> &dependencies)
-
-typedef struct {
-    int version;
-    void (*init_matrix_handle)(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p_handle);
-
-    sycl::event (*release_matrix_handle)(sycl::queue &queue,
-                                         oneapi::mkl::sparse::matrix_handle_t *p_handle,
-                                         const std::vector<sycl::event> &dependencies);
-
-    FOR_EACH_FP_AND_INT_TYPE(DEFINE_SET_CSR_DATA);
-
-    // optimize_*
-    sycl::event (*optimize_gemm_v1)(sycl::queue &queue, oneapi::mkl::transpose transpose_A,
-                                    oneapi::mkl::sparse::matrix_handle_t handle,
-                                    const std::vector<sycl::event> &dependencies);
-    sycl::event (*optimize_gemm_v2)(sycl::queue &queue, oneapi::mkl::transpose transpose_A,
-                                    oneapi::mkl::transpose transpose_B,
-                                    oneapi::mkl::layout dense_matrix_layout,
-                                    const std::int64_t columns,
-                                    oneapi::mkl::sparse::matrix_handle_t handle,
-                                    const std::vector<sycl::event> &dependencies);
-    sycl::event (*optimize_gemv)(sycl::queue &queue, oneapi::mkl::transpose transpose_val,
-                                 oneapi::mkl::sparse::matrix_handle_t handle,
-                                 const std::vector<sycl::event> &dependencies);
-    sycl::event (*optimize_trsv)(sycl::queue &queue, oneapi::mkl::uplo uplo_val,
-                                 oneapi::mkl::transpose transpose_val, oneapi::mkl::diag diag_val,
-                                 oneapi::mkl::sparse::matrix_handle_t handle,
-                                 const std::vector<sycl::event> &dependencies);
-
-    FOR_EACH_FP_TYPE(DEFINE_GEMV);
-    FOR_EACH_FP_TYPE(DEFINE_TRSV);
-    FOR_EACH_FP_TYPE(DEFINE_GEMM);
-} sparse_blas_function_table_t;
-
-#undef DEFINE_SET_CSR_DATA
-#undef DEFINE_GEMV
-#undef DEFINE_TRSV
-#undef DEFINE_GEMM
-
-#endif // _ONEMKL_SPARSE_BLAS_FUNCTION_TABLE_HPP_
diff --git a/src/sparse_blas/macros.hpp b/src/sparse_blas/macros.hpp
deleted file mode 100644
index a4ef88e35..000000000
--- a/src/sparse_blas/macros.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* (*Licensed under the Apache License, Version 2.0 )(the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _ONEMKL_SPARSE_BLAS_MACROS_HPP_
-#define _ONEMKL_SPARSE_BLAS_MACROS_HPP_
-
-#define FOR_EACH_FP_TYPE(DEFINE_MACRO)      \
-    DEFINE_MACRO(float, _rf);               \
-    DEFINE_MACRO(double, _rd);              \
-    DEFINE_MACRO(std::complex<float>, _cf); \
-    DEFINE_MACRO(std::complex<double>, _cd)
-
-#define FOR_EACH_FP_AND_INT_TYPE_HELPER(DEFINE_MACRO, INT_TYPE, INT_SUFFIX) \
-    DEFINE_MACRO(float, _rf, INT_TYPE, INT_SUFFIX);                         \
-    DEFINE_MACRO(double, _rd, INT_TYPE, INT_SUFFIX);                        \
-    DEFINE_MACRO(std::complex<float>, _cf, INT_TYPE, INT_SUFFIX);           \
-    DEFINE_MACRO(std::complex<double>, _cd, INT_TYPE, INT_SUFFIX)
-
-#define FOR_EACH_FP_AND_INT_TYPE(DEFINE_MACRO)                         \
-    FOR_EACH_FP_AND_INT_TYPE_HELPER(DEFINE_MACRO, std::int32_t, _i32); \
-    FOR_EACH_FP_AND_INT_TYPE_HELPER(DEFINE_MACRO, std::int64_t, _i64)
-
-#endif // _ONEMKL_SPARSE_BLAS_MACROS_HPP_
diff --git a/src/sparse_blas/sparse_blas_loader.cpp b/src/sparse_blas/sparse_blas_loader.cpp
deleted file mode 100644
index 95da6df9c..000000000
--- a/src/sparse_blas/sparse_blas_loader.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Codeplay Software Ltd.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "oneapi/mkl/sparse_blas/detail/sparse_blas_rt.hpp"
-
-#include "function_table_initializer.hpp"
-#include "sparse_blas/function_table.hpp"
-#include "sparse_blas/macros.hpp"
-#include "oneapi/mkl/detail/get_device_id.hpp"
-
-namespace oneapi::mkl::sparse {
-
-static oneapi::mkl::detail::table_initializer<mkl::domain::sparse_blas,
-                                              sparse_blas_function_table_t>
-    function_tables;
-
-void init_matrix_handle(sycl::queue &queue, matrix_handle_t *p_handle) {
-    auto libkey = get_device_id(queue);
-    function_tables[libkey].init_matrix_handle(queue, p_handle);
-}
-
-sycl::event release_matrix_handle(sycl::queue &queue, matrix_handle_t *p_handle,
-                                  const std::vector<sycl::event> &dependencies) {
-    auto libkey = get_device_id(queue);
-    return function_tables[libkey].release_matrix_handle(queue, p_handle, dependencies);
-}
-
-#define DEFINE_SET_CSR_DATA(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX)                              \
-    template <>                                                                                    \
-    void set_csr_data(sycl::queue &queue, matrix_handle_t handle, INT_TYPE num_rows,               \
-                      INT_TYPE num_cols, INT_TYPE nnz, index_base index,                           \
-                      sycl::buffer<INT_TYPE, 1> &row_ptr, sycl::buffer<INT_TYPE, 1> &col_ind,      \
-                      sycl::buffer<FP_TYPE, 1> &val) {                                             \
-        auto libkey = get_device_id(queue);                                                        \
-        function_tables[libkey].set_csr_data_buffer##FP_SUFFIX##INT_SUFFIX(                        \
-            queue, handle, num_rows, num_cols, nnz, index, row_ptr, col_ind, val);                 \
-    }                                                                                              \
-    template <>                                                                                    \
-    sycl::event set_csr_data(sycl::queue &queue, matrix_handle_t handle, INT_TYPE num_rows,        \
-                             INT_TYPE num_cols, INT_TYPE nnz, index_base index, INT_TYPE *row_ptr, \
-                             INT_TYPE *col_ind, FP_TYPE *val,                                      \
-                             const std::vector<sycl::event> &dependencies) {                       \
-        auto libkey = get_device_id(queue);                                                        \
-        return function_tables[libkey].set_csr_data_usm##FP_SUFFIX##INT_SUFFIX(                    \
-            queue, handle, num_rows, num_cols, nnz, index, row_ptr, col_ind, val, dependencies);   \
-    }
-
-FOR_EACH_FP_AND_INT_TYPE(DEFINE_SET_CSR_DATA)
-#undef DEFINE_SET_CSR_DATA
-
-sycl::event optimize_gemm(sycl::queue &queue, transpose transpose_A, matrix_handle_t handle,
-                          const std::vector<sycl::event> &dependencies) {
-    auto libkey = get_device_id(queue);
-    return function_tables[libkey].optimize_gemm_v1(queue, transpose_A, handle, dependencies);
-}
-
-sycl::event optimize_gemm(sycl::queue &queue, transpose transpose_A, transpose transpose_B,
-                          layout dense_matrix_layout, const std::int64_t columns,
-                          matrix_handle_t handle, const std::vector<sycl::event> &dependencies) {
-    auto libkey = get_device_id(queue);
-    return function_tables[libkey].optimize_gemm_v2(
-        queue, transpose_A, transpose_B, dense_matrix_layout, columns, handle, dependencies);
-}
-
-sycl::event optimize_gemv(sycl::queue &queue, transpose transpose_val, matrix_handle_t handle,
-                          const std::vector<sycl::event> &dependencies) {
-    auto libkey = get_device_id(queue);
-    return function_tables[libkey].optimize_gemv(queue, transpose_val, handle, dependencies);
-}
-
-sycl::event optimize_trsv(sycl::queue &queue, uplo uplo_val, transpose transpose_val, diag diag_val,
-                          matrix_handle_t handle, const std::vector<sycl::event> &dependencies) {
-    auto libkey = get_device_id(queue);
-    return function_tables[libkey].optimize_trsv(queue, uplo_val, transpose_val, diag_val, handle,
-                                                 dependencies);
-}
-
-#define DEFINE_GEMV(FP_TYPE, FP_SUFFIX)                                                           \
-    template <>                                                                                   \
-    void gemv(sycl::queue &queue, transpose transpose_val, const FP_TYPE alpha,                   \
-              matrix_handle_t A_handle, sycl::buffer<FP_TYPE, 1> &x, const FP_TYPE beta,          \
-              sycl::buffer<FP_TYPE, 1> &y) {                                                      \
-        auto libkey = get_device_id(queue);                                                       \
-        function_tables[libkey].gemv_buffer##FP_SUFFIX(queue, transpose_val, alpha, A_handle, x,  \
-                                                       beta, y);                                  \
-    }                                                                                             \
-    template <>                                                                                   \
-    sycl::event gemv(sycl::queue &queue, transpose transpose_val, const FP_TYPE alpha,            \
-                     matrix_handle_t A_handle, const FP_TYPE *x, const FP_TYPE beta, FP_TYPE *y,  \
-                     const std::vector<sycl::event> &dependencies) {                              \
-        auto libkey = get_device_id(queue);                                                       \
-        return function_tables[libkey].gemv_usm##FP_SUFFIX(queue, transpose_val, alpha, A_handle, \
-                                                           x, beta, y, dependencies);             \
-    }
-
-FOR_EACH_FP_TYPE(DEFINE_GEMV)
-#undef DEFINE_GEMV
-
-#define DEFINE_TRSV(FP_TYPE, FP_SUFFIX)                                                          \
-    template <>                                                                                  \
-    void trsv(sycl::queue &queue, uplo uplo_val, transpose transpose_val, diag diag_val,         \
-              matrix_handle_t A_handle, sycl::buffer<FP_TYPE, 1> &x,                             \
-              sycl::buffer<FP_TYPE, 1> &y) {                                                     \
-        auto libkey = get_device_id(queue);                                                      \
-        function_tables[libkey].trsv_buffer##FP_SUFFIX(queue, uplo_val, transpose_val, diag_val, \
-                                                       A_handle, x, y);                          \
-    }                                                                                            \
-    template <>                                                                                  \
-    sycl::event trsv(sycl::queue &queue, uplo uplo_val, transpose transpose_val, diag diag_val,  \
-                     matrix_handle_t A_handle, const FP_TYPE *x, FP_TYPE *y,                     \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        auto libkey = get_device_id(queue);                                                      \
-        return function_tables[libkey].trsv_usm##FP_SUFFIX(                                      \
-            queue, uplo_val, transpose_val, diag_val, A_handle, x, y, dependencies);             \
-    }
-
-FOR_EACH_FP_TYPE(DEFINE_TRSV)
-#undef DEFINE_TRSV
-
-#define DEFINE_GEMM(FP_TYPE, FP_SUFFIX)                                                          \
-    template <>                                                                                  \
-    void gemm(sycl::queue &queue, layout dense_matrix_layout, transpose transpose_A,             \
-              transpose transpose_B, const FP_TYPE alpha, matrix_handle_t A_handle,              \
-              sycl::buffer<FP_TYPE, 1> &B, const std::int64_t columns, const std::int64_t ldb,   \
-              const FP_TYPE beta, sycl::buffer<FP_TYPE, 1> &C, const std::int64_t ldc) {         \
-        auto libkey = get_device_id(queue);                                                      \
-        function_tables[libkey].gemm_buffer##FP_SUFFIX(queue, dense_matrix_layout, transpose_A,  \
-                                                       transpose_B, alpha, A_handle, B, columns, \
-                                                       ldb, beta, C, ldc);                       \
-    }                                                                                            \
-    template <>                                                                                  \
-    sycl::event gemm(sycl::queue &queue, layout dense_matrix_layout, transpose transpose_A,      \
-                     transpose transpose_B, const FP_TYPE alpha, matrix_handle_t A_handle,       \
-                     const FP_TYPE *B, const std::int64_t columns, const std::int64_t ldb,       \
-                     const FP_TYPE beta, FP_TYPE *C, const std::int64_t ldc,                     \
-                     const std::vector<sycl::event> &dependencies) {                             \
-        auto libkey = get_device_id(queue);                                                      \
-        return function_tables[libkey].gemm_usm##FP_SUFFIX(                                      \
-            queue, dense_matrix_layout, transpose_A, transpose_B, alpha, A_handle, B, columns,   \
-            ldb, beta, C, ldc, dependencies);                                                    \
-    }
-
-FOR_EACH_FP_TYPE(DEFINE_GEMM)
-#undef DEFINE_GEMM
-
-} // namespace oneapi::mkl::sparse
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
deleted file mode 100644
index ecdafe8c0..000000000
--- a/tests/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build GoogleTest first
-add_subdirectory(${PROJECT_SOURCE_DIR}/deps/googletest ${CMAKE_BINARY_DIR}/deps/googletest)
-
-# Build Unit Tests
-add_subdirectory(unit_tests)
diff --git a/tests/README.md b/tests/README.md
deleted file mode 100644
index 3a8346057..000000000
--- a/tests/README.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# oneMKL Interfaces Testing
-
-## Overview
-Inside the `unit_tests` directory, there are domain-level directories which contain domain-specific tests, usually per function or per configuration.
-
-See [Building and Running Tests](https://oneapi-src.github.io/oneMKL/building_and_running_tests.html) documentation for more information about how to build and run the tests.
-
-[GoogleTest](https://github.com/google/googletest) is used as the unit-testing framework.
-
-
-*Refer to `<path to onemkl>/deps/googletest/LICENSE` for GoogleTest license.*
diff --git a/tests/unit_tests/CMakeLists.txt b/tests/unit_tests/CMakeLists.txt
deleted file mode 100644
index e7fe8e110..000000000
--- a/tests/unit_tests/CMakeLists.txt
+++ /dev/null
@@ -1,228 +0,0 @@
-#===============================================================================
-# Copyright 2020-2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-if("blas" IN_LIST TARGET_DOMAINS)
-  find_package(CBLAS REQUIRED)
-endif()
-
-if("lapack" IN_LIST TARGET_DOMAINS)
-    find_package(LAPACKE REQUIRED)
-endif()
-
-foreach(domain ${TARGET_DOMAINS})
-  # Build tests first
-  add_subdirectory(${domain})
-endforeach()
-
-include(GoogleTest)
-
-get_target_property(GTEST_INCLUDE_DIR gtest INTERFACE_INCLUDE_DIRECTORIES)
-
-# Build final test binaries: test_main_rt is for testing RunTime API (RT), test_main_ct is for testing CompileTime API (CT)
-
-# BLAS config
-set(blas_TEST_LIST
-        blas_level1
-        blas_level2
-        blas_level3
-        blas_batch
-        blas_extensions)
-
-set(blas_TEST_LINK "")
-
-# LAPACK config
-set(lapack_TEST_LIST
-        lapack_source)
-
-set(lapack_TEST_LINK ${LAPACKE_LINK})
-
-# RNG config
-set(rng_TEST_LIST
-      rng_statistics
-      rng_service)
-set(rng_DEVICE_TEST_LIST
-      rng_device_moments
-      rng_device_service
-)
-
-set(rng_TEST_LINK "")
-
-# DFT config
-set(dft_TEST_LIST
-      dft_source)
-
-set(dft_TEST_LINK "")
-
-# Sparse BLAS config
-set(sparse_blas_TEST_LIST
-      spblas_source)
-
-set(sparse_blas_TEST_LINK "")
-
-foreach(domain ${TARGET_DOMAINS})
-  # Generate RT and CT test lists
-  set(${domain}_TEST_LIST_RT ${${domain}_TEST_LIST})
-  set(${domain}_TEST_LIST_CT ${${domain}_TEST_LIST})
-  set(${domain}_DEVICE_TEST_LIST_CT ${${domain}_DEVICE_TEST_LIST})
-  list(TRANSFORM ${domain}_TEST_LIST_RT APPEND _rt)
-  list(TRANSFORM ${domain}_TEST_LIST_CT APPEND _ct)
-  list(TRANSFORM ${domain}_DEVICE_TEST_LIST_CT APPEND _ct)
-
-  add_executable(test_main_${domain}_ct main_test.cpp)
-  target_include_directories(test_main_${domain}_ct PUBLIC ${GTEST_INCLUDE_DIR})
-  
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET test_main_${domain}_ct SOURCES main_test.cpp)
-  else()
-    target_compile_options(test_main_${domain}_ct PRIVATE -fsycl)
-  endif()
-
-  if(BUILD_SHARED_LIBS)
-    add_executable(test_main_${domain}_rt main_test.cpp)
-    target_include_directories(test_main_${domain}_rt PUBLIC ${GTEST_INCLUDE_DIR})
-    if(NOT ${ONEMKL_SYCL_IMPLEMENTATION} STREQUAL "hipsycl")
-      target_compile_options(test_main_${domain}_rt PRIVATE -fsycl)
-    endif()
-    target_link_libraries(test_main_${domain}_rt PUBLIC
-      gtest
-      gtest_main
-      ${CMAKE_DL_LIBS}
-      ${${domain}_TEST_LINK}
-      ONEMKL::SYCL::SYCL
-      onemkl
-      ${${domain}_TEST_LIST_RT}
-    )
-    if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-      add_sycl_to_target(TARGET test_main_${domain}_rt SOURCES main_test.cpp)
-    endif()
-  endif()
-
-  if(ENABLE_MKLCPU_BACKEND)
-    add_dependencies(test_main_${domain}_ct onemkl_${domain}_mklcpu)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_mklcpu)
-  endif()
-
-  if(ENABLE_MKLGPU_BACKEND)
-    add_dependencies(test_main_${domain}_ct onemkl_${domain}_mklgpu)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_mklgpu)
-  endif()
-
-  if(domain STREQUAL "blas" AND ENABLE_CUBLAS_BACKEND)
-    add_dependencies(test_main_${domain}_ct onemkl_${domain}_cublas)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_cublas)
-  endif()
-
-  if(domain STREQUAL "blas" AND ENABLE_ROCBLAS_BACKEND)
-    add_dependencies(test_main_${domain}_ct onemkl_${domain}_rocblas)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_rocblas)
-  endif()
-
-  if(domain STREQUAL "blas" AND ENABLE_NETLIB_BACKEND)
-    add_dependencies(test_main_${domain}_ct onemkl_${domain}_netlib)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_netlib)
-  endif()
-
-  if(domain STREQUAL "blas" AND ENABLE_PORTBLAS_BACKEND)
-    add_dependencies(test_main_${domain}_ct onemkl_${domain}_portblas)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_portblas)
-  endif()
-
-  if(domain STREQUAL "lapack" AND ENABLE_CUSOLVER_BACKEND)
-    add_dependencies(test_main_${domain}_ct onemkl_${domain}_cusolver)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_cusolver)
-  endif()
-
-  if(domain STREQUAL "lapack" AND ENABLE_ROCSOLVER_BACKEND)
-    add_dependencies(test_main_${domain}_ct onemkl_${domain}_rocsolver)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_rocsolver)
-  endif()
-  
-  if(domain STREQUAL "rng" AND ENABLE_CURAND_BACKEND)
-    add_dependencies(test_main_${domain}_ct onemkl_${domain}_curand)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_curand)
-  endif()
-
-  if(domain STREQUAL "rng" AND ENABLE_ROCRAND_BACKEND)
-    add_dependencies(test_main_${domain}_ct onemkl_${domain}_rocrand)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_rocrand)
-  endif()
-
-  if(domain STREQUAL "dft" AND ENABLE_CUFFT_BACKEND)
-    add_dependencies(test_main_${domain}_ct onemkl_${domain}_cufft)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_cufft)
-  endif()
-
-  if(domain STREQUAL "dft" AND ENABLE_ROCFFT_BACKEND)
-    add_dependencies(test_main_${domain}_ct onemkl_dft_rocfft)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_dft_rocfft)
-  endif()
-
-  if(domain STREQUAL "dft" AND ENABLE_PORTFFT_BACKEND)
-    add_dependencies(test_main_${domain}_ct onemkl_dft_portfft)
-    list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_dft_portfft)
-  endif()
-
-  target_link_libraries(test_main_${domain}_ct PUBLIC
-      gtest
-      gtest_main
-      ${CMAKE_DL_LIBS}
-      ${${domain}_TEST_LINK}
-      ${ONEMKL_LIBRARIES_${domain}}
-      ONEMKL::SYCL::SYCL
-      ${${domain}_TEST_LIST_CT}
-      ${${domain}_DEVICE_TEST_LIST_CT}
-  )
-  
-  if(NOT ${ONEMKL_SYCL_IMPLEMENTATION} STREQUAL "hipsycl")
-    target_link_options(test_main_${domain}_ct PUBLIC -fsycl-device-code-split=per_kernel)
-  endif()
-
-  string(TOUPPER ${domain} DOMAIN_PREFIX)
-
-  if(domain STREQUAL "blas")
-    set(TEST_LD_LIBRARY_PATH ${CMAKE_BINARY_DIR}/lib:${CBLAS_LIB_DIR}:$ENV{LD_LIBRARY_PATH})
-  else()
-    set(TEST_LD_LIBRARY_PATH ${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH})
-  endif()
-
-  if(BUILD_SHARED_LIBS)
-    set_target_properties(test_main_${domain}_rt
-      PROPERTIES BUILD_RPATH $<TARGET_FILE_DIR:onemkl>)
-  # Find individual tests within executable
-    gtest_discover_tests(test_main_${domain}_rt
-      PROPERTIES BUILD_RPATH ${CMAKE_BINARY_DIR}/lib
-      PROPERTIES ENVIRONMENT LD_LIBRARY_PATH=${TEST_LD_LIBRARY_PATH}
-      PROPERTIES TEST_PREFIX ${DOMAIN_PREFIX}/RT/
-      DISCOVERY_TIMEOUT 30
-    )
-  endif()
-
-  gtest_discover_tests(test_main_${domain}_ct
-    PROPERTIES BUILD_RPATH ${CMAKE_BINARY_DIR}/lib
-    PROPERTIES ENVIRONMENT LD_LIBRARY_PATH=${TEST_LD_LIBRARY_PATH}
-    PROPERTIES TEST_PREFIX ${DOMAIN_PREFIX}/CT/
-    DISCOVERY_TIMEOUT 30
-  )
-
-  if(BUILD_SHARED_LIBS)
-    if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-      add_sycl_to_target(TARGET test_main_${domain}_rt)
-    endif()
-  endif()
-endforeach()
diff --git a/tests/unit_tests/blas/CMakeLists.txt b/tests/unit_tests/blas/CMakeLists.txt
deleted file mode 100644
index 74034decb..000000000
--- a/tests/unit_tests/blas/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_subdirectory(level1)
-add_subdirectory(level2)
-add_subdirectory(level3)
-add_subdirectory(batch)
-add_subdirectory(extensions)
diff --git a/tests/unit_tests/blas/batch/CMakeLists.txt b/tests/unit_tests/blas/batch/CMakeLists.txt
deleted file mode 100644
index 0f47f37af..000000000
--- a/tests/unit_tests/blas/batch/CMakeLists.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build object from all test sources
-set(BATCH_SOURCES "copy_batch_stride.cpp" "axpy_batch_stride.cpp" "dgmm_batch_stride.cpp" "gemm_batch_stride.cpp" "gemv_batch_stride.cpp" "trsm_batch_stride.cpp" "syrk_batch_stride.cpp" "copy_batch_usm.cpp" "copy_batch_stride_usm.cpp" "axpy_batch_usm.cpp" "axpy_batch_stride_usm.cpp" "dgmm_batch_usm.cpp" "dgmm_batch_stride_usm.cpp" "gemm_batch_usm.cpp" "gemm_batch_stride_usm.cpp" "gemv_batch_usm.cpp" "gemv_batch_stride_usm.cpp" "trsm_batch_usm.cpp" "trsm_batch_stride_usm.cpp" "syrk_batch_usm.cpp" "syrk_batch_stride_usm.cpp" "omatcopy_batch_stride.cpp" "omatcopy_batch_stride_usm.cpp" "imatcopy_batch_stride.cpp" "imatcopy_batch_stride_usm.cpp" "omatadd_batch_stride.cpp" "omatadd_batch_stride_usm.cpp" "omatcopy_batch_usm.cpp" "imatcopy_batch_usm.cpp")
-
-if(BUILD_SHARED_LIBS)
-  add_library(blas_batch_rt OBJECT ${BATCH_SOURCES})
-  target_compile_options(blas_batch_rt PRIVATE -DCALL_RT_API -DNOMINMAX)
-  target_include_directories(blas_batch_rt
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-      PUBLIC ${CBLAS_INCLUDE}
-  )
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET blas_batch_rt SOURCES ${BATCH_SOURCES})
-  else()
-    target_link_libraries(blas_batch_rt PUBLIC ONEMKL::SYCL::SYCL)
-  endif()
-endif()
-
-add_library(blas_batch_ct OBJECT ${BATCH_SOURCES})
-target_compile_options(blas_batch_ct PRIVATE  -DNOMINMAX)
-target_include_directories(blas_batch_ct
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-    PUBLIC ${PROJECT_SOURCE_DIR}/include
-    PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-    PUBLIC ${CMAKE_BINARY_DIR}/bin
-    PUBLIC ${CBLAS_INCLUDE}
-)
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET blas_batch_ct SOURCES ${BATCH_SOURCES})
-else()
-  target_link_libraries(blas_batch_ct PUBLIC ONEMKL::SYCL::SYCL)
-endif()
diff --git a/tests/unit_tests/blas/batch/axpy_batch_stride.cpp b/tests/unit_tests/blas/batch/axpy_batch_stride.cpp
deleted file mode 100644
index 9bb1406ef..000000000
--- a/tests/unit_tests/blas/batch/axpy_batch_stride.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, fp alpha,
-         int64_t batch_size) {
-    // Prepare data.
-    int64_t n, i;
-
-    n = 1357;
-
-    int64_t stride_x, stride_y;
-    stride_x = n * std::abs(incx);
-    stride_y = n * std::abs(incy);
-
-    vector<fp, allocator_helper<fp, 64>> x(stride_x * batch_size);
-    vector<fp, allocator_helper<fp, 64>> y(stride_y * batch_size), y_ref(stride_y * batch_size);
-
-    for (i = 0; i < batch_size; i++) {
-        rand_vector(x.data() + stride_x * i, n, incx);
-        rand_vector(y.data() + stride_y * i, n, incy);
-    }
-
-    y_ref = y;
-
-    // Call reference AXPY_BATCH_STRIDE.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int n_ref = (int)n;
-    int incx_ref = (int)incx;
-    int incy_ref = (int)incy;
-    int batch_size_ref = (int)batch_size;
-
-    for (i = 0; i < batch_size_ref; i++) {
-        ::axpy(&n_ref, (fp_ref *)&alpha, (fp_ref *)x.data() + i * stride_x, &incx_ref,
-               (fp_ref *)y_ref.data() + i * stride_y, &incy_ref);
-    }
-
-    // Call DPC++ AXPY_BATCH_STRIDE.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during AXPY_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer(x.data(), range<1>(x.size()));
-    buffer<fp, 1> y_buffer(y.data(), range<1>(y.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::axpy_batch(main_queue, n, alpha, x_buffer, incx,
-                                                            stride_x, y_buffer, incy, stride_y,
-                                                            batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::axpy_batch(main_queue, n, alpha, x_buffer, incx,
-                                                         stride_x, y_buffer, incy, stride_y,
-                                                         batch_size);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::axpy_batch, n,
-                                        alpha, x_buffer, incx, stride_x, y_buffer, incy, stride_y,
-                                        batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::axpy_batch, n,
-                                        alpha, x_buffer, incx, stride_x, y_buffer, incy, stride_y,
-                                        batch_size);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during AXPY_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of AXPY_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good = true;
-    for (i = 0; i < batch_size; i++) {
-        good = good && check_equal_vector(y_accessor.get_pointer() + i * stride_y,
-                                          y_ref.data() + i * stride_y, n, incy, n, std::cout);
-    }
-    return (int)good;
-}
-
-class AxpyBatchStrideTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(AxpyBatchStrideTests, RealSinglePrecision) {
-    float alpha = 2.0;
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, alpha, 15));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, alpha, 15));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, alpha, 15));
-}
-
-TEST_P(AxpyBatchStrideTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha = 2.0;
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, alpha, 15));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, alpha, 15));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, alpha, 15));
-}
-
-TEST_P(AxpyBatchStrideTests, ComplexSinglePrecision) {
-    std::complex<float> alpha = std::complex<float>(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2,
-                                                3, alpha, 15));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                -2, -3, alpha, 15));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1,
-                                                1, alpha, 15));
-}
-
-TEST_P(AxpyBatchStrideTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha = std::complex<double>(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 2, 3, alpha, 15));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 -2, -3, alpha, 15));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 1, 1, alpha, 15));
-}
-
-INSTANTIATE_TEST_SUITE_P(AxpyBatchStrideTestSuite, AxpyBatchStrideTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/axpy_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/axpy_batch_stride_usm.cpp
deleted file mode 100644
index 9ebc82abe..000000000
--- a/tests/unit_tests/blas/batch/axpy_batch_stride_usm.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, fp alpha,
-         int64_t batch_size) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during AXPY_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t n, i;
-
-    n = 1357;
-
-    int64_t stride_x, stride_y;
-    stride_x = n * std::abs(incx);
-    stride_y = n * std::abs(incy);
-
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), y_ref(ua);
-
-    x.resize(stride_x * batch_size);
-    y.resize(stride_y * batch_size);
-
-    for (i = 0; i < batch_size; i++) {
-        rand_vector(&x[stride_x * i], n, incx);
-        rand_vector(&y[stride_y * i], n, incy);
-    }
-
-    y_ref.resize(y.size());
-    for (int i = 0; i < y.size(); i++)
-        y_ref[i] = y[i];
-
-    // Call reference AXPY_BATCH_STRIDE.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int n_ref = (int)n;
-    int incx_ref = (int)incx;
-    int incy_ref = (int)incy;
-    int batch_size_ref = (int)batch_size;
-
-    for (i = 0; i < batch_size_ref; i++) {
-        ::axpy(&n_ref, (fp_ref *)&alpha, (fp_ref *)x.data() + i * stride_x, &incx_ref,
-               (fp_ref *)y_ref.data() + i * stride_y, &incy_ref);
-    }
-
-    // Call DPC++ AXPY_BATCH_STRIDE.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::axpy_batch(
-                    main_queue, n, alpha, &x[0], incx, stride_x, &y[0], incy, stride_y, batch_size,
-                    dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::axpy_batch(main_queue, n, alpha, &x[0], incx,
-                                                                stride_x, &y[0], incy, stride_y,
-                                                                batch_size, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::axpy_batch, n,
-                                        alpha, &x[0], incx, stride_x, &y[0], incy, stride_y,
-                                        batch_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::axpy_batch, n,
-                                        alpha, &x[0], incx, stride_x, &y[0], incy, stride_y,
-                                        batch_size, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during AXPY_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of AXPY_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = true;
-    for (i = 0; i < batch_size; i++) {
-        good = good &&
-               check_equal_vector(&y[i * stride_y], &y_ref[i * stride_y], n, incy, n, std::cout);
-    }
-    return (int)good;
-}
-
-class AxpyBatchStrideUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(AxpyBatchStrideUsmTests, RealSinglePrecision) {
-    float alpha = 2.0;
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, alpha, 15));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, alpha, 15));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, alpha, 15));
-}
-
-TEST_P(AxpyBatchStrideUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha = 2.0;
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, alpha, 15));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, alpha, 15));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, alpha, 15));
-}
-
-TEST_P(AxpyBatchStrideUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha = std::complex<float>(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2,
-                                                3, alpha, 15));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                -2, -3, alpha, 15));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1,
-                                                1, alpha, 15));
-}
-
-TEST_P(AxpyBatchStrideUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha = std::complex<double>(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 2, 3, alpha, 15));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 -2, -3, alpha, 15));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 1, 1, alpha, 15));
-}
-
-INSTANTIATE_TEST_SUITE_P(AxpyBatchStrideUsmTestSuite, AxpyBatchStrideUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/axpy_batch_usm.cpp b/tests/unit_tests/blas/batch/axpy_batch_usm.cpp
deleted file mode 100644
index 4dacf8ddb..000000000
--- a/tests/unit_tests/blas/batch/axpy_batch_usm.cpp
+++ /dev/null
@@ -1,285 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during AXPY_BATCH:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t *n =
-        (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt);
-    int64_t *incx =
-        (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt);
-    int64_t *incy =
-        (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt);
-    fp *alpha = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * group_count, *dev, cxt);
-    int64_t *group_size =
-        (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt);
-
-    if ((n == NULL) || (incx == NULL) || (incy == NULL) || (alpha == NULL) ||
-        (group_size == NULL)) {
-        std::cout << "Error cannot allocate input arrays\n";
-        oneapi::mkl::free_shared(n, cxt);
-        oneapi::mkl::free_shared(incx, cxt);
-        oneapi::mkl::free_shared(incy, cxt);
-        oneapi::mkl::free_shared(alpha, cxt);
-        oneapi::mkl::free_shared(group_size, cxt);
-        return false;
-    }
-
-    int64_t i;
-    int64_t j, idx = 0;
-    int64_t total_size_x, total_size_y;
-    int64_t total_batch_count = 0;
-
-    for (i = 0; i < group_count; i++) {
-        group_size[i] = 1 + std::rand() % 100;
-        n[i] = 1 + std::rand() % 500;
-        incx[i] = ((std::rand() % 2) == 0) ? 1 + std::rand() % 2 : -1 - std::rand() % 2;
-        incy[i] = ((std::rand() % 2) == 0) ? 1 + std::rand() % 2 : -1 - std::rand() % 2;
-        alpha[i] = rand_scalar<fp>();
-        total_batch_count += group_size[i];
-    }
-
-    fp **x_array =
-        (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * total_batch_count, *dev, cxt);
-    fp **y_array =
-        (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * total_batch_count, *dev, cxt);
-    fp **y_ref_array =
-        (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * total_batch_count, *dev, cxt);
-
-    if ((x_array == NULL) || (y_array == NULL) || (y_ref_array == NULL)) {
-        std::cout << "Error cannot allocate arrays of pointers\n";
-        oneapi::mkl::free_shared(x_array, cxt);
-        oneapi::mkl::free_shared(y_array, cxt);
-        oneapi::mkl::free_shared(y_ref_array, cxt);
-        return false;
-    }
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            total_size_x = (1 + (n[i] - 1) * std::abs(incx[i]));
-            total_size_y = (1 + (n[i] - 1) * std::abs(incy[i]));
-            x_array[idx] =
-                (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_x, *dev, cxt);
-            y_array[idx] =
-                (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_y, *dev, cxt);
-            y_ref_array[idx] =
-                (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_y, *dev, cxt);
-            rand_vector(x_array[idx], n[i], incx[i]);
-            rand_vector(y_array[idx], n[i], incy[i]);
-            copy_vector(y_array[idx], n[i], incy[i], y_ref_array[idx]);
-            idx++;
-        }
-    }
-
-    // Call reference AXPY_BATCH.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int n_ref, incx_ref, incy_ref;
-
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            n_ref = (int)n[i];
-            incx_ref = (int)incx[i];
-            incy_ref = (int)incy[i];
-            ::axpy((const int *)&n_ref, (const fp_ref *)&alpha[i], (const fp_ref *)x_array[idx],
-                   (const int *)&incx_ref, (fp_ref *)y_ref_array[idx], (const int *)&incy_ref);
-            idx++;
-        }
-    }
-
-    // Call DPC++ AXPY_BATCH.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::axpy_batch(
-                    main_queue, n, alpha, (const fp **)x_array, incx, y_array, incy, group_count,
-                    group_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::axpy_batch(
-                    main_queue, n, alpha, (const fp **)x_array, incx, y_array, incy, group_count,
-                    group_size, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::axpy_batch, n,
-                                        alpha, (const fp **)x_array, incx, y_array, incy,
-                                        group_count, group_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::axpy_batch, n,
-                                        alpha, (const fp **)x_array, incx, y_array, incy,
-                                        group_count, group_size, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during AXPY_BATCH:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        idx = 0;
-        for (i = 0; i < group_count; i++) {
-            for (j = 0; j < group_size[i]; j++) {
-                oneapi::mkl::free_shared(x_array[idx], cxt);
-                oneapi::mkl::free_shared(y_array[idx], cxt);
-                oneapi::mkl::free_shared(y_ref_array[idx], cxt);
-                idx++;
-            }
-        }
-        oneapi::mkl::free_shared(n, cxt);
-        oneapi::mkl::free_shared(incx, cxt);
-        oneapi::mkl::free_shared(incy, cxt);
-        oneapi::mkl::free_shared(alpha, cxt);
-        oneapi::mkl::free_shared(group_size, cxt);
-        oneapi::mkl::free_shared(x_array, cxt);
-        oneapi::mkl::free_shared(y_array, cxt);
-        oneapi::mkl::free_shared(y_ref_array, cxt);
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of AXPY_BATCH:\n" << error.what() << std::endl;
-    }
-
-    bool good = true;
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            good = good && check_equal_vector(y_array[idx], y_ref_array[idx], n[i], incy[i], n[i],
-                                              std::cout);
-            idx++;
-        }
-    }
-
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            oneapi::mkl::free_shared(x_array[idx], cxt);
-            oneapi::mkl::free_shared(y_array[idx], cxt);
-            oneapi::mkl::free_shared(y_ref_array[idx], cxt);
-            idx++;
-        }
-    }
-    oneapi::mkl::free_shared(n, cxt);
-    oneapi::mkl::free_shared(incx, cxt);
-    oneapi::mkl::free_shared(incy, cxt);
-    oneapi::mkl::free_shared(alpha, cxt);
-    oneapi::mkl::free_shared(group_size, cxt);
-    oneapi::mkl::free_shared(x_array, cxt);
-    oneapi::mkl::free_shared(y_array, cxt);
-    oneapi::mkl::free_shared(y_ref_array, cxt);
-
-    return (int)good;
-}
-
-class AxpyBatchUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(AxpyBatchUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(AxpyBatchUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(AxpyBatchUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(AxpyBatchUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(AxpyBatchUsmTestSuite, AxpyBatchUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/copy_batch_stride.cpp b/tests/unit_tests/blas/batch/copy_batch_stride.cpp
deleted file mode 100644
index a1da595f6..000000000
--- a/tests/unit_tests/blas/batch/copy_batch_stride.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, int64_t batch_size) {
-    // Prepare data.
-    int64_t n, i;
-
-    n = 1357;
-
-    int64_t stride_x, stride_y;
-    stride_x = n * std::abs(incx);
-    stride_y = n * std::abs(incy);
-
-    vector<fp, allocator_helper<fp, 64>> x(stride_x * batch_size);
-    vector<fp, allocator_helper<fp, 64>> y(stride_y * batch_size), y_ref(stride_y * batch_size);
-
-    for (i = 0; i < batch_size; i++) {
-        rand_vector(x.data() + stride_x * i, n, incx);
-        rand_vector(y.data() + stride_y * i, n, incy);
-    }
-
-    y_ref = y;
-
-    // Call reference COPY_BATCH_STRIDE.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int n_ref = (int)n;
-    int incx_ref = (int)incx;
-    int incy_ref = (int)incy;
-    int batch_size_ref = (int)batch_size;
-
-    for (i = 0; i < batch_size_ref; i++) {
-        ::copy(&n_ref, (fp_ref *)x.data() + i * stride_x, &incx_ref,
-               (fp_ref *)y_ref.data() + i * stride_y, &incy_ref);
-    }
-
-    // Call DPC++ COPY_BATCH_STRIDE.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during COPY_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer(x.data(), range<1>(x.size()));
-    buffer<fp, 1> y_buffer(y.data(), range<1>(y.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::copy_batch(main_queue, n, x_buffer, incx, stride_x,
-                                                            y_buffer, incy, stride_y, batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::copy_batch(main_queue, n, x_buffer, incx, stride_x,
-                                                         y_buffer, incy, stride_y, batch_size);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::copy_batch, n,
-                                        x_buffer, incx, stride_x, y_buffer, incy, stride_y,
-                                        batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::copy_batch, n,
-                                        x_buffer, incx, stride_x, y_buffer, incy, stride_y,
-                                        batch_size);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during COPY_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of COPY_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good = true;
-    for (i = 0; i < batch_size; i++) {
-        good = good && check_equal_vector(y_accessor.get_pointer() + i * stride_y,
-                                          y_ref.data() + i * stride_y, n, incy, n, std::cout);
-    }
-    return (int)good;
-}
-
-class CopyBatchStrideTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(CopyBatchStrideTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 15));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 15));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 15));
-}
-
-TEST_P(CopyBatchStrideTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 15));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 15));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 15));
-}
-
-TEST_P(CopyBatchStrideTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 15));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 15));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 15));
-}
-
-TEST_P(CopyBatchStrideTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 15));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 15));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 15));
-}
-
-INSTANTIATE_TEST_SUITE_P(CopyBatchStrideTestSuite, CopyBatchStrideTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/copy_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/copy_batch_stride_usm.cpp
deleted file mode 100644
index 569293be1..000000000
--- a/tests/unit_tests/blas/batch/copy_batch_stride_usm.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, int64_t batch_size) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during COPY_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t n, i;
-
-    n = 1357;
-
-    int64_t stride_x, stride_y;
-    stride_x = n * std::abs(incx);
-    stride_y = n * std::abs(incy);
-
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), y_ref(ua);
-
-    x.resize(stride_x * batch_size);
-    y.resize(stride_y * batch_size);
-
-    for (i = 0; i < batch_size; i++) {
-        rand_vector(&x[stride_x * i], n, incx);
-        rand_vector(&y[stride_y * i], n, incy);
-    }
-
-    y_ref.resize(y.size());
-    for (int i = 0; i < y.size(); i++)
-        y_ref[i] = y[i];
-
-    // Call reference COPY_BATCH_STRIDE.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int n_ref = (int)n;
-    int incx_ref = (int)incx;
-    int incy_ref = (int)incy;
-    int batch_size_ref = (int)batch_size;
-
-    for (i = 0; i < batch_size_ref; i++) {
-        ::copy(&n_ref, (fp_ref *)x.data() + i * stride_x, &incx_ref,
-               (fp_ref *)y_ref.data() + i * stride_y, &incy_ref);
-    }
-
-    // Call DPC++ COPY_BATCH_STRIDE.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::copy_batch(main_queue, n, &x[0], incx,
-                                                                   stride_x, &y[0], incy, stride_y,
-                                                                   batch_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::copy_batch(main_queue, n, &x[0], incx,
-                                                                stride_x, &y[0], incy, stride_y,
-                                                                batch_size, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::copy_batch, n,
-                                        &x[0], incx, stride_x, &y[0], incy, stride_y, batch_size,
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::copy_batch, n,
-                                        &x[0], incx, stride_x, &y[0], incy, stride_y, batch_size,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during COPY_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of COPY_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = true;
-    for (i = 0; i < batch_size; i++) {
-        good = good &&
-               check_equal_vector(&y[i * stride_y], &y_ref[i * stride_y], n, incy, n, std::cout);
-    }
-    return (int)good;
-}
-
-class CopyBatchStrideUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(CopyBatchStrideUsmTests, RealSinglePrecision) {
-    float alpha = 2.0;
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 15));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 15));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 15));
-}
-
-TEST_P(CopyBatchStrideUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha = 2.0;
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 15));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 15));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 15));
-}
-
-TEST_P(CopyBatchStrideUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha = std::complex<float>(2.0, -0.5);
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 15));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 15));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 15));
-}
-
-TEST_P(CopyBatchStrideUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha = std::complex<double>(2.0, -0.5);
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 15));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 15));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 15));
-}
-
-INSTANTIATE_TEST_SUITE_P(CopyBatchStrideUsmTestSuite, CopyBatchStrideUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/copy_batch_usm.cpp b/tests/unit_tests/blas/batch/copy_batch_usm.cpp
deleted file mode 100644
index 8cac23704..000000000
--- a/tests/unit_tests/blas/batch/copy_batch_usm.cpp
+++ /dev/null
@@ -1,279 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during COPY_BATCH:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t *n =
-        (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt);
-    int64_t *incx =
-        (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt);
-    int64_t *incy =
-        (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt);
-    int64_t *group_size =
-        (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt);
-
-    if ((n == NULL) || (incx == NULL) || (incy == NULL) || (group_size == NULL)) {
-        std::cout << "Error cannot allocate input arrays\n";
-        oneapi::mkl::free_shared(n, cxt);
-        oneapi::mkl::free_shared(incx, cxt);
-        oneapi::mkl::free_shared(incy, cxt);
-        oneapi::mkl::free_shared(group_size, cxt);
-        return false;
-    }
-
-    int64_t i;
-    int64_t j, idx = 0;
-    int64_t total_size_x, total_size_y;
-    int64_t total_batch_count = 0;
-
-    for (i = 0; i < group_count; i++) {
-        group_size[i] = 1 + std::rand() % 100;
-        n[i] = 1 + std::rand() % 500;
-        incx[i] = ((std::rand() % 2) == 0) ? 1 + std::rand() % 2 : -1 - std::rand() % 2;
-        incy[i] = ((std::rand() % 2) == 0) ? 1 + std::rand() % 2 : -1 - std::rand() % 2;
-        total_batch_count += group_size[i];
-    }
-
-    fp **x_array =
-        (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * total_batch_count, *dev, cxt);
-    fp **y_array =
-        (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * total_batch_count, *dev, cxt);
-    fp **y_ref_array =
-        (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * total_batch_count, *dev, cxt);
-
-    if ((x_array == NULL) || (y_array == NULL) || (y_ref_array == NULL)) {
-        std::cout << "Error cannot allocate arrays of pointers\n";
-        oneapi::mkl::free_shared(x_array, cxt);
-        oneapi::mkl::free_shared(y_array, cxt);
-        oneapi::mkl::free_shared(y_ref_array, cxt);
-        return false;
-    }
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            total_size_x = (1 + (n[i] - 1) * std::abs(incx[i]));
-            total_size_y = (1 + (n[i] - 1) * std::abs(incy[i]));
-            x_array[idx] =
-                (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_x, *dev, cxt);
-            y_array[idx] =
-                (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_y, *dev, cxt);
-            y_ref_array[idx] =
-                (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_y, *dev, cxt);
-            rand_vector(x_array[idx], n[i], incx[i]);
-            rand_vector(y_array[idx], n[i], incy[i]);
-            copy_vector(y_array[idx], n[i], incy[i], y_ref_array[idx]);
-            idx++;
-        }
-    }
-
-    // Call reference COPY_BATCH.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int n_ref, incx_ref, incy_ref;
-
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            n_ref = (int)n[i];
-            incx_ref = (int)incx[i];
-            incy_ref = (int)incy[i];
-            ::copy((const int *)&n_ref, (const fp_ref *)x_array[idx], (const int *)&incx_ref,
-                   (fp_ref *)y_ref_array[idx], (const int *)&incy_ref);
-            idx++;
-        }
-    }
-
-    // Call DPC++ COPY_BATCH.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::copy_batch(
-                    main_queue, n, (const fp **)x_array, incx, y_array, incy, group_count,
-                    group_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::copy_batch(main_queue, n, (const fp **)x_array,
-                                                                incx, y_array, incy, group_count,
-                                                                group_size, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::copy_batch, n,
-                                        (const fp **)x_array, incx, y_array, incy, group_count,
-                                        group_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::copy_batch, n,
-                                        (const fp **)x_array, incx, y_array, incy, group_count,
-                                        group_size, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during COPY_BATCH:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        idx = 0;
-        for (i = 0; i < group_count; i++) {
-            for (j = 0; j < group_size[i]; j++) {
-                oneapi::mkl::free_shared(x_array[idx], cxt);
-                oneapi::mkl::free_shared(y_array[idx], cxt);
-                oneapi::mkl::free_shared(y_ref_array[idx], cxt);
-                idx++;
-            }
-        }
-        oneapi::mkl::free_shared(n, cxt);
-        oneapi::mkl::free_shared(incx, cxt);
-        oneapi::mkl::free_shared(incy, cxt);
-        oneapi::mkl::free_shared(group_size, cxt);
-        oneapi::mkl::free_shared(x_array, cxt);
-        oneapi::mkl::free_shared(y_array, cxt);
-        oneapi::mkl::free_shared(y_ref_array, cxt);
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of COPY_BATCH:\n" << error.what() << std::endl;
-    }
-
-    bool good = true;
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            good = good && check_equal_vector(y_array[idx], y_ref_array[idx], n[i], incy[i], n[i],
-                                              std::cout);
-            idx++;
-        }
-    }
-
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            oneapi::mkl::free_shared(x_array[idx], cxt);
-            oneapi::mkl::free_shared(y_array[idx], cxt);
-            oneapi::mkl::free_shared(y_ref_array[idx], cxt);
-            idx++;
-        }
-    }
-    oneapi::mkl::free_shared(n, cxt);
-    oneapi::mkl::free_shared(incx, cxt);
-    oneapi::mkl::free_shared(incy, cxt);
-    oneapi::mkl::free_shared(group_size, cxt);
-    oneapi::mkl::free_shared(x_array, cxt);
-    oneapi::mkl::free_shared(y_array, cxt);
-    oneapi::mkl::free_shared(y_ref_array, cxt);
-
-    return (int)good;
-}
-
-class CopyBatchUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(CopyBatchUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(CopyBatchUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(CopyBatchUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(CopyBatchUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(CopyBatchUsmTestSuite, CopyBatchUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/dgmm_batch_stride.cpp b/tests/unit_tests/blas/batch/dgmm_batch_stride.cpp
deleted file mode 100644
index bb642c3ee..000000000
--- a/tests/unit_tests/blas/batch/dgmm_batch_stride.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right, int64_t incx,
-         int64_t batch_size) {
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldc;
-    int64_t i, tmp;
-
-    batch_size = 15;
-    m = 25;
-    n = 30;
-    lda = 38;
-    ldc = 42;
-
-    int x_len = (left_right == oneapi::mkl::side::right) ? n : m;
-
-    int64_t stride_a, stride_x, stride_c;
-    stride_x = x_len * std::abs(incx);
-    stride_a = lda * std::max(m, n);
-    stride_c = ldc * std::max(m, n);
-
-    vector<fp, allocator_helper<fp, 64>> x(stride_x * batch_size), A(stride_a * batch_size);
-    vector<fp, allocator_helper<fp, 64>> C(stride_c * batch_size), C_ref(stride_c * batch_size);
-
-    for (i = 0; i < batch_size; i++) {
-        rand_vector(x.data() + stride_x * i, x_len, incx);
-        rand_matrix(A.data() + stride_a * i, layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-        rand_matrix(C.data() + stride_c * i, layout, oneapi::mkl::transpose::nontrans, m, n, ldc);
-    }
-
-    C_ref = C;
-
-    // Call reference DGMM_BATCH_STRIDE.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int incx_ref = (int)incx;
-    int lda_ref = (int)lda;
-    int ldc_ref = (int)ldc;
-    int batch_size_ref = (int)batch_size;
-
-    for (i = 0; i < batch_size_ref; i++) {
-        ::dgmm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right),
-               (const int *)&m_ref, (const int *)&n_ref, (const fp_ref *)(A.data() + stride_a * i),
-               (const int *)&lda_ref, (const fp_ref *)(x.data() + stride_x * i),
-               (const int *)&incx_ref, (fp_ref *)(C_ref.data() + stride_c * i),
-               (const int *)&ldc_ref);
-    }
-
-    // Call DPC++ DGMM_BATCH_STRIDE.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during DGMM_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer(x.data(), range<1>(x.size()));
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> C_buffer(C.data(), range<1>(C.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::dgmm_batch(main_queue, left_right, m, n, A_buffer,
-                                                            lda, stride_a, x_buffer, incx, stride_x,
-                                                            C_buffer, ldc, stride_c, batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::dgmm_batch(main_queue, left_right, m, n, A_buffer,
-                                                         lda, stride_a, x_buffer, incx, stride_x,
-                                                         C_buffer, ldc, stride_c, batch_size);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::dgmm_batch,
-                                        left_right, m, n, A_buffer, lda, stride_a, x_buffer, incx,
-                                        stride_x, C_buffer, ldc, stride_c, batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::dgmm_batch,
-                                        left_right, m, n, A_buffer, lda, stride_a, x_buffer, incx,
-                                        stride_x, C_buffer, ldc, stride_c, batch_size);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during DGMM_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of DGMM_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto C_accessor = C_buffer.get_host_access(read_only);
-    bool good = true;
-    for (i = 0; i < batch_size; i++) {
-        good = good &&
-               check_equal_matrix(C_accessor.get_pointer() + i * stride_c,
-                                  C_ref.data() + i * stride_c, layout, m, n, ldc, 1, std::cout);
-    }
-    return (int)good;
-}
-
-class DgmmBatchStrideTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(DgmmBatchStrideTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, 2, 5));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, -2, 5));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, 1, 5));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, 2, 5));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, -2, 5));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, 1, 5));
-}
-
-TEST_P(DgmmBatchStrideTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, 2, 5));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, -2, 5));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, 1, 5));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, 2, 5));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, -2, 5));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, 1, 5));
-}
-
-TEST_P(DgmmBatchStrideTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, 2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, -2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, 1, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, 2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, -2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, 1, 5));
-}
-
-TEST_P(DgmmBatchStrideTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, 2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, -2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, 1, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, 2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, -2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, 1, 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(DgmmBatchStrideTestSuite, DgmmBatchStrideTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/dgmm_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/dgmm_batch_stride_usm.cpp
deleted file mode 100644
index bb9cf0df3..000000000
--- a/tests/unit_tests/blas/batch/dgmm_batch_stride_usm.cpp
+++ /dev/null
@@ -1,257 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right, int64_t incx,
-         int64_t batch_size) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during DGMM_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldc;
-    int64_t i, tmp;
-
-    batch_size = 15;
-    m = 25;
-    n = 30;
-    lda = 38;
-    ldc = 42;
-
-    int x_len = (left_right == oneapi::mkl::side::right) ? n : m;
-
-    int64_t stride_a, stride_x, stride_c;
-    stride_x = x_len * std::abs(incx);
-    stride_a = lda * std::max(m, n);
-    stride_c = ldc * std::max(m, n);
-
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), A(ua), C(ua), C_ref(ua);
-
-    x.resize(stride_x * batch_size);
-    A.resize(stride_a * batch_size);
-    C.resize(stride_c * batch_size);
-
-    for (i = 0; i < batch_size; i++) {
-        rand_vector(&x[stride_x * i], x_len, incx);
-        rand_matrix(&A[stride_a * i], layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-        rand_matrix(&C[stride_c * i], layout, oneapi::mkl::transpose::nontrans, m, n, ldc);
-    }
-
-    C_ref.resize(C.size());
-    for (int i = 0; i < C.size(); i++)
-        C_ref[i] = C[i];
-
-    // Call reference DGMM_BATCH_STRIDE.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int incx_ref = (int)incx;
-    int lda_ref = (int)lda;
-    int ldc_ref = (int)ldc;
-    int batch_size_ref = (int)batch_size;
-
-    for (i = 0; i < batch_size_ref; i++) {
-        ::dgmm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right),
-               (const int *)&m_ref, (const int *)&n_ref, (const fp_ref *)(A.data() + stride_a * i),
-               (const int *)&lda_ref, (const fp_ref *)(x.data() + stride_x * i),
-               (const int *)&incx_ref, (fp_ref *)(C_ref.data() + stride_c * i),
-               (const int *)&ldc_ref);
-    }
-
-    // Call DPC++ DGMM_BATCH_STRIDE.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::dgmm_batch(
-                    main_queue, left_right, m, n, &A[0], lda, stride_a, &x[0], incx, stride_x,
-                    &C[0], ldc, stride_c, batch_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::dgmm_batch(
-                    main_queue, left_right, m, n, &A[0], lda, stride_a, &x[0], incx, stride_x,
-                    &C[0], ldc, stride_c, batch_size, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::dgmm_batch,
-                                        left_right, m, n, &A[0], lda, stride_a, &x[0], incx,
-                                        stride_x, &C[0], ldc, stride_c, batch_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::dgmm_batch,
-                                        left_right, m, n, &A[0], lda, stride_a, &x[0], incx,
-                                        stride_x, &C[0], ldc, stride_c, batch_size, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during DGMM_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of DGMM_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = true;
-    for (i = 0; i < batch_size; i++) {
-        good = good && check_equal_matrix(&C[i * stride_c], &C_ref[i * stride_c], layout, m, n, ldc,
-                                          1, std::cout);
-    }
-    return (int)good;
-}
-
-class DgmmBatchStrideUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(DgmmBatchStrideUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, 2, 5));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, -2, 5));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, 1, 5));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, 2, 5));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, -2, 5));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, 1, 5));
-}
-
-TEST_P(DgmmBatchStrideUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, 2, 5));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, -2, 5));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, 1, 5));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, 2, 5));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, -2, 5));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, 1, 5));
-}
-
-TEST_P(DgmmBatchStrideUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, 2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, -2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, 1, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, 2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, -2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, 1, 5));
-}
-
-TEST_P(DgmmBatchStrideUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, 2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, -2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, 1, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, 2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, -2, 5));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, 1, 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(DgmmBatchStrideUsmTestSuite, DgmmBatchStrideUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/dgmm_batch_usm.cpp b/tests/unit_tests/blas/batch/dgmm_batch_usm.cpp
deleted file mode 100644
index 1f568580f..000000000
--- a/tests/unit_tests/blas/batch/dgmm_batch_usm.cpp
+++ /dev/null
@@ -1,318 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "allocator_helper.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during DGMM_BATCH:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto uaint = usm_allocator<int64_t, usm::alloc::shared, 64>(cxt, *dev);
-    vector<int64_t, decltype(uaint)> m(uaint), n(uaint), lda(uaint), incx(uaint), ldc(uaint),
-        group_size(uaint);
-
-    auto uaside = usm_allocator<oneapi::mkl::side, usm::alloc::shared, 64>(cxt, *dev);
-    vector<oneapi::mkl::side, decltype(uaside)> left_right(uaside);
-
-    m.resize(group_count);
-    n.resize(group_count);
-    lda.resize(group_count);
-    incx.resize(group_count);
-    ldc.resize(group_count);
-    group_size.resize(group_count);
-    left_right.resize(group_count);
-
-    int64_t i, tmp;
-    int64_t j, idx = 0;
-    int64_t total_batch_count = 0;
-    int64_t x_len = 0;
-    int64_t size_a = 0, size_x = 0, size_c = 0;
-
-    for (i = 0; i < group_count; i++) {
-        group_size[i] = 1 + std::rand() % 20;
-        m[i] = 1 + std::rand() % 500;
-        n[i] = 1 + std::rand() % 500;
-        lda[i] = std::max(m[i], n[i]);
-        incx[i] = -3 + std::rand() % 6;
-        incx[i] = (incx[i] == 0) ? 3 : incx[i];
-        ldc[i] = std::max(m[i], n[i]);
-        left_right[i] = (oneapi::mkl::side)(std::rand() % 2);
-        total_batch_count += group_size[i];
-    }
-
-    auto uafpp = usm_allocator<fp *, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp *, decltype(uafpp)> a_array(uafpp), x_array(uafpp), c_array(uafpp),
-        c_ref_array(uafpp);
-    a_array.resize(total_batch_count);
-    x_array.resize(total_batch_count);
-    c_array.resize(total_batch_count);
-    c_ref_array.resize(total_batch_count);
-
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        size_a = (layout == oneapi::mkl::layout::col_major) ? lda[i] * n[i] : lda[i] * m[i];
-        x_len = (left_right[i] == oneapi::mkl::side::R) ? n[i] : m[i];
-        size_x = 1 + (x_len - 1) * std::abs(incx[i]);
-        size_c = (layout == oneapi::mkl::layout::col_major) ? ldc[i] * n[i] : ldc[i] * m[i];
-        for (j = 0; j < group_size[i]; j++) {
-            a_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt);
-            x_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_x, *dev, cxt);
-            c_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_c, *dev, cxt);
-            c_ref_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_c, *dev, cxt);
-            rand_matrix(a_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i], lda[i]);
-            rand_vector(x_array[idx], x_len, incx[i]);
-            rand_matrix(c_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i], ldc[i]);
-            copy_matrix(c_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i], ldc[i],
-                        c_ref_array[idx]);
-            idx++;
-        }
-    }
-
-    // Call reference DGMM_BATCH.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int *m_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *n_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *lda_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *incx_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *ldc_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *group_size_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-
-    CBLAS_SIDE *left_right_ref =
-        (CBLAS_SIDE *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_SIDE) * group_count);
-
-    if ((m_ref == NULL) || (n_ref == NULL) || (lda_ref == NULL) || (incx_ref == NULL) ||
-        (ldc_ref == NULL) || (left_right_ref == NULL) || (group_size_ref == NULL)) {
-        std::cout << "Error cannot allocate input arrays\n";
-        oneapi::mkl::aligned_free(m_ref);
-        oneapi::mkl::aligned_free(n_ref);
-        oneapi::mkl::aligned_free(lda_ref);
-        oneapi::mkl::aligned_free(incx_ref);
-        oneapi::mkl::aligned_free(ldc_ref);
-        oneapi::mkl::aligned_free(left_right_ref);
-        oneapi::mkl::aligned_free(group_size_ref);
-        idx = 0;
-        for (i = 0; i < group_count; i++) {
-            for (j = 0; j < group_size[i]; j++) {
-                oneapi::mkl::free_shared(a_array[idx], cxt);
-                oneapi::mkl::free_shared(x_array[idx], cxt);
-                oneapi::mkl::free_shared(c_array[idx], cxt);
-                oneapi::mkl::free_shared(c_ref_array[idx], cxt);
-                idx++;
-            }
-        }
-        return false;
-    }
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        left_right_ref[i] = convert_to_cblas_side(left_right[i]);
-        m_ref[i] = (int)m[i];
-        n_ref[i] = (int)n[i];
-        lda_ref[i] = (int)lda[i];
-        incx_ref[i] = (int)incx[i];
-        ldc_ref[i] = (int)ldc[i];
-        group_size_ref[i] = (int)group_size[i];
-        for (j = 0; j < group_size_ref[i]; j++) {
-            ::dgmm(convert_to_cblas_layout(layout), left_right_ref[i], (const int *)&m_ref[i],
-                   (const int *)&n_ref[i], (const fp_ref *)a_array[idx], (const int *)&lda_ref[i],
-                   (const fp_ref *)x_array[idx], (const int *)&incx_ref[i],
-                   (fp_ref *)c_ref_array[idx], (const int *)&ldc_ref[i]);
-            idx++;
-        }
-    }
-
-    // Call DPC++ DGMM_BATCH.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::dgmm_batch(
-                    main_queue, &left_right[0], &m[0], &n[0], (const fp **)&a_array[0], &lda[0],
-                    (const fp **)&x_array[0], &incx[0], &c_array[0], &ldc[0], group_count,
-                    &group_size[0], dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::dgmm_batch(
-                    main_queue, &left_right[0], &m[0], &n[0], (const fp **)&a_array[0], &lda[0],
-                    (const fp **)&x_array[0], &incx[0], &c_array[0], &ldc[0], group_count,
-                    &group_size[0], dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::dgmm_batch,
-                                        &left_right[0], &m[0], &n[0], (const fp **)&a_array[0],
-                                        &lda[0], (const fp **)&x_array[0], &incx[0], &c_array[0],
-                                        &ldc[0], group_count, &group_size[0], dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::dgmm_batch,
-                                        &left_right[0], &m[0], &n[0], (const fp **)&a_array[0],
-                                        &lda[0], (const fp **)&x_array[0], &incx[0], &c_array[0],
-                                        &ldc[0], group_count, &group_size[0], dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during DGMM_BATCH:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        oneapi::mkl::aligned_free(m_ref);
-        oneapi::mkl::aligned_free(n_ref);
-        oneapi::mkl::aligned_free(lda_ref);
-        oneapi::mkl::aligned_free(incx_ref);
-        oneapi::mkl::aligned_free(ldc_ref);
-        oneapi::mkl::aligned_free(left_right_ref);
-        oneapi::mkl::aligned_free(group_size_ref);
-        idx = 0;
-        for (i = 0; i < group_count; i++) {
-            for (j = 0; j < group_size[i]; j++) {
-                oneapi::mkl::free_shared(a_array[idx], cxt);
-                oneapi::mkl::free_shared(x_array[idx], cxt);
-                oneapi::mkl::free_shared(c_array[idx], cxt);
-                oneapi::mkl::free_shared(c_ref_array[idx], cxt);
-                idx++;
-            }
-        }
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of DGMM_BATCH:\n" << error.what() << std::endl;
-    }
-
-    bool good = true;
-    // Compare the results of reference implementation and DPC++ implementation.
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            good = good && check_equal_matrix(c_array[idx], c_ref_array[idx], layout, m[i], n[i],
-                                              ldc[i], 1, std::cout);
-            idx++;
-        }
-    }
-
-    oneapi::mkl::aligned_free(m_ref);
-    oneapi::mkl::aligned_free(n_ref);
-    oneapi::mkl::aligned_free(lda_ref);
-    oneapi::mkl::aligned_free(incx_ref);
-    oneapi::mkl::aligned_free(ldc_ref);
-    oneapi::mkl::aligned_free(left_right_ref);
-    oneapi::mkl::aligned_free(group_size_ref);
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            oneapi::mkl::free_shared(a_array[idx], cxt);
-            oneapi::mkl::free_shared(x_array[idx], cxt);
-            oneapi::mkl::free_shared(c_array[idx], cxt);
-            oneapi::mkl::free_shared(c_ref_array[idx], cxt);
-            idx++;
-        }
-    }
-
-    return (int)good;
-}
-
-class DgmmBatchUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(DgmmBatchUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(DgmmBatchUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(DgmmBatchUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(DgmmBatchUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(DgmmBatchUsmTestSuite, DgmmBatchUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/gemm_batch_stride.cpp b/tests/unit_tests/blas/batch/gemm_batch_stride.cpp
deleted file mode 100644
index 5241cb822..000000000
--- a/tests/unit_tests/blas/batch/gemm_batch_stride.cpp
+++ /dev/null
@@ -1,289 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename Ta, typename Tb, typename Tc, typename Ts>
-int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) {
-    // Prepare data.
-    int64_t m, n, k;
-    int64_t lda, ldb, ldc;
-    oneapi::mkl::transpose transa, transb;
-    Ts alpha, beta;
-    int64_t i, tmp;
-
-    batch_size = 1 + std::rand() % 20;
-    m = 1 + std::rand() % 500;
-    n = 1 + std::rand() % 500;
-    k = 1 + std::rand() % 500;
-    lda = std::max(m, k);
-    ldb = std::max(n, k);
-    ldc = std::max(m, n);
-    alpha = rand_scalar<Ts>();
-    beta = rand_scalar<Ts>();
-
-    if ((std::is_same<Ts, std::complex<float>>::value) ||
-        (std::is_same<Ts, std::complex<double>>::value)) {
-        tmp = std::rand() % 3;
-        if (tmp == 2)
-            transa = oneapi::mkl::transpose::conjtrans;
-        else
-            transa = (oneapi::mkl::transpose)tmp;
-        tmp = std::rand() % 3;
-        if (tmp == 2)
-            transb = oneapi::mkl::transpose::conjtrans;
-        else
-            transb = (oneapi::mkl::transpose)tmp;
-    }
-    else {
-        transa = (oneapi::mkl::transpose)(std::rand() % 2);
-        transb = (oneapi::mkl::transpose)(std::rand() % 2);
-    }
-
-    int64_t stride_a, stride_b, stride_c;
-
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            stride_a = (transa == oneapi::mkl::transpose::nontrans) ? lda * k : lda * m;
-            stride_b = (transb == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * k;
-            stride_c = ldc * n;
-            break;
-        case oneapi::mkl::layout::row_major:
-            stride_a = (transa == oneapi::mkl::transpose::nontrans) ? lda * m : lda * k;
-            stride_b = (transb == oneapi::mkl::transpose::nontrans) ? ldb * k : ldb * n;
-            stride_c = ldc * m;
-            break;
-        default: break;
-    }
-
-    vector<Ta, allocator_helper<Ta, 64>> A(stride_a * batch_size);
-    vector<Ta, allocator_helper<Tb, 64>> B(stride_b * batch_size);
-    vector<Tc, allocator_helper<Tc, 64>> C(stride_c * batch_size),
-        C_cast_ref(stride_c * batch_size);
-    vector<Ts, allocator_helper<Ts, 64>> A_ref(stride_a * batch_size), B_ref(stride_b * batch_size),
-        C_ref(stride_c * batch_size);
-
-    for (i = 0; i < batch_size; i++) {
-        rand_matrix(A.data() + stride_a * i, layout, transa, m, k, lda);
-        rand_matrix(B.data() + stride_b * i, layout, transb, k, n, ldb);
-        rand_matrix(C.data() + stride_c * i, layout, oneapi::mkl::transpose::nontrans, m, n, ldc);
-    }
-
-    for (size_t i = 0; i < A.size(); ++i) {
-        A_ref[i] = A[i];
-    }
-    for (size_t i = 0; i < B.size(); ++i) {
-        B_ref[i] = B[i];
-    }
-    for (size_t i = 0; i < C.size(); ++i) {
-        C_ref[i] = C[i];
-    }
-
-    // Call reference GEMM_BATCH_STRIDE.
-    using fp_ref = typename ref_type_info<Ts>::type;
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int k_ref = (int)k;
-    int lda_ref = (int)lda;
-    int ldb_ref = (int)ldb;
-    int ldc_ref = (int)ldc;
-    int batch_size_ref = (int)batch_size;
-
-    for (i = 0; i < batch_size_ref; i++) {
-        ::gemm(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa),
-               convert_to_cblas_trans(transb), (const int *)&m_ref, (const int *)&n_ref,
-               (const int *)&k_ref, (const fp_ref *)&alpha,
-               (const fp_ref *)(A_ref.data() + stride_a * i), (const int *)&lda_ref,
-               (const fp_ref *)(B_ref.data() + stride_b * i), (const int *)&ldb_ref,
-               (const fp_ref *)&beta, (fp_ref *)(C_ref.data() + stride_c * i),
-               (const int *)&ldc_ref);
-    }
-
-    // Call DPC++ GEMM_BATCH_STRIDE.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMM_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<Ta, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<Tb, 1> B_buffer(B.data(), range<1>(B.size()));
-    buffer<Tc, 1> C_buffer(C.data(), range<1>(C.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::gemm_batch(
-                    main_queue, transa, transb, m, n, k, alpha, A_buffer, lda, stride_a, B_buffer,
-                    ldb, stride_b, beta, C_buffer, ldc, stride_c, batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::gemm_batch(
-                    main_queue, transa, transb, m, n, k, alpha, A_buffer, lda, stride_a, B_buffer,
-                    ldb, stride_b, beta, C_buffer, ldc, stride_c, batch_size);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemm_batch,
-                                        transa, transb, m, n, k, alpha, A_buffer, lda, stride_a,
-                                        B_buffer, ldb, stride_b, beta, C_buffer, ldc, stride_c,
-                                        batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemm_batch,
-                                        transa, transb, m, n, k, alpha, A_buffer, lda, stride_a,
-                                        B_buffer, ldb, stride_b, beta, C_buffer, ldc, stride_c,
-                                        batch_size);
-                break;
-            default: break;
-        }
-#endif
-        main_queue.wait_and_throw();
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GEMM_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GEMM_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    int tol_scalar = 10;
-    int error_mag = tol_scalar * k;
-    if (std::is_same_v<Tc, int32_t>)
-        error_mag = 1;
-
-    for (size_t i = 0; i < C_ref.size(); ++i) {
-        C_cast_ref[i] = C_ref[i];
-    }
-    auto C_accessor = C_buffer.get_host_access(read_only);
-    bool good = check_almost_equal_matrix(C_accessor, C_cast_ref, oneapi::mkl::layout::col_major,
-                                          stride_c * batch_size, 1, stride_c * batch_size,
-                                          error_mag, std::cout);
-
-    return (int)good;
-}
-
-class GemmBatchStrideTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(GemmBatchStrideTests, RealHalfPrecision) {
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half, sycl::half, sycl::half>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchStrideTests, HalfHalfFloatPrecision) {
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half, float, float>(std::get<0>(GetParam()),
-                                                                  std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchStrideTests, Int8Int8SinglePrecision) {
-    EXPECT_TRUEORSKIP((test<std::int8_t, std::int8_t, float, float>(std::get<0>(GetParam()),
-                                                                    std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchStrideTests, Int8Int8Int32Precision) {
-    EXPECT_TRUEORSKIP((test<std::int8_t, std::int8_t, std::int32_t, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchStrideTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (test<float, float, float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchStrideTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP((
-        test<double, double, double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchStrideTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, std::complex<float>, std::complex<float>, std::complex<float>>(
-            std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchStrideTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, std::complex<double>, std::complex<double>,
-              std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemmBatchStrideTestSuite, GemmBatchStrideTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp
deleted file mode 100644
index 97f2dd086..000000000
--- a/tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp
+++ /dev/null
@@ -1,324 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename Ta, typename Tb, typename Tc, typename Ts>
-int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMM_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t m, n, k;
-    int64_t lda, ldb, ldc;
-    oneapi::mkl::transpose transa, transb;
-    Ts alpha, beta;
-
-    int64_t i, tmp;
-
-    batch_size = 1 + std::rand() % 20;
-    m = 1 + std::rand() % 500;
-    n = 1 + std::rand() % 500;
-    k = 1 + std::rand() % 500;
-    lda = std::max(m, k);
-    ldb = std::max(n, k);
-    ldc = std::max(m, n);
-    alpha = rand_scalar<Ts>();
-    beta = rand_scalar<Ts>();
-    if ((std::is_same<Ts, std::complex<float>>::value) ||
-        (std::is_same<Ts, std::complex<double>>::value)) {
-        tmp = std::rand() % 3;
-        if (tmp == 2)
-            transa = oneapi::mkl::transpose::conjtrans;
-        else
-            transa = (oneapi::mkl::transpose)tmp;
-        tmp = std::rand() % 3;
-        if (tmp == 2)
-            transb = oneapi::mkl::transpose::conjtrans;
-        else
-            transb = (oneapi::mkl::transpose)tmp;
-    }
-    else {
-        transa = (oneapi::mkl::transpose)(std::rand() % 2);
-        transb = (oneapi::mkl::transpose)(std::rand() % 2);
-    }
-
-    int64_t stride_a, stride_b, stride_c;
-
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            stride_a = (transa == oneapi::mkl::transpose::nontrans) ? lda * k : lda * m;
-            stride_b = (transb == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * k;
-            stride_c = ldc * n;
-            break;
-        case oneapi::mkl::layout::row_major:
-            stride_a = (transa == oneapi::mkl::transpose::nontrans) ? lda * m : lda * k;
-            stride_b = (transb == oneapi::mkl::transpose::nontrans) ? ldb * k : ldb * n;
-            stride_c = ldc * m;
-            break;
-        default: break;
-    }
-
-    auto ua = usm_allocator<Ta, usm::alloc::shared, 64>(cxt, *dev);
-    auto ub = usm_allocator<Tb, usm::alloc::shared, 64>(cxt, *dev);
-    auto uc = usm_allocator<Tc, usm::alloc::shared, 64>(cxt, *dev);
-    auto us = usm_allocator<Ts, usm::alloc::shared, 64>(cxt, *dev);
-    vector<Ta, decltype(ua)> A(ua);
-    vector<Tb, decltype(ub)> B(ub);
-    vector<Tc, decltype(uc)> C(uc), C_cast_ref(uc);
-    vector<Ts, decltype(us)> A_ref(us), B_ref(us), C_ref(us);
-
-    A.resize(stride_a * batch_size);
-    B.resize(stride_b * batch_size);
-    C.resize(stride_c * batch_size);
-    A_ref.resize(stride_c * batch_size);
-    B_ref.resize(stride_c * batch_size);
-    C_ref.resize(stride_c * batch_size);
-    C_cast_ref.resize(stride_c * batch_size);
-
-    Ta **a_array = (Ta **)oneapi::mkl::malloc_shared(64, sizeof(Ta *) * batch_size, *dev, cxt);
-    Tb **b_array = (Tb **)oneapi::mkl::malloc_shared(64, sizeof(Tb *) * batch_size, *dev, cxt);
-    Tc **c_array = (Tc **)oneapi::mkl::malloc_shared(64, sizeof(Tc *) * batch_size, *dev, cxt);
-    Ts **c_ref_array = (Ts **)oneapi::mkl::malloc_shared(64, sizeof(Ts *) * batch_size, *dev, cxt);
-
-    if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL) || (c_ref_array == NULL)) {
-        std::cout << "Error cannot allocate arrays of pointers\n";
-        oneapi::mkl::free_shared(a_array, cxt);
-        oneapi::mkl::free_shared(b_array, cxt);
-        oneapi::mkl::free_shared(c_array, cxt);
-        oneapi::mkl::free_shared(c_ref_array, cxt);
-        return false;
-    }
-
-    for (i = 0; i < batch_size; i++) {
-        a_array[i] = &A[i * stride_a];
-        b_array[i] = &B[i * stride_b];
-        c_array[i] = &C[i * stride_c];
-        c_ref_array[i] = &C_ref[i * stride_c];
-    }
-
-    rand_matrix(A, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_a * batch_size, 1, stride_a * batch_size);
-    rand_matrix(B, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_b * batch_size, 1, stride_b * batch_size);
-    rand_matrix(C, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_c * batch_size, 1, stride_c * batch_size);
-    copy_matrix(A, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_a * batch_size, 1, stride_a * batch_size, A_ref);
-    copy_matrix(B, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_b * batch_size, 1, stride_b * batch_size, B_ref);
-    copy_matrix(C, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_c * batch_size, 1, stride_c * batch_size, C_ref);
-
-    // Call reference GEMM_BATCH_STRIDE.
-    using fp_ref = typename ref_type_info<Ts>::type;
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int k_ref = (int)k;
-    int lda_ref = (int)lda;
-    int ldb_ref = (int)ldb;
-    int ldc_ref = (int)ldc;
-    int batch_size_ref = (int)batch_size;
-    for (i = 0; i < batch_size_ref; i++) {
-        ::gemm(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa),
-               convert_to_cblas_trans(transb), (const int *)&m_ref, (const int *)&n_ref,
-               (const int *)&k_ref, (const fp_ref *)&alpha,
-               (const fp_ref *)(A_ref.data() + stride_a * i), (const int *)&lda_ref,
-               (const fp_ref *)(B_ref.data() + stride_b * i), (const int *)&ldb_ref,
-               (const fp_ref *)&beta, (fp_ref *)(C_ref.data() + stride_c * i),
-               (const int *)&ldc_ref);
-    }
-
-    // Call DPC++ GEMM_BATCH_STRIDE.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::gemm_batch(
-                    main_queue, transa, transb, m, n, k, alpha, &A[0], lda, stride_a, &B[0], ldb,
-                    stride_b, beta, &C[0], ldc, stride_c, batch_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::gemm_batch(
-                    main_queue, transa, transb, m, n, k, alpha, &A[0], lda, stride_a, &B[0], ldb,
-                    stride_b, beta, &C[0], ldc, stride_c, batch_size, dependencies);
-                break;
-            default: break;
-        }
-        done.wait_and_throw();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemm_batch,
-                                        transa, transb, m, n, k, alpha, &A[0], lda, stride_a, &B[0],
-                                        ldb, stride_b, beta, &C[0], ldc, stride_c, batch_size,
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemm_batch,
-                                        transa, transb, m, n, k, alpha, &A[0], lda, stride_a, &B[0],
-                                        ldb, stride_b, beta, &C[0], ldc, stride_c, batch_size,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait_and_throw();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GEMM_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        oneapi::mkl::free_shared(a_array, cxt);
-        oneapi::mkl::free_shared(b_array, cxt);
-        oneapi::mkl::free_shared(c_array, cxt);
-        oneapi::mkl::free_shared(c_ref_array, cxt);
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GEMM_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    int tol_scalar = 10;
-    int error_mag = tol_scalar * k;
-    if (std::is_same_v<Tc, int32_t>)
-        error_mag = 1;
-
-    for (size_t i = 0; i < C_ref.size(); ++i) {
-        C_cast_ref[i] = C_ref[i];
-    }
-    bool good = check_almost_equal_matrix(C, C_cast_ref, oneapi::mkl::layout::col_major,
-                                          stride_c * batch_size, 1, stride_c * batch_size,
-                                          error_mag, std::cout);
-
-    oneapi::mkl::free_shared(a_array, cxt);
-    oneapi::mkl::free_shared(b_array, cxt);
-    oneapi::mkl::free_shared(c_array, cxt);
-    oneapi::mkl::free_shared(c_ref_array, cxt);
-
-    return (int)good;
-}
-
-class GemmBatchStrideUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(GemmBatchStrideUsmTests, RealHalfPrecision) {
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half, sycl::half, sycl::half>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchStrideUsmTests, HalfHalfFloatPrecision) {
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half, float, float>(std::get<0>(GetParam()),
-                                                                  std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchStrideUsmTests, Int8Int8SinglePrecision) {
-    EXPECT_TRUEORSKIP((test<std::int8_t, std::int8_t, float, float>(std::get<0>(GetParam()),
-                                                                    std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchStrideUsmTests, Int8Int8Int32Precision) {
-    EXPECT_TRUEORSKIP((test<std::int8_t, std::int8_t, std::int32_t, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchStrideUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (test<float, float, float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchStrideUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP((
-        test<double, double, double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchStrideUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, std::complex<float>, std::complex<float>, std::complex<float>>(
-            std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchStrideUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, std::complex<double>, std::complex<double>,
-              std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemmBatchStrideUsmTestSuite, GemmBatchStrideUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/gemm_batch_usm.cpp b/tests/unit_tests/blas/batch/gemm_batch_usm.cpp
deleted file mode 100644
index a651f9ae3..000000000
--- a/tests/unit_tests/blas/batch/gemm_batch_usm.cpp
+++ /dev/null
@@ -1,421 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "allocator_helper.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename Ta, typename Tb, typename Tc, typename Ts>
-int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMM_BATCH:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto uaint = usm_allocator<int64_t, usm::alloc::shared, 64>(cxt, *dev);
-    vector<int64_t, decltype(uaint)> m(uaint), n(uaint), k(uaint), lda(uaint), ldb(uaint),
-        ldc(uaint), group_size(uaint);
-
-    auto uatranspose = usm_allocator<oneapi::mkl::transpose, usm::alloc::shared, 64>(cxt, *dev);
-    vector<oneapi::mkl::transpose, decltype(uatranspose)> transa(uatranspose), transb(uatranspose);
-
-    auto uaTs = usm_allocator<Ts, usm::alloc::shared, 64>(cxt, *dev);
-    vector<Ts, decltype(uaTs)> alpha(uaTs), beta(uaTs);
-
-    m.resize(group_count);
-    n.resize(group_count);
-    k.resize(group_count);
-    lda.resize(group_count);
-    ldb.resize(group_count);
-    ldc.resize(group_count);
-    group_size.resize(group_count);
-    transa.resize(group_count);
-    transb.resize(group_count);
-    alpha.resize(group_count);
-    beta.resize(group_count);
-
-    int64_t i, tmp;
-    int64_t j, idx = 0;
-    int64_t total_batch_count = 0;
-    int64_t size_a = 0, size_b = 0, size_c = 0;
-
-    for (i = 0; i < group_count; i++) {
-        group_size[i] = 1 + std::rand() % 20;
-        m[i] = 1 + std::rand() % 500;
-        n[i] = 1 + std::rand() % 500;
-        k[i] = 1 + std::rand() % 500;
-        lda[i] = std::max(m[i], k[i]);
-        ldb[i] = std::max(n[i], k[i]);
-        ldc[i] = std::max(m[i], n[i]);
-        alpha[i] = rand_scalar<Ts>();
-        beta[i] = rand_scalar<Ts>();
-        if ((std::is_same<Ts, std::complex<float>>::value) ||
-            (std::is_same<Ts, std::complex<double>>::value)) {
-            tmp = std::rand() % 3;
-            if (tmp == 2)
-                transa[i] = oneapi::mkl::transpose::conjtrans;
-            else
-                transa[i] = (oneapi::mkl::transpose)tmp;
-            tmp = std::rand() % 3;
-            if (tmp == 2)
-                transb[i] = oneapi::mkl::transpose::conjtrans;
-            else
-                transb[i] = (oneapi::mkl::transpose)tmp;
-        }
-        else {
-            transa[i] = (oneapi::mkl::transpose)(std::rand() % 2);
-            transb[i] = (oneapi::mkl::transpose)(std::rand() % 2);
-        }
-        total_batch_count += group_size[i];
-    }
-
-    auto uaTap = usm_allocator<Ta *, usm::alloc::shared, 64>(cxt, *dev);
-    auto uaTbp = usm_allocator<Tb *, usm::alloc::shared, 64>(cxt, *dev);
-    auto uaTcp = usm_allocator<Tc *, usm::alloc::shared, 64>(cxt, *dev);
-    auto uaTsp = usm_allocator<Ts *, usm::alloc::shared, 64>(cxt, *dev);
-    vector<Ta *, decltype(uaTap)> a_array(uaTap);
-    vector<Tb *, decltype(uaTbp)> b_array(uaTbp);
-    vector<Tc *, decltype(uaTcp)> c_array(uaTcp), c_cast_ref_array(uaTcp);
-    vector<Ts *, decltype(uaTsp)> a_ref_array(uaTsp), b_ref_array(uaTsp), c_ref_array(uaTsp);
-    a_array.resize(total_batch_count);
-    b_array.resize(total_batch_count);
-    c_array.resize(total_batch_count);
-    a_ref_array.resize(total_batch_count);
-    b_ref_array.resize(total_batch_count);
-    c_cast_ref_array.resize(total_batch_count);
-    c_ref_array.resize(total_batch_count);
-
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                size_a = lda[i] * ((transa[i] == oneapi::mkl::transpose::nontrans) ? k[i] : m[i]);
-                size_b = ldb[i] * ((transb[i] == oneapi::mkl::transpose::nontrans) ? n[i] : k[i]);
-                size_c = ldc[i] * n[i];
-                break;
-            case oneapi::mkl::layout::row_major:
-                size_a = lda[i] * ((transa[i] == oneapi::mkl::transpose::nontrans) ? m[i] : k[i]);
-                size_b = ldb[i] * ((transb[i] == oneapi::mkl::transpose::nontrans) ? k[i] : n[i]);
-                size_c = ldc[i] * m[i];
-                break;
-            default: break;
-        }
-        for (j = 0; j < group_size[i]; j++) {
-            a_array[idx] = (Ta *)oneapi::mkl::malloc_shared(64, sizeof(Ta) * size_a, *dev, cxt);
-            b_array[idx] = (Tb *)oneapi::mkl::malloc_shared(64, sizeof(Tb) * size_b, *dev, cxt);
-            c_array[idx] = (Tc *)oneapi::mkl::malloc_shared(64, sizeof(Tc) * size_c, *dev, cxt);
-            a_ref_array[idx] = (Ts *)oneapi::mkl::malloc_shared(64, sizeof(Ts) * size_a, *dev, cxt);
-            b_ref_array[idx] = (Ts *)oneapi::mkl::malloc_shared(64, sizeof(Ts) * size_b, *dev, cxt);
-            c_cast_ref_array[idx] =
-                (Tc *)oneapi::mkl::malloc_shared(64, sizeof(Tc) * size_c, *dev, cxt);
-            c_ref_array[idx] = (Ts *)oneapi::mkl::malloc_shared(64, sizeof(Ts) * size_c, *dev, cxt);
-            rand_matrix(a_array[idx], layout, transa[i], m[i], k[i], lda[i]);
-            rand_matrix(b_array[idx], layout, transb[i], k[i], n[i], ldb[i]);
-            rand_matrix(c_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i], ldc[i]);
-            copy_matrix(a_array[idx], layout, transa[i], m[i], k[i], lda[i], a_ref_array[idx]);
-            copy_matrix(b_array[idx], layout, transb[i], k[i], n[i], ldb[i], b_ref_array[idx]);
-            copy_matrix(c_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i], ldc[i],
-                        c_ref_array[idx]);
-            idx++;
-        }
-    }
-
-    // Call reference GEMM_BATCH.
-    using fp_ref = typename ref_type_info<Ts>::type;
-    int *m_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *n_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *k_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *lda_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *ldb_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *ldc_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *group_size_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-
-    CBLAS_TRANSPOSE *transa_ref =
-        (CBLAS_TRANSPOSE *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count);
-    CBLAS_TRANSPOSE *transb_ref =
-        (CBLAS_TRANSPOSE *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count);
-
-    if ((m_ref == NULL) || (n_ref == NULL) || (k_ref == NULL) || (lda_ref == NULL) ||
-        (ldb_ref == NULL) || (ldc_ref == NULL) || (transa_ref == NULL) || (transb_ref == NULL) ||
-        (group_size_ref == NULL)) {
-        std::cout << "Error cannot allocate input arrays\n";
-        oneapi::mkl::aligned_free(m_ref);
-        oneapi::mkl::aligned_free(n_ref);
-        oneapi::mkl::aligned_free(k_ref);
-        oneapi::mkl::aligned_free(lda_ref);
-        oneapi::mkl::aligned_free(ldb_ref);
-        oneapi::mkl::aligned_free(ldc_ref);
-        oneapi::mkl::aligned_free(transa_ref);
-        oneapi::mkl::aligned_free(transb_ref);
-        oneapi::mkl::aligned_free(group_size_ref);
-        idx = 0;
-        for (i = 0; i < group_count; i++) {
-            for (j = 0; j < group_size[i]; j++) {
-                oneapi::mkl::free_shared(a_array[idx], cxt);
-                oneapi::mkl::free_shared(b_array[idx], cxt);
-                oneapi::mkl::free_shared(c_array[idx], cxt);
-                oneapi::mkl::free_shared(a_ref_array[idx], cxt);
-                oneapi::mkl::free_shared(b_ref_array[idx], cxt);
-                oneapi::mkl::free_shared(c_cast_ref_array[idx], cxt);
-                oneapi::mkl::free_shared(c_ref_array[idx], cxt);
-                idx++;
-            }
-        }
-        return false;
-    }
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        transa_ref[i] = convert_to_cblas_trans(transa[i]);
-        transb_ref[i] = convert_to_cblas_trans(transb[i]);
-        m_ref[i] = (int)m[i];
-        n_ref[i] = (int)n[i];
-        k_ref[i] = (int)k[i];
-        lda_ref[i] = (int)lda[i];
-        ldb_ref[i] = (int)ldb[i];
-        ldc_ref[i] = (int)ldc[i];
-        group_size_ref[i] = (int)group_size[i];
-        for (j = 0; j < group_size_ref[i]; j++) {
-            ::gemm(convert_to_cblas_layout(layout), transa_ref[i], transb_ref[i],
-                   (const int *)&m_ref[i], (const int *)&n_ref[i], (const int *)&k_ref[i],
-                   (const fp_ref *)&alpha[i], (const fp_ref *)a_ref_array[idx],
-                   (const int *)&lda_ref[i], (const fp_ref *)b_ref_array[idx],
-                   (const int *)&ldb_ref[i], (const fp_ref *)&beta[i], (fp_ref *)c_ref_array[idx],
-                   (const int *)&ldc_ref[i]);
-            idx++;
-        }
-    }
-
-    // Call DPC++ GEMM_BATCH.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::gemm_batch(
-                    main_queue, &transa[0], &transb[0], &m[0], &n[0], &k[0], &alpha[0],
-                    (const Ta **)&a_array[0], &lda[0], (const Tb **)&b_array[0], &ldb[0], &beta[0],
-                    &c_array[0], &ldc[0], group_count, &group_size[0], dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::gemm_batch(
-                    main_queue, &transa[0], &transb[0], &m[0], &n[0], &k[0], &alpha[0],
-                    (const Ta **)&a_array[0], &lda[0], (const Tb **)&b_array[0], &ldb[0], &beta[0],
-                    &c_array[0], &ldc[0], group_count, &group_size[0], dependencies);
-                break;
-            default: break;
-        }
-        done.wait_and_throw();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemm_batch,
-                                        &transa[0], &transb[0], &m[0], &n[0], &k[0], &alpha[0],
-                                        (const Ta **)&a_array[0], &lda[0], (const Tb **)&b_array[0],
-                                        &ldb[0], &beta[0], &c_array[0], &ldc[0], group_count,
-                                        &group_size[0], dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemm_batch,
-                                        &transa[0], &transb[0], &m[0], &n[0], &k[0], &alpha[0],
-                                        (const Ta **)&a_array[0], &lda[0], (const Ta **)&b_array[0],
-                                        &ldb[0], &beta[0], &c_array[0], &ldc[0], group_count,
-                                        &group_size[0], dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait_and_throw();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GEMM_BATCH:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        oneapi::mkl::aligned_free(m_ref);
-        oneapi::mkl::aligned_free(n_ref);
-        oneapi::mkl::aligned_free(k_ref);
-        oneapi::mkl::aligned_free(lda_ref);
-        oneapi::mkl::aligned_free(ldb_ref);
-        oneapi::mkl::aligned_free(ldc_ref);
-        oneapi::mkl::aligned_free(transa_ref);
-        oneapi::mkl::aligned_free(transb_ref);
-        oneapi::mkl::aligned_free(group_size_ref);
-        idx = 0;
-        for (i = 0; i < group_count; i++) {
-            for (j = 0; j < group_size[i]; j++) {
-                oneapi::mkl::free_shared(a_array[idx], cxt);
-                oneapi::mkl::free_shared(b_array[idx], cxt);
-                oneapi::mkl::free_shared(c_array[idx], cxt);
-                oneapi::mkl::free_shared(a_ref_array[idx], cxt);
-                oneapi::mkl::free_shared(b_ref_array[idx], cxt);
-                oneapi::mkl::free_shared(c_cast_ref_array[idx], cxt);
-                oneapi::mkl::free_shared(c_ref_array[idx], cxt);
-                idx++;
-            }
-        }
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GEMM_BATCH:\n" << error.what() << std::endl;
-    }
-
-    bool good = true;
-    // Compare the results of reference implementation and DPC++ implementation.
-    int tol_scalar = 10;
-
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            int error_mag = tol_scalar * k[i];
-            if (std::is_same_v<Tc, int32_t>)
-                error_mag = 1;
-
-            copy_matrix(c_ref_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i],
-                        ldc[i], c_cast_ref_array[idx]);
-            good = good && check_almost_equal_matrix(c_array[idx], c_cast_ref_array[idx], layout,
-                                                     m[i], n[i], ldc[i], error_mag, std::cout);
-            idx++;
-        }
-    }
-    oneapi::mkl::aligned_free(m_ref);
-    oneapi::mkl::aligned_free(n_ref);
-    oneapi::mkl::aligned_free(k_ref);
-    oneapi::mkl::aligned_free(lda_ref);
-    oneapi::mkl::aligned_free(ldb_ref);
-    oneapi::mkl::aligned_free(ldc_ref);
-    oneapi::mkl::aligned_free(transa_ref);
-    oneapi::mkl::aligned_free(transb_ref);
-    oneapi::mkl::aligned_free(group_size_ref);
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            oneapi::mkl::free_shared(a_array[idx], cxt);
-            oneapi::mkl::free_shared(b_array[idx], cxt);
-            oneapi::mkl::free_shared(c_array[idx], cxt);
-            oneapi::mkl::free_shared(a_ref_array[idx], cxt);
-            oneapi::mkl::free_shared(b_ref_array[idx], cxt);
-            oneapi::mkl::free_shared(c_cast_ref_array[idx], cxt);
-            oneapi::mkl::free_shared(c_ref_array[idx], cxt);
-            idx++;
-        }
-    }
-
-    return (int)good;
-}
-
-class GemmBatchUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(GemmBatchUsmTests, RealHalfPrecision) {
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half, sycl::half, sycl::half>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchUsmTests, HalfHalfFloatPrecision) {
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half, float, float>(std::get<0>(GetParam()),
-                                                                  std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchUsmTests, Int8Int8SinglePrecision) {
-    EXPECT_TRUEORSKIP((test<std::int8_t, std::int8_t, float, float>(std::get<0>(GetParam()),
-                                                                    std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchUsmTests, Int8Int8Int32Precision) {
-    EXPECT_TRUEORSKIP((test<std::int8_t, std::int8_t, std::int32_t, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (test<float, float, float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP((
-        test<double, double, double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, std::complex<float>, std::complex<float>, std::complex<float>>(
-            std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-TEST_P(GemmBatchUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, std::complex<double>, std::complex<double>,
-              std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemmBatchUsmTestSuite, GemmBatchUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/gemv_batch_stride.cpp b/tests/unit_tests/blas/batch/gemv_batch_stride.cpp
deleted file mode 100644
index bd92f70ca..000000000
--- a/tests/unit_tests/blas/batch/gemv_batch_stride.cpp
+++ /dev/null
@@ -1,233 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, int64_t batch_size) {
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda;
-    oneapi::mkl::transpose transa;
-    fp alpha, beta;
-    int64_t i, tmp;
-
-    batch_size = 15;
-    m = 25;
-    n = 30;
-    lda = 42;
-    alpha = rand_scalar<fp>();
-    beta = rand_scalar<fp>();
-
-    if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
-        transa = (oneapi::mkl::transpose)(std::rand() % 2);
-    }
-    else {
-        tmp = std::rand() % 3;
-        if (tmp == 2)
-            transa = oneapi::mkl::transpose::conjtrans;
-        else
-            transa = (oneapi::mkl::transpose)tmp;
-    }
-
-    int x_len = outer_dimension(transa, m, n);
-    int y_len = inner_dimension(transa, m, n);
-
-    int64_t stride_x, stride_y, stride_a;
-    stride_x = x_len * std::abs(incx);
-    stride_y = y_len * std::abs(incy);
-    stride_a = lda * std::max(m, n);
-
-    vector<fp, allocator_helper<fp, 64>> x(stride_x * batch_size), A(stride_a * batch_size);
-    vector<fp, allocator_helper<fp, 64>> y(stride_y * batch_size), y_ref(stride_y * batch_size);
-
-    for (i = 0; i < batch_size; i++) {
-        rand_vector(x.data() + stride_x * i, x_len, incx);
-        rand_vector(y.data() + stride_y * i, y_len, incy);
-        rand_matrix(A.data() + stride_a * i, layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-    }
-
-    y_ref = y;
-
-    // Call reference GEMV_BATCH_STRIDE.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int incx_ref = (int)incx;
-    int incy_ref = (int)incy;
-    int lda_ref = (int)lda;
-    int batch_size_ref = (int)batch_size;
-
-    for (i = 0; i < batch_size_ref; i++) {
-        ::gemv(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), (const int *)&m_ref,
-               (const int *)&n_ref, (const fp_ref *)&alpha,
-               (const fp_ref *)(A.data() + stride_a * i), (const int *)&lda_ref,
-               (const fp_ref *)(x.data() + stride_x * i), (const int *)&incx_ref,
-               (const fp_ref *)&beta, (fp_ref *)(y_ref.data() + stride_y * i),
-               (const int *)&incy_ref);
-    }
-
-    // Call DPC++ GEMV_BATCH_STRIDE.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMV_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer(x.data(), range<1>(x.size()));
-    buffer<fp, 1> y_buffer(y.data(), range<1>(y.size()));
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::gemv_batch(
-                    main_queue, transa, m, n, alpha, A_buffer, lda, stride_a, x_buffer, incx,
-                    stride_x, beta, y_buffer, incy, stride_y, batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::gemv_batch(
-                    main_queue, transa, m, n, alpha, A_buffer, lda, stride_a, x_buffer, incx,
-                    stride_x, beta, y_buffer, incy, stride_y, batch_size);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemv_batch,
-                                        transa, m, n, alpha, A_buffer, lda, stride_a, x_buffer,
-                                        incx, stride_x, beta, y_buffer, incy, stride_y, batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemv_batch,
-                                        transa, m, n, alpha, A_buffer, lda, stride_a, x_buffer,
-                                        incx, stride_x, beta, y_buffer, incy, stride_y, batch_size);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GEMV_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GEMV_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good = true;
-    for (i = 0; i < batch_size; i++) {
-        good = good && check_equal_vector(y_accessor.get_pointer() + i * stride_y,
-                                          y_ref.data() + i * stride_y, y_len, incy,
-                                          std::max<int>(m, n), std::cout);
-    }
-    return (int)good;
-}
-
-class GemvBatchStrideTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(GemvBatchStrideTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 5));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 5));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 5));
-}
-
-TEST_P(GemvBatchStrideTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 5));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 5));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 5));
-}
-
-TEST_P(GemvBatchStrideTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 5);
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 5);
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 5));
-}
-
-TEST_P(GemvBatchStrideTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 5);
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 5);
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemvBatchStrideTestSuite, GemvBatchStrideTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/gemv_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/gemv_batch_stride_usm.cpp
deleted file mode 100644
index d6eb47887..000000000
--- a/tests/unit_tests/blas/batch/gemv_batch_stride_usm.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, int64_t batch_size) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMM_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda;
-    oneapi::mkl::transpose transa;
-    fp alpha, beta;
-    int64_t i, tmp;
-
-    batch_size = 15;
-    m = 25;
-    n = 30;
-    lda = 42;
-    alpha = rand_scalar<fp>();
-    beta = rand_scalar<fp>();
-
-    if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
-        transa = (oneapi::mkl::transpose)(std::rand() % 2);
-    }
-    else {
-        tmp = std::rand() % 3;
-        if (tmp == 2)
-            transa = oneapi::mkl::transpose::conjtrans;
-        else
-            transa = (oneapi::mkl::transpose)tmp;
-    }
-
-    int x_len = outer_dimension(transa, m, n);
-    int y_len = inner_dimension(transa, m, n);
-
-    int64_t stride_x, stride_y, stride_a;
-    stride_x = x_len * std::abs(incx);
-    stride_y = y_len * std::abs(incy);
-    stride_a = lda * std::max(m, n);
-
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua), y_ref(ua);
-
-    x.resize(stride_x * batch_size);
-    y.resize(stride_y * batch_size);
-    A.resize(stride_a * batch_size);
-
-    for (i = 0; i < batch_size; i++) {
-        rand_vector(&x[stride_x * i], x_len, incx);
-        rand_vector(&y[stride_y * i], y_len, incy);
-        rand_matrix(&A[stride_a * i], layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-    }
-
-    y_ref.resize(y.size());
-    for (int i = 0; i < y.size(); i++)
-        y_ref[i] = y[i];
-
-    // Call reference GEMV_BATCH_STRIDE.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int incx_ref = (int)incx;
-    int incy_ref = (int)incy;
-    int lda_ref = (int)lda;
-    int batch_size_ref = (int)batch_size;
-
-    for (i = 0; i < batch_size_ref; i++) {
-        ::gemv(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), (const int *)&m_ref,
-               (const int *)&n_ref, (const fp_ref *)&alpha, (const fp_ref *)&A[stride_a * i],
-               (const int *)&lda_ref, (const fp_ref *)&x[stride_x * i], (const int *)&incx_ref,
-               (const fp_ref *)&beta, (fp_ref *)&y_ref[stride_y * i], (const int *)&incy_ref);
-    }
-
-    // Call DPC++ GEMV_BATCH_STRIDE.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::gemv_batch(
-                    main_queue, transa, m, n, alpha, &A[0], lda, stride_a, &x[0], incx, stride_x,
-                    beta, &y[0], incy, stride_y, batch_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::gemv_batch(
-                    main_queue, transa, m, n, alpha, &A[0], lda, stride_a, &x[0], incx, stride_x,
-                    beta, &y[0], incy, stride_y, batch_size, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemv_batch,
-                                        transa, m, n, alpha, &A[0], lda, stride_a, &x[0], incx,
-                                        stride_x, beta, &y[0], incy, stride_y, batch_size,
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemv_batch,
-                                        transa, m, n, alpha, &A[0], lda, stride_a, &x[0], incx,
-                                        stride_x, beta, &y[0], incy, stride_y, batch_size,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GEMV_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GEMV_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = true;
-    for (i = 0; i < batch_size; i++) {
-        good = good && check_equal_vector(&y[i * stride_y], &y_ref[i * stride_y], y_len, incy,
-                                          std::max<int>(m, n), std::cout);
-    }
-    return (int)good;
-}
-
-class GemvBatchStrideUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(GemvBatchStrideUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 5));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 5));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 5));
-}
-
-TEST_P(GemvBatchStrideUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 5));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 5));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 5));
-}
-
-TEST_P(GemvBatchStrideUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 5);
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 5);
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 5));
-}
-
-TEST_P(GemvBatchStrideUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 5);
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), -2, -3, 5);
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1, 1, 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemvBatchStrideUsmTestSuite, GemvBatchStrideUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/gemv_batch_usm.cpp b/tests/unit_tests/blas/batch/gemv_batch_usm.cpp
deleted file mode 100644
index 4ad661f5b..000000000
--- a/tests/unit_tests/blas/batch/gemv_batch_usm.cpp
+++ /dev/null
@@ -1,339 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "allocator_helper.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMV_BATCH:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto uaint = usm_allocator<int64_t, usm::alloc::shared, 64>(cxt, *dev);
-    vector<int64_t, decltype(uaint)> m(uaint), n(uaint), lda(uaint), incx(uaint), incy(uaint),
-        group_size(uaint);
-
-    auto uatranspose = usm_allocator<oneapi::mkl::transpose, usm::alloc::shared, 64>(cxt, *dev);
-    vector<oneapi::mkl::transpose, decltype(uatranspose)> transa(uatranspose);
-
-    auto uafp = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(uafp)> alpha(uafp), beta(uafp);
-
-    m.resize(group_count);
-    n.resize(group_count);
-    lda.resize(group_count);
-    incx.resize(group_count);
-    incy.resize(group_count);
-    group_size.resize(group_count);
-    transa.resize(group_count);
-    alpha.resize(group_count);
-    beta.resize(group_count);
-
-    int64_t i, tmp;
-    int64_t j, idx = 0;
-    int64_t total_batch_count = 0;
-    int64_t x_len = 0, y_len = 0;
-    int64_t size_a = 0, size_x = 0, size_y = 0;
-
-    for (i = 0; i < group_count; i++) {
-        group_size[i] = 1 + std::rand() % 20;
-        m[i] = 1 + std::rand() % 500;
-        n[i] = 1 + std::rand() % 500;
-        lda[i] = std::max(m[i], n[i]);
-        incx[i] = -3 + std::rand() % 6;
-        incx[i] = (incx[i] == 0) ? 3 : incx[i];
-        incy[i] = -3 + std::rand() % 6;
-        incy[i] = (incy[i] == 0) ? 3 : incy[i];
-        alpha[i] = rand_scalar<fp>();
-        beta[i] = rand_scalar<fp>();
-        if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
-            transa[i] = (oneapi::mkl::transpose)(std::rand() % 2);
-        }
-        else {
-            tmp = std::rand() % 3;
-            if (tmp == 2)
-                transa[i] = oneapi::mkl::transpose::conjtrans;
-            else
-                transa[i] = (oneapi::mkl::transpose)tmp;
-        }
-        total_batch_count += group_size[i];
-    }
-
-    auto uafpp = usm_allocator<fp *, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp *, decltype(uafpp)> a_array(uafpp), x_array(uafpp), y_array(uafpp),
-        y_ref_array(uafpp);
-    a_array.resize(total_batch_count);
-    x_array.resize(total_batch_count);
-    y_array.resize(total_batch_count);
-    y_ref_array.resize(total_batch_count);
-
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        size_a = (layout == oneapi::mkl::layout::col_major) ? lda[i] * n[i] : lda[i] * m[i];
-        x_len = (transa[i] == oneapi::mkl::transpose::nontrans) ? n[i] : m[i];
-        y_len = (transa[i] == oneapi::mkl::transpose::nontrans) ? m[i] : n[i];
-        size_x = 1 + (x_len - 1) * std::abs(incx[i]);
-        size_y = 1 + (y_len - 1) * std::abs(incy[i]);
-        for (j = 0; j < group_size[i]; j++) {
-            a_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt);
-            x_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_x, *dev, cxt);
-            y_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_y, *dev, cxt);
-            y_ref_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_y, *dev, cxt);
-            rand_matrix(a_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i], lda[i]);
-            rand_vector(x_array[idx], x_len, incx[i]);
-            rand_vector(y_array[idx], y_len, incy[i]);
-            copy_vector(y_array[idx], y_len, incy[i], y_ref_array[idx]);
-            idx++;
-        }
-    }
-
-    // Call reference GEMV_BATCH.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int *m_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *n_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *lda_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *incx_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *incy_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *group_size_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-
-    CBLAS_TRANSPOSE *transa_ref =
-        (CBLAS_TRANSPOSE *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count);
-
-    if ((m_ref == NULL) || (n_ref == NULL) || (lda_ref == NULL) || (incx_ref == NULL) ||
-        (incy_ref == NULL) || (transa_ref == NULL) || (group_size_ref == NULL)) {
-        std::cout << "Error cannot allocate input arrays\n";
-        oneapi::mkl::aligned_free(m_ref);
-        oneapi::mkl::aligned_free(n_ref);
-        oneapi::mkl::aligned_free(lda_ref);
-        oneapi::mkl::aligned_free(incx_ref);
-        oneapi::mkl::aligned_free(incy_ref);
-        oneapi::mkl::aligned_free(transa_ref);
-        oneapi::mkl::aligned_free(group_size_ref);
-        idx = 0;
-        for (i = 0; i < group_count; i++) {
-            for (j = 0; j < group_size[i]; j++) {
-                oneapi::mkl::free_shared(a_array[idx], cxt);
-                oneapi::mkl::free_shared(x_array[idx], cxt);
-                oneapi::mkl::free_shared(y_array[idx], cxt);
-                oneapi::mkl::free_shared(y_ref_array[idx], cxt);
-                idx++;
-            }
-        }
-        return false;
-    }
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        transa_ref[i] = convert_to_cblas_trans(transa[i]);
-        m_ref[i] = (int)m[i];
-        n_ref[i] = (int)n[i];
-        lda_ref[i] = (int)lda[i];
-        incx_ref[i] = (int)incx[i];
-        incy_ref[i] = (int)incy[i];
-        group_size_ref[i] = (int)group_size[i];
-        for (j = 0; j < group_size_ref[i]; j++) {
-            ::gemv(convert_to_cblas_layout(layout), transa_ref[i], (const int *)&m_ref[i],
-                   (const int *)&n_ref[i], (const fp_ref *)&alpha[i], (const fp_ref *)a_array[idx],
-                   (const int *)&lda_ref[i], (const fp_ref *)x_array[idx],
-                   (const int *)&incx_ref[i], (const fp_ref *)&beta[i], (fp_ref *)y_ref_array[idx],
-                   (const int *)&incy_ref[i]);
-            idx++;
-        }
-    }
-
-    // Call DPC++ GEMV_BATCH.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::gemv_batch(
-                    main_queue, &transa[0], &m[0], &n[0], &alpha[0], (const fp **)&a_array[0],
-                    &lda[0], (const fp **)&x_array[0], &incx[0], &beta[0], &y_array[0], &incy[0],
-                    group_count, &group_size[0], dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::gemv_batch(
-                    main_queue, &transa[0], &m[0], &n[0], &alpha[0], (const fp **)&a_array[0],
-                    &lda[0], (const fp **)&x_array[0], &incx[0], &beta[0], &y_array[0], &incy[0],
-                    group_count, &group_size[0], dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemv_batch,
-                                        &transa[0], &m[0], &n[0], &alpha[0],
-                                        (const fp **)&a_array[0], &lda[0], (const fp **)&x_array[0],
-                                        &incx[0], &beta[0], &y_array[0], &incy[0], group_count,
-                                        &group_size[0], dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemv_batch,
-                                        &transa[0], &m[0], &n[0], &alpha[0],
-                                        (const fp **)&a_array[0], &lda[0], (const fp **)&x_array[0],
-                                        &incx[0], &beta[0], &y_array[0], &incy[0], group_count,
-                                        &group_size[0], dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GEMV_BATCH:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        oneapi::mkl::aligned_free(m_ref);
-        oneapi::mkl::aligned_free(n_ref);
-        oneapi::mkl::aligned_free(lda_ref);
-        oneapi::mkl::aligned_free(incx_ref);
-        oneapi::mkl::aligned_free(incy_ref);
-        oneapi::mkl::aligned_free(transa_ref);
-        oneapi::mkl::aligned_free(group_size_ref);
-        idx = 0;
-        for (i = 0; i < group_count; i++) {
-            for (j = 0; j < group_size[i]; j++) {
-                oneapi::mkl::free_shared(a_array[idx], cxt);
-                oneapi::mkl::free_shared(x_array[idx], cxt);
-                oneapi::mkl::free_shared(y_array[idx], cxt);
-                oneapi::mkl::free_shared(y_ref_array[idx], cxt);
-                idx++;
-            }
-        }
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GEMV_BATCH:\n" << error.what() << std::endl;
-    }
-
-    bool good = true;
-    // Compare the results of reference implementation and DPC++ implementation.
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        y_len = (transa[i] == oneapi::mkl::transpose::nontrans) ? m[i] : n[i];
-        for (j = 0; j < group_size[i]; j++) {
-            good = good && check_equal_vector(y_array[idx], y_ref_array[idx], y_len, incy[i],
-                                              std::max<int>(m[i], n[i]), std::cout);
-            idx++;
-        }
-    }
-
-    oneapi::mkl::aligned_free(m_ref);
-    oneapi::mkl::aligned_free(n_ref);
-    oneapi::mkl::aligned_free(lda_ref);
-    oneapi::mkl::aligned_free(incx_ref);
-    oneapi::mkl::aligned_free(incy_ref);
-    oneapi::mkl::aligned_free(transa_ref);
-    oneapi::mkl::aligned_free(group_size_ref);
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            oneapi::mkl::free_shared(a_array[idx], cxt);
-            oneapi::mkl::free_shared(x_array[idx], cxt);
-            oneapi::mkl::free_shared(y_array[idx], cxt);
-            oneapi::mkl::free_shared(y_ref_array[idx], cxt);
-            idx++;
-        }
-    }
-
-    return (int)good;
-}
-
-class GemvBatchUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(GemvBatchUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(GemvBatchUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(GemvBatchUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(GemvBatchUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemvBatchUsmTestSuite, GemvBatchUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/imatcopy_batch_stride.cpp b/tests/unit_tests/blas/batch/imatcopy_batch_stride.cpp
deleted file mode 100644
index ac8bbb2b4..000000000
--- a/tests/unit_tests/blas/batch/imatcopy_batch_stride.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) {
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb;
-    oneapi::mkl::transpose trans;
-    fp alpha;
-    int64_t i, tmp;
-
-    batch_size = 1 + std::rand() % 20;
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = std::max(m, n);
-    ldb = std::max(m, n);
-    alpha = rand_scalar<fp>();
-    trans = rand_trans<fp>();
-
-    int64_t stride_a, stride_b, stride;
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            stride_a = lda * n;
-            stride_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * m;
-            stride = std::max(stride_a, stride_b);
-            break;
-        case oneapi::mkl::layout::row_major:
-            stride_a = lda * m;
-            stride_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * m : ldb * n;
-            stride = std::max(stride_a, stride_b);
-            break;
-        default: break;
-    }
-
-    vector<fp, allocator_helper<fp, 64>> AB(stride * batch_size), AB_ref(stride * batch_size);
-
-    rand_matrix(AB.data(), oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride * batch_size, 1, stride * batch_size);
-    copy_matrix(AB.data(), oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride * batch_size, 1, stride * batch_size, AB_ref.data());
-
-    // Call reference IMATCOPY_BATCH_STRIDE.
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int lda_ref = (int)lda;
-    int ldb_ref = (int)ldb;
-    int batch_size_ref = (int)batch_size;
-    for (i = 0; i < batch_size_ref; i++) {
-        imatcopy_ref(layout, trans, m_ref, n_ref, alpha, AB_ref.data() + stride * i, lda_ref,
-                     ldb_ref);
-    }
-
-    // Call DPC++ IMATCOPY_BATCH_STRIDE
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during IMATCOPY_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> AB_buffer(AB.data(), range<1>(AB.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::imatcopy_batch(
-                    main_queue, trans, m, n, alpha, AB_buffer, lda, ldb, stride, batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::imatcopy_batch(
-                    main_queue, trans, m, n, alpha, AB_buffer, lda, ldb, stride, batch_size);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::imatcopy_batch,
-                                        trans, m, n, alpha, AB_buffer, lda, ldb, stride,
-                                        batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::imatcopy_batch,
-                                        trans, m, n, alpha, AB_buffer, lda, ldb, stride,
-                                        batch_size);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during IMATCOPY_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of IMATCOPY_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto AB_accessor = AB_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(AB_accessor, AB_ref, oneapi::mkl::layout::col_major,
-                                   stride * batch_size, 1, stride * batch_size, 10, std::cout);
-
-    return (int)good;
-}
-
-class ImatcopyBatchStrideTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(ImatcopyBatchStrideTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(ImatcopyBatchStrideTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(ImatcopyBatchStrideTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(ImatcopyBatchStrideTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(ImatcopyBatchStrideTestSuite, ImatcopyBatchStrideTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/imatcopy_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/imatcopy_batch_stride_usm.cpp
deleted file mode 100644
index b3099d309..000000000
--- a/tests/unit_tests/blas/batch/imatcopy_batch_stride_usm.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during OMATCOPY_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb;
-    oneapi::mkl::transpose trans;
-    fp alpha;
-    int64_t i;
-
-    batch_size = 1 + std::rand() % 20;
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = std::max(m, n);
-    ldb = std::max(m, n);
-    alpha = rand_scalar<fp>();
-    trans = rand_trans<fp>();
-
-    int64_t stride_a, stride_b, stride;
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            stride_a = lda * n;
-            stride_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * m;
-            stride = std::max(stride_a, stride_b);
-            break;
-        case oneapi::mkl::layout::row_major:
-            stride_a = lda * m;
-            stride_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * m : ldb * n;
-            stride = std::max(stride_a, stride_b);
-            break;
-        default: break;
-    }
-
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> AB(ua), AB_ref(ua);
-
-    AB.resize(stride * batch_size);
-    AB_ref.resize(stride * batch_size);
-    fp **ab_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt);
-    fp **ab_ref_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt);
-    if ((ab_array == NULL) || (ab_ref_array == NULL)) {
-        std::cout << "Error cannot allocate arrays of pointers\n";
-        oneapi::mkl::free_shared(ab_array, cxt);
-        oneapi::mkl::free_shared(ab_ref_array, cxt);
-        return false;
-    }
-
-    for (i = 0; i < batch_size; i++) {
-        ab_array[i] = &AB[i * stride];
-        ab_ref_array[i] = &AB_ref[i * stride];
-    }
-
-    rand_matrix(AB, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride * batch_size, 1, stride * batch_size);
-    copy_matrix(AB, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride * batch_size, 1, stride * batch_size, AB_ref);
-
-    // Call reference IMATCOPY_BATCH_STRIDE.
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int lda_ref = (int)lda;
-    int ldb_ref = (int)ldb;
-    int batch_size_ref = (int)batch_size;
-    for (i = 0; i < batch_size_ref; i++) {
-        imatcopy_ref(layout, trans, m_ref, n_ref, alpha, ab_ref_array[i], lda_ref, ldb_ref);
-    }
-
-    // Call DPC++ IMATCOPY_BATCH_STRIDE
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::imatcopy_batch(
-                    main_queue, trans, m, n, alpha, &AB[0], lda, ldb, stride, batch_size,
-                    dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::imatcopy_batch(main_queue, trans, m, n, alpha,
-                                                                    &AB[0], lda, ldb, stride,
-                                                                    batch_size, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::imatcopy_batch,
-                                        trans, m, n, alpha, &AB[0], lda, ldb, stride, batch_size,
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::imatcopy_batch,
-                                        trans, m, n, alpha, &AB[0], lda, ldb, stride, batch_size,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during IMATCOPY_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        oneapi::mkl::free_shared(ab_array, cxt);
-        oneapi::mkl::free_shared(ab_ref_array, cxt);
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of IMATCOPY_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good = check_equal_matrix(AB, AB_ref, oneapi::mkl::layout::col_major, stride * batch_size,
-                                   1, stride * batch_size, 10, std::cout);
-
-    oneapi::mkl::free_shared(ab_array, cxt);
-    oneapi::mkl::free_shared(ab_ref_array, cxt);
-
-    return (int)good;
-}
-
-class ImatcopyBatchStrideUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(ImatcopyBatchStrideUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(ImatcopyBatchStrideUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(ImatcopyBatchStrideUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(ImatcopyBatchStrideUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(ImatcopyBatchStrideUsmTestSuite, ImatcopyBatchStrideUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/imatcopy_batch_usm.cpp b/tests/unit_tests/blas/batch/imatcopy_batch_usm.cpp
deleted file mode 100644
index 74c9881af..000000000
--- a/tests/unit_tests/blas/batch/imatcopy_batch_usm.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-#include <type_traits>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during IMATCOPY_BATCH:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto uaint = usm_allocator<int64_t, usm::alloc::shared, 64>(cxt, *dev);
-    vector<int64_t, decltype(uaint)> m(uaint), n(uaint), lda(uaint), ldb(uaint), group_size(uaint);
-
-    auto uatranspose = usm_allocator<oneapi::mkl::transpose, usm::alloc::shared, 64>(cxt, *dev);
-    vector<oneapi::mkl::transpose, decltype(uatranspose)> trans(uatranspose);
-
-    auto uafp = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(uafp)> alpha(uafp);
-
-    m.resize(group_count);
-    n.resize(group_count);
-    lda.resize(group_count);
-    ldb.resize(group_count);
-    group_size.resize(group_count);
-    trans.resize(group_count);
-    alpha.resize(group_count);
-
-    int64_t i, tmp;
-    int64_t j, idx = 0;
-    int64_t total_batch_count = 0;
-    int64_t size_a = 0, size_b = 0, size = 0;
-
-    for (i = 0; i < group_count; i++) {
-        group_size[i] = 1 + std::rand() % 20;
-        m[i] = 1 + std::rand() % 50;
-        n[i] = 1 + std::rand() % 50;
-        lda[i] = std::max(m[i], n[i]);
-        ldb[i] = std::max(m[i], n[i]);
-        alpha[i] = rand_scalar<fp>();
-        trans[i] = rand_trans<fp>();
-        total_batch_count += group_size[i];
-    }
-
-    auto uafpp = usm_allocator<fp *, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp *, decltype(uafpp)> ab_array(uafpp), ab_ref_array(uafpp);
-
-    ab_array.resize(total_batch_count);
-    ab_ref_array.resize(total_batch_count);
-
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                size_a = lda[i] * n[i];
-                size_b =
-                    (trans[i] == oneapi::mkl::transpose::nontrans) ? ldb[i] * n[i] : ldb[i] * m[i];
-                break;
-            case oneapi::mkl::layout::row_major:
-                size_a = lda[i] * m[i];
-                size_b =
-                    (trans[i] == oneapi::mkl::transpose::nontrans) ? ldb[i] * m[i] : ldb[i] * n[i];
-                break;
-            default: break;
-        }
-        size = std::max(size_a, size_b);
-        for (j = 0; j < group_size[i]; j++) {
-            ab_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size, *dev, cxt);
-            ab_ref_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size, *dev, cxt);
-            rand_matrix(ab_array[idx], oneapi::mkl::layout::col_major,
-                        oneapi::mkl::transpose::nontrans, size, 1, size);
-            copy_matrix(ab_array[idx], oneapi::mkl::layout::col_major,
-                        oneapi::mkl::transpose::nontrans, size, 1, size, ab_ref_array[idx]);
-            idx++;
-        }
-    }
-
-    // Call reference IMATCOPY
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        int m_ref = (int)m[i];
-        int n_ref = (int)n[i];
-        int lda_ref = (int)lda[i];
-        int ldb_ref = (int)ldb[i];
-        int group_size_ref = (int)group_size[i];
-        for (j = 0; j < group_size_ref; j++) {
-            imatcopy_ref(layout, trans[i], m_ref, n_ref, alpha[i], ab_ref_array[idx], lda_ref,
-                         ldb_ref);
-            idx++;
-        }
-    }
-
-    // Call DPC++ IMATCOPY_BATCH
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::imatcopy_batch(
-                    main_queue, trans.data(), m.data(), n.data(), alpha.data(), ab_array.data(),
-                    lda.data(), ldb.data(), group_count, group_size.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::imatcopy_batch(
-                    main_queue, trans.data(), m.data(), n.data(), alpha.data(), ab_array.data(),
-                    lda.data(), ldb.data(), group_count, group_size.data(), dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::imatcopy_batch,
-                                        trans.data(), m.data(), n.data(), alpha.data(),
-                                        ab_array.data(), lda.data(), ldb.data(), group_count,
-                                        group_size.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::imatcopy_batch,
-                                        trans.data(), m.data(), n.data(), alpha.data(),
-                                        ab_array.data(), lda.data(), ldb.data(), group_count,
-                                        group_size.data(), dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during IMATCOPY_BATCH:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        idx = 0;
-        for (i = 0; i < group_count; i++) {
-            for (j = 0; j < group_size[i]; j++) {
-                oneapi::mkl::free_shared(ab_array[idx], cxt);
-                oneapi::mkl::free_shared(ab_ref_array[idx], cxt);
-                idx++;
-            }
-        }
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of IMATCOPY_BATCH:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good = true;
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                size_a = lda[i] * n[i];
-                size_b =
-                    (trans[i] == oneapi::mkl::transpose::nontrans) ? ldb[i] * n[i] : ldb[i] * m[i];
-                break;
-            case oneapi::mkl::layout::row_major:
-                size_a = lda[i] * m[i];
-                size_b =
-                    (trans[i] == oneapi::mkl::transpose::nontrans) ? ldb[i] * m[i] : ldb[i] * n[i];
-                break;
-            default: break;
-        }
-        size = std::max(size_a, size_b);
-        for (j = 0; j < group_size[i]; j++) {
-            good = good &&
-                   check_equal_matrix(ab_array[idx], ab_ref_array[idx],
-                                      oneapi::mkl::layout::col_major, size, 1, size, 10, std::cout);
-            idx++;
-        }
-    }
-
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            oneapi::mkl::free_shared(ab_array[idx], cxt);
-            oneapi::mkl::free_shared(ab_ref_array[idx], cxt);
-            idx++;
-        }
-    }
-
-    return (int)good;
-}
-
-class ImatcopyBatchUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(ImatcopyBatchUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(ImatcopyBatchUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(ImatcopyBatchUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(ImatcopyBatchUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(ImatcopyBatchUsmTestSuite, ImatcopyBatchUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/omatadd_batch_stride.cpp b/tests/unit_tests/blas/batch/omatadd_batch_stride.cpp
deleted file mode 100644
index cc20d0e3b..000000000
--- a/tests/unit_tests/blas/batch/omatadd_batch_stride.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) {
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb, ldc;
-    oneapi::mkl::transpose transa, transb;
-    fp alpha, beta;
-    int64_t i, tmp;
-
-    batch_size = 1 + std::rand() % 20;
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = std::max(m, n);
-    ldb = std::max(m, n);
-    ldc = std::max(m, n);
-    alpha = rand_scalar<fp>();
-    beta = rand_scalar<fp>();
-    transa = rand_trans<fp>();
-    transb = rand_trans<fp>();
-
-    int64_t stride_a, stride_b, stride_c;
-
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            stride_a = (transa == oneapi::mkl::transpose::nontrans) ? lda * n : lda * m;
-            stride_b = (transb == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * m;
-            stride_c = ldc * n;
-            break;
-        case oneapi::mkl::layout::row_major:
-            stride_a = (transa == oneapi::mkl::transpose::nontrans) ? lda * m : lda * n;
-            stride_b = (transb == oneapi::mkl::transpose::nontrans) ? ldb * m : ldb * n;
-            stride_c = ldc * m;
-            break;
-        default: break;
-    }
-
-    vector<fp, allocator_helper<fp, 64>> A(stride_a * batch_size), B(stride_b * batch_size),
-        C(stride_c * batch_size), C_ref(stride_c * batch_size);
-
-    rand_matrix(A.data(), oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_a * batch_size, 1, stride_a * batch_size);
-    rand_matrix(B.data(), oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_b * batch_size, 1, stride_b * batch_size);
-    rand_matrix(C.data(), oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_c * batch_size, 1, stride_c * batch_size);
-    copy_matrix(C.data(), oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_c * batch_size, 1, stride_c * batch_size, C_ref.data());
-
-    // Call reference OMATADD_BATCH_STRIDE.
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int lda_ref = (int)lda;
-    int ldb_ref = (int)ldb;
-    int ldc_ref = (int)ldc;
-    int batch_size_ref = (int)batch_size;
-    for (i = 0; i < batch_size_ref; i++) {
-        omatadd_ref(layout, transa, transb, m_ref, n_ref, alpha, A.data() + stride_a * i, lda_ref,
-                    beta, B.data() + stride_b * i, ldb_ref, C_ref.data() + stride_c * i, ldc_ref);
-    }
-
-    // Call DPC++ OMATADD_BATCH_STRIDE
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during OMATADD_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-    buffer<fp, 1> C_buffer(C.data(), range<1>(C.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::omatadd_batch(
-                    main_queue, transa, transb, m, n, alpha, A_buffer, lda, stride_a, beta,
-                    B_buffer, ldb, stride_b, C_buffer, ldc, stride_c, batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::omatadd_batch(
-                    main_queue, transa, transb, m, n, alpha, A_buffer, lda, stride_a, beta,
-                    B_buffer, ldb, stride_b, C_buffer, ldc, stride_c, batch_size);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::omatadd_batch,
-                                        transa, transb, m, n, alpha, A_buffer, lda, stride_a, beta,
-                                        B_buffer, ldb, stride_b, C_buffer, ldc, stride_c,
-                                        batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::omatadd_batch,
-                                        transa, transb, m, n, alpha, A_buffer, lda, stride_a, beta,
-                                        B_buffer, ldb, stride_b, C_buffer, ldc, stride_c,
-                                        batch_size);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during OMATADD_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of OMATADD_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto C_accessor = C_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(C_accessor, C_ref, oneapi::mkl::layout::col_major,
-                                   stride_c * batch_size, 1, stride_c * batch_size, 10, std::cout);
-
-    return (int)good;
-}
-
-class OmataddBatchStrideTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(OmataddBatchStrideTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmataddBatchStrideTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmataddBatchStrideTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmataddBatchStrideTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(OmataddBatchStrideTestSuite, OmataddBatchStrideTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/omatadd_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/omatadd_batch_stride_usm.cpp
deleted file mode 100644
index 7388084cb..000000000
--- a/tests/unit_tests/blas/batch/omatadd_batch_stride_usm.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during OMATADD_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb, ldc;
-    oneapi::mkl::transpose transa, transb;
-    fp alpha, beta;
-    int64_t i, tmp;
-
-    batch_size = 1 + std::rand() % 20;
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = std::max(m, n);
-    ldb = std::max(m, n);
-    ldc = std::max(m, n);
-    alpha = rand_scalar<fp>();
-    beta = rand_scalar<fp>();
-    transa = rand_trans<fp>();
-    transb = rand_trans<fp>();
-
-    int64_t stride_a, stride_b, stride_c;
-
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            stride_a = (transa == oneapi::mkl::transpose::nontrans) ? lda * n : lda * m;
-            stride_b = (transb == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * m;
-            stride_c = ldc * n;
-            break;
-        case oneapi::mkl::layout::row_major:
-            stride_a = (transa == oneapi::mkl::transpose::nontrans) ? lda * m : lda * n;
-            stride_b = (transb == oneapi::mkl::transpose::nontrans) ? ldb * m : ldb * n;
-            stride_c = ldc * m;
-            break;
-        default: break;
-    }
-
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), B(ua), C(ua), C_ref(ua);
-
-    A.resize(stride_a * batch_size);
-    B.resize(stride_b * batch_size);
-    C.resize(stride_c * batch_size);
-    C_ref.resize(stride_c * batch_size);
-
-    fp **a_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt);
-    fp **b_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt);
-    fp **c_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt);
-    fp **c_ref_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt);
-
-    if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL) || (c_ref_array == NULL)) {
-        std::cout << "Error cannot allocate arrays of pointers\n";
-        oneapi::mkl::free_shared(a_array, cxt);
-        oneapi::mkl::free_shared(b_array, cxt);
-        oneapi::mkl::free_shared(c_array, cxt);
-        oneapi::mkl::free_shared(c_ref_array, cxt);
-        return false;
-    }
-
-    for (i = 0; i < batch_size; i++) {
-        a_array[i] = &A[i * stride_a];
-        b_array[i] = &B[i * stride_b];
-        c_array[i] = &C[i * stride_c];
-        c_ref_array[i] = &C_ref[i * stride_c];
-    }
-
-    rand_matrix(A, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_a * batch_size, 1, stride_a * batch_size);
-    rand_matrix(B, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_b * batch_size, 1, stride_b * batch_size);
-    rand_matrix(C, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_c * batch_size, 1, stride_c * batch_size);
-    copy_matrix(C, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_c * batch_size, 1, stride_c * batch_size, C_ref);
-
-    // Call reference OMATADD_BATCH_STRIDE.
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int lda_ref = (int)lda;
-    int ldb_ref = (int)ldb;
-    int ldc_ref = (int)ldc;
-    int batch_size_ref = (int)batch_size;
-    for (i = 0; i < batch_size_ref; i++) {
-        omatadd_ref(layout, transa, transb, m_ref, n_ref, alpha, a_array[i], lda_ref, beta,
-                    b_array[i], ldb_ref, c_ref_array[i], ldc_ref);
-    }
-
-    // Call DPC++ OMATADD_BATCH_STRIDE
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::omatadd_batch(
-                    main_queue, transa, transb, m, n, alpha, &A[0], lda, stride_a, beta, &B[0], ldb,
-                    stride_b, &C[0], ldc, stride_c, batch_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::omatadd_batch(
-                    main_queue, transa, transb, m, n, alpha, &A[0], lda, stride_a, beta, &B[0], ldb,
-                    stride_b, &C[0], ldc, stride_c, batch_size, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::omatadd_batch,
-                                        transa, transb, m, n, alpha, &A[0], lda, stride_a, beta,
-                                        &B[0], ldb, stride_b, &C[0], ldc, stride_c, batch_size,
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::omatadd_batch,
-                                        transa, transb, m, n, alpha, &A[0], lda, stride_a, beta,
-                                        &B[0], ldb, stride_b, &C[0], ldc, stride_c, batch_size,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during OMATADD_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        oneapi::mkl::free_shared(a_array, cxt);
-        oneapi::mkl::free_shared(b_array, cxt);
-        oneapi::mkl::free_shared(c_array, cxt);
-        oneapi::mkl::free_shared(c_ref_array, cxt);
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of OMATADD_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good = check_equal_matrix(C, C_ref, oneapi::mkl::layout::col_major, stride_c * batch_size,
-                                   1, stride_c * batch_size, 10, std::cout);
-
-    oneapi::mkl::free_shared(a_array, cxt);
-    oneapi::mkl::free_shared(b_array, cxt);
-    oneapi::mkl::free_shared(c_array, cxt);
-    oneapi::mkl::free_shared(c_ref_array, cxt);
-
-    return (int)good;
-}
-
-class OmataddBatchStrideUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(OmataddBatchStrideUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmataddBatchStrideUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmataddBatchStrideUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmataddBatchStrideUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(OmataddBatchStrideUsmTestSuite, OmataddBatchStrideUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/omatcopy_batch_stride.cpp b/tests/unit_tests/blas/batch/omatcopy_batch_stride.cpp
deleted file mode 100644
index d08329fc6..000000000
--- a/tests/unit_tests/blas/batch/omatcopy_batch_stride.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) {
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb;
-    oneapi::mkl::transpose trans;
-    fp alpha;
-    int64_t i, tmp;
-
-    batch_size = 1 + std::rand() % 20;
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = std::max(m, n);
-    ldb = std::max(m, n);
-    alpha = rand_scalar<fp>();
-    trans = rand_trans<fp>();
-
-    int64_t stride_a, stride_b;
-
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            stride_a = lda * n;
-            stride_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * m;
-            break;
-        case oneapi::mkl::layout::row_major:
-            stride_a = lda * m;
-            stride_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * m : ldb * n;
-            break;
-        default: break;
-    }
-
-    vector<fp, allocator_helper<fp, 64>> A(stride_a * batch_size), B(stride_b * batch_size),
-        B_ref(stride_b * batch_size);
-
-    for (i = 0; i < batch_size; i++) {
-        rand_matrix(A.data() + stride_a * i, layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-        rand_matrix(B.data() + stride_b * i, layout, trans, m, n, ldb);
-    }
-
-    // Call reference OMATCOPY_BATCH_STRIDE.
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int lda_ref = (int)lda;
-    int ldb_ref = (int)ldb;
-    int batch_size_ref = (int)batch_size;
-    for (i = 0; i < batch_size_ref; i++) {
-        omatcopy_ref(layout, trans, m_ref, n_ref, alpha, A.data() + stride_a * i, lda_ref,
-                     B_ref.data() + stride_b * i, ldb_ref);
-    }
-
-    // Call DPC++ OMATCOPY_BATCH_STRIDE
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during OMATCOPY_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::omatcopy_batch(main_queue, trans, m, n, alpha,
-                                                                A_buffer, lda, stride_a, B_buffer,
-                                                                ldb, stride_b, batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::omatcopy_batch(main_queue, trans, m, n, alpha,
-                                                             A_buffer, lda, stride_a, B_buffer, ldb,
-                                                             stride_b, batch_size);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::omatcopy_batch,
-                                        trans, m, n, alpha, A_buffer, lda, stride_a, B_buffer, ldb,
-                                        stride_b, batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::omatcopy_batch,
-                                        trans, m, n, alpha, A_buffer, lda, stride_a, B_buffer, ldb,
-                                        stride_b, batch_size);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during OMATCOPY_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of OMATCOPY_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto B_accessor = B_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(B_accessor, B_ref, oneapi::mkl::layout::col_major,
-                                   stride_b * batch_size, 1, stride_b * batch_size, 10, std::cout);
-
-    return (int)good;
-}
-
-class OmatcopyBatchStrideTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(OmatcopyBatchStrideTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmatcopyBatchStrideTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmatcopyBatchStrideTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmatcopyBatchStrideTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(OmatcopyBatchStrideTestSuite, OmatcopyBatchStrideTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/omatcopy_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/omatcopy_batch_stride_usm.cpp
deleted file mode 100644
index 7479b57db..000000000
--- a/tests/unit_tests/blas/batch/omatcopy_batch_stride_usm.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-#include <type_traits>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during OMATCOPY_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb;
-    oneapi::mkl::transpose trans;
-    fp alpha;
-    int64_t i, tmp;
-
-    batch_size = 1 + std::rand() % 20;
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = std::max(m, n);
-    ldb = std::max(m, n);
-    alpha = rand_scalar<fp>();
-    trans = rand_trans<fp>();
-
-    int64_t stride_a, stride_b;
-
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            stride_a = lda * n;
-            stride_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * m;
-            break;
-        case oneapi::mkl::layout::row_major:
-            stride_a = lda * m;
-            stride_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * m : ldb * n;
-            break;
-        default: break;
-    }
-
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), B(ua), B_ref(ua);
-
-    A.resize(stride_a * batch_size);
-    B.resize(stride_b * batch_size);
-    B_ref.resize(stride_b * batch_size);
-
-    fp **a_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt);
-    fp **b_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt);
-    fp **b_ref_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt);
-
-    if ((a_array == NULL) || (b_array == NULL) || (b_ref_array == NULL)) {
-        std::cout << "Error cannot allocate arrays of pointers\n";
-        oneapi::mkl::free_shared(a_array, cxt);
-        oneapi::mkl::free_shared(b_array, cxt);
-        oneapi::mkl::free_shared(b_ref_array, cxt);
-        return false;
-    }
-
-    for (i = 0; i < batch_size; i++) {
-        a_array[i] = &A[i * stride_a];
-        b_array[i] = &B[i * stride_b];
-        b_ref_array[i] = &B_ref[i * stride_b];
-    }
-
-    rand_matrix(A, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_a * batch_size, 1, stride_a * batch_size);
-    rand_matrix(B, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_b * batch_size, 1, stride_b * batch_size);
-    copy_matrix(B, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_b * batch_size, 1, stride_b * batch_size, B_ref);
-
-    // Call reference OMATCOPY_BATCH_STRIDE.
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int lda_ref = (int)lda;
-    int ldb_ref = (int)ldb;
-    int batch_size_ref = (int)batch_size;
-    for (i = 0; i < batch_size_ref; i++) {
-        omatcopy_ref(layout, trans, m_ref, n_ref, alpha, a_array[i], lda_ref, b_ref_array[i],
-                     ldb_ref);
-    }
-
-    // Call DPC++ OMATCOPY_BATCH_STRIDE
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::omatcopy_batch(
-                    main_queue, trans, m, n, alpha, &A[0], lda, stride_a, &B[0], ldb, stride_b,
-                    batch_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::omatcopy_batch(
-                    main_queue, trans, m, n, alpha, &A[0], lda, stride_a, &B[0], ldb, stride_b,
-                    batch_size, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::omatcopy_batch,
-                                        trans, m, n, alpha, &A[0], lda, stride_a, &B[0], ldb,
-                                        stride_b, batch_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::omatcopy_batch,
-                                        trans, m, n, alpha, &A[0], lda, stride_a, &B[0], ldb,
-                                        stride_b, batch_size, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during OMATCOPY_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        oneapi::mkl::free_shared(a_array, cxt);
-        oneapi::mkl::free_shared(b_array, cxt);
-        oneapi::mkl::free_shared(b_ref_array, cxt);
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of OMATCOPY_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good = check_equal_matrix(B, B_ref, oneapi::mkl::layout::col_major, stride_b * batch_size,
-                                   1, stride_b * batch_size, 10, std::cout);
-
-    oneapi::mkl::free_shared(a_array, cxt);
-    oneapi::mkl::free_shared(b_array, cxt);
-    oneapi::mkl::free_shared(b_ref_array, cxt);
-
-    return (int)good;
-}
-
-class OmatcopyBatchStrideUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(OmatcopyBatchStrideUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmatcopyBatchStrideUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmatcopyBatchStrideUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmatcopyBatchStrideUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(OmatcopyBatchStrideUsmTestSuite, OmatcopyBatchStrideUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/omatcopy_batch_usm.cpp b/tests/unit_tests/blas/batch/omatcopy_batch_usm.cpp
deleted file mode 100644
index 7f1e4a103..000000000
--- a/tests/unit_tests/blas/batch/omatcopy_batch_usm.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-#include <type_traits>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during OMATCOPY_BATCH:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto uaint = usm_allocator<int64_t, usm::alloc::shared, 64>(cxt, *dev);
-    vector<int64_t, decltype(uaint)> m(uaint), n(uaint), lda(uaint), ldb(uaint), group_size(uaint);
-
-    auto uatranspose = usm_allocator<oneapi::mkl::transpose, usm::alloc::shared, 64>(cxt, *dev);
-    vector<oneapi::mkl::transpose, decltype(uatranspose)> trans(uatranspose);
-
-    auto uafp = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(uafp)> alpha(uafp);
-
-    m.resize(group_count);
-    n.resize(group_count);
-    lda.resize(group_count);
-    ldb.resize(group_count);
-    group_size.resize(group_count);
-    trans.resize(group_count);
-    alpha.resize(group_count);
-
-    int64_t i, tmp;
-    int64_t j, idx = 0;
-    int64_t total_batch_count = 0;
-    int64_t size_a = 0, size_b = 0;
-
-    for (i = 0; i < group_count; i++) {
-        group_size[i] = 1 + std::rand() % 20;
-        m[i] = 1 + std::rand() % 50;
-        n[i] = 1 + std::rand() % 50;
-        lda[i] = std::max(m[i], n[i]);
-        ldb[i] = std::max(m[i], n[i]);
-        alpha[i] = rand_scalar<fp>();
-        trans[i] = rand_trans<fp>();
-        total_batch_count += group_size[i];
-    }
-
-    auto uafpp = usm_allocator<fp *, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp *, decltype(uafpp)> a_array(uafpp), b_array(uafpp), b_ref_array(uafpp);
-
-    a_array.resize(total_batch_count);
-    b_array.resize(total_batch_count);
-    b_ref_array.resize(total_batch_count);
-
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                size_a = lda[i] * n[i];
-                size_b =
-                    (trans[i] == oneapi::mkl::transpose::nontrans) ? ldb[i] * n[i] : ldb[i] * m[i];
-                break;
-            case oneapi::mkl::layout::row_major:
-                size_a = lda[i] * m[i];
-                size_b =
-                    (trans[i] == oneapi::mkl::transpose::nontrans) ? ldb[i] * m[i] : ldb[i] * n[i];
-                break;
-            default: break;
-        }
-        for (j = 0; j < group_size[i]; j++) {
-            a_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt);
-            b_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_b, *dev, cxt);
-            b_ref_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_b, *dev, cxt);
-            rand_matrix(a_array[idx], oneapi::mkl::layout::col_major,
-                        oneapi::mkl::transpose::nontrans, size_a, 1, size_a);
-            rand_matrix(b_array[idx], oneapi::mkl::layout::col_major,
-                        oneapi::mkl::transpose::nontrans, size_b, 1, size_b);
-            copy_matrix(b_array[idx], oneapi::mkl::layout::col_major,
-                        oneapi::mkl::transpose::nontrans, size_b, 1, size_b, b_ref_array[idx]);
-            idx++;
-        }
-    }
-
-    // Call reference OMATCOPY
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        int m_ref = (int)m[i];
-        int n_ref = (int)n[i];
-        int lda_ref = (int)lda[i];
-        int ldb_ref = (int)ldb[i];
-        int group_size_ref = (int)group_size[i];
-        for (j = 0; j < group_size_ref; j++) {
-            omatcopy_ref(layout, trans[i], m_ref, n_ref, alpha[i], a_array[idx], lda_ref,
-                         b_ref_array[idx], ldb_ref);
-            idx++;
-        }
-    }
-
-    // Call DPC++ OMATCOPY_BATCH
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::omatcopy_batch(
-                    main_queue, trans.data(), m.data(), n.data(), alpha.data(),
-                    (const fp **)a_array.data(), lda.data(), b_array.data(), ldb.data(),
-                    group_count, group_size.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::omatcopy_batch(
-                    main_queue, trans.data(), m.data(), n.data(), alpha.data(),
-                    (const fp **)a_array.data(), lda.data(), b_array.data(), ldb.data(),
-                    group_count, group_size.data(), dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::omatcopy_batch,
-                                        trans.data(), m.data(), n.data(), alpha.data(),
-                                        (const fp **)a_array.data(), lda.data(), b_array.data(),
-                                        ldb.data(), group_count, group_size.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::omatcopy_batch,
-                                        trans.data(), m.data(), n.data(), alpha.data(),
-                                        (const fp **)a_array.data(), lda.data(), b_array.data(),
-                                        ldb.data(), group_count, group_size.data(), dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during OMATCOPY_BATCH:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        idx = 0;
-        for (i = 0; i < group_count; i++) {
-            for (j = 0; j < group_size[i]; j++) {
-                oneapi::mkl::free_shared(a_array[idx], cxt);
-                oneapi::mkl::free_shared(b_array[idx], cxt);
-                oneapi::mkl::free_shared(b_ref_array[idx], cxt);
-                idx++;
-            }
-        }
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of OMATCOPY_BATCH:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good = true;
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                size_a = lda[i] * n[i];
-                size_b =
-                    (trans[i] == oneapi::mkl::transpose::nontrans) ? ldb[i] * n[i] : ldb[i] * m[i];
-                break;
-            case oneapi::mkl::layout::row_major:
-                size_a = lda[i] * m[i];
-                size_b =
-                    (trans[i] == oneapi::mkl::transpose::nontrans) ? ldb[i] * m[i] : ldb[i] * n[i];
-                break;
-            default: break;
-        }
-        for (j = 0; j < group_size[i]; j++) {
-            good = good && check_equal_matrix(b_array[idx], b_ref_array[idx],
-                                              oneapi::mkl::layout::col_major, size_b, 1, size_b, 10,
-                                              std::cout);
-            idx++;
-        }
-    }
-
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            oneapi::mkl::free_shared(a_array[idx], cxt);
-            oneapi::mkl::free_shared(b_array[idx], cxt);
-            oneapi::mkl::free_shared(b_ref_array[idx], cxt);
-            idx++;
-        }
-    }
-
-    return (int)good;
-}
-
-class OmatcopyBatchUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(OmatcopyBatchUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmatcopyBatchUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmatcopyBatchUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(OmatcopyBatchUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(OmatcopyBatchUsmTestSuite, OmatcopyBatchUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/syrk_batch_stride.cpp b/tests/unit_tests/blas/batch/syrk_batch_stride.cpp
deleted file mode 100644
index 58dc4d7dc..000000000
--- a/tests/unit_tests/blas/batch/syrk_batch_stride.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) {
-    // Prepare data.
-    int64_t n, k;
-    int64_t lda, ldc;
-    oneapi::mkl::uplo upper_lower;
-    oneapi::mkl::transpose trans;
-    fp alpha, beta;
-    int64_t i, tmp;
-
-    batch_size = 1 + std::rand() % 20;
-    n = 1 + std::rand() % 500;
-    k = 1 + std::rand() % 500;
-    lda = std::max(n, k);
-    ldc = std::max(n, n);
-    alpha = rand_scalar<fp>();
-    beta = rand_scalar<fp>();
-
-    upper_lower = (oneapi::mkl::uplo)(std::rand() % 2);
-    if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
-        trans = (std::rand() % 2) == 0 ? oneapi::mkl::transpose::nontrans
-                                       : (std::rand() % 2) == 0 ? oneapi::mkl::transpose::trans
-                                                                : oneapi::mkl::transpose::conjtrans;
-    }
-    else {
-        trans = (std::rand() % 2) == 0 ? oneapi::mkl::transpose::nontrans
-                                       : oneapi::mkl::transpose::trans;
-    }
-
-    int64_t stride_a, stride_c;
-
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            stride_a = (trans == oneapi::mkl::transpose::nontrans) ? lda * k : lda * n;
-            stride_c = ldc * n;
-            break;
-        case oneapi::mkl::layout::row_major:
-            stride_a = (trans == oneapi::mkl::transpose::nontrans) ? lda * n : lda * k;
-            stride_c = ldc * n;
-            break;
-        default: break;
-    }
-
-    vector<fp, allocator_helper<fp, 64>> A(stride_a * batch_size);
-    vector<fp, allocator_helper<fp, 64>> C(stride_c * batch_size), C_ref(stride_c * batch_size);
-
-    for (i = 0; i < batch_size; i++) {
-        rand_matrix(A.data() + stride_a * i, layout, trans, n, k, lda);
-        rand_matrix(C.data() + stride_c * i, layout, oneapi::mkl::transpose::nontrans, n, n, ldc);
-    }
-
-    C_ref = C;
-
-    // Call reference SYRK_BATCH_STRIDE.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int n_ref = (int)n;
-    int k_ref = (int)k;
-    int lda_ref = (int)lda;
-    int ldc_ref = (int)ldc;
-    int batch_size_ref = (int)batch_size;
-
-    for (i = 0; i < batch_size_ref; i++) {
-        ::syrk(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-               convert_to_cblas_trans(trans), (const int *)&n_ref, (const int *)&k_ref,
-               (const fp_ref *)&alpha, (const fp_ref *)(A.data() + stride_a * i),
-               (const int *)&lda_ref, (const fp_ref *)&beta,
-               (fp_ref *)(C_ref.data() + stride_c * i), (const int *)&ldc_ref);
-    }
-
-    // Call DPC++ SYRK_BATCH_STRIDE.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SYRK_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> C_buffer(C.data(), range<1>(C.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::syrk_batch(main_queue, upper_lower, trans, n, k,
-                                                            alpha, A_buffer, lda, stride_a, beta,
-                                                            C_buffer, ldc, stride_c, batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::syrk_batch(main_queue, upper_lower, trans, n, k,
-                                                         alpha, A_buffer, lda, stride_a, beta,
-                                                         C_buffer, ldc, stride_c, batch_size);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::syrk_batch,
-                                        upper_lower, trans, n, k, alpha, A_buffer, lda, stride_a,
-                                        beta, C_buffer, ldc, stride_c, batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::syrk_batch,
-                                        upper_lower, trans, n, k, alpha, A_buffer, lda, stride_a,
-                                        beta, C_buffer, ldc, stride_c, batch_size);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SYRK_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SYRK_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto C_accessor = C_buffer.get_host_access(read_only);
-    bool good =
-        check_equal_matrix(C_accessor, C_ref, oneapi::mkl::layout::col_major, stride_c * batch_size,
-                           1, stride_c * batch_size, 10 * k, std::cout);
-
-    return (int)good;
-}
-
-class SyrkBatchStrideTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(SyrkBatchStrideTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(SyrkBatchStrideTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(SyrkBatchStrideTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(SyrkBatchStrideTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(SyrkBatchStrideTestSuite, SyrkBatchStrideTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/syrk_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/syrk_batch_stride_usm.cpp
deleted file mode 100644
index 31aa09b79..000000000
--- a/tests/unit_tests/blas/batch/syrk_batch_stride_usm.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SYRK_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t n, k;
-    int64_t lda, ldc;
-    oneapi::mkl::uplo upper_lower;
-    oneapi::mkl::transpose trans;
-    fp alpha, beta;
-
-    int64_t i, tmp;
-
-    batch_size = 1 + std::rand() % 20;
-    n = 1 + std::rand() % 500;
-    k = 1 + std::rand() % 500;
-    lda = std::max(n, k);
-    ldc = std::max(n, n);
-    alpha = rand_scalar<fp>();
-    beta = rand_scalar<fp>();
-    upper_lower = (oneapi::mkl::uplo)(std::rand() % 2);
-    if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
-        trans = (std::rand() % 2) == 0 ? oneapi::mkl::transpose::nontrans
-                                       : (std::rand() % 2) == 0 ? oneapi::mkl::transpose::trans
-                                                                : oneapi::mkl::transpose::conjtrans;
-    }
-    else {
-        trans = (std::rand() % 2) == 0 ? oneapi::mkl::transpose::nontrans
-                                       : oneapi::mkl::transpose::trans;
-    }
-
-    int64_t stride_a, stride_c;
-
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            stride_a = (trans == oneapi::mkl::transpose::nontrans) ? lda * k : lda * n;
-            stride_c = ldc * n;
-            break;
-        case oneapi::mkl::layout::row_major:
-            stride_a = (trans == oneapi::mkl::transpose::nontrans) ? lda * n : lda * k;
-            stride_c = ldc * n;
-            break;
-        default: break;
-    }
-
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), C(ua), C_ref(ua);
-
-    A.resize(stride_a * batch_size);
-    C.resize(stride_c * batch_size);
-    C_ref.resize(stride_c * batch_size);
-
-    fp **a_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt);
-    fp **c_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt);
-    fp **c_ref_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt);
-
-    if ((a_array == NULL) || (c_array == NULL) || (c_ref_array == NULL)) {
-        std::cout << "Error cannot allocate arrays of pointers\n";
-        oneapi::mkl::free_shared(a_array, cxt);
-        oneapi::mkl::free_shared(c_array, cxt);
-        oneapi::mkl::free_shared(c_ref_array, cxt);
-        return false;
-    }
-
-    for (i = 0; i < batch_size; i++) {
-        a_array[i] = &A[i * stride_a];
-        c_array[i] = &C[i * stride_c];
-        c_ref_array[i] = &C_ref[i * stride_c];
-    }
-
-    rand_matrix(A, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_a * batch_size, 1, stride_a * batch_size);
-    rand_matrix(C, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_c * batch_size, 1, stride_c * batch_size);
-    copy_matrix(C, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans,
-                stride_c * batch_size, 1, stride_c * batch_size, C_ref);
-
-    // Call reference SYRK_BATCH_STRIDE.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int n_ref = (int)n;
-    int k_ref = (int)k;
-    int lda_ref = (int)lda;
-    int ldc_ref = (int)ldc;
-    int batch_size_ref = (int)batch_size;
-    for (i = 0; i < batch_size_ref; i++) {
-        ::syrk(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-               convert_to_cblas_trans(trans), (const int *)&n_ref, (const int *)&k_ref,
-               (const fp_ref *)&alpha, (const fp_ref *)(A.data() + stride_a * i),
-               (const int *)&lda_ref, (const fp_ref *)&beta,
-               (fp_ref *)(C_ref.data() + stride_c * i), (const int *)&ldc_ref);
-    }
-
-    // Call DPC++ SYRK_BATCH_STRIDE.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::syrk_batch(
-                    main_queue, upper_lower, trans, n, k, alpha, &A[0], lda, stride_a, beta, &C[0],
-                    ldc, stride_c, batch_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::syrk_batch(
-                    main_queue, upper_lower, trans, n, k, alpha, &A[0], lda, stride_a, beta, &C[0],
-                    ldc, stride_c, batch_size, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::syrk_batch,
-                                        upper_lower, trans, n, k, alpha, &A[0], lda, stride_a, beta,
-                                        &C[0], ldc, stride_c, batch_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::syrk_batch,
-                                        upper_lower, trans, n, k, alpha, &A[0], lda, stride_a, beta,
-                                        &C[0], ldc, stride_c, batch_size, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SYRK_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        oneapi::mkl::free_shared(a_array, cxt);
-        oneapi::mkl::free_shared(c_array, cxt);
-        oneapi::mkl::free_shared(c_ref_array, cxt);
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SYRK_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good = check_equal_matrix(C, C_ref, oneapi::mkl::layout::col_major, stride_c * batch_size,
-                                   1, stride_c * batch_size, 10 * k, std::cout);
-
-    oneapi::mkl::free_shared(a_array, cxt);
-    oneapi::mkl::free_shared(c_array, cxt);
-    oneapi::mkl::free_shared(c_ref_array, cxt);
-
-    return (int)good;
-}
-
-class SyrkBatchStrideUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(SyrkBatchStrideUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(SyrkBatchStrideUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(SyrkBatchStrideUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(SyrkBatchStrideUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(SyrkBatchStrideUsmTestSuite, SyrkBatchStrideUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/syrk_batch_usm.cpp b/tests/unit_tests/blas/batch/syrk_batch_usm.cpp
deleted file mode 100644
index 36d0d6dd5..000000000
--- a/tests/unit_tests/blas/batch/syrk_batch_usm.cpp
+++ /dev/null
@@ -1,334 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "allocator_helper.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SYRK_BATCH:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto uaint = usm_allocator<int64_t, usm::alloc::shared, 64>(cxt, *dev);
-    vector<int64_t, decltype(uaint)> n(uaint), k(uaint), lda(uaint), ldc(uaint), group_size(uaint);
-
-    auto uauplo = usm_allocator<oneapi::mkl::uplo, usm::alloc::shared, 64>(cxt, *dev);
-    vector<oneapi::mkl::uplo, decltype(uauplo)> upper_lower(uauplo);
-
-    auto uatranspose = usm_allocator<oneapi::mkl::transpose, usm::alloc::shared, 64>(cxt, *dev);
-    vector<oneapi::mkl::transpose, decltype(uatranspose)> trans(uatranspose);
-
-    auto uafp = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(uafp)> alpha(uafp), beta(uafp);
-
-    n.resize(group_count);
-    k.resize(group_count);
-    lda.resize(group_count);
-    ldc.resize(group_count);
-    group_size.resize(group_count);
-    trans.resize(group_count);
-    upper_lower.resize(group_count);
-    alpha.resize(group_count);
-    beta.resize(group_count);
-
-    int64_t i, tmp;
-    int64_t j, idx = 0;
-    int64_t total_batch_count = 0;
-    int64_t size_a = 0, size_c = 0;
-
-    for (i = 0; i < group_count; i++) {
-        group_size[i] = 1 + std::rand() % 20;
-        n[i] = 1 + std::rand() % 500;
-        k[i] = 1 + std::rand() % 500;
-        lda[i] = std::max(n[i], k[i]);
-        ldc[i] = std::max(n[i], n[i]);
-        alpha[i] = rand_scalar<fp>();
-        beta[i] = rand_scalar<fp>();
-        upper_lower[i] = (oneapi::mkl::uplo)(std::rand() % 2);
-        if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
-            trans[i] = (std::rand() % 2) == 0
-                           ? oneapi::mkl::transpose::nontrans
-                           : (std::rand() % 2) == 0 ? oneapi::mkl::transpose::trans
-                                                    : oneapi::mkl::transpose::conjtrans;
-        }
-        else {
-            trans[i] = (std::rand() % 2) == 0 ? oneapi::mkl::transpose::nontrans
-                                              : oneapi::mkl::transpose::trans;
-        }
-        total_batch_count += group_size[i];
-    }
-
-    auto uafpp = usm_allocator<fp *, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp *, decltype(uafpp)> a_array(uafpp), c_array(uafpp), c_ref_array(uafpp);
-    a_array.resize(total_batch_count);
-    c_array.resize(total_batch_count);
-    c_ref_array.resize(total_batch_count);
-
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                size_a = lda[i] * ((trans[i] == oneapi::mkl::transpose::nontrans) ? k[i] : n[i]);
-                size_c = ldc[i] * n[i];
-                break;
-            case oneapi::mkl::layout::row_major:
-                size_a = lda[i] * ((trans[i] == oneapi::mkl::transpose::nontrans) ? n[i] : k[i]);
-                size_c = ldc[i] * n[i];
-                break;
-            default: break;
-        }
-        for (j = 0; j < group_size[i]; j++) {
-            a_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt);
-            c_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_c, *dev, cxt);
-            c_ref_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_c, *dev, cxt);
-            rand_matrix(a_array[idx], layout, trans[i], n[i], k[i], lda[i]);
-            rand_matrix(c_array[idx], layout, oneapi::mkl::transpose::nontrans, n[i], n[i], ldc[i]);
-            copy_matrix(c_array[idx], layout, oneapi::mkl::transpose::nontrans, n[i], n[i], ldc[i],
-                        c_ref_array[idx]);
-            idx++;
-        }
-    }
-
-    // Call reference SYRK_BATCH.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int *n_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *k_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *lda_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *ldc_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *group_size_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-
-    CBLAS_UPLO *upper_lower_ref =
-        (CBLAS_UPLO *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_UPLO) * group_count);
-    CBLAS_TRANSPOSE *trans_ref =
-        (CBLAS_TRANSPOSE *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count);
-
-    if ((n_ref == NULL) || (k_ref == NULL) || (lda_ref == NULL) || (ldc_ref == NULL) ||
-        (trans_ref == NULL) || (upper_lower_ref == NULL) || (group_size_ref == NULL)) {
-        std::cout << "Error cannot allocate input arrays\n";
-        oneapi::mkl::aligned_free(n_ref);
-        oneapi::mkl::aligned_free(k_ref);
-        oneapi::mkl::aligned_free(lda_ref);
-        oneapi::mkl::aligned_free(ldc_ref);
-        oneapi::mkl::aligned_free(trans_ref);
-        oneapi::mkl::aligned_free(upper_lower_ref);
-        oneapi::mkl::aligned_free(group_size_ref);
-        idx = 0;
-        for (i = 0; i < group_count; i++) {
-            for (j = 0; j < group_size[i]; j++) {
-                oneapi::mkl::free_shared(a_array[idx], cxt);
-                oneapi::mkl::free_shared(c_array[idx], cxt);
-                oneapi::mkl::free_shared(c_ref_array[idx], cxt);
-                idx++;
-            }
-        }
-        return false;
-    }
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        trans_ref[i] = convert_to_cblas_trans(trans[i]);
-        upper_lower_ref[i] = convert_to_cblas_uplo(upper_lower[i]);
-        n_ref[i] = (int)n[i];
-        k_ref[i] = (int)k[i];
-        lda_ref[i] = (int)lda[i];
-        ldc_ref[i] = (int)ldc[i];
-        group_size_ref[i] = (int)group_size[i];
-        for (j = 0; j < group_size_ref[i]; j++) {
-            ::syrk(convert_to_cblas_layout(layout), upper_lower_ref[i], trans_ref[i],
-                   (const int *)&n_ref[i], (const int *)&k_ref[i], (const fp_ref *)&alpha[i],
-                   (const fp_ref *)a_array[idx], (const int *)&lda_ref[i], (const fp_ref *)&beta[i],
-                   (fp_ref *)c_ref_array[idx], (const int *)&ldc_ref[i]);
-            idx++;
-        }
-    }
-
-    // Call DPC++ SYRK_BATCH.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::syrk_batch(
-                    main_queue, &upper_lower[0], &trans[0], &n[0], &k[0], &alpha[0],
-                    (const fp **)&a_array[0], &lda[0], &beta[0], &c_array[0], &ldc[0], group_count,
-                    &group_size[0], dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::syrk_batch(
-                    main_queue, &upper_lower[0], &trans[0], &n[0], &k[0], &alpha[0],
-                    (const fp **)&a_array[0], &lda[0], &beta[0], &c_array[0], &ldc[0], group_count,
-                    &group_size[0], dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::syrk_batch,
-                                        &upper_lower[0], &trans[0], &n[0], &k[0], &alpha[0],
-                                        (const fp **)&a_array[0], &lda[0], &beta[0], &c_array[0],
-                                        &ldc[0], group_count, &group_size[0], dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::syrk_batch,
-                                        &upper_lower[0], &trans[0], &n[0], &k[0], &alpha[0],
-                                        (const fp **)&a_array[0], &lda[0], &beta[0], &c_array[0],
-                                        &ldc[0], group_count, &group_size[0], dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SYRK_BATCH:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        oneapi::mkl::aligned_free(n_ref);
-        oneapi::mkl::aligned_free(k_ref);
-        oneapi::mkl::aligned_free(lda_ref);
-        oneapi::mkl::aligned_free(ldc_ref);
-        oneapi::mkl::aligned_free(upper_lower_ref);
-        oneapi::mkl::aligned_free(trans_ref);
-        oneapi::mkl::aligned_free(group_size_ref);
-        idx = 0;
-        for (i = 0; i < group_count; i++) {
-            for (j = 0; j < group_size[i]; j++) {
-                oneapi::mkl::free_shared(a_array[idx], cxt);
-                oneapi::mkl::free_shared(c_array[idx], cxt);
-                oneapi::mkl::free_shared(c_ref_array[idx], cxt);
-                idx++;
-            }
-        }
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SYRK_BATCH:\n" << error.what() << std::endl;
-    }
-
-    bool good = true;
-    // Compare the results of reference implementation and DPC++ implementation.
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            good = good && check_equal_matrix(c_array[idx], c_ref_array[idx], layout, n[i], n[i],
-                                              ldc[i], 10 * k[i], std::cout);
-            idx++;
-        }
-    }
-    oneapi::mkl::aligned_free(n_ref);
-    oneapi::mkl::aligned_free(k_ref);
-    oneapi::mkl::aligned_free(lda_ref);
-    oneapi::mkl::aligned_free(ldc_ref);
-    oneapi::mkl::aligned_free(upper_lower_ref);
-    oneapi::mkl::aligned_free(trans_ref);
-    oneapi::mkl::aligned_free(group_size_ref);
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            oneapi::mkl::free_shared(a_array[idx], cxt);
-            oneapi::mkl::free_shared(c_array[idx], cxt);
-            oneapi::mkl::free_shared(c_ref_array[idx], cxt);
-            idx++;
-        }
-    }
-
-    return (int)good;
-}
-
-class SyrkBatchUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(SyrkBatchUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(SyrkBatchUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(SyrkBatchUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(SyrkBatchUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(SyrkBatchUsmTestSuite, SyrkBatchUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/trsm_batch_stride.cpp b/tests/unit_tests/blas/batch/trsm_batch_stride.cpp
deleted file mode 100644
index cde6aa367..000000000
--- a/tests/unit_tests/blas/batch/trsm_batch_stride.cpp
+++ /dev/null
@@ -1,232 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout) {
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb;
-    oneapi::mkl::transpose trans;
-    oneapi::mkl::side left_right;
-    oneapi::mkl::uplo upper_lower;
-    oneapi::mkl::diag unit_nonunit;
-    fp alpha;
-    int64_t batch_size;
-    int64_t i, tmp;
-
-    batch_size = 1 + std::rand() % 20;
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = std::max(m, n);
-    ldb = std::max(n, m);
-    alpha = rand_scalar<fp>();
-
-    if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
-        trans = (oneapi::mkl::transpose)(std::rand() % 2);
-    }
-    else {
-        tmp = std::rand() % 3;
-        if (tmp == 2)
-            trans = oneapi::mkl::transpose::conjtrans;
-        else
-            trans = (oneapi::mkl::transpose)tmp;
-    }
-    left_right = (oneapi::mkl::side)(std::rand() % 2);
-    upper_lower = (oneapi::mkl::uplo)(std::rand() % 2);
-    unit_nonunit = (oneapi::mkl::diag)(std::rand() % 2);
-
-    int64_t stride_a, stride_b;
-    int64_t total_size_b;
-
-    stride_a = (left_right == oneapi::mkl::side::left) ? lda * m : lda * n;
-    switch (layout) {
-        case oneapi::mkl::layout::col_major: stride_b = ldb * n; break;
-        case oneapi::mkl::layout::row_major: stride_b = ldb * m; break;
-        default: break;
-    }
-    total_size_b = batch_size * stride_b;
-
-    vector<fp, allocator_helper<fp, 64>> A(batch_size * stride_a), B(total_size_b),
-        B_ref(total_size_b);
-
-    for (i = 0; i < batch_size; i++) {
-        if (left_right == oneapi::mkl::side::left)
-            rand_trsm_matrix(A.data() + stride_a * i, layout, trans, m, m, lda);
-        else
-            rand_trsm_matrix(A.data() + stride_a * i, layout, trans, n, n, lda);
-        rand_matrix(B.data() + stride_b * i, layout, oneapi::mkl::transpose::nontrans, m, n, ldb);
-    }
-
-    B_ref = B;
-
-    // Call reference TRSM_BATCH_STRIDE.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int m_ref, n_ref, lda_ref, ldb_ref, batch_size_ref;
-    m_ref = (int)m;
-    n_ref = (int)n;
-    lda_ref = (int)lda;
-    ldb_ref = (int)ldb;
-    batch_size_ref = (int)batch_size;
-    for (i = 0; i < batch_size_ref; i++) {
-        ::trsm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right),
-               convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-               convert_to_cblas_diag(unit_nonunit), (const int *)&m_ref, (const int *)&n_ref,
-               (const fp_ref *)&alpha, (const fp_ref *)(A.data() + stride_a * i),
-               (const int *)&lda_ref, (fp_ref *)(B_ref.data() + stride_b * i),
-               (const int *)&ldb_ref);
-    }
-
-    // Call DPC++ TRSM_BATCH_STRIDE.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during TRSM_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::trsm_batch(
-                    main_queue, left_right, upper_lower, trans, unit_nonunit, m, n, alpha, A_buffer,
-                    lda, stride_a, B_buffer, ldb, stride_b, batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::trsm_batch(
-                    main_queue, left_right, upper_lower, trans, unit_nonunit, m, n, alpha, A_buffer,
-                    lda, stride_a, B_buffer, ldb, stride_b, batch_size);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::trsm_batch,
-                                        left_right, upper_lower, trans, unit_nonunit, m, n, alpha,
-                                        A_buffer, lda, stride_a, B_buffer, ldb, stride_b,
-                                        batch_size);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::trsm_batch,
-                                        left_right, upper_lower, trans, unit_nonunit, m, n, alpha,
-                                        A_buffer, lda, stride_a, B_buffer, ldb, stride_b,
-                                        batch_size);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during TRSM_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of TRSM_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto B_accessor = B_buffer.get_host_access(read_only);
-    bool good =
-        check_equal_trsm_matrix(B_accessor, B_ref, oneapi::mkl::layout::col_major, total_size_b, 1,
-                                total_size_b, 10 * std::max(m, n), std::cout);
-
-    return (int)good;
-}
-
-class TrsmBatchStrideTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(TrsmBatchStrideTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(TrsmBatchStrideTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(TrsmBatchStrideTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(TrsmBatchStrideTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(TrsmBatchStrideTestSuite, TrsmBatchStrideTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/trsm_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/trsm_batch_stride_usm.cpp
deleted file mode 100644
index d99836f87..000000000
--- a/tests/unit_tests/blas/batch/trsm_batch_stride_usm.cpp
+++ /dev/null
@@ -1,234 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during TRSM_BATCH_STRIDE:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb;
-    oneapi::mkl::transpose trans;
-    oneapi::mkl::side left_right;
-    oneapi::mkl::uplo upper_lower;
-    oneapi::mkl::diag unit_nonunit;
-    fp alpha;
-    int64_t batch_size;
-    int64_t i, tmp;
-
-    batch_size = 1 + std::rand() % 20;
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = std::max(m, n);
-    ldb = std::max(n, m);
-    alpha = rand_scalar<fp>();
-
-    if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
-        trans = (oneapi::mkl::transpose)(std::rand() % 2);
-    }
-    else {
-        tmp = std::rand() % 3;
-        if (tmp == 2)
-            trans = oneapi::mkl::transpose::conjtrans;
-        else
-            trans = (oneapi::mkl::transpose)tmp;
-    }
-    left_right = (oneapi::mkl::side)(std::rand() % 2);
-    upper_lower = (oneapi::mkl::uplo)(std::rand() % 2);
-    unit_nonunit = (oneapi::mkl::diag)(std::rand() % 2);
-
-    int64_t stride_a, stride_b;
-    int64_t total_size_b;
-
-    stride_a = (left_right == oneapi::mkl::side::left) ? lda * m : lda * n;
-    stride_b = (layout == oneapi::mkl::layout::col_major) ? ldb * n : ldb * m;
-
-    total_size_b = batch_size * stride_b;
-
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), B(ua), B_ref(ua);
-
-    A.resize(stride_a * batch_size);
-    B.resize(total_size_b);
-    B_ref.resize(total_size_b);
-
-    for (i = 0; i < batch_size; i++) {
-        if (left_right == oneapi::mkl::side::left)
-            rand_trsm_matrix(&A[stride_a * i], layout, trans, m, m, lda);
-        else
-            rand_trsm_matrix(&A[stride_a * i], layout, trans, n, n, lda);
-        rand_matrix(&B[stride_b * i], layout, oneapi::mkl::transpose::nontrans, m, n, ldb);
-    }
-
-    copy_matrix(B, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, total_size_b,
-                1, total_size_b, B_ref);
-
-    // Call reference TRSM_BATCH_STRIDE.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int m_ref, n_ref, lda_ref, ldb_ref, batch_size_ref;
-    m_ref = (int)m;
-    n_ref = (int)n;
-    lda_ref = (int)lda;
-    ldb_ref = (int)ldb;
-    batch_size_ref = (int)batch_size;
-    for (i = 0; i < batch_size_ref; i++) {
-        ::trsm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right),
-               convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans),
-               convert_to_cblas_diag(unit_nonunit), (const int *)&m_ref, (const int *)&n_ref,
-               (const fp_ref *)&alpha, (const fp_ref *)(A.data() + stride_a * i),
-               (const int *)&lda_ref, (fp_ref *)(B_ref.data() + stride_b * i),
-               (const int *)&ldb_ref);
-    }
-
-    // Call DPC++ TRSM_BATCH_STRIDE.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::trsm_batch(
-                    main_queue, left_right, upper_lower, trans, unit_nonunit, m, n, alpha, &A[0],
-                    lda, stride_a, &B[0], ldb, stride_b, batch_size, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::trsm_batch(
-                    main_queue, left_right, upper_lower, trans, unit_nonunit, m, n, alpha, &A[0],
-                    lda, stride_a, &B[0], ldb, stride_b, batch_size, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::trsm_batch,
-                                        left_right, upper_lower, trans, unit_nonunit, m, n, alpha,
-                                        &A[0], lda, stride_a, &B[0], ldb, stride_b, batch_size,
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::trsm_batch,
-                                        left_right, upper_lower, trans, unit_nonunit, m, n, alpha,
-                                        &A[0], lda, stride_a, &B[0], ldb, stride_b, batch_size,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during TRSM_BATCH_STRIDE:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of TRSM_BATCH_STRIDE:\n"
-                  << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good = check_equal_trsm_matrix(B, B_ref, oneapi::mkl::layout::col_major, total_size_b, 1,
-                                        total_size_b, 10 * std::max(m, n), std::cout);
-
-    return (int)good;
-}
-
-class TrsmBatchStrideUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(TrsmBatchStrideUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(TrsmBatchStrideUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(TrsmBatchStrideUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(TrsmBatchStrideUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(TrsmBatchStrideUsmTestSuite, TrsmBatchStrideUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/batch/trsm_batch_usm.cpp b/tests/unit_tests/blas/batch/trsm_batch_usm.cpp
deleted file mode 100644
index 747f59433..000000000
--- a/tests/unit_tests/blas/batch/trsm_batch_usm.cpp
+++ /dev/null
@@ -1,352 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "allocator_helper.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during TRSM_BATCH:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto uaint = usm_allocator<int64_t, usm::alloc::shared, 64>(cxt, *dev);
-    vector<int64_t, decltype(uaint)> m(uaint), n(uaint), lda(uaint), ldb(uaint), group_size(uaint);
-
-    auto uatranspose = usm_allocator<oneapi::mkl::transpose, usm::alloc::shared, 64>(cxt, *dev);
-    vector<oneapi::mkl::transpose, decltype(uatranspose)> trans(uatranspose);
-
-    auto uaside = usm_allocator<oneapi::mkl::side, usm::alloc::shared, 64>(cxt, *dev);
-    vector<oneapi::mkl::side, decltype(uaside)> left_right(uaside);
-
-    auto uauplo = usm_allocator<oneapi::mkl::uplo, usm::alloc::shared, 64>(cxt, *dev);
-    vector<oneapi::mkl::uplo, decltype(uauplo)> upper_lower(uauplo);
-
-    auto uadiag = usm_allocator<oneapi::mkl::diag, usm::alloc::shared, 64>(cxt, *dev);
-    vector<oneapi::mkl::diag, decltype(uadiag)> unit_nonunit(uadiag);
-
-    auto uafp = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(uafp)> alpha(uafp);
-
-    m.resize(group_count);
-    n.resize(group_count);
-    lda.resize(group_count);
-    ldb.resize(group_count);
-    group_size.resize(group_count);
-    trans.resize(group_count);
-    left_right.resize(group_count);
-    upper_lower.resize(group_count);
-    unit_nonunit.resize(group_count);
-    alpha.resize(group_count);
-
-    int64_t i, tmp;
-    int64_t j, idx = 0;
-    int64_t total_batch_count = 0;
-    int64_t size_a = 0, size_b = 0;
-    int64_t Arank = 0;
-
-    for (i = 0; i < group_count; i++) {
-        group_size[i] = 1 + std::rand() % 20;
-        m[i] = 1 + std::rand() % 50;
-        n[i] = 1 + std::rand() % 50;
-        lda[i] = std::max(m[i], n[i]);
-        ldb[i] = std::max(n[i], m[i]);
-        alpha[i] = rand_scalar<fp>();
-        if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
-            trans[i] = (oneapi::mkl::transpose)(std::rand() % 2);
-        }
-        else {
-            tmp = std::rand() % 3;
-            if (tmp == 2)
-                trans[i] = oneapi::mkl::transpose::conjtrans;
-            else
-                trans[i] = (oneapi::mkl::transpose)tmp;
-        }
-        left_right[i] = (oneapi::mkl::side)(std::rand() % 2);
-        upper_lower[i] = (oneapi::mkl::uplo)(std::rand() % 2);
-        unit_nonunit[i] = (oneapi::mkl::diag)(std::rand() % 2);
-
-        total_batch_count += group_size[i];
-    }
-
-    auto uafpp = usm_allocator<fp *, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp *, decltype(uafpp)> a_array(uafpp), b_array(uafpp), b_ref_array(uafpp);
-
-    a_array.resize(total_batch_count);
-    b_array.resize(total_batch_count);
-    b_ref_array.resize(total_batch_count);
-
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        size_a = lda[i] * (left_right[i] == oneapi::mkl::side::left ? m[i] : n[i]);
-        Arank = left_right[i] == oneapi::mkl::side::left ? m[i] : n[i];
-        size_b = ldb[i] * ((layout == oneapi::mkl::layout::col_major) ? n[i] : m[i]);
-        for (j = 0; j < group_size[i]; j++) {
-            a_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt);
-            b_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_b, *dev, cxt);
-            b_ref_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_b, *dev, cxt);
-            rand_trsm_matrix(a_array[idx], layout, trans[i], Arank, Arank, lda[i]);
-            rand_matrix(b_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i], ldb[i]);
-            copy_matrix(b_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i], ldb[i],
-                        b_ref_array[idx]);
-            idx++;
-        }
-    }
-
-    // Call reference TRSM_BATCH.
-    using fp_ref = typename ref_type_info<fp>::type;
-    int *m_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *n_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *lda_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *ldb_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-    int *group_size_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count);
-
-    CBLAS_TRANSPOSE *trans_ref =
-        (CBLAS_TRANSPOSE *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count);
-    CBLAS_SIDE *left_right_ref =
-        (CBLAS_SIDE *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_SIDE) * group_count);
-    CBLAS_UPLO *upper_lower_ref =
-        (CBLAS_UPLO *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_UPLO) * group_count);
-    CBLAS_DIAG *unit_nonunit_ref =
-        (CBLAS_DIAG *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_DIAG) * group_count);
-
-    if ((m_ref == NULL) || (n_ref == NULL) || (lda_ref == NULL) || (ldb_ref == NULL) ||
-        (trans_ref == NULL) || (left_right_ref == NULL) || (upper_lower_ref == NULL) ||
-        (unit_nonunit_ref == NULL) || (group_size_ref == NULL)) {
-        std::cout << "Error cannot allocate input arrays\n";
-        oneapi::mkl::aligned_free(m_ref);
-        oneapi::mkl::aligned_free(n_ref);
-        oneapi::mkl::aligned_free(lda_ref);
-        oneapi::mkl::aligned_free(ldb_ref);
-        oneapi::mkl::aligned_free(trans_ref);
-        oneapi::mkl::aligned_free(left_right_ref);
-        oneapi::mkl::aligned_free(upper_lower_ref);
-        oneapi::mkl::aligned_free(unit_nonunit_ref);
-        oneapi::mkl::aligned_free(group_size_ref);
-        idx = 0;
-        for (i = 0; i < group_count; i++) {
-            for (j = 0; j < group_size[i]; j++) {
-                oneapi::mkl::free_shared(a_array[idx], cxt);
-                oneapi::mkl::free_shared(b_array[idx], cxt);
-                oneapi::mkl::free_shared(b_ref_array[idx], cxt);
-                idx++;
-            }
-        }
-        return false;
-    }
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        trans_ref[i] = convert_to_cblas_trans(trans[i]);
-        left_right_ref[i] = convert_to_cblas_side(left_right[i]);
-        upper_lower_ref[i] = convert_to_cblas_uplo(upper_lower[i]);
-        unit_nonunit_ref[i] = convert_to_cblas_diag(unit_nonunit[i]);
-        m_ref[i] = (int)m[i];
-        n_ref[i] = (int)n[i];
-        lda_ref[i] = (int)lda[i];
-        ldb_ref[i] = (int)ldb[i];
-        group_size_ref[i] = (int)group_size[i];
-        for (j = 0; j < group_size_ref[i]; j++) {
-            ::trsm(convert_to_cblas_layout(layout), left_right_ref[i], upper_lower_ref[i],
-                   trans_ref[i], unit_nonunit_ref[i], (const int *)&m_ref[i],
-                   (const int *)&n_ref[i], (const fp_ref *)&alpha[i], (const fp_ref *)a_array[idx],
-                   (const int *)&lda_ref[i], b_ref_array[idx], (const int *)&ldb_ref[i]);
-            idx++;
-        }
-    }
-
-    // Call DPC++ TRSM_BATCH.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::trsm_batch(
-                    main_queue, &left_right[0], &upper_lower[0], &trans[0], &unit_nonunit[0], &m[0],
-                    &n[0], &alpha[0], (const fp **)&a_array[0], &lda[0], &b_array[0], &ldb[0],
-                    group_count, &group_size[0], dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::trsm_batch(
-                    main_queue, &left_right[0], &upper_lower[0], &trans[0], &unit_nonunit[0], &m[0],
-                    &n[0], &alpha[0], (const fp **)&a_array[0], &lda[0], &b_array[0], &ldb[0],
-                    group_count, &group_size[0], dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::trsm_batch,
-                                        &left_right[0], &upper_lower[0], &trans[0],
-                                        &unit_nonunit[0], &m[0], &n[0], &alpha[0],
-                                        (const fp **)&a_array[0], &lda[0], &b_array[0], &ldb[0],
-                                        group_count, &group_size[0], dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::trsm_batch,
-                                        &left_right[0], &upper_lower[0], &trans[0],
-                                        &unit_nonunit[0], &m[0], &n[0], &alpha[0],
-                                        (const fp **)&a_array[0], &lda[0], &b_array[0], &ldb[0],
-                                        group_count, &group_size[0], dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during TRSM_BATCH:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        oneapi::mkl::aligned_free(m_ref);
-        oneapi::mkl::aligned_free(n_ref);
-        oneapi::mkl::aligned_free(lda_ref);
-        oneapi::mkl::aligned_free(ldb_ref);
-        oneapi::mkl::aligned_free(trans_ref);
-        oneapi::mkl::aligned_free(left_right_ref);
-        oneapi::mkl::aligned_free(upper_lower_ref);
-        oneapi::mkl::aligned_free(unit_nonunit_ref);
-        oneapi::mkl::aligned_free(group_size_ref);
-        idx = 0;
-        for (i = 0; i < group_count; i++) {
-            for (j = 0; j < group_size[i]; j++) {
-                oneapi::mkl::free_shared(a_array[idx], cxt);
-                oneapi::mkl::free_shared(b_array[idx], cxt);
-                oneapi::mkl::free_shared(b_ref_array[idx], cxt);
-                idx++;
-            }
-        }
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of TRSM_BATCH:\n" << error.what() << std::endl;
-    }
-
-    bool good = true;
-    // Compare the results of reference implementation and DPC++ implementation.
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            good = good && check_equal_trsm_matrix(b_array[idx], b_ref_array[idx], layout, m[i],
-                                                   n[i], ldb[i], 10 * ldb[i], std::cout);
-            idx++;
-        }
-    }
-    oneapi::mkl::aligned_free(m_ref);
-    oneapi::mkl::aligned_free(n_ref);
-    oneapi::mkl::aligned_free(lda_ref);
-    oneapi::mkl::aligned_free(ldb_ref);
-    oneapi::mkl::aligned_free(trans_ref);
-    oneapi::mkl::aligned_free(left_right_ref);
-    oneapi::mkl::aligned_free(upper_lower_ref);
-    oneapi::mkl::aligned_free(unit_nonunit_ref);
-    oneapi::mkl::aligned_free(group_size_ref);
-    idx = 0;
-    for (i = 0; i < group_count; i++) {
-        for (j = 0; j < group_size[i]; j++) {
-            oneapi::mkl::free_shared(a_array[idx], cxt);
-            oneapi::mkl::free_shared(b_array[idx], cxt);
-            oneapi::mkl::free_shared(b_ref_array[idx], cxt);
-            idx++;
-        }
-    }
-
-    return (int)good;
-}
-
-class TrsmBatchUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(TrsmBatchUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(TrsmBatchUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(TrsmBatchUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-TEST_P(TrsmBatchUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 5));
-}
-
-INSTANTIATE_TEST_SUITE_P(TrsmBatchUsmTestSuite, TrsmBatchUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/extensions/CMakeLists.txt b/tests/unit_tests/blas/extensions/CMakeLists.txt
deleted file mode 100644
index af58e5076..000000000
--- a/tests/unit_tests/blas/extensions/CMakeLists.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build object from all test sources
-set(EXTENSIONS_SOURCES "gemm_bias.cpp" "gemmt.cpp" "gemm_bias_usm.cpp" "gemmt_usm.cpp" "omatcopy.cpp" "omatcopy_usm.cpp" "imatcopy.cpp" "imatcopy_usm.cpp" "omatadd.cpp" "omatadd_usm.cpp" "omatcopy2.cpp" "omatcopy2_usm.cpp")
-
-if(BUILD_SHARED_LIBS)
-  add_library(blas_extensions_rt OBJECT ${EXTENSIONS_SOURCES})
-  target_compile_options(blas_extensions_rt PRIVATE -DCALL_RT_API -DNOMINMAX)
-  target_include_directories(blas_extensions_rt
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-      PUBLIC ${CBLAS_INCLUDE}
-  )
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET blas_extensions_rt SOURCES ${EXTENSIONS_SOURCES})
-  else()
-    target_link_libraries(blas_extensions_rt PUBLIC ONEMKL::SYCL::SYCL)
-  endif()
-endif()
-
-add_library(blas_extensions_ct OBJECT ${EXTENSIONS_SOURCES})
-target_compile_options(blas_extensions_ct PRIVATE  -DNOMINMAX)
-target_include_directories(blas_extensions_ct
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-    PUBLIC ${PROJECT_SOURCE_DIR}/include
-    PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-    PUBLIC ${CMAKE_BINARY_DIR}/bin
-    PUBLIC ${CBLAS_INCLUDE}
-)
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET blas_extensions_ct  SOURCES ${EXTENSIONS_SOURCES})
-else()
-  target_link_libraries(blas_extensions_ct PUBLIC ONEMKL::SYCL::SYCL)
-endif()
diff --git a/tests/unit_tests/blas/extensions/gemm_bias.cpp b/tests/unit_tests/blas/extensions/gemm_bias.cpp
deleted file mode 100644
index c6e99e829..000000000
--- a/tests/unit_tests/blas/extensions/gemm_bias.cpp
+++ /dev/null
@@ -1,385 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename Ts, typename Ta, typename Tb, typename Tc>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa,
-         oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, int m, int n, int k, int lda,
-         int ldb, int ldc, Ts alpha, Ts beta) {
-    // Prepare data.
-    vector<Ta, allocator_helper<Ta, 64>> A;
-    vector<Tb, allocator_helper<Tb, 64>> B;
-    vector<Tc, allocator_helper<Tc, 64>> C, C_ref, co;
-
-    Ta ao = rand_scalar<Ta>();
-    Tb bo = rand_scalar<Tb>();
-
-    rand_matrix(A, layout, transa, m, k, lda);
-    rand_matrix(B, layout, transb, k, n, ldb);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, m, n, ldc);
-    if (offsetc == oneapi::mkl::offset::fix)
-        rand_matrix(co, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, 1, 1, 1);
-    if (offsetc == oneapi::mkl::offset::column)
-        rand_matrix(co, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, m, 1, m);
-    if (offsetc == oneapi::mkl::offset::row)
-        rand_matrix(co, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, n, 1, n);
-
-    C_ref = C;
-
-    // Call Reference GEMM_BIAS.
-    const int m_ref = m, n_ref = n, k_ref = k;
-    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
-
-    using Ts_ref = typename ref_type_info<Ts>::type;
-    using Ta_ref = typename ref_type_info<Ta>::type;
-    using Tb_ref = typename ref_type_info<Tb>::type;
-    using Tc_ref = typename ref_type_info<Tc>::type;
-
-    ::gemm_bias(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa),
-                convert_to_cblas_trans(transb), convert_to_cblas_offset(offsetc), &m_ref, &n_ref,
-                &k_ref, (Ts_ref*)&alpha, (Ta_ref*)A.data(), &lda_ref, (Ta_ref*)&ao,
-                (Tb_ref*)B.data(), &ldb_ref, (Tb_ref*)&bo, (Ts_ref*)&beta, (Tc_ref*)C_ref.data(),
-                &ldc_ref, (Tc_ref*)co.data());
-
-    // Call DPC++ GEMM_BIAS.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMM_BIAS:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<Ta, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<Tb, 1> B_buffer(B.data(), range<1>(B.size()));
-    buffer<Tc, 1> C_buffer(C.data(), range<1>(C.size()));
-    buffer<Tc, 1> CO_buffer(co.data(), range<1>(co.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::gemm_bias(main_queue, transa, transb, offsetc, m,
-                                                           n, k, alpha, A_buffer, lda, ao, B_buffer,
-                                                           ldb, bo, beta, C_buffer, ldc, CO_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::gemm_bias(main_queue, transa, transb, offsetc, m, n,
-                                                        k, alpha, A_buffer, lda, ao, B_buffer, ldb,
-                                                        bo, beta, C_buffer, ldc, CO_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemm_bias,
-                                        transa, transb, offsetc, m, n, k, alpha, A_buffer, lda, ao,
-                                        B_buffer, ldb, bo, beta, C_buffer, ldc, CO_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemm_bias, transa,
-                                        transb, offsetc, m, n, k, alpha, A_buffer, lda, ao,
-                                        B_buffer, ldb, bo, beta, C_buffer, ldc, CO_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during GEMM_BIAS:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of GEMM_BIAS:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto C_accessor = C_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(C_accessor, C_ref, layout, m, n, ldc, 10 * k, std::cout);
-
-    return (int)good;
-}
-
-class GemmBiasTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(GemmBiasTests, Int8Int8Int32Precision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-}
-
-TEST_P(GemmBiasTests, Int8Uint8Int32Precision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-}
-
-TEST_P(GemmBiasTests, Uint8Int8Int32Precision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-}
-
-TEST_P(GemmBiasTests, Uint8Uint8Int32Precision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemmBiasTestSuite, GemmBiasTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/extensions/gemm_bias_usm.cpp b/tests/unit_tests/blas/extensions/gemm_bias_usm.cpp
deleted file mode 100644
index 908eed909..000000000
--- a/tests/unit_tests/blas/extensions/gemm_bias_usm.cpp
+++ /dev/null
@@ -1,391 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename Ts, typename Ta, typename Tb, typename Tc>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa,
-         oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, int m, int n, int k, int lda,
-         int ldb, int ldc, Ts alpha, Ts beta) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMM_BIAS:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<Ta, usm::alloc::shared, 64>(cxt, *dev);
-    auto ub = usm_allocator<Tb, usm::alloc::shared, 64>(cxt, *dev);
-    auto uc = usm_allocator<Tc, usm::alloc::shared, 64>(cxt, *dev);
-    vector<Ta, decltype(ua)> A(ua);
-    vector<Tb, decltype(ub)> B(ub);
-    vector<Tc, decltype(uc)> C(uc), C_ref(uc), co(uc);
-
-    Ta ao = rand_scalar<Ta>();
-    Tb bo = rand_scalar<Tb>();
-
-    rand_matrix(A, layout, transa, m, k, lda);
-    rand_matrix(B, layout, transb, k, n, ldb);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, m, n, ldc);
-    if (offsetc == oneapi::mkl::offset::fix)
-        rand_matrix(co, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, 1, 1, 1);
-    if (offsetc == oneapi::mkl::offset::column)
-        rand_matrix(co, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, m, 1, m);
-    if (offsetc == oneapi::mkl::offset::row)
-        rand_matrix(co, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, n, 1, n);
-
-    C_ref.resize(C.size());
-    for (int i = 0; i < C.size(); i++)
-        C_ref[i] = C[i];
-
-    // Call Reference GEMM_BIAS.
-    const int m_ref = m, n_ref = n, k_ref = k;
-    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
-
-    using Ts_ref = typename ref_type_info<Ts>::type;
-    using Ta_ref = typename ref_type_info<Ta>::type;
-    using Tb_ref = typename ref_type_info<Tb>::type;
-    using Tc_ref = typename ref_type_info<Tc>::type;
-
-    ::gemm_bias(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa),
-                convert_to_cblas_trans(transb), convert_to_cblas_offset(offsetc), &m_ref, &n_ref,
-                &k_ref, (Ts_ref*)&alpha, (Ta_ref*)A.data(), &lda_ref, (Ta_ref*)&ao,
-                (Tb_ref*)B.data(), &ldb_ref, (Tb_ref*)&bo, (Ts_ref*)&beta, (Tc_ref*)C_ref.data(),
-                &ldc_ref, (Tc_ref*)co.data());
-
-    // Call DPC++ GEMM_BIAS.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::gemm_bias(
-                    main_queue, transa, transb, offsetc, m, n, k, alpha, A.data(), lda, ao,
-                    B.data(), ldb, bo, beta, C.data(), ldc, co.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::gemm_bias(
-                    main_queue, transa, transb, offsetc, m, n, k, alpha, A.data(), lda, ao,
-                    B.data(), ldb, bo, beta, C.data(), ldc, co.data(), dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemm_bias,
-                                        transa, transb, offsetc, m, n, k, alpha, A.data(), lda, ao,
-                                        B.data(), ldb, bo, beta, C.data(), ldc, co.data(),
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemm_bias, transa,
-                                        transb, offsetc, m, n, k, alpha, A.data(), lda, ao,
-                                        B.data(), ldb, bo, beta, C.data(), ldc, co.data(),
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during GEMM_BIAS:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of GEMM_BIAS:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good = check_equal_matrix(C, C_ref, layout, m, n, ldc, 10 * k, std::cout);
-
-    return (int)good;
-}
-
-class GemmBiasUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(GemmBiasUsmTests, Int8Int8Int32Precision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-}
-
-TEST_P(GemmBiasUsmTests, Int8Uint8Int32Precision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, int8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-}
-
-TEST_P(GemmBiasUsmTests, Uint8Int8Int32Precision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, int8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-}
-
-TEST_P(GemmBiasUsmTests, Uint8Uint8Int32Precision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::fix, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::column, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106,
-        alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, uint8_t, uint8_t, int32_t>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, oneapi::mkl::offset::row, 79, 83, 91, 103, 105, 106, alpha,
-        beta)));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemmBiasUsmTestSuite, GemmBiasUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/extensions/gemmt.cpp b/tests/unit_tests/blas/extensions/gemmt.cpp
deleted file mode 100644
index 228a85d33..000000000
--- a/tests/unit_tests/blas/extensions/gemmt.cpp
+++ /dev/null
@@ -1,387 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, int n, int k, int lda,
-         int ldb, int ldc, fp alpha, fp beta) {
-    // Prepare data.
-    vector<fp, allocator_helper<fp, 64>> A, B, C, C_ref;
-    rand_matrix(A, layout, transa, n, k, lda);
-    rand_matrix(B, layout, transb, k, n, ldb);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, n, n, ldc);
-    C_ref = C;
-
-    // Call Reference GEMMT.
-    const int n_ref = n, k_ref = k;
-    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::gemmt(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-            convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), &n_ref, &k_ref,
-            (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B.data(), &ldb_ref,
-            (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
-
-    // Call DPC++ GEMMT.
-
-    // Catch asynchronous exceptions
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMMT:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-    buffer<fp, 1> C_buffer(C.data(), range<1>(C.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::gemmt(main_queue, upper_lower, transa, transb, n,
-                                                       k, alpha, A_buffer, lda, B_buffer, ldb, beta,
-                                                       C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::gemmt(main_queue, upper_lower, transa, transb, n, k,
-                                                    alpha, A_buffer, lda, B_buffer, ldb, beta,
-                                                    C_buffer, ldc);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemmt,
-                                        upper_lower, transa, transb, n, k, alpha, A_buffer, lda,
-                                        B_buffer, ldb, beta, C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemmt,
-                                        upper_lower, transa, transb, n, k, alpha, A_buffer, lda,
-                                        B_buffer, ldb, beta, C_buffer, ldc);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during GEMMT:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of GEMMT:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto C_accessor = C_buffer.get_host_access(read_only);
-    bool good =
-        check_equal_matrix(C_accessor, C_ref, layout, upper_lower, n, n, ldc, 10 * k, std::cout);
-
-    return (int)good;
-}
-
-class GemmtTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(GemmtTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-}
-
-TEST_P(GemmtTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-}
-
-TEST_P(GemmtTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0);
-    std::complex<float> beta(3.0);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-        beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-        beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-}
-
-TEST_P(GemmtTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0);
-    std::complex<double> beta(3.0);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-        beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-        beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemmtTestSuite, GemmtTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/extensions/gemmt_usm.cpp b/tests/unit_tests/blas/extensions/gemmt_usm.cpp
deleted file mode 100644
index dac300ae2..000000000
--- a/tests/unit_tests/blas/extensions/gemmt_usm.cpp
+++ /dev/null
@@ -1,387 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, int n, int k, int lda,
-         int ldb, int ldc, fp alpha, fp beta) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMMT:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), B(ua), C(ua);
-    rand_matrix(A, layout, transa, n, k, lda);
-    rand_matrix(B, layout, transb, k, n, ldb);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, n, n, ldc);
-
-    auto C_ref = C;
-
-    // Call Reference GEMMT.
-    const int n_ref = n, k_ref = k;
-    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::gemmt(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-            convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), &n_ref, &k_ref,
-            (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)B.data(), &ldb_ref,
-            (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
-
-    // Call DPC++ GEMMT.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::gemmt(
-                    main_queue, upper_lower, transa, transb, n, k, alpha, A.data(), lda, B.data(),
-                    ldb, beta, C.data(), ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::gemmt(main_queue, upper_lower, transa, transb,
-                                                           n, k, alpha, A.data(), lda, B.data(),
-                                                           ldb, beta, C.data(), ldc, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemmt,
-                                        upper_lower, transa, transb, n, k, alpha, A.data(), lda,
-                                        B.data(), ldb, beta, C.data(), ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemmt,
-                                        upper_lower, transa, transb, n, k, alpha, A.data(), lda,
-                                        B.data(), ldb, beta, C.data(), ldc, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during GEMMT:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of GEMMT:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good = check_equal_matrix(C, C_ref, layout, upper_lower, n, n, ldc, 10 * k, std::cout);
-
-    return (int)good;
-}
-
-class GemmtUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(GemmtUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                  beta));
-}
-
-TEST_P(GemmtUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-                                   beta));
-}
-
-TEST_P(GemmtUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0);
-    std::complex<float> beta(3.0);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-        beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-        beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-}
-
-TEST_P(GemmtUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0);
-    std::complex<double> beta(3.0);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-        beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103, alpha,
-        beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::trans, 27, 98, 101, 102, 103,
-        alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::conjtrans, 27, 98, 101, 102, 103,
-        alpha, beta));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemmtUsmTestSuite, GemmtUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/extensions/imatcopy.cpp b/tests/unit_tests/blas/extensions/imatcopy.cpp
deleted file mode 100644
index e21702775..000000000
--- a/tests/unit_tests/blas/extensions/imatcopy.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout) {
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb;
-    oneapi::mkl::transpose trans;
-    fp alpha;
-    int64_t i, tmp;
-
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = std::max(m, n);
-    ldb = std::max(m, n);
-    alpha = rand_scalar<fp>();
-    trans = rand_trans<fp>();
-
-    int64_t size_a, size_b, size;
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            size_a = lda * n;
-            size_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * m;
-            break;
-        case oneapi::mkl::layout::row_major:
-            size_a = lda * m;
-            size_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * m : ldb * n;
-            break;
-        default: break;
-    }
-    size = std::max(size_a, size_b);
-
-    vector<fp, allocator_helper<fp, 64>> AB(size), AB_ref(size);
-
-    rand_matrix(AB, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size, 1,
-                size);
-    copy_matrix(AB, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size, 1, size,
-                AB_ref);
-
-    // Call reference IMATCOPY.
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int lda_ref = (int)lda;
-    int ldb_ref = (int)ldb;
-    imatcopy_ref(layout, trans, m_ref, n_ref, alpha, AB_ref.data(), lda_ref, ldb_ref);
-
-    // Call DPC++ IMATCOPY
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during IMATCOPY:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> AB_buffer(AB.data(), range<1>(AB.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::imatcopy(main_queue, trans, m, n, alpha, AB_buffer,
-                                                          lda, ldb);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::imatcopy(main_queue, trans, m, n, alpha, AB_buffer,
-                                                       lda, ldb);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::imatcopy,
-                                        trans, m, n, alpha, AB_buffer, lda, ldb);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::imatcopy, trans,
-                                        m, n, alpha, AB_buffer, lda, ldb);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during IMATCOPY:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of IMATCOPY:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto AB_accessor = AB_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(AB_accessor, AB_ref, oneapi::mkl::layout::col_major, size, 1,
-                                   size, 10, std::cout);
-
-    return (int)good;
-}
-
-class ImatcopyTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(ImatcopyTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(ImatcopyTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(ImatcopyTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(ImatcopyTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(ImatcopyTestSuite, ImatcopyTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/extensions/imatcopy_usm.cpp b/tests/unit_tests/blas/extensions/imatcopy_usm.cpp
deleted file mode 100644
index dc3d43d2e..000000000
--- a/tests/unit_tests/blas/extensions/imatcopy_usm.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-#include <type_traits>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during IMATCOPY:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb;
-    oneapi::mkl::transpose trans;
-    fp alpha;
-    int64_t i;
-
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = std::max(m, n);
-    ldb = std::max(m, n);
-    alpha = rand_scalar<fp>();
-    trans = rand_trans<fp>();
-
-    int64_t size_a, size_b, size;
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            size_a = lda * n;
-            size_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * m;
-            break;
-        case oneapi::mkl::layout::row_major:
-            size_a = lda * m;
-            size_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * m : ldb * n;
-            break;
-        default: break;
-    }
-    size = std::max(size_a, size_b);
-
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> AB(ua), AB_ref(ua);
-
-    AB.resize(size);
-    AB_ref.resize(size);
-
-    rand_matrix(AB, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size, 1,
-                size);
-    copy_matrix(AB, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size, 1, size,
-                AB_ref);
-
-    // Call reference IMATCOPY.
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int lda_ref = (int)lda;
-    int ldb_ref = (int)ldb;
-    imatcopy_ref(layout, trans, m_ref, n_ref, alpha, AB_ref.data(), lda_ref, ldb_ref);
-
-    // Call DPC++ IMATCOPY
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::imatcopy(main_queue, trans, m, n, alpha,
-                                                                 &AB[0], lda, ldb, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::imatcopy(main_queue, trans, m, n, alpha,
-                                                              &AB[0], lda, ldb, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::imatcopy,
-                                        trans, m, n, alpha, &AB[0], lda, ldb, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::imatcopy, trans,
-                                        m, n, alpha, &AB[0], lda, ldb, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during IMATCOPY:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of IMATCOPY:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good = check_equal_matrix(AB, AB_ref, oneapi::mkl::layout::col_major, size, 1, size, 10,
-                                   std::cout);
-
-    return (int)good;
-}
-
-class ImatcopyUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(ImatcopyUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(ImatcopyUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(ImatcopyUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(ImatcopyUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(ImatcopyUsmTestSuite, ImatcopyUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/extensions/omatadd.cpp b/tests/unit_tests/blas/extensions/omatadd.cpp
deleted file mode 100644
index b2af98935..000000000
--- a/tests/unit_tests/blas/extensions/omatadd.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout) {
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb, ldc;
-    oneapi::mkl::transpose transa, transb;
-    fp alpha, beta;
-    int64_t i, tmp;
-
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = std::max(m, n);
-    ldb = std::max(m, n);
-    ldc = std::max(m, n);
-    alpha = rand_scalar<fp>();
-    beta = rand_scalar<fp>();
-    transa = rand_trans<fp>();
-    transb = rand_trans<fp>();
-
-    int64_t size_a, size_b, size_c;
-
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            size_a = (transa == oneapi::mkl::transpose::nontrans) ? lda * n : lda * m;
-            size_b = (transb == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * m;
-            size_c = ldc * n;
-            break;
-        case oneapi::mkl::layout::row_major:
-            size_a = (transa == oneapi::mkl::transpose::nontrans) ? lda * m : lda * n;
-            size_b = (transb == oneapi::mkl::transpose::nontrans) ? ldb * m : ldb * n;
-            size_c = ldc * m;
-            break;
-        default: break;
-    }
-
-    vector<fp, allocator_helper<fp, 64>> A(size_a), B(size_b), C(size_c), C_ref(size_c);
-
-    rand_matrix(A.data(), oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_a,
-                1, size_a);
-    rand_matrix(B.data(), oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_b,
-                1, size_b);
-    rand_matrix(C.data(), oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_c,
-                1, size_c);
-    copy_matrix(C.data(), oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_c,
-                1, size_c, C_ref.data());
-
-    // Call reference OMATADD.
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int lda_ref = (int)lda;
-    int ldb_ref = (int)ldb;
-    int ldc_ref = (int)ldc;
-    omatadd_ref(layout, transa, transb, m_ref, n_ref, alpha, A.data(), lda_ref, beta, B.data(),
-                ldb_ref, C_ref.data(), ldc_ref);
-
-    // Call DPC++ OMATADD
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during OMATADD:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-    buffer<fp, 1> C_buffer(C.data(), range<1>(C.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::omatadd(main_queue, transa, transb, m, n, alpha,
-                                                         A_buffer, lda, beta, B_buffer, ldb,
-                                                         C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::omatadd(main_queue, transa, transb, m, n, alpha,
-                                                      A_buffer, lda, beta, B_buffer, ldb, C_buffer,
-                                                      ldc);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::omatadd,
-                                        transa, transb, m, n, alpha, A_buffer, lda, beta, B_buffer,
-                                        ldb, C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::omatadd, transa,
-                                        transb, m, n, alpha, A_buffer, lda, beta, B_buffer, ldb,
-                                        C_buffer, ldc);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during OMATADD:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of OMATADD:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto C_accessor = C_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(C_accessor, C_ref, oneapi::mkl::layout::col_major, size_c, 1,
-                                   size_c, 10, std::cout);
-
-    return (int)good;
-}
-
-class OmataddTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(OmataddTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(OmataddTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(OmataddTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(OmataddTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(OmataddTestSuite, OmataddTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/extensions/omatadd_usm.cpp b/tests/unit_tests/blas/extensions/omatadd_usm.cpp
deleted file mode 100644
index 783f985b2..000000000
--- a/tests/unit_tests/blas/extensions/omatadd_usm.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during OMATADD:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb, ldc;
-    oneapi::mkl::transpose transa, transb;
-    fp alpha, beta;
-    int64_t i, tmp;
-
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = std::max(m, n);
-    ldb = std::max(m, n);
-    ldc = std::max(m, n);
-    alpha = rand_scalar<fp>();
-    beta = rand_scalar<fp>();
-    transa = rand_trans<fp>();
-    transb = rand_trans<fp>();
-
-    int64_t size_a, size_b, size_c;
-
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            size_a = (transa == oneapi::mkl::transpose::nontrans) ? lda * n : lda * m;
-            size_b = (transb == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * m;
-            size_c = ldc * n;
-            break;
-        case oneapi::mkl::layout::row_major:
-            size_a = (transa == oneapi::mkl::transpose::nontrans) ? lda * m : lda * n;
-            size_b = (transb == oneapi::mkl::transpose::nontrans) ? ldb * m : ldb * n;
-            size_c = ldc * m;
-            break;
-        default: break;
-    }
-
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), B(ua), C(ua), C_ref(ua);
-
-    A.resize(size_a);
-    B.resize(size_b);
-    C.resize(size_c);
-    C_ref.resize(size_c);
-
-    rand_matrix(A, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_a, 1,
-                size_a);
-    rand_matrix(B, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_b, 1,
-                size_b);
-    rand_matrix(C, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_c, 1,
-                size_c);
-    copy_matrix(C, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_c, 1,
-                size_c, C_ref);
-
-    // Call reference OMATADD.
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int lda_ref = (int)lda;
-    int ldb_ref = (int)ldb;
-    int ldc_ref = (int)ldc;
-    omatadd_ref(layout, transa, transb, m_ref, n_ref, alpha, A.data(), lda_ref, beta, B.data(),
-                ldb_ref, C_ref.data(), ldc_ref);
-
-    // Call DPC++ OMATADD
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::omatadd(main_queue, transa, transb, m, n,
-                                                                alpha, &A[0], lda, beta, &B[0], ldb,
-                                                                &C[0], ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::omatadd(main_queue, transa, transb, m, n,
-                                                             alpha, &A[0], lda, beta, &B[0], ldb,
-                                                             &C[0], ldc, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::omatadd,
-                                        transa, transb, m, n, alpha, &A[0], lda, beta, &B[0], ldb,
-                                        &C[0], ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::omatadd, transa,
-                                        transb, m, n, alpha, &A[0], lda, beta, &B[0], ldb, &C[0],
-                                        ldc, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during OMATADD:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of OMATADD:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good = check_equal_matrix(C, C_ref, oneapi::mkl::layout::col_major, size_c, 1, size_c, 10,
-                                   std::cout);
-
-    return (int)good;
-}
-
-class OmataddUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(OmataddUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(OmataddUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(OmataddUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(OmataddUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(OmataddUsmTestSuite, OmataddUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/extensions/omatcopy.cpp b/tests/unit_tests/blas/extensions/omatcopy.cpp
deleted file mode 100644
index 122ba2c79..000000000
--- a/tests/unit_tests/blas/extensions/omatcopy.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout) {
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb;
-    oneapi::mkl::transpose trans;
-    fp alpha;
-    int64_t i, tmp;
-
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = std::max(m, n);
-    ldb = std::max(m, n);
-    alpha = rand_scalar<fp>();
-
-    if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
-        trans = (oneapi::mkl::transpose)(std::rand() % 2);
-    }
-    else {
-        tmp = std::rand() % 3;
-        if (tmp == 2)
-            trans = oneapi::mkl::transpose::conjtrans;
-        else
-            trans = (oneapi::mkl::transpose)tmp;
-    }
-
-    int64_t size_a, size_b;
-
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            size_a = lda * n;
-            size_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * m;
-            break;
-        case oneapi::mkl::layout::row_major:
-            size_a = lda * m;
-            size_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * m : ldb * n;
-            break;
-        default: break;
-    }
-
-    vector<fp, allocator_helper<fp, 64>> A(size_a), B(size_b), B_ref(size_b);
-
-    rand_matrix(A.data(), layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-    rand_matrix(B.data(), layout, trans, m, n, ldb);
-
-    // Call reference OMATCOPY.
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int lda_ref = (int)lda;
-    int ldb_ref = (int)ldb;
-    omatcopy_ref(layout, trans, m_ref, n_ref, alpha, A.data(), lda_ref, B_ref.data(), ldb_ref);
-
-    // Call DPC++ OMATCOPY
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during OMATCOPY:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::omatcopy(main_queue, trans, m, n, alpha, A_buffer,
-                                                          lda, B_buffer, ldb);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::omatcopy(main_queue, trans, m, n, alpha, A_buffer,
-                                                       lda, B_buffer, ldb);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::omatcopy,
-                                        trans, m, n, alpha, A_buffer, lda, B_buffer, ldb);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::omatcopy, trans,
-                                        m, n, alpha, A_buffer, lda, B_buffer, ldb);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during OMATCOPY:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of OMATCOPY:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto B_accessor = B_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(B_accessor, B_ref, oneapi::mkl::layout::col_major, size_b, 1,
-                                   size_b, 10, std::cout);
-
-    return (int)good;
-}
-
-class OmatcopyTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(OmatcopyTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(OmatcopyTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(OmatcopyTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(OmatcopyTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(OmatcopyTestSuite, OmatcopyTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/extensions/omatcopy2.cpp b/tests/unit_tests/blas/extensions/omatcopy2.cpp
deleted file mode 100644
index d0407c324..000000000
--- a/tests/unit_tests/blas/extensions/omatcopy2.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout) {
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb;
-    int64_t stride_a, stride_b;
-    oneapi::mkl::transpose trans;
-    fp alpha;
-
-    stride_a = 1 + std::rand() % 50;
-    stride_b = 1 + std::rand() % 50;
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = stride_a * (std::max(m, n) - 1) + 1;
-    ldb = stride_b * (std::max(m, n) - 1) + 1;
-    alpha = rand_scalar<fp>();
-    trans = rand_trans<fp>();
-
-    int64_t size_a, size_b;
-
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            size_a = lda * n;
-            size_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * m;
-            break;
-        case oneapi::mkl::layout::row_major:
-            size_a = lda * m;
-            size_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * m : ldb * n;
-            break;
-        default: break;
-    }
-
-    vector<fp, allocator_helper<fp, 64>> A(size_a), B(size_b), B_ref(size_b);
-
-    rand_matrix(A.data(), layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-    rand_matrix(B.data(), layout, trans, m, n, ldb);
-    copy_matrix(B.data(), oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_b,
-                1, size_b, B_ref.data());
-
-    // Call reference OMATCOPY2.
-    int64_t m_ref = m;
-    int64_t n_ref = n;
-    int64_t lda_ref = lda;
-    int64_t ldb_ref = ldb;
-    int64_t stride_a_ref = stride_a;
-    int64_t stride_b_ref = stride_b;
-    omatcopy2_ref(layout, trans, m_ref, n_ref, alpha, A.data(), lda_ref, stride_a_ref, B_ref.data(),
-                  ldb_ref, stride_b_ref);
-
-    // Call DPC++ OMATCOPY2
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during OMATCOPY2:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::omatcopy2(main_queue, trans, m, n, alpha, A_buffer,
-                                                           lda, stride_a, B_buffer, ldb, stride_b);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::omatcopy2(main_queue, trans, m, n, alpha, A_buffer,
-                                                        lda, stride_a, B_buffer, ldb, stride_b);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::omatcopy2,
-                                        trans, m, n, alpha, A_buffer, lda, stride_a, B_buffer, ldb,
-                                        stride_b);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::omatcopy2, trans,
-                                        m, n, alpha, A_buffer, lda, stride_a, B_buffer, ldb,
-                                        stride_b);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during OMATCOPY2:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of OMATCOPY2:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto B_accessor = B_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(B_accessor, B_ref, oneapi::mkl::layout::col_major, size_b, 1,
-                                   size_b, 10, std::cout);
-
-    return (int)good;
-}
-
-class Omatcopy2Tests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(Omatcopy2Tests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(Omatcopy2Tests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(Omatcopy2Tests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(Omatcopy2Tests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Omatcopy2TestSuite, Omatcopy2Tests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/extensions/omatcopy2_usm.cpp b/tests/unit_tests/blas/extensions/omatcopy2_usm.cpp
deleted file mode 100644
index d2103d243..000000000
--- a/tests/unit_tests/blas/extensions/omatcopy2_usm.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-#include <type_traits>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during OMATCOPY2:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb;
-    int64_t stride_a, stride_b;
-    oneapi::mkl::transpose trans;
-    fp alpha;
-
-    stride_a = 1 + std::rand() % 50;
-    stride_b = 1 + std::rand() % 50;
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = stride_a * (std::max(m, n) - 1) + 1;
-    ldb = stride_b * (std::max(m, n) - 1) + 1;
-    alpha = rand_scalar<fp>();
-    trans = rand_trans<fp>();
-
-    int64_t size_a, size_b;
-
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            size_a = lda * n;
-            size_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * m;
-            break;
-        case oneapi::mkl::layout::row_major:
-            size_a = lda * m;
-            size_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * m : ldb * n;
-            break;
-        default: break;
-    }
-
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), B(ua), B_ref(ua);
-
-    A.resize(size_a);
-    B.resize(size_b);
-    B_ref.resize(size_b);
-
-    rand_matrix(A, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_a, 1,
-                size_a);
-    rand_matrix(B, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_b, 1,
-                size_b);
-    copy_matrix(B, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_b, 1,
-                size_b, B_ref);
-
-    // Call reference OMATCOPY2.
-    int64_t m_ref = m;
-    int64_t n_ref = n;
-    int64_t lda_ref = lda;
-    int64_t ldb_ref = ldb;
-    int64_t stride_a_ref = stride_a;
-    int64_t stride_b_ref = stride_b;
-    omatcopy2_ref(layout, trans, m_ref, n_ref, alpha, A.data(), lda_ref, stride_a_ref, B_ref.data(),
-                  ldb_ref, stride_b_ref);
-
-    // Call DPC++ OMATCOPY2
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::omatcopy2(main_queue, trans, m, n, alpha,
-                                                                  &A[0], lda, stride_a, &B[0], ldb,
-                                                                  stride_b, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::omatcopy2(main_queue, trans, m, n, alpha,
-                                                               &A[0], lda, stride_a, &B[0], ldb,
-                                                               stride_b, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::omatcopy2,
-                                        trans, m, n, alpha, &A[0], lda, stride_a, &B[0], ldb,
-                                        stride_b, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::omatcopy2, trans,
-                                        m, n, alpha, &A[0], lda, stride_a, &B[0], ldb, stride_b,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during OMATCOPY2:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of OMATCOPY2:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good = check_equal_matrix(B, B_ref, oneapi::mkl::layout::col_major, size_b, 1, size_b, 10,
-                                   std::cout);
-
-    return (int)good;
-}
-
-class Omatcopy2UsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(Omatcopy2UsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(Omatcopy2UsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(Omatcopy2UsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(Omatcopy2UsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Omatcopy2UsmTestSuite, Omatcopy2UsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/extensions/omatcopy_usm.cpp b/tests/unit_tests/blas/extensions/omatcopy_usm.cpp
deleted file mode 100644
index ac9ba2d5c..000000000
--- a/tests/unit_tests/blas/extensions/omatcopy_usm.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-#include <type_traits>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during OMATCOPY:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    int64_t m, n;
-    int64_t lda, ldb;
-    oneapi::mkl::transpose trans;
-    fp alpha;
-    int64_t i, tmp;
-
-    m = 1 + std::rand() % 50;
-    n = 1 + std::rand() % 50;
-    lda = std::max(m, n);
-    ldb = std::max(m, n);
-    alpha = rand_scalar<fp>();
-    trans = rand_trans<fp>();
-
-    int64_t size_a, size_b;
-    switch (layout) {
-        case oneapi::mkl::layout::col_major:
-            size_a = lda * n;
-            size_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * n : ldb * m;
-            break;
-        case oneapi::mkl::layout::row_major:
-            size_a = lda * m;
-            size_b = (trans == oneapi::mkl::transpose::nontrans) ? ldb * m : ldb * n;
-            break;
-        default: break;
-    }
-
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), B(ua), B_ref(ua);
-
-    A.resize(size_a);
-    B.resize(size_b);
-    B_ref.resize(size_b);
-
-    rand_matrix(A, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_a, 1,
-                size_a);
-    rand_matrix(B, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_b, 1,
-                size_b);
-    copy_matrix(B, oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_b, 1,
-                size_b, B_ref);
-
-    // Call reference OMATCOPY.
-    int m_ref = (int)m;
-    int n_ref = (int)n;
-    int lda_ref = (int)lda;
-    int ldb_ref = (int)ldb;
-    omatcopy_ref(layout, trans, m_ref, n_ref, alpha, A.data(), lda_ref, B_ref.data(), ldb_ref);
-
-    // Call DPC++ OMATCOPY
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::omatcopy(
-                    main_queue, trans, m, n, alpha, &A[0], lda, &B[0], ldb, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::omatcopy(main_queue, trans, m, n, alpha, &A[0],
-                                                              lda, &B[0], ldb, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::omatcopy,
-                                        trans, m, n, alpha, &A[0], lda, &B[0], ldb, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::omatcopy, trans,
-                                        m, n, alpha, &A[0], lda, &B[0], ldb, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during OMATCOPY:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of OMATCOPY:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good = check_equal_matrix(B, B_ref, oneapi::mkl::layout::col_major, size_b, 1, size_b, 10,
-                                   std::cout);
-
-    return (int)good;
-}
-
-class OmatcopyUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(OmatcopyUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(OmatcopyUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(OmatcopyUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-TEST_P(OmatcopyUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(OmatcopyUsmTestSuite, OmatcopyUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/include/allocator_helper.hpp b/tests/unit_tests/blas/include/allocator_helper.hpp
deleted file mode 100644
index 79fd22254..000000000
--- a/tests/unit_tests/blas/include/allocator_helper.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef __ALLOCATOR_HELPER_HPP
-#define __ALLOCATOR_HELPER_HPP
-
-#include <stdlib.h>
-#include <cstddef>
-#include <limits>
-#include <type_traits>
-#include "test_helper.hpp"
-
-template <typename T, int align>
-struct allocator_helper {
-    typedef T* pointer;
-    typedef const T* const_pointer;
-    typedef void* void_pointer;
-    typedef const void* const_void_pointer;
-    typedef T value_type;
-    typedef size_t size_type;
-    typedef ptrdiff_t difference_type;
-
-    template <typename U>
-    struct rebind {
-        typedef allocator_helper<U, align> other;
-    };
-
-    allocator_helper() noexcept {}
-    template <typename U, int align2>
-    allocator_helper(allocator_helper<U, align2>& other) noexcept {}
-    template <typename U, int align2>
-    allocator_helper(allocator_helper<U, align2>&& other) noexcept {}
-
-    T* allocate(size_t n) {
-        void* mem = oneapi::mkl::aligned_alloc(align, n * sizeof(T));
-        if (!mem)
-            throw std::bad_alloc();
-
-        return static_cast<T*>(mem);
-    }
-
-    void deallocate(T* p, size_t n) noexcept {
-        oneapi::mkl::aligned_free(p);
-    }
-
-    constexpr size_t max_size() const noexcept {
-        return std::numeric_limits<size_t>::max() / sizeof(T);
-    }
-
-    template <typename U, int align2>
-    constexpr bool operator==(const allocator_helper<U, align2>) const noexcept {
-        return true;
-    }
-    template <typename U, int align2>
-    constexpr bool operator!=(const allocator_helper<U, align2>) const noexcept {
-        return false;
-    }
-
-    typedef std::true_type is_always_equal;
-};
-
-#endif
diff --git a/tests/unit_tests/blas/include/onemkl_blas_helper.hpp b/tests/unit_tests/blas/include/onemkl_blas_helper.hpp
deleted file mode 100644
index 5489aaa61..000000000
--- a/tests/unit_tests/blas/include/onemkl_blas_helper.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef ONEMKL_BLAS_HELPER_HPP
-#define ONEMKL_BLAS_HELPER_HPP
-
-#include "cblas.h"
-
-#include "oneapi/mkl/types.hpp"
-
-typedef enum { CblasRowOffset = 101, CblasColOffset = 102, CblasFixOffset = 103 } CBLAS_OFFSET;
-
-/**
- * Helper methods for converting between onemkl types and their BLAS
- * equivalents.
- */
-
-inline CBLAS_TRANSPOSE convert_to_cblas_trans(oneapi::mkl::transpose trans) {
-    if (trans == oneapi::mkl::transpose::trans)
-        return CBLAS_TRANSPOSE::CblasTrans;
-    else if (trans == oneapi::mkl::transpose::conjtrans)
-        return CBLAS_TRANSPOSE::CblasConjTrans;
-    else
-        return CBLAS_TRANSPOSE::CblasNoTrans;
-}
-
-inline CBLAS_UPLO convert_to_cblas_uplo(oneapi::mkl::uplo is_upper) {
-    return is_upper == oneapi::mkl::uplo::upper ? CBLAS_UPLO::CblasUpper : CBLAS_UPLO::CblasLower;
-}
-
-inline CBLAS_DIAG convert_to_cblas_diag(oneapi::mkl::diag is_unit) {
-    return is_unit == oneapi::mkl::diag::unit ? CBLAS_DIAG::CblasUnit : CBLAS_DIAG::CblasNonUnit;
-}
-
-inline CBLAS_SIDE convert_to_cblas_side(oneapi::mkl::side is_left) {
-    return is_left == oneapi::mkl::side::left ? CBLAS_SIDE::CblasLeft : CBLAS_SIDE::CblasRight;
-}
-
-inline CBLAS_OFFSET convert_to_cblas_offset(oneapi::mkl::offset offsetc) {
-    if (offsetc == oneapi::mkl::offset::fix)
-        return CBLAS_OFFSET::CblasFixOffset;
-    else if (offsetc == oneapi::mkl::offset::column)
-        return CBLAS_OFFSET::CblasColOffset;
-    else
-        return CBLAS_OFFSET::CblasRowOffset;
-}
-
-inline CBLAS_LAYOUT convert_to_cblas_layout(oneapi::mkl::layout is_column) {
-    return is_column == oneapi::mkl::layout::col_major ? CBLAS_LAYOUT::CblasColMajor
-                                                       : CBLAS_LAYOUT::CblasRowMajor;
-}
-
-static const CBLAS_TRANSPOSE fcblastrans[] = { CblasNoTrans, CblasTrans, CblasConjTrans };
-
-static const CBLAS_UPLO fcblasuplo[] = { CblasUpper, CblasLower };
-
-static const CBLAS_SIDE fcblasside[] = { CblasLeft, CblasRight };
-
-static const CBLAS_DIAG fcblasdiag[] = { CblasNonUnit, CblasUnit };
-
-static const CBLAS_TRANSPOSE fcblastrans_r[] = { CblasTrans, CblasNoTrans, CblasNoTrans };
-
-static const CBLAS_TRANSPOSE fcblastrans_r2[] = { CblasTrans, CblasNoTrans, CblasConjTrans };
-
-static const CBLAS_TRANSPOSE fcblastrans_c[] = { CblasConjTrans, CblasNoTrans, CblasNoTrans };
-
-static const CBLAS_OFFSET fcblasoffset[] = { CblasColOffset, CblasRowOffset, CblasFixOffset };
-
-#endif // ONEMKL_BLAS_HELPER_HPP
diff --git a/tests/unit_tests/blas/include/reference_blas_templates.hpp b/tests/unit_tests/blas/include/reference_blas_templates.hpp
deleted file mode 100644
index 6d184ba75..000000000
--- a/tests/unit_tests/blas/include/reference_blas_templates.hpp
+++ /dev/null
@@ -1,2183 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _REFERENCE_BLAS_TEMPLATES_HPP__
-#define _REFERENCE_BLAS_TEMPLATES_HPP__
-
-#include <stdlib.h>
-#include <complex>
-#include <cstdint>
-#include "cblas.h"
-#include "oneapi/mkl/types.hpp"
-#include "test_helper.hpp"
-#include "reference_blas_wrappers.hpp"
-
-inline bool isNonTranspose(CBLAS_TRANSPOSE trans) {
-    return trans == CblasNoTrans;
-}
-
-template <typename T_src, typename T_dest>
-static inline void copy_mat(T_src &src, CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, int row,
-                            int col, int ld, T_dest *&dest) {
-    int i, j, Iend, Jend;
-    if (layout == CblasColMajor) {
-        Jend = isNonTranspose(trans) ? col : row;
-        Iend = isNonTranspose(trans) ? row : col;
-    }
-    else {
-        Jend = isNonTranspose(trans) ? row : col;
-        Iend = isNonTranspose(trans) ? col : row;
-    }
-
-    for (j = 0; j < Jend; j++) {
-        for (i = 0; i < Iend; i++) {
-            dest[i + ld * j] = (T_dest)src[i + ld * j];
-        }
-    }
-}
-
-template <typename T_src, typename T_dest>
-static inline void copy_mat(T_src &src, CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, int row,
-                            int col, int ld, T_dest off, T_dest *&dest) {
-    int i, j, Iend, Jend;
-    if (layout == CblasColMajor) {
-        Jend = isNonTranspose(trans) ? col : row;
-        Iend = isNonTranspose(trans) ? row : col;
-    }
-    else {
-        Jend = isNonTranspose(trans) ? row : col;
-        Iend = isNonTranspose(trans) ? col : row;
-    }
-
-    for (j = 0; j < Jend; j++) {
-        for (i = 0; i < Iend; i++) {
-            dest[i + ld * j] = (T_dest)src[i + ld * j] - off;
-        }
-    }
-}
-
-template <typename T_src, typename T_dest, typename T_off>
-static inline void copy_mat(T_src &src, CBLAS_LAYOUT layout, int row, int col, int ld,
-                            CBLAS_OFFSET off_kind, T_off off, T_dest &dest) {
-    using T_data = typename std::remove_reference<decltype(dest[0])>::type;
-    int i, j;
-    T_data tmp;
-
-    int Jend = (layout == CblasColMajor) ? col : row;
-    int Iend = (layout == CblasColMajor) ? row : col;
-
-    if (off_kind == CblasFixOffset) {
-        tmp = off[0];
-        for (j = 0; j < Jend; j++) {
-            for (i = 0; i < Iend; i++) {
-                dest[i + ld * j] = tmp + (T_data)src[i + ld * j];
-            }
-        }
-    }
-    else if (((off_kind == CblasColOffset) && (layout == CblasColMajor)) ||
-             ((off_kind == CblasRowOffset) && (layout == CblasRowMajor))) {
-        for (j = 0; j < Jend; j++) {
-            for (i = 0; i < Iend; i++) {
-                tmp = off[i];
-                dest[i + ld * j] = tmp + (T_data)src[i + ld * j];
-            }
-        }
-    }
-    else {
-        for (j = 0; j < Jend; j++) {
-            tmp = off[j];
-            for (i = 0; i < Iend; i++) {
-                dest[i + ld * j] = tmp + (T_data)src[i + ld * j];
-            }
-        }
-    }
-}
-
-template <typename T_src, typename T_desc>
-static inline void update_c(T_src &src, CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, int row,
-                            int col, int ld, T_desc *&dest) {
-    int i, j;
-
-    int Jend = (layout == CblasColMajor) ? col : row;
-    int Iend = (layout == CblasColMajor) ? row : col;
-
-    for (j = 0; j < Jend; j++) {
-        for (i = 0; i < Iend; i++) {
-            if (((upper_lower == CblasUpper) && (layout == CblasColMajor)) ||
-                ((upper_lower == CblasLower) && (layout == CblasRowMajor))) {
-                if (j >= i)
-                    dest[i + ld * j] = (T_desc)src[i + ld * j];
-                else
-                    dest[i + ld * j] = (T_desc)0.0;
-            }
-            else {
-                if (j <= i)
-                    dest[i + ld * j] = (T_desc)src[i + ld * j];
-                else
-                    dest[i + ld * j] = (T_desc)0.0;
-            }
-        }
-    }
-}
-
-/* Level 3 */
-
-template <typename fp>
-static void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m,
-                 const int *n, const int *k, const fp *alpha, const fp *a, const int *lda,
-                 const fp *b, const int *ldb, const fp *beta, fp *c, const int *ldc);
-
-template <>
-void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m,
-          const int *n, const int *k, const sycl::half *alpha, const sycl::half *a, const int *lda,
-          const sycl::half *b, const int *ldb, const sycl::half *beta, sycl::half *c,
-          const int *ldc) {
-    // Not supported in NETLIB. SGEMM is used as reference.
-    int sizea, sizeb, sizec;
-    const float alphaf = *alpha;
-    const float betaf = *beta;
-    if (layout == CblasColMajor) {
-        sizea = (transa == CblasNoTrans) ? *lda * *k : *lda * *m;
-        sizeb = (transb == CblasNoTrans) ? *ldb * *n : *ldb * *k;
-        sizec = *ldc * *n;
-    }
-    else {
-        sizea = (transa == CblasNoTrans) ? *lda * *m : *lda * *k;
-        sizeb = (transb == CblasNoTrans) ? *ldb * *k : *ldb * *n;
-        sizec = *ldc * *m;
-    }
-    float *af = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizea);
-    float *bf = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizeb);
-    float *cf = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizec);
-    copy_mat(a, layout, transa, *m, *k, *lda, af);
-    copy_mat(b, layout, transb, *k, *n, *ldb, bf);
-    copy_mat(c, layout, CblasNoTrans, *m, *n, *ldc, cf);
-    cblas_sgemm_wrapper(layout, transa, transb, *m, *n, *k, alphaf, af, *lda, bf, *ldb, betaf, cf,
-                        *ldc);
-    copy_mat(cf, layout, CblasNoTrans, *m, *n, *ldc, c);
-    oneapi::mkl::aligned_free(af);
-    oneapi::mkl::aligned_free(bf);
-    oneapi::mkl::aligned_free(cf);
-}
-
-template <>
-void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m,
-          const int *n, const int *k, const float *alpha, const float *a, const int *lda,
-          const float *b, const int *ldb, const float *beta, float *c, const int *ldc) {
-    cblas_sgemm_wrapper(layout, transa, transb, *m, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c,
-                        *ldc);
-}
-
-template <>
-void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m,
-          const int *n, const int *k, const double *alpha, const double *a, const int *lda,
-          const double *b, const int *ldb, const double *beta, double *c, const int *ldc) {
-    cblas_dgemm_wrapper(layout, transa, transb, *m, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c,
-                        *ldc);
-}
-
-template <>
-void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m,
-          const int *n, const int *k, const std::complex<float> *alpha,
-          const std::complex<float> *a, const int *lda, const std::complex<float> *b,
-          const int *ldb, const std::complex<float> *beta, std::complex<float> *c, const int *ldc) {
-    cblas_cgemm_wrapper(layout, transa, transb, *m, *n, *k, (const void *)alpha, (const void *)a,
-                        *lda, (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc);
-}
-
-template <>
-void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m,
-          const int *n, const int *k, const std::complex<double> *alpha,
-          const std::complex<double> *a, const int *lda, const std::complex<double> *b,
-          const int *ldb, const std::complex<double> *beta, std::complex<double> *c,
-          const int *ldc) {
-    cblas_zgemm_wrapper(layout, transa, transb, *m, *n, *k, (const void *)alpha, (const void *)a,
-                        *lda, (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc);
-}
-
-template <typename fpa, typename fpc>
-static void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m,
-                 const int *n, const int *k, const fpc *alpha, const fpa *a, const int *lda,
-                 const fpa *b, const int *ldb, const fpc *beta, fpc *c, const int *ldc);
-
-template <>
-void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m,
-          const int *n, const int *k, const float *alpha, const sycl::half *a, const int *lda,
-          const sycl::half *b, const int *ldb, const float *beta, float *c, const int *ldc) {
-    // Not supported in NETLIB. SGEMM is used as reference.
-    int sizea, sizeb;
-    if (layout == CblasColMajor) {
-        sizea = (transa == CblasNoTrans) ? *lda * *k : *lda * *m;
-        sizeb = (transb == CblasNoTrans) ? *ldb * *n : *ldb * *k;
-    }
-    else {
-        sizea = (transa == CblasNoTrans) ? *lda * *m : *lda * *k;
-        sizeb = (transb == CblasNoTrans) ? *ldb * *k : *ldb * *n;
-    }
-    float *af = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizea);
-    float *bf = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizeb);
-    copy_mat(a, layout, transa, *m, *k, *lda, af);
-    copy_mat(b, layout, transb, *k, *n, *ldb, bf);
-    cblas_sgemm_wrapper(layout, transa, transb, *m, *n, *k, *alpha, af, *lda, bf, *ldb, *beta, c,
-                        *ldc);
-    oneapi::mkl::aligned_free(af);
-    oneapi::mkl::aligned_free(bf);
-}
-
-template <>
-void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m,
-          const int *n, const int *k, const float *alpha, const oneapi::mkl::bfloat16 *a,
-          const int *lda, const oneapi::mkl::bfloat16 *b, const int *ldb, const float *beta,
-          float *c, const int *ldc) {
-    // Not supported in NETLIB. SGEMM is used as reference.
-    int sizea, sizeb;
-    if (layout == CblasColMajor) {
-        sizea = (transa == CblasNoTrans) ? *lda * *k : *lda * *m;
-        sizeb = (transb == CblasNoTrans) ? *ldb * *n : *ldb * *k;
-    }
-    else {
-        sizea = (transa == CblasNoTrans) ? *lda * *m : *lda * *k;
-        sizeb = (transb == CblasNoTrans) ? *ldb * *k : *ldb * *n;
-    }
-    float *af = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizea);
-    float *bf = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizeb);
-    copy_mat(a, layout, transa, *m, *k, *lda, af);
-    copy_mat(b, layout, transb, *k, *n, *ldb, bf);
-    cblas_sgemm_wrapper(layout, transa, transb, *m, *n, *k, *alpha, af, *lda, bf, *ldb, *beta, c,
-                        *ldc);
-    oneapi::mkl::aligned_free(af);
-    oneapi::mkl::aligned_free(bf);
-}
-
-template <typename fp>
-static void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m,
-                 const int *n, const fp *alpha, const fp *a, const int *lda, const fp *b,
-                 const int *ldb, const fp *beta, fp *c, const int *ldc);
-
-template <>
-void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m, const int *n,
-          const float *alpha, const float *a, const int *lda, const float *b, const int *ldb,
-          const float *beta, float *c, const int *ldc) {
-    cblas_ssymm_wrapper(layout, left_right, uplo, *m, *n, *alpha, a, *lda, b, *ldb, *beta, c, *ldc);
-}
-
-template <>
-void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m, const int *n,
-          const double *alpha, const double *a, const int *lda, const double *b, const int *ldb,
-          const double *beta, double *c, const int *ldc) {
-    cblas_dsymm_wrapper(layout, left_right, uplo, *m, *n, *alpha, a, *lda, b, *ldb, *beta, c, *ldc);
-}
-
-template <>
-void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m, const int *n,
-          const std::complex<float> *alpha, const std::complex<float> *a, const int *lda,
-          const std::complex<float> *b, const int *ldb, const std::complex<float> *beta,
-          std::complex<float> *c, const int *ldc) {
-    cblas_csymm_wrapper(layout, left_right, uplo, *m, *n, (const void *)alpha, (const void *)a,
-                        *lda, (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc);
-}
-
-template <>
-void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m, const int *n,
-          const std::complex<double> *alpha, const std::complex<double> *a, const int *lda,
-          const std::complex<double> *b, const int *ldb, const std::complex<double> *beta,
-          std::complex<double> *c, const int *ldc) {
-    cblas_zsymm_wrapper(layout, left_right, uplo, *m, *n, (const void *)alpha, (const void *)a,
-                        *lda, (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc);
-}
-
-template <typename fp>
-static void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n,
-                 const int *k, const fp *alpha, const fp *a, const int *lda, const fp *beta, fp *c,
-                 const int *ldc);
-
-template <>
-void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k,
-          const float *alpha, const float *a, const int *lda, const float *beta, float *c,
-          const int *ldc) {
-    cblas_ssyrk_wrapper(layout, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc);
-}
-
-template <>
-void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k,
-          const double *alpha, const double *a, const int *lda, const double *beta, double *c,
-          const int *ldc) {
-    cblas_dsyrk_wrapper(layout, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc);
-}
-
-template <>
-void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k,
-          const std::complex<float> *alpha, const std::complex<float> *a, const int *lda,
-          const std::complex<float> *beta, std::complex<float> *c, const int *ldc) {
-    cblas_csyrk_wrapper(layout, uplo, trans, *n, *k, (const void *)alpha, (const void *)a, *lda,
-                        (const void *)beta, (void *)c, *ldc);
-}
-
-template <>
-void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k,
-          const std::complex<double> *alpha, const std::complex<double> *a, const int *lda,
-          const std::complex<double> *beta, std::complex<double> *c, const int *ldc) {
-    cblas_zsyrk_wrapper(layout, uplo, trans, *n, *k, (const void *)alpha, (const void *)a, *lda,
-                        (const void *)beta, (void *)c, *ldc);
-}
-
-template <typename fp>
-static void hemm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m,
-                 const int *n, const fp *alpha, const fp *a, const int *lda, const fp *b,
-                 const int *ldb, const fp *beta, fp *c, const int *ldc);
-
-template <>
-void hemm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m, const int *n,
-          const std::complex<float> *alpha, const std::complex<float> *a, const int *lda,
-          const std::complex<float> *b, const int *ldb, const std::complex<float> *beta,
-          std::complex<float> *c, const int *ldc) {
-    cblas_chemm_wrapper(layout, left_right, uplo, *m, *n, (const void *)alpha, (const void *)a,
-                        *lda, (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc);
-}
-
-template <>
-void hemm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m, const int *n,
-          const std::complex<double> *alpha, const std::complex<double> *a, const int *lda,
-          const std::complex<double> *b, const int *ldb, const std::complex<double> *beta,
-          std::complex<double> *c, const int *ldc) {
-    cblas_zhemm_wrapper(layout, left_right, uplo, *m, *n, (const void *)alpha, (const void *)a,
-                        *lda, (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc);
-}
-
-template <typename fp_scalar, typename fp_data>
-static void herk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n,
-                 const int *k, const fp_scalar *alpha, const fp_data *a, const int *lda,
-                 const fp_scalar *beta, fp_data *c, const int *ldc);
-
-template <>
-void herk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k,
-          const float *alpha, const std::complex<float> *a, const int *lda, const float *beta,
-          std::complex<float> *c, const int *ldc) {
-    cblas_cherk_wrapper(layout, uplo, trans, *n, *k, *alpha, (const void *)a, *lda, *beta,
-                        (void *)c, *ldc);
-}
-
-template <>
-void herk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k,
-          const double *alpha, const std::complex<double> *a, const int *lda, const double *beta,
-          std::complex<double> *c, const int *ldc) {
-    cblas_zherk_wrapper(layout, uplo, trans, *n, *k, *alpha, (const void *)a, *lda, *beta,
-                        (void *)c, *ldc);
-}
-
-template <typename fp>
-static void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n,
-                  const int *k, const fp *alpha, const fp *a, const int *lda, const fp *b,
-                  const int *ldb, const fp *beta, fp *c, const int *ldc);
-
-template <>
-void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k,
-           const float *alpha, const float *a, const int *lda, const float *b, const int *ldb,
-           const float *beta, float *c, const int *ldc) {
-    cblas_ssyr2k_wrapper(layout, uplo, trans, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc);
-}
-
-template <>
-void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k,
-           const double *alpha, const double *a, const int *lda, const double *b, const int *ldb,
-           const double *beta, double *c, const int *ldc) {
-    cblas_dsyr2k_wrapper(layout, uplo, trans, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc);
-}
-
-template <>
-void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k,
-           const std::complex<float> *alpha, const std::complex<float> *a, const int *lda,
-           const std::complex<float> *b, const int *ldb, const std::complex<float> *beta,
-           std::complex<float> *c, const int *ldc) {
-    cblas_csyr2k_wrapper(layout, uplo, trans, *n, *k, (const void *)alpha, (const void *)a, *lda,
-                         (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc);
-}
-
-template <>
-void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k,
-           const std::complex<double> *alpha, const std::complex<double> *a, const int *lda,
-           const std::complex<double> *b, const int *ldb, const std::complex<double> *beta,
-           std::complex<double> *c, const int *ldc) {
-    cblas_zsyr2k_wrapper(layout, uplo, trans, *n, *k, (const void *)alpha, (const void *)a, *lda,
-                         (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc);
-}
-
-template <typename fp_scalar, typename fp_data>
-static void her2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n,
-                  const int *k, const fp_data *alpha, const fp_data *a, const int *lda,
-                  const fp_data *b, const int *ldb, const fp_scalar *beta, fp_data *c,
-                  const int *ldc);
-
-template <>
-void her2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k,
-           const std::complex<float> *alpha, const std::complex<float> *a, const int *lda,
-           const std::complex<float> *b, const int *ldb, const float *beta, std::complex<float> *c,
-           const int *ldc) {
-    cblas_cher2k_wrapper(layout, uplo, trans, *n, *k, (const void *)alpha, (const void *)a, *lda,
-                         (const void *)b, *ldb, *beta, (void *)c, *ldc);
-}
-
-template <>
-void her2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k,
-           const std::complex<double> *alpha, const std::complex<double> *a, const int *lda,
-           const std::complex<double> *b, const int *ldb, const double *beta,
-           std::complex<double> *c, const int *ldc) {
-    cblas_zher2k_wrapper(layout, uplo, trans, *n, *k, (const void *)alpha, (const void *)a, *lda,
-                         (const void *)b, *ldb, *beta, (void *)c, *ldc);
-}
-
-template <typename fp>
-static void trmm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa,
-                 CBLAS_DIAG diag, const int *m, const int *n, const fp *alpha, const fp *a,
-                 const int *lda, fp *b, const int *ldb);
-
-template <>
-void trmm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa,
-          CBLAS_DIAG diag, const int *m, const int *n, const float *alpha, const float *a,
-          const int *lda, float *b, const int *ldb) {
-    cblas_strmm_wrapper(layout, side, uplo, transa, diag, *m, *n, *alpha, a, *lda, b, *ldb);
-}
-
-template <>
-void trmm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa,
-          CBLAS_DIAG diag, const int *m, const int *n, const double *alpha, const double *a,
-          const int *lda, double *b, const int *ldb) {
-    cblas_dtrmm_wrapper(layout, side, uplo, transa, diag, *m, *n, *alpha, a, *lda, b, *ldb);
-}
-
-template <>
-void trmm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa,
-          CBLAS_DIAG diag, const int *m, const int *n, const std::complex<float> *alpha,
-          const std::complex<float> *a, const int *lda, std::complex<float> *b, const int *ldb) {
-    cblas_ctrmm_wrapper(layout, side, uplo, transa, diag, *m, *n, (const void *)alpha,
-                        (const void *)a, *lda, (void *)b, *ldb);
-}
-
-template <>
-void trmm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa,
-          CBLAS_DIAG diag, const int *m, const int *n, const std::complex<double> *alpha,
-          const std::complex<double> *a, const int *lda, std::complex<double> *b, const int *ldb) {
-    cblas_ztrmm_wrapper(layout, side, uplo, transa, diag, *m, *n, (const void *)alpha,
-                        (const void *)a, *lda, (void *)b, *ldb);
-}
-
-template <typename fp>
-static void trsm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa,
-                 CBLAS_DIAG diag, const int *m, const int *n, const fp *alpha, const fp *a,
-                 const int *lda, fp *b, const int *ldb);
-
-template <>
-void trsm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa,
-          CBLAS_DIAG diag, const int *m, const int *n, const float *alpha, const float *a,
-          const int *lda, float *b, const int *ldb) {
-    cblas_strsm_wrapper(layout, side, uplo, transa, diag, *m, *n, *alpha, a, *lda, b, *ldb);
-}
-
-template <>
-void trsm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa,
-          CBLAS_DIAG diag, const int *m, const int *n, const double *alpha, const double *a,
-          const int *lda, double *b, const int *ldb) {
-    cblas_dtrsm_wrapper(layout, side, uplo, transa, diag, *m, *n, *alpha, a, *lda, b, *ldb);
-}
-
-template <>
-void trsm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa,
-          CBLAS_DIAG diag, const int *m, const int *n, const std::complex<float> *alpha,
-          const std::complex<float> *a, const int *lda, std::complex<float> *b, const int *ldb) {
-    cblas_ctrsm_wrapper(layout, side, uplo, transa, diag, *m, *n, (const void *)alpha,
-                        (const void *)a, *lda, (void *)b, *ldb);
-}
-
-template <>
-void trsm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa,
-          CBLAS_DIAG diag, const int *m, const int *n, const std::complex<double> *alpha,
-          const std::complex<double> *a, const int *lda, std::complex<double> *b, const int *ldb) {
-    cblas_ztrsm_wrapper(layout, side, uplo, transa, diag, *m, *n, (const void *)alpha,
-                        (const void *)a, *lda, (void *)b, *ldb);
-}
-
-/* Level 2 */
-
-template <typename fp>
-static void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n,
-                 const fp *alpha, const fp *a, const int *lda, const fp *x, const int *incx,
-                 const fp *beta, fp *y, const int *incy);
-
-template <>
-void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n,
-          const float *alpha, const float *a, const int *lda, const float *x, const int *incx,
-          const float *beta, float *y, const int *incy) {
-    cblas_sgemv_wrapper(layout, trans, *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy);
-}
-
-template <>
-void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n,
-          const double *alpha, const double *a, const int *lda, const double *x, const int *incx,
-          const double *beta, double *y, const int *incy) {
-    cblas_dgemv_wrapper(layout, trans, *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy);
-}
-
-template <>
-void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n,
-          const std::complex<float> *alpha, const std::complex<float> *a, const int *lda,
-          const std::complex<float> *x, const int *incx, const std::complex<float> *beta,
-          std::complex<float> *y, const int *incy) {
-    cblas_cgemv_wrapper(layout, trans, *m, *n, (const void *)alpha, (const void *)a, *lda,
-                        (const void *)x, *incx, (const void *)beta, (void *)y, *incy);
-}
-
-template <>
-void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n,
-          const std::complex<double> *alpha, const std::complex<double> *a, const int *lda,
-          const std::complex<double> *x, const int *incx, const std::complex<double> *beta,
-          std::complex<double> *y, const int *incy) {
-    cblas_zgemv_wrapper(layout, trans, *m, *n, (const void *)alpha, (const void *)a, *lda,
-                        (const void *)x, *incx, (const void *)beta, (void *)y, *incy);
-}
-
-template <typename fp>
-static void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, int *kl,
-                 int *ku, const fp *alpha, const fp *a, const int *lda, const fp *x,
-                 const int *incx, const fp *beta, fp *y, const int *incy);
-
-template <>
-void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, int *kl, int *ku,
-          const float *alpha, const float *a, const int *lda, const float *x, const int *incx,
-          const float *beta, float *y, const int *incy) {
-    cblas_sgbmv_wrapper(layout, trans, *m, *n, *kl, *ku, *alpha, a, *lda, x, *incx, *beta, y,
-                        *incy);
-}
-
-template <>
-void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, int *kl, int *ku,
-          const double *alpha, const double *a, const int *lda, const double *x, const int *incx,
-          const double *beta, double *y, const int *incy) {
-    cblas_dgbmv_wrapper(layout, trans, *m, *n, *kl, *ku, *alpha, a, *lda, x, *incx, *beta, y,
-                        *incy);
-}
-
-template <>
-void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, int *kl, int *ku,
-          const std::complex<float> *alpha, const std::complex<float> *a, const int *lda,
-          const std::complex<float> *x, const int *incx, const std::complex<float> *beta,
-          std::complex<float> *y, const int *incy) {
-    cblas_cgbmv_wrapper(layout, trans, *m, *n, *kl, *ku, (const void *)alpha, (const void *)a, *lda,
-                        (const void *)x, *incx, (const void *)beta, (void *)y, *incy);
-}
-
-template <>
-void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, int *kl, int *ku,
-          const std::complex<double> *alpha, const std::complex<double> *a, const int *lda,
-          const std::complex<double> *x, const int *incx, const std::complex<double> *beta,
-          std::complex<double> *y, const int *incy) {
-    cblas_zgbmv_wrapper(layout, trans, *m, *n, *kl, *ku, (const void *)alpha, (const void *)a, *lda,
-                        (const void *)x, *incx, (const void *)beta, (void *)y, *incy);
-}
-
-template <typename fp>
-static void ger(CBLAS_LAYOUT layout, const int *m, const int *n, const fp *alpha, const fp *x,
-                const int *incx, const fp *y, const int *incy, fp *a, const int *lda);
-
-template <>
-void ger(CBLAS_LAYOUT layout, const int *m, const int *n, const float *alpha, const float *x,
-         const int *incx, const float *y, const int *incy, float *a, const int *lda) {
-    cblas_sger_wrapper(layout, *m, *n, *alpha, x, *incx, y, *incy, a, *lda);
-}
-
-template <>
-void ger(CBLAS_LAYOUT layout, const int *m, const int *n, const double *alpha, const double *x,
-         const int *incx, const double *y, const int *incy, double *a, const int *lda) {
-    cblas_dger_wrapper(layout, *m, *n, *alpha, x, *incx, y, *incy, a, *lda);
-}
-
-template <typename fp>
-static void gerc(CBLAS_LAYOUT layout, const int *m, const int *n, const fp *alpha, const fp *x,
-                 const int *incx, const fp *y, const int *incy, fp *a, const int *lda);
-
-template <>
-void gerc(CBLAS_LAYOUT layout, const int *m, const int *n, const std::complex<float> *alpha,
-          const std::complex<float> *x, const int *incx, const std::complex<float> *y,
-          const int *incy, std::complex<float> *a, const int *lda) {
-    cblas_cgerc_wrapper(layout, *m, *n, (const void *)alpha, (const void *)x, *incx,
-                        (const void *)y, *incy, (void *)a, *lda);
-}
-
-template <>
-void gerc(CBLAS_LAYOUT layout, const int *m, const int *n, const std::complex<double> *alpha,
-          const std::complex<double> *x, const int *incx, const std::complex<double> *y,
-          const int *incy, std::complex<double> *a, const int *lda) {
-    cblas_zgerc_wrapper(layout, *m, *n, (const void *)alpha, (const void *)x, *incx,
-                        (const void *)y, *incy, (void *)a, *lda);
-}
-
-template <typename fp>
-static void geru(CBLAS_LAYOUT layout, const int *m, const int *n, const fp *alpha, const fp *x,
-                 const int *incx, const fp *y, const int *incy, fp *a, const int *lda);
-
-template <>
-void geru(CBLAS_LAYOUT layout, const int *m, const int *n, const std::complex<float> *alpha,
-          const std::complex<float> *x, const int *incx, const std::complex<float> *y,
-          const int *incy, std::complex<float> *a, const int *lda) {
-    cblas_cgeru_wrapper(layout, *m, *n, (const void *)alpha, (const void *)x, *incx,
-                        (const void *)y, *incy, (void *)a, *lda);
-}
-
-template <>
-void geru(CBLAS_LAYOUT layout, const int *m, const int *n, const std::complex<double> *alpha,
-          const std::complex<double> *x, const int *incx, const std::complex<double> *y,
-          const int *incy, std::complex<double> *a, const int *lda) {
-    cblas_zgeru_wrapper(layout, *m, *n, (const void *)alpha, (const void *)x, *incx,
-                        (const void *)y, *incy, (void *)a, *lda);
-}
-
-template <typename fp>
-static void hbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const int *k,
-                 const fp *alpha, const fp *a, const int *lda, const fp *x, const int *incx,
-                 const fp *beta, fp *y, const int *incy);
-
-template <>
-void hbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const int *k,
-          const std::complex<float> *alpha, const std::complex<float> *a, const int *lda,
-          const std::complex<float> *x, const int *incx, const std::complex<float> *beta,
-          std::complex<float> *y, const int *incy) {
-    cblas_chbmv_wrapper(layout, upper_lower, *n, *k, (const void *)alpha, (const void *)a, *lda,
-                        (const void *)x, *incx, (const void *)beta, (void *)y, *incy);
-}
-
-template <>
-void hbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const int *k,
-          const std::complex<double> *alpha, const std::complex<double> *a, const int *lda,
-          const std::complex<double> *x, const int *incx, const std::complex<double> *beta,
-          std::complex<double> *y, const int *incy) {
-    cblas_zhbmv_wrapper(layout, upper_lower, *n, *k, (const void *)alpha, (const void *)a, *lda,
-                        (const void *)x, *incx, (const void *)beta, (void *)y, *incy);
-}
-
-template <typename fp>
-static void hemv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha,
-                 const fp *a, const int *lda, const fp *x, const int *incx, const fp *beta, fp *y,
-                 const int *incy);
-
-template <>
-void hemv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n,
-          const std::complex<float> *alpha, const std::complex<float> *a, const int *lda,
-          const std::complex<float> *x, const int *incx, const std::complex<float> *beta,
-          std::complex<float> *y, const int *incy) {
-    cblas_chemv_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)a, *lda,
-                        (const void *)x, *incx, (const void *)beta, (void *)y, *incy);
-}
-
-template <>
-void hemv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n,
-          const std::complex<double> *alpha, const std::complex<double> *a, const int *lda,
-          const std::complex<double> *x, const int *incx, const std::complex<double> *beta,
-          std::complex<double> *y, const int *incy) {
-    cblas_zhemv_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)a, *lda,
-                        (const void *)x, *incx, (const void *)beta, (void *)y, *incy);
-}
-
-template <typename fp_scalar, typename fp_data>
-static void her(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp_scalar *alpha,
-                const fp_data *x, const int *incx, fp_data *a, const int *lda);
-
-template <>
-void her(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha,
-         const std::complex<float> *x, const int *incx, std::complex<float> *a, const int *lda) {
-    cblas_cher_wrapper(layout, upper_lower, *n, *alpha, (const void *)x, *incx, (void *)a, *lda);
-}
-
-template <>
-void her(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha,
-         const std::complex<double> *x, const int *incx, std::complex<double> *a, const int *lda) {
-    cblas_zher_wrapper(layout, upper_lower, *n, *alpha, (const void *)x, *incx, (void *)a, *lda);
-}
-
-template <typename fp>
-static void her2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha,
-                 const fp *x, const int *incx, const fp *y, const int *incy, fp *a, const int *lda);
-
-template <>
-void her2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n,
-          const std::complex<float> *alpha, const std::complex<float> *x, const int *incx,
-          const std::complex<float> *y, const int *incy, std::complex<float> *a, const int *lda) {
-    cblas_cher2_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)x, *incx,
-                        (const void *)y, *incy, (void *)a, *lda);
-}
-
-template <>
-void her2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n,
-          const std::complex<double> *alpha, const std::complex<double> *x, const int *incx,
-          const std::complex<double> *y, const int *incy, std::complex<double> *a, const int *lda) {
-    cblas_zher2_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)x, *incx,
-                        (const void *)y, *incy, (void *)a, *lda);
-}
-
-template <typename fp>
-static void hpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha,
-                 const fp *a, const fp *x, const int *incx, const fp *beta, fp *y, const int *incy);
-
-template <>
-void hpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n,
-          const std::complex<float> *alpha, const std::complex<float> *a,
-          const std::complex<float> *x, const int *incx, const std::complex<float> *beta,
-          std::complex<float> *y, const int *incy) {
-    cblas_chpmv_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)a,
-                        (const void *)x, *incx, (const void *)beta, (void *)y, *incy);
-}
-
-template <>
-void hpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n,
-          const std::complex<double> *alpha, const std::complex<double> *a,
-          const std::complex<double> *x, const int *incx, const std::complex<double> *beta,
-          std::complex<double> *y, const int *incy) {
-    cblas_zhpmv_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)a,
-                        (const void *)x, *incx, (const void *)beta, (void *)y, *incy);
-}
-
-template <typename fp_scalar, typename fp_data>
-static void hpr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp_scalar *alpha,
-                const fp_data *x, const int *incx, fp_data *a);
-
-template <>
-void hpr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha,
-         const std::complex<float> *x, const int *incx, std::complex<float> *a) {
-    cblas_chpr_wrapper(layout, upper_lower, *n, *alpha, (const void *)x, *incx, (void *)a);
-}
-
-template <>
-void hpr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha,
-         const std::complex<double> *x, const int *incx, std::complex<double> *a) {
-    cblas_zhpr_wrapper(layout, upper_lower, *n, *alpha, (const void *)x, *incx, (void *)a);
-}
-
-template <typename fp>
-static void hpr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha,
-                 const fp *x, const int *incx, const fp *y, const int *incy, fp *a);
-
-template <>
-void hpr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n,
-          const std::complex<float> *alpha, const std::complex<float> *x, const int *incx,
-          const std::complex<float> *y, const int *incy, std::complex<float> *a) {
-    cblas_chpr2_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)x, *incx,
-                        (const void *)y, *incy, (void *)a);
-}
-
-template <>
-void hpr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n,
-          const std::complex<double> *alpha, const std::complex<double> *x, const int *incx,
-          const std::complex<double> *y, const int *incy, std::complex<double> *a) {
-    cblas_zhpr2_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)x, *incx,
-                        (const void *)y, *incy, (void *)a);
-}
-
-template <typename fp>
-static void sbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const int *k,
-                 const fp *alpha, const fp *a, const int *lda, const fp *x, const int *incx,
-                 const fp *beta, fp *y, const int *incy);
-
-template <>
-void sbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const int *k,
-          const float *alpha, const float *a, const int *lda, const float *x, const int *incx,
-          const float *beta, float *y, const int *incy) {
-    cblas_ssbmv_wrapper(layout, upper_lower, *n, *k, *alpha, a, *lda, x, *incx, *beta, y, *incy);
-}
-
-template <>
-void sbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const int *k,
-          const double *alpha, const double *a, const int *lda, const double *x, const int *incx,
-          const double *beta, double *y, const int *incy) {
-    cblas_dsbmv_wrapper(layout, upper_lower, *n, *k, *alpha, a, *lda, x, *incx, *beta, y, *incy);
-}
-
-template <typename fp>
-static void symv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha,
-                 const fp *a, const int *lda, const fp *x, const int *incx, const fp *beta, fp *y,
-                 const int *incy);
-
-template <>
-void symv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha,
-          const float *a, const int *lda, const float *x, const int *incx, const float *beta,
-          float *y, const int *incy) {
-    cblas_ssymv_wrapper(layout, upper_lower, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy);
-}
-
-template <>
-void symv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha,
-          const double *a, const int *lda, const double *x, const int *incx, const double *beta,
-          double *y, const int *incy) {
-    cblas_dsymv_wrapper(layout, upper_lower, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy);
-}
-
-template <typename fp>
-static void syr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha,
-                const fp *x, const int *incx, fp *a, const int *lda);
-
-template <>
-void syr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha,
-         const float *x, const int *incx, float *a, const int *lda) {
-    cblas_ssyr_wrapper(layout, upper_lower, *n, *alpha, x, *incx, a, *lda);
-}
-
-template <>
-void syr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha,
-         const double *x, const int *incx, double *a, const int *lda) {
-    cblas_dsyr_wrapper(layout, upper_lower, *n, *alpha, x, *incx, a, *lda);
-}
-
-template <typename fp>
-static void syr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha,
-                 const fp *x, const int *incx, const fp *y, const int *incy, fp *a, const int *lda);
-
-template <>
-void syr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha,
-          const float *x, const int *incx, const float *y, const int *incy, float *a,
-          const int *lda) {
-    cblas_ssyr2_wrapper(layout, upper_lower, *n, *alpha, x, *incx, y, *incy, a, *lda);
-}
-
-template <>
-void syr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha,
-          const double *x, const int *incx, const double *y, const int *incy, double *a,
-          const int *lda) {
-    cblas_dsyr2_wrapper(layout, upper_lower, *n, *alpha, x, *incx, y, *incy, a, *lda);
-}
-
-template <typename fp>
-static void spmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha,
-                 const fp *a, const fp *x, const int *incx, const fp *beta, fp *y, const int *incy);
-
-template <>
-void spmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha,
-          const float *a, const float *x, const int *incx, const float *beta, float *y,
-          const int *incy) {
-    cblas_sspmv_wrapper(layout, upper_lower, *n, *alpha, a, x, *incx, *beta, y, *incy);
-}
-
-template <>
-void spmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha,
-          const double *a, const double *x, const int *incx, const double *beta, double *y,
-          const int *incy) {
-    cblas_dspmv_wrapper(layout, upper_lower, *n, *alpha, a, x, *incx, *beta, y, *incy);
-}
-
-template <typename fp>
-static void spr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha,
-                const fp *x, const int *incx, fp *a);
-
-template <>
-void spr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha,
-         const float *x, const int *incx, float *a) {
-    cblas_sspr_wrapper(layout, upper_lower, *n, *alpha, x, *incx, a);
-}
-
-template <>
-void spr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha,
-         const double *x, const int *incx, double *a) {
-    cblas_dspr_wrapper(layout, upper_lower, *n, *alpha, x, *incx, a);
-}
-
-template <typename fp>
-static void spr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha,
-                 const fp *x, const int *incx, const fp *y, const int *incy, fp *a);
-
-template <>
-void spr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha,
-          const float *x, const int *incx, const float *y, const int *incy, float *a) {
-    cblas_sspr2_wrapper(layout, upper_lower, *n, *alpha, x, *incx, y, *incy, a);
-}
-
-template <>
-void spr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha,
-          const double *x, const int *incx, const double *y, const int *incy, double *a) {
-    cblas_dspr2_wrapper(layout, upper_lower, *n, *alpha, x, *incx, y, *incy, a);
-}
-
-template <typename fp>
-static void tbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                 CBLAS_DIAG unit_diag, const int *n, const int *k, const fp *a, const int *lda,
-                 fp *x, const int *incx);
-
-template <>
-void tbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const int *k, const float *a, const int *lda, float *x, const int *incx) {
-    cblas_stbmv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, a, *lda, x, *incx);
-}
-
-template <>
-void tbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const int *k, const double *a, const int *lda, double *x, const int *incx) {
-    cblas_dtbmv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, a, *lda, x, *incx);
-}
-
-template <>
-void tbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const int *k, const std::complex<float> *a, const int *lda,
-          std::complex<float> *x, const int *incx) {
-    cblas_ctbmv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, (const void *)a, *lda,
-                        (void *)x, *incx);
-}
-
-template <>
-void tbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const int *k, const std::complex<double> *a, const int *lda,
-          std::complex<double> *x, const int *incx) {
-    cblas_ztbmv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, (const void *)a, *lda,
-                        (void *)x, *incx);
-}
-
-template <typename fp>
-static void tbsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                 CBLAS_DIAG unit_diag, const int *n, const int *k, const fp *a, const int *lda,
-                 fp *x, const int *incx);
-
-template <>
-void tbsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const int *k, const float *a, const int *lda, float *x, const int *incx) {
-    cblas_stbsv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, a, *lda, x, *incx);
-}
-
-template <>
-void tbsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const int *k, const double *a, const int *lda, double *x, const int *incx) {
-    cblas_dtbsv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, a, *lda, x, *incx);
-}
-
-template <>
-void tbsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const int *k, const std::complex<float> *a, const int *lda,
-          std::complex<float> *x, const int *incx) {
-    cblas_ctbsv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, (const void *)a, *lda,
-                        (void *)x, *incx);
-}
-
-template <>
-void tbsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const int *k, const std::complex<double> *a, const int *lda,
-          std::complex<double> *x, const int *incx) {
-    cblas_ztbsv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, (const void *)a, *lda,
-                        (void *)x, *incx);
-}
-
-template <typename fp>
-static void tpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                 CBLAS_DIAG unit_diag, const int *n, const fp *a, fp *x, const int *incx);
-
-template <>
-void tpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const float *a, float *x, const int *incx) {
-    cblas_stpmv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, x, *incx);
-}
-
-template <>
-void tpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const double *a, double *x, const int *incx) {
-    cblas_dtpmv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, x, *incx);
-}
-
-template <>
-void tpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const std::complex<float> *a, std::complex<float> *x, const int *incx) {
-    cblas_ctpmv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, (void *)x,
-                        *incx);
-}
-
-template <>
-void tpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const std::complex<double> *a, std::complex<double> *x, const int *incx) {
-    cblas_ztpmv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, (void *)x,
-                        *incx);
-}
-
-template <typename fp>
-static void tpsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                 CBLAS_DIAG unit_diag, const int *n, const fp *a, fp *x, const int *incx);
-
-template <>
-void tpsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const float *a, float *x, const int *incx) {
-    cblas_stpsv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, x, *incx);
-}
-
-template <>
-void tpsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const double *a, double *x, const int *incx) {
-    cblas_dtpsv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, x, *incx);
-}
-
-template <>
-void tpsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const std::complex<float> *a, std::complex<float> *x, const int *incx) {
-    cblas_ctpsv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, (void *)x,
-                        *incx);
-}
-
-template <>
-void tpsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const std::complex<double> *a, std::complex<double> *x, const int *incx) {
-    cblas_ztpsv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, (void *)x,
-                        *incx);
-}
-
-template <typename fp>
-static void trmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                 CBLAS_DIAG unit_diag, const int *n, const fp *a, const int *lda, fp *x,
-                 const int *incx);
-
-template <>
-void trmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const float *a, const int *lda, float *x, const int *incx) {
-    cblas_strmv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, *lda, x, *incx);
-}
-
-template <>
-void trmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const double *a, const int *lda, double *x, const int *incx) {
-    cblas_dtrmv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, *lda, x, *incx);
-}
-
-template <>
-void trmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const std::complex<float> *a, const int *lda, std::complex<float> *x,
-          const int *incx) {
-    cblas_ctrmv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, *lda, (void *)x,
-                        *incx);
-}
-
-template <>
-void trmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const std::complex<double> *a, const int *lda, std::complex<double> *x,
-          const int *incx) {
-    cblas_ztrmv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, *lda, (void *)x,
-                        *incx);
-}
-
-template <typename fp>
-static void trsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                 CBLAS_DIAG unit_diag, const int *n, const fp *a, const int *lda, fp *x,
-                 const int *incx);
-
-template <>
-void trsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const float *a, const int *lda, float *x, const int *incx) {
-    cblas_strsv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, *lda, x, *incx);
-}
-
-template <>
-void trsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const double *a, const int *lda, double *x, const int *incx) {
-    cblas_dtrsv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, *lda, x, *incx);
-}
-
-template <>
-void trsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const std::complex<float> *a, const int *lda, std::complex<float> *x,
-          const int *incx) {
-    cblas_ctrsv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, *lda, (void *)x,
-                        *incx);
-}
-
-template <>
-void trsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag,
-          const int *n, const std::complex<double> *a, const int *lda, std::complex<double> *x,
-          const int *incx) {
-    cblas_ztrsv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, *lda, (void *)x,
-                        *incx);
-}
-
-/* Level 1 */
-
-template <typename fp_data, typename fp_res>
-static fp_res asum(const int *n, const fp_data *x, const int *incx);
-
-template <>
-float asum(const int *n, const float *x, const int *incx) {
-    return cblas_sasum_wrapper(*n, x, *incx);
-}
-
-template <>
-double asum(const int *n, const double *x, const int *incx) {
-    return cblas_dasum_wrapper(*n, x, *incx);
-}
-
-template <>
-float asum(const int *n, const std::complex<float> *x, const int *incx) {
-    return cblas_scasum_wrapper(*n, (const void *)x, *incx);
-}
-
-template <>
-double asum(const int *n, const std::complex<double> *x, const int *incx) {
-    return cblas_dzasum_wrapper(*n, (const void *)x, *incx);
-}
-
-template <typename fp>
-static void axpy(const int *n, const fp *alpha, const fp *x, const int *incx, fp *y,
-                 const int *incy);
-
-template <>
-void axpy(const int *n, const float *alpha, const float *x, const int *incx, float *y,
-          const int *incy) {
-    cblas_saxpy_wrapper(*n, *alpha, x, *incx, y, *incy);
-}
-
-template <>
-void axpy(const int *n, const double *alpha, const double *x, const int *incx, double *y,
-          const int *incy) {
-    cblas_daxpy_wrapper(*n, *alpha, x, *incx, y, *incy);
-}
-
-template <>
-void axpy(const int *n, const std::complex<float> *alpha, const std::complex<float> *x,
-          const int *incx, std::complex<float> *y, const int *incy) {
-    cblas_caxpy_wrapper(*n, (const void *)alpha, (const void *)x, *incx, (void *)y, *incy);
-}
-
-template <>
-void axpy(const int *n, const std::complex<double> *alpha, const std::complex<double> *x,
-          const int *incx, std::complex<double> *y, const int *incy) {
-    cblas_zaxpy_wrapper(*n, (const void *)alpha, (const void *)x, *incx, (void *)y, *incy);
-}
-
-template <typename fp>
-static void copy(const int *n, const fp *x, const int *incx, fp *y, const int *incy);
-
-template <>
-void copy(const int *n, const float *x, const int *incx, float *y, const int *incy) {
-    cblas_scopy_wrapper(*n, x, *incx, y, *incy);
-}
-template <>
-void copy(const int *n, const double *x, const int *incx, double *y, const int *incy) {
-    cblas_dcopy_wrapper(*n, x, *incx, y, *incy);
-}
-template <>
-void copy(const int *n, const std::complex<float> *x, const int *incx, std::complex<float> *y,
-          const int *incy) {
-    cblas_ccopy_wrapper(*n, (const void *)x, *incx, (void *)y, *incy);
-}
-template <>
-void copy(const int *n, const std::complex<double> *x, const int *incx, std::complex<double> *y,
-          const int *incy) {
-    cblas_zcopy_wrapper(*n, (const void *)x, *incx, (void *)y, *incy);
-}
-
-template <typename fp, typename fp_res>
-static fp_res dot(const int *n, const fp *x, const int *incx, const fp *y, const int *incy);
-
-template <>
-float dot(const int *n, const float *x, const int *incx, const float *y, const int *incy) {
-    return cblas_sdot_wrapper(*n, x, *incx, y, *incy);
-}
-
-template <>
-double dot(const int *n, const double *x, const int *incx, const double *y, const int *incy) {
-    return cblas_ddot_wrapper(*n, x, *incx, y, *incy);
-}
-
-template <>
-double dot(const int *n, const float *x, const int *incx, const float *y, const int *incy) {
-    return cblas_dsdot_wrapper(*n, x, *incx, y, *incy);
-}
-
-static float sdsdot(const int *n, const float *sb, const float *x, const int *incx, const float *y,
-                    const int *incy) {
-    return cblas_sdsdot_wrapper(*n, *sb, x, *incx, y, *incy);
-}
-
-template <typename fp, typename fp_res>
-static fp_res nrm2(const int *n, const fp *x, const int *incx);
-
-template <>
-float nrm2(const int *n, const float *x, const int *incx) {
-    return cblas_snrm2_wrapper(*n, x, *incx);
-}
-
-template <>
-double nrm2(const int *n, const double *x, const int *incx) {
-    return cblas_dnrm2_wrapper(*n, x, *incx);
-}
-
-template <>
-float nrm2(const int *n, const std::complex<float> *x, const int *incx) {
-    return cblas_scnrm2_wrapper(*n, (const void *)x, *incx);
-}
-
-template <>
-double nrm2(const int *n, const std::complex<double> *x, const int *incx) {
-    return cblas_dznrm2_wrapper(*n, (const void *)x, *incx);
-}
-
-template <typename fp, typename fp_scalar>
-static void rot(const int *n, fp *x, const int *incx, fp *y, const int *incy, const fp_scalar *c,
-                const fp_scalar *s);
-
-template <>
-void rot(const int *n, float *x, const int *incx, float *y, const int *incy, const float *c,
-         const float *s) {
-    cblas_srot_wrapper(*n, x, *incx, y, *incy, *c, *s);
-}
-
-template <>
-void rot(const int *n, double *x, const int *incx, double *y, const int *incy, const double *c,
-         const double *s) {
-    cblas_drot_wrapper(*n, x, *incx, y, *incy, *c, *s);
-}
-
-template <>
-void rot(const int *n, std::complex<float> *x, const int *incx, std::complex<float> *y,
-         const int *incy, const float *c, const float *s) {
-    csrot_wrapper(n, (void *)x, incx, (void *)y, incy, c, s);
-}
-
-template <>
-void rot(const int *n, std::complex<double> *x, const int *incx, std::complex<double> *y,
-         const int *incy, const double *c, const double *s) {
-    zdrot_wrapper(n, (void *)x, incx, (void *)y, incy, c, s);
-}
-
-template <typename fp, typename fp_c>
-static void rotg(fp *a, fp *b, fp_c *c, fp *s);
-
-template <>
-void rotg(float *a, float *b, float *c, float *s) {
-    cblas_srotg_wrapper(a, b, c, s);
-}
-
-template <>
-void rotg(double *a, double *b, double *c, double *s) {
-    cblas_drotg_wrapper(a, b, c, s);
-}
-
-template <>
-void rotg(std::complex<float> *a, std::complex<float> *b, float *c, std::complex<float> *s) {
-    crotg_wrapper((void *)a, (void *)b, c, (void *)s);
-}
-
-template <>
-void rotg(std::complex<double> *a, std::complex<double> *b, double *c, std::complex<double> *s) {
-    zrotg_wrapper((void *)a, (void *)b, c, (void *)s);
-}
-
-template <typename fp>
-static void rotm(const int *n, fp *x, const int *incx, fp *y, const int *incy, const fp *param);
-
-template <>
-void rotm(const int *n, float *x, const int *incx, float *y, const int *incy, const float *param) {
-    cblas_srotm_wrapper(*n, x, *incx, y, *incy, param);
-}
-
-template <>
-void rotm(const int *n, double *x, const int *incx, double *y, const int *incy,
-          const double *param) {
-    cblas_drotm_wrapper(*n, x, *incx, y, *incy, param);
-}
-
-template <typename fp>
-static void rotmg(fp *d1, fp *d2, fp *x1, fp *y1, fp *param);
-
-template <>
-void rotmg(float *d1, float *d2, float *x1, float *y1, float *param) {
-    cblas_srotmg_wrapper(d1, d2, x1, *y1, param);
-}
-
-template <>
-void rotmg(double *d1, double *d2, double *x1, double *y1, double *param) {
-    cblas_drotmg_wrapper(d1, d2, x1, *y1, param);
-}
-
-template <typename fp_scalar, typename fp_data>
-static void scal(const int *n, const fp_scalar *alpha, fp_data *x, const int *incx);
-
-template <>
-void scal(const int *n, const float *alpha, float *x, const int *incx) {
-    cblas_sscal_wrapper(*n, *alpha, x, *incx);
-}
-template <>
-void scal(const int *n, const double *alpha, double *x, const int *incx) {
-    cblas_dscal_wrapper(*n, *alpha, x, *incx);
-}
-template <>
-void scal(const int *n, const std::complex<float> *alpha, std::complex<float> *x, const int *incx) {
-    cblas_cscal_wrapper(*n, (const void *)alpha, (void *)x, *incx);
-}
-template <>
-void scal(const int *n, const std::complex<double> *alpha, std::complex<double> *x,
-          const int *incx) {
-    cblas_zscal_wrapper(*n, (const void *)alpha, (void *)x, *incx);
-}
-template <>
-void scal(const int *n, const float *alpha, std::complex<float> *x, const int *incx) {
-    cblas_csscal_wrapper(*n, *alpha, (void *)x, *incx);
-}
-template <>
-void scal(const int *n, const double *alpha, std::complex<double> *x, const int *incx) {
-    cblas_zdscal_wrapper(*n, *alpha, (void *)x, *incx);
-}
-
-template <typename fp>
-static void swap(const int *n, fp *x, const int *incx, fp *y, const int *incy);
-
-template <>
-void swap(const int *n, float *x, const int *incx, float *y, const int *incy) {
-    cblas_sswap_wrapper(*n, x, *incx, y, *incy);
-}
-
-template <>
-void swap(const int *n, double *x, const int *incx, double *y, const int *incy) {
-    cblas_dswap_wrapper(*n, x, *incx, y, *incy);
-}
-
-template <>
-void swap(const int *n, std::complex<float> *x, const int *incx, std::complex<float> *y,
-          const int *incy) {
-    cblas_cswap_wrapper(*n, (void *)x, *incx, (void *)y, *incy);
-}
-
-template <>
-void swap(const int *n, std::complex<double> *x, const int *incx, std::complex<double> *y,
-          const int *incy) {
-    cblas_zswap_wrapper(*n, (void *)x, *incx, (void *)y, *incy);
-}
-
-template <typename fp>
-static void dotc(fp *pres, const int *n, const fp *x, const int *incx, const fp *y,
-                 const int *incy);
-
-template <>
-void dotc(std::complex<float> *pres, const int *n, const std::complex<float> *x, const int *incx,
-          const std::complex<float> *y, const int *incy) {
-    cblas_cdotc_sub_wrapper(*n, (const void *)x, *incx, (const void *)y, *incy, (void *)pres);
-}
-
-template <>
-void dotc(std::complex<double> *pres, const int *n, const std::complex<double> *x, const int *incx,
-          const std::complex<double> *y, const int *incy) {
-    cblas_zdotc_sub_wrapper(*n, (const void *)x, *incx, (const void *)y, *incy, (void *)pres);
-}
-
-template <typename fp>
-static void dotu(fp *pres, const int *n, const fp *x, const int *incx, const fp *y,
-                 const int *incy);
-
-template <>
-void dotu(std::complex<float> *pres, const int *n, const std::complex<float> *x, const int *incx,
-          const std::complex<float> *y, const int *incy) {
-    cblas_cdotu_sub_wrapper(*n, (const void *)x, *incx, (const void *)y, *incy, (void *)pres);
-}
-
-template <>
-void dotu(std::complex<double> *pres, const int *n, const std::complex<double> *x, const int *incx,
-          const std::complex<double> *y, const int *incy) {
-    cblas_zdotu_sub_wrapper(*n, (const void *)x, *incx, (const void *)y, *incy, (void *)pres);
-}
-
-template <typename fp>
-static int iamax(const int *n, const fp *x, const int *incx);
-
-template <>
-int iamax(const int *n, const float *x, const int *incx) {
-    return cblas_isamax_wrapper(*n, x, *incx);
-}
-
-template <>
-int iamax(const int *n, const double *x, const int *incx) {
-    return cblas_idamax_wrapper(*n, x, *incx);
-}
-
-template <>
-int iamax(const int *n, const std::complex<float> *x, const int *incx) {
-    return cblas_icamax_wrapper(*n, (const void *)x, *incx);
-}
-
-template <>
-int iamax(const int *n, const std::complex<double> *x, const int *incx) {
-    return cblas_izamax_wrapper(*n, (const void *)x, *incx);
-}
-
-inline float abs_val(float val) {
-    return std::abs(val);
-}
-
-inline double abs_val(double val) {
-    return std::abs(val);
-}
-
-inline float abs_val(std::complex<float> val) {
-    return std::abs(val.real()) + std::abs(val.imag());
-}
-
-inline double abs_val(std::complex<double> val) {
-    return std::abs(val.real()) + std::abs(val.imag());
-}
-
-template <typename fp>
-static int iamin(const int *n, const fp *x, const int *incx);
-
-template <>
-int iamin(const int *n, const float *x, const int *incx) {
-    if (*n < 1 || *incx < 1) {
-        return 0;
-    }
-    int min_idx = 0;
-    auto min_val = abs_val(x[0]);
-    if (sycl::isnan(min_val))
-        return 0;
-
-    for (int logical_i = 1; logical_i < *n; ++logical_i) {
-        int i = logical_i * std::abs(*incx);
-        auto curr_val = abs_val(x[i]);
-        if (sycl::isnan(curr_val))
-            return logical_i;
-        if (curr_val < min_val) {
-            min_idx = logical_i;
-            min_val = curr_val;
-        }
-    }
-    return min_idx;
-}
-
-template <>
-int iamin(const int *n, const double *x, const int *incx) {
-    if (*n < 1 || *incx < 1) {
-        return 0;
-    }
-    int min_idx = 0;
-    auto min_val = abs_val(x[0]);
-    if (sycl::isnan(min_val))
-        return 0;
-
-    for (int logical_i = 1; logical_i < *n; ++logical_i) {
-        int i = logical_i * std::abs(*incx);
-        auto curr_val = abs_val(x[i]);
-        if (sycl::isnan(curr_val))
-            return logical_i;
-        if (curr_val < min_val) {
-            min_idx = logical_i;
-            min_val = curr_val;
-        }
-    }
-    return min_idx;
-}
-
-template <>
-int iamin(const int *n, const std::complex<float> *x, const int *incx) {
-    if (*n < 1 || *incx < 1) {
-        return 0;
-    }
-    int min_idx = 0;
-    auto min_val = abs_val(x[0]);
-    if (sycl::isnan(min_val))
-        return 0;
-
-    for (int logical_i = 1; logical_i < *n; ++logical_i) {
-        int i = logical_i * std::abs(*incx);
-        auto curr_val = abs_val(x[i]);
-        if (sycl::isnan(curr_val))
-            return logical_i;
-        if (curr_val < min_val) {
-            min_idx = logical_i;
-            min_val = curr_val;
-        }
-    }
-    return min_idx;
-}
-
-template <>
-int iamin(const int *n, const std::complex<double> *x, const int *incx) {
-    if (*n < 1 || *incx < 1) {
-        return 0;
-    }
-    int min_idx = 0;
-    auto min_val = abs_val(x[0]);
-    if (sycl::isnan(min_val))
-        return 0;
-
-    for (int logical_i = 1; logical_i < *n; ++logical_i) {
-        int i = logical_i * std::abs(*incx);
-        auto curr_val = abs_val(x[i]);
-        if (sycl::isnan(curr_val))
-            return logical_i;
-        if (curr_val < min_val) {
-            min_idx = logical_i;
-            min_val = curr_val;
-        }
-    }
-    return min_idx;
-}
-
-/* Extensions */
-
-template <typename fp>
-static void axpby(const int *n, const fp *alpha, const fp *x, const int *incx, const fp *beta,
-                  fp *y, const int *incy);
-
-template <>
-void axpby(const int *n, const float *alpha, const float *x, const int *incx, const float *beta,
-           float *y, const int *incy) {
-    // Not supported in NETLIB. Reference C++ implementation is used.
-    int idx = (*incx) > 0 ? 0 : (1 - *n) * (*incx);
-    int idy = (*incy) > 0 ? 0 : (1 - *n) * (*incy);
-    for (int i = 0; i < *n; i++)
-        y[idy + i * (*incy)] = *alpha * x[idx + i * (*incx)] + (*beta) * y[idy + i * (*incy)];
-}
-
-template <>
-void axpby(const int *n, const double *alpha, const double *x, const int *incx, const double *beta,
-           double *y, const int *incy) {
-    // Not supported in NETLIB. Reference C++ implementation is used.
-    int idx = (*incx) > 0 ? 0 : (1 - *n) * (*incx);
-    int idy = (*incy) > 0 ? 0 : (1 - *n) * (*incy);
-    for (int i = 0; i < *n; i++)
-        y[idy + i * (*incy)] = *alpha * x[idx + i * (*incx)] + (*beta) * y[idy + i * (*incy)];
-}
-
-template <>
-void axpby(const int *n, const std::complex<float> *alpha, const std::complex<float> *x,
-           const int *incx, const std::complex<float> *beta, std::complex<float> *y,
-           const int *incy) {
-    // Not supported in NETLIB. Reference C++ implementation is used.
-    int idx = (*incx) > 0 ? 0 : (1 - *n) * (*incx);
-    int idy = (*incy) > 0 ? 0 : (1 - *n) * (*incy);
-    for (int i = 0; i < *n; i++)
-        y[idy + i * (*incy)] = *alpha * x[idx + i * (*incx)] + (*beta) * y[idy + i * (*incy)];
-}
-
-template <>
-void axpby(const int *n, const std::complex<double> *alpha, const std::complex<double> *x,
-           const int *incx, const std::complex<double> *beta, std::complex<double> *y,
-           const int *incy) {
-    // Not supported in NETLIB. Reference C++ implementation is used.
-    int idx = (*incx) > 0 ? 0 : (1 - *n) * (*incx);
-    int idy = (*incy) > 0 ? 0 : (1 - *n) * (*incy);
-    for (int i = 0; i < *n; i++)
-        y[idy + i * (*incy)] = *alpha * x[idx + i * (*incx)] + (*beta) * y[idy + i * (*incy)];
-}
-
-template <typename fps, typename fpa, typename fpb, typename fpc>
-static void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-                      CBLAS_OFFSET offsetc, const int *m, const int *n, const int *k,
-                      const fps *alpha, const fpa *a, const int *lda, const fpa *ao, const fpb *b,
-                      const int *ldb, const fpb *bo, const fps *beta, fpc *c, const int *ldc,
-                      const fpc *co);
-
-template <>
-void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-               CBLAS_OFFSET offsetc, const int *m, const int *n, const int *k, const float *alpha,
-               const int8_t *a, const int *lda, const int8_t *ao, const int8_t *b, const int *ldb,
-               const int8_t *bo, const float *beta, int32_t *c, const int *ldc, const int32_t *co) {
-    // Not supported in NETLIB. DGEMM is used as reference.
-    int sizea, sizeb, sizec;
-    if (layout == CblasColMajor) {
-        sizea = (transa == CblasNoTrans) ? *lda * *k : *lda * *m;
-        sizeb = (transb == CblasNoTrans) ? *ldb * *n : *ldb * *k;
-        sizec = *ldc * *n;
-    }
-    else {
-        sizea = (transa == CblasNoTrans) ? *lda * *m : *lda * *k;
-        sizeb = (transb == CblasNoTrans) ? *ldb * *k : *ldb * *n;
-        sizec = *ldc * *m;
-    }
-    double *ad = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizea);
-    double *bd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizeb);
-    double *cd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec);
-    double alphad = *alpha;
-    double betad = *beta;
-    double aod = *ao;
-    double bod = *bo;
-    copy_mat(a, layout, transa, *m, *k, *lda, aod, ad);
-    copy_mat(b, layout, transb, *k, *n, *ldb, bod, bd);
-    copy_mat(c, layout, CblasNoTrans, *m, *n, *ldc, 0.0, cd);
-    cblas_dgemm_wrapper(layout, transa, transb, *m, *n, *k, alphad, ad, *lda, bd, *ldb, betad, cd,
-                        *ldc);
-    copy_mat(cd, layout, *m, *n, *ldc, offsetc, co, c);
-    oneapi::mkl::aligned_free(ad);
-    oneapi::mkl::aligned_free(bd);
-    oneapi::mkl::aligned_free(cd);
-}
-
-template <>
-void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-               CBLAS_OFFSET offsetc, const int *m, const int *n, const int *k, const float *alpha,
-               const int8_t *a, const int *lda, const int8_t *ao, const uint8_t *b, const int *ldb,
-               const uint8_t *bo, const float *beta, int32_t *c, const int *ldc,
-               const int32_t *co) {
-    // Not supported in NETLIB. DGEMM is used as reference.
-    int sizea, sizeb, sizec;
-    if (layout == CblasColMajor) {
-        sizea = (transa == CblasNoTrans) ? *lda * *k : *lda * *m;
-        sizeb = (transb == CblasNoTrans) ? *ldb * *n : *ldb * *k;
-        sizec = *ldc * *n;
-    }
-    else {
-        sizea = (transa == CblasNoTrans) ? *lda * *m : *lda * *k;
-        sizeb = (transb == CblasNoTrans) ? *ldb * *k : *ldb * *n;
-        sizec = *ldc * *m;
-    }
-    double *ad = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizea);
-    double *bd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizeb);
-    double *cd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec);
-    double alphad = *alpha;
-    double betad = *beta;
-    double aod = *ao;
-    double bod = *bo;
-    copy_mat(a, layout, transa, *m, *k, *lda, aod, ad);
-    copy_mat(b, layout, transb, *k, *n, *ldb, bod, bd);
-    copy_mat(c, layout, CblasNoTrans, *m, *n, *ldc, 0.0, cd);
-    cblas_dgemm_wrapper(layout, transa, transb, *m, *n, *k, alphad, ad, *lda, bd, *ldb, betad, cd,
-                        *ldc);
-    copy_mat(cd, layout, *m, *n, *ldc, offsetc, co, c);
-    oneapi::mkl::aligned_free(ad);
-    oneapi::mkl::aligned_free(bd);
-    oneapi::mkl::aligned_free(cd);
-}
-
-template <>
-void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-               CBLAS_OFFSET offsetc, const int *m, const int *n, const int *k, const float *alpha,
-               const uint8_t *a, const int *lda, const uint8_t *ao, const int8_t *b, const int *ldb,
-               const int8_t *bo, const float *beta, int32_t *c, const int *ldc, const int32_t *co) {
-    // Not supported in NETLIB. DGEMM is used as reference.
-    int sizea, sizeb, sizec;
-    if (layout == CblasColMajor) {
-        sizea = (transa == CblasNoTrans) ? *lda * *k : *lda * *m;
-        sizeb = (transb == CblasNoTrans) ? *ldb * *n : *ldb * *k;
-        sizec = *ldc * *n;
-    }
-    else {
-        sizea = (transa == CblasNoTrans) ? *lda * *m : *lda * *k;
-        sizeb = (transb == CblasNoTrans) ? *ldb * *k : *ldb * *n;
-        sizec = *ldc * *m;
-    }
-    double *ad = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizea);
-    double *bd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizeb);
-    double *cd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec);
-    double alphad = *alpha;
-    double betad = *beta;
-    double aod = *ao;
-    double bod = *bo;
-    copy_mat(a, layout, transa, *m, *k, *lda, aod, ad);
-    copy_mat(b, layout, transb, *k, *n, *ldb, bod, bd);
-    copy_mat(c, layout, CblasNoTrans, *m, *n, *ldc, 0.0, cd);
-    cblas_dgemm_wrapper(layout, transa, transb, *m, *n, *k, alphad, ad, *lda, bd, *ldb, betad, cd,
-                        *ldc);
-    copy_mat(cd, layout, *m, *n, *ldc, offsetc, co, c);
-    oneapi::mkl::aligned_free(ad);
-    oneapi::mkl::aligned_free(bd);
-    oneapi::mkl::aligned_free(cd);
-}
-
-template <>
-void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-               CBLAS_OFFSET offsetc, const int *m, const int *n, const int *k, const float *alpha,
-               const uint8_t *a, const int *lda, const uint8_t *ao, const uint8_t *b,
-               const int *ldb, const uint8_t *bo, const float *beta, int32_t *c, const int *ldc,
-               const int32_t *co) {
-    // Not supported in NETLIB. DGEMM is used as reference.
-    int sizea, sizeb, sizec;
-    if (layout == CblasColMajor) {
-        sizea = (transa == CblasNoTrans) ? *lda * *k : *lda * *m;
-        sizeb = (transb == CblasNoTrans) ? *ldb * *n : *ldb * *k;
-        sizec = *ldc * *n;
-    }
-    else {
-        sizea = (transa == CblasNoTrans) ? *lda * *m : *lda * *k;
-        sizeb = (transb == CblasNoTrans) ? *ldb * *k : *ldb * *n;
-        sizec = *ldc * *m;
-    }
-    double *ad = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizea);
-    double *bd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizeb);
-    double *cd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec);
-    double alphad = *alpha;
-    double betad = *beta;
-    double aod = *ao;
-    double bod = *bo;
-    copy_mat(a, layout, transa, *m, *k, *lda, aod, ad);
-    copy_mat(b, layout, transb, *k, *n, *ldb, bod, bd);
-    copy_mat(c, layout, CblasNoTrans, *m, *n, *ldc, 0.0, cd);
-    cblas_dgemm_wrapper(layout, transa, transb, *m, *n, *k, alphad, ad, *lda, bd, *ldb, betad, cd,
-                        *ldc);
-    copy_mat(cd, layout, *m, *n, *ldc, offsetc, co, c);
-    oneapi::mkl::aligned_free(ad);
-    oneapi::mkl::aligned_free(bd);
-    oneapi::mkl::aligned_free(cd);
-}
-
-template <typename fp>
-static void gemmt(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE transa,
-                  CBLAS_TRANSPOSE transb, const int *n, const int *k, const fp *alpha, const fp *a,
-                  const int *lda, const fp *b, const int *ldb, const fp *beta, fp *c,
-                  const int *ldc);
-
-template <>
-void gemmt(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE transa,
-           CBLAS_TRANSPOSE transb, const int *n, const int *k, const float *alpha, const float *a,
-           const int *lda, const float *b, const int *ldb, const float *beta, float *c,
-           const int *ldc) {
-    // Not supported in NETLIB. SGEMM is used as reference.
-    int sizec;
-    sizec = *ldc * *n;
-    float *cf = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizec);
-    update_c(c, layout, upper_lower, *n, *n, *ldc, cf);
-    cblas_sgemm_wrapper(layout, transa, transb, *n, *n, *k, *alpha, a, *lda, b, *ldb, *beta, cf,
-                        *ldc);
-    update_c(cf, layout, upper_lower, *n, *n, *ldc, c);
-    oneapi::mkl::aligned_free(cf);
-}
-
-template <>
-void gemmt(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE transa,
-           CBLAS_TRANSPOSE transb, const int *n, const int *k, const double *alpha, const double *a,
-           const int *lda, const double *b, const int *ldb, const double *beta, double *c,
-           const int *ldc) {
-    // Not supported in NETLIB. DGEMM is used as reference.
-    int sizec;
-    sizec = *ldc * *n;
-    double *cf = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec);
-    update_c(c, layout, upper_lower, *n, *n, *ldc, cf);
-    cblas_dgemm_wrapper(layout, transa, transb, *n, *n, *k, *alpha, a, *lda, b, *ldb, *beta, cf,
-                        *ldc);
-    update_c(cf, layout, upper_lower, *n, *n, *ldc, c);
-    oneapi::mkl::aligned_free(cf);
-}
-
-template <>
-void gemmt(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE transa,
-           CBLAS_TRANSPOSE transb, const int *n, const int *k, const std::complex<float> *alpha,
-           const std::complex<float> *a, const int *lda, const std::complex<float> *b,
-           const int *ldb, const std::complex<float> *beta, std::complex<float> *c,
-           const int *ldc) {
-    // Not supported in NETLIB. CGEMM is used as reference.
-    int sizec;
-    sizec = *ldc * *n;
-    std::complex<float> *cf =
-        (std::complex<float> *)oneapi::mkl::aligned_alloc(64, sizeof(std::complex<float>) * sizec);
-    update_c(c, layout, upper_lower, *n, *n, *ldc, cf);
-    cblas_cgemm_wrapper(layout, transa, transb, *n, *n, *k, alpha, a, *lda, b, *ldb, beta, cf,
-                        *ldc);
-    update_c(cf, layout, upper_lower, *n, *n, *ldc, c);
-    oneapi::mkl::aligned_free(cf);
-}
-
-template <>
-void gemmt(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE transa,
-           CBLAS_TRANSPOSE transb, const int *n, const int *k, const std::complex<double> *alpha,
-           const std::complex<double> *a, const int *lda, const std::complex<double> *b,
-           const int *ldb, const std::complex<double> *beta, std::complex<double> *c,
-           const int *ldc) {
-    // Not supported in NETLIB. ZGEMM is used as reference.
-    int sizec;
-    sizec = *ldc * *n;
-    std::complex<double> *cf = (std::complex<double> *)oneapi::mkl::aligned_alloc(
-        64, sizeof(std::complex<double>) * sizec);
-    update_c(c, layout, upper_lower, *n, *n, *ldc, cf);
-    cblas_zgemm_wrapper(layout, transa, transb, *n, *n, *k, alpha, a, *lda, b, *ldb, beta, cf,
-                        *ldc);
-    update_c(cf, layout, upper_lower, *n, *n, *ldc, c);
-    oneapi::mkl::aligned_free(cf);
-}
-
-template <typename fp>
-static void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int *m, const int *n,
-                 const fp *a, const int *lda, const fp *x, const int *incx, fp *c, const int *ldc);
-
-template <>
-void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int *m, const int *n, const float *a,
-          const int *lda, const float *x, const int *incx, float *c, const int *ldc) {
-    // Not supported in NETLIB. Reference C++ implementation is used.
-    float tmp;
-    int size_x = (left_right == CblasLeft) ? *m : *n;
-    int idx = (*incx) > 0 ? 0 : (1 - size_x) * (*incx);
-
-    if (left_right == CblasRight) {
-        for (int i = 0; i < *n; i++) {
-            tmp = x[idx + i * (*incx)];
-            for (int j = 0; j < *m; j++) {
-                if (layout == CblasColMajor)
-                    c[j + i * (*ldc)] = tmp * a[j + i * (*lda)];
-                else
-                    c[i + j * (*ldc)] = tmp * a[i + j * (*lda)];
-            }
-        }
-    }
-    else {
-        for (int i = 0; i < *m; i++) {
-            tmp = x[idx + i * (*incx)];
-            for (int j = 0; j < *n; j++) {
-                if (layout == CblasColMajor)
-                    c[i + j * (*ldc)] = tmp * a[i + j * (*lda)];
-                else
-                    c[j + i * (*ldc)] = tmp * a[j + i * (*lda)];
-            }
-        }
-    }
-}
-
-template <>
-void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int *m, const int *n, const double *a,
-          const int *lda, const double *x, const int *incx, double *c, const int *ldc) {
-    // Not supported in NETLIB. Reference C++ implementation is used.
-    double tmp;
-    int size_x = (left_right == CblasLeft) ? *m : *n;
-    int idx = (*incx) > 0 ? 0 : (1 - size_x) * (*incx);
-
-    if (left_right == CblasRight) {
-        for (int i = 0; i < *n; i++) {
-            tmp = x[idx + i * (*incx)];
-            for (int j = 0; j < *m; j++) {
-                if (layout == CblasColMajor)
-                    c[j + i * (*ldc)] = tmp * a[j + i * (*lda)];
-                else
-                    c[i + j * (*ldc)] = tmp * a[i + j * (*lda)];
-            }
-        }
-    }
-    else {
-        for (int i = 0; i < *m; i++) {
-            tmp = x[idx + i * (*incx)];
-            for (int j = 0; j < *n; j++) {
-                if (layout == CblasColMajor)
-                    c[i + j * (*ldc)] = tmp * a[i + j * (*lda)];
-                else
-                    c[j + i * (*ldc)] = tmp * a[j + i * (*lda)];
-            }
-        }
-    }
-}
-
-template <>
-void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int *m, const int *n,
-          const std::complex<float> *a, const int *lda, const std::complex<float> *x,
-          const int *incx, std::complex<float> *c, const int *ldc) {
-    // Not supported in NETLIB. Reference C++ implementation is used.
-    std::complex<float> tmp;
-    int size_x = (left_right == CblasLeft) ? *m : *n;
-    int idx = (*incx) > 0 ? 0 : (1 - size_x) * (*incx);
-
-    if (left_right == CblasRight) {
-        for (int i = 0; i < *n; i++) {
-            tmp = x[idx + i * (*incx)];
-            for (int j = 0; j < *m; j++) {
-                if (layout == CblasColMajor) {
-                    c[j + i * (*ldc)] =
-                        std::complex<float>((tmp.real() * a[j + i * (*lda)].real() -
-                                             tmp.imag() * a[j + i * (*lda)].imag()),
-                                            (tmp.real() * a[j + i * (*lda)].imag() +
-                                             tmp.imag() * a[j + i * (*lda)].real()));
-                }
-                else {
-                    c[i + j * (*ldc)] =
-                        std::complex<float>((tmp.real() * a[i + j * (*lda)].real() -
-                                             tmp.imag() * a[i + j * (*lda)].imag()),
-                                            (tmp.real() * a[i + j * (*lda)].imag() +
-                                             tmp.imag() * a[i + j * (*lda)].real()));
-                }
-            }
-        }
-    }
-    else {
-        for (int i = 0; i < *m; i++) {
-            tmp = x[idx + i * (*incx)];
-            for (int j = 0; j < *n; j++) {
-                if (layout == CblasColMajor) {
-                    c[i + j * (*ldc)] =
-                        std::complex<float>((tmp.real() * a[i + j * (*lda)].real() -
-                                             tmp.imag() * a[i + j * (*lda)].imag()),
-                                            (tmp.real() * a[i + j * (*lda)].imag() +
-                                             tmp.imag() * a[i + j * (*lda)].real()));
-                }
-                else {
-                    c[j + i * (*ldc)] =
-                        std::complex<float>((tmp.real() * a[j + i * (*lda)].real() -
-                                             tmp.imag() * a[j + i * (*lda)].imag()),
-                                            (tmp.real() * a[j + i * (*lda)].imag() +
-                                             tmp.imag() * a[j + i * (*lda)].real()));
-                }
-            }
-        }
-    }
-}
-
-template <>
-void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int *m, const int *n,
-          const std::complex<double> *a, const int *lda, const std::complex<double> *x,
-          const int *incx, std::complex<double> *c, const int *ldc) {
-    // Not supported in NETLIB. Reference C++ implementation is used.
-    std::complex<double> tmp;
-    int size_x = (left_right == CblasLeft) ? *m : *n;
-    int idx = (*incx) > 0 ? 0 : (1 - size_x) * (*incx);
-
-    if (left_right == CblasRight) {
-        for (int i = 0; i < *n; i++) {
-            tmp = x[idx + i * (*incx)];
-            for (int j = 0; j < *m; j++) {
-                if (layout == CblasColMajor) {
-                    c[j + i * (*ldc)] =
-                        std::complex<double>((tmp.real() * a[j + i * (*lda)].real() -
-                                              tmp.imag() * a[j + i * (*lda)].imag()),
-                                             (tmp.real() * a[j + i * (*lda)].imag() +
-                                              tmp.imag() * a[j + i * (*lda)].real()));
-                }
-                else {
-                    c[i + j * (*ldc)] =
-                        std::complex<double>((tmp.real() * a[i + j * (*lda)].real() -
-                                              tmp.imag() * a[i + j * (*lda)].imag()),
-                                             (tmp.real() * a[i + j * (*lda)].imag() +
-                                              tmp.imag() * a[i + j * (*lda)].real()));
-                }
-            }
-        }
-    }
-    else {
-        for (int i = 0; i < *m; i++) {
-            tmp = x[idx + i * (*incx)];
-            for (int j = 0; j < *n; j++) {
-                if (layout == CblasColMajor) {
-                    c[i + j * (*ldc)] =
-                        std::complex<double>((tmp.real() * a[i + j * (*lda)].real() -
-                                              tmp.imag() * a[i + j * (*lda)].imag()),
-                                             (tmp.real() * a[i + j * (*lda)].imag() +
-                                              tmp.imag() * a[i + j * (*lda)].real()));
-                }
-                else {
-                    c[j + i * (*ldc)] =
-                        std::complex<double>((tmp.real() * a[j + i * (*lda)].real() -
-                                              tmp.imag() * a[j + i * (*lda)].imag()),
-                                             (tmp.real() * a[j + i * (*lda)].imag() +
-                                              tmp.imag() * a[j + i * (*lda)].real()));
-                }
-            }
-        }
-    }
-}
-
-// std::conj can take a real type as input, but still returns a complex type.
-// This version always returns the same type it has as input
-template <typename fp>
-fp sametype_conj(fp x) {
-    if constexpr (std::is_same_v<fp, std::complex<float>> ||
-                  std::is_same_v<fp, std::complex<double>>) {
-        return std::conj(x);
-    }
-    else {
-        return x;
-    }
-}
-
-template <typename fp>
-void omatcopy_ref(oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int64_t m, int64_t n,
-                  fp alpha, fp *A, int64_t lda, fp *B, int64_t ldb) {
-    int64_t logical_m, logical_n;
-    if (layout == oneapi::mkl::layout::col_major) {
-        logical_m = m;
-        logical_n = n;
-    }
-    else {
-        logical_m = n;
-        logical_n = m;
-    }
-    if (trans == oneapi::mkl::transpose::nontrans) {
-        for (int64_t j = 0; j < logical_n; j++) {
-            for (int64_t i = 0; i < logical_m; i++) {
-                B[j * ldb + i] = alpha * A[j * lda + i];
-            }
-        }
-    }
-    else if (trans == oneapi::mkl::transpose::trans) {
-        for (int64_t j = 0; j < logical_n; j++) {
-            for (int64_t i = 0; i < logical_m; i++) {
-                B[i * ldb + j] = alpha * A[j * lda + i];
-            }
-        }
-    }
-    else {
-        // conjtrans
-        for (int64_t j = 0; j < logical_n; j++) {
-            for (int64_t i = 0; i < logical_m; i++) {
-                B[i * ldb + j] = alpha * sametype_conj(A[j * lda + i]);
-            }
-        }
-    }
-}
-
-template <typename fp>
-void omatcopy2_ref(oneapi::mkl::layout layout, oneapi::mkl::transpose trans, const int64_t &m,
-                   const int64_t &n, const fp &alpha, const fp *in_matrix, const int64_t &ld_in,
-                   const int64_t &inc_in, fp *out_matrix, const int64_t &ld_out,
-                   const int64_t inc_out) {
-    int64_t logical_m, logical_n;
-    if (layout == oneapi::mkl::layout::col_major) {
-        logical_m = m;
-        logical_n = n;
-    }
-    else {
-        logical_m = n;
-        logical_n = m;
-    }
-    if (trans == oneapi::mkl::transpose::trans) {
-        for (int64_t i = 0; i < logical_m; ++i) {
-            for (int64_t j = 0; j < logical_n; ++j) {
-                {
-                    out_matrix[j * inc_out + i * ld_out] =
-                        alpha * in_matrix[i * inc_in + j * ld_in];
-                }
-            }
-        }
-    }
-    else if (trans == oneapi::mkl::transpose::nontrans) {
-        for (int i = 0; i < logical_n; ++i) {
-            for (int j = 0; j < logical_m; ++j) {
-                {
-                    out_matrix[j * inc_out + i * ld_out] =
-                        alpha * in_matrix[j * inc_in + i * ld_in];
-                }
-            }
-        }
-    }
-    else {
-        for (int64_t i = 0; i < logical_m; ++i) {
-            for (int64_t j = 0, c = 0; j < logical_n; ++j, ++c) {
-                out_matrix[j * inc_out + i * ld_out] =
-                    alpha * sametype_conj(in_matrix[i * inc_in + j * ld_in]);
-            }
-        }
-    }
-
-    return;
-}
-
-template <typename fp>
-void imatcopy_ref(oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int64_t m, int64_t n,
-                  fp alpha, fp *A, int64_t lda, int64_t ldb) {
-    int64_t logical_m, logical_n;
-    if (layout == oneapi::mkl::layout::col_major) {
-        logical_m = m;
-        logical_n = n;
-    }
-    else {
-        logical_m = n;
-        logical_n = m;
-    }
-    std::vector<fp> temp(m * n);
-    int64_t ld_temp = (trans == oneapi::mkl::transpose::nontrans ? logical_m : logical_n);
-
-    if (trans == oneapi::mkl::transpose::nontrans) {
-        for (int64_t j = 0; j < logical_n; j++) {
-            for (int64_t i = 0; i < logical_m; i++) {
-                temp[j * ld_temp + i] = alpha * A[j * lda + i];
-            }
-        }
-    }
-    else if (trans == oneapi::mkl::transpose::trans) {
-        for (int64_t j = 0; j < logical_n; j++) {
-            for (int64_t i = 0; i < logical_m; i++) {
-                temp[i * ld_temp + j] = alpha * A[j * lda + i];
-            }
-        }
-    }
-    else {
-        // conjtrans
-        for (int64_t j = 0; j < logical_n; j++) {
-            for (int64_t i = 0; i < logical_m; i++) {
-                temp[i * ld_temp + j] = alpha * sametype_conj(A[j * lda + i]);
-            }
-        }
-    }
-
-    if (trans == oneapi::mkl::transpose::nontrans) {
-        for (int64_t j = 0; j < logical_n; j++) {
-            for (int64_t i = 0; i < logical_m; i++) {
-                A[j * ldb + i] = temp[j * ld_temp + i];
-            }
-        }
-    }
-    else {
-        for (int64_t j = 0; j < logical_n; j++) {
-            for (int64_t i = 0; i < logical_m; i++) {
-                A[i * ldb + j] = temp[i * ld_temp + j];
-            }
-        }
-    }
-}
-
-template <typename fp>
-void omatadd_ref(oneapi::mkl::layout layout, oneapi::mkl::transpose transa,
-                 oneapi::mkl::transpose transb, int64_t m, int64_t n, fp alpha, fp *A, int64_t lda,
-                 fp beta, fp *B, int64_t ldb, fp *C, int64_t ldc) {
-    int64_t logical_m, logical_n;
-    if (layout == oneapi::mkl::layout::col_major) {
-        logical_m = m;
-        logical_n = n;
-    }
-    else {
-        logical_m = n;
-        logical_n = m;
-    }
-
-    for (int64_t j = 0; j < logical_n; j++) {
-        for (int64_t i = 0; i < logical_m; i++) {
-            C[j * ldc + i] = 0.0;
-        }
-    }
-
-    if (transa == oneapi::mkl::transpose::nontrans) {
-        for (int64_t j = 0; j < logical_n; j++) {
-            for (int64_t i = 0; i < logical_m; i++) {
-                C[j * ldc + i] += alpha * A[j * lda + i];
-            }
-        }
-    }
-    else if (transa == oneapi::mkl::transpose::trans) {
-        for (int64_t j = 0; j < logical_n; j++) {
-            for (int64_t i = 0; i < logical_m; i++) {
-                C[j * ldc + i] += alpha * A[i * lda + j];
-            }
-        }
-    }
-    else {
-        // conjtrans
-        for (int64_t j = 0; j < logical_n; j++) {
-            for (int64_t i = 0; i < logical_m; i++) {
-                C[j * ldc + i] += alpha * sametype_conj(A[i * lda + j]);
-            }
-        }
-    }
-
-    if (transb == oneapi::mkl::transpose::nontrans) {
-        for (int64_t j = 0; j < logical_n; j++) {
-            for (int64_t i = 0; i < logical_m; i++) {
-                C[j * ldc + i] += beta * B[j * ldb + i];
-            }
-        }
-    }
-    else if (transb == oneapi::mkl::transpose::trans) {
-        for (int64_t j = 0; j < logical_n; j++) {
-            for (int64_t i = 0; i < logical_m; i++) {
-                C[j * ldc + i] += beta * B[i * ldb + j];
-            }
-        }
-    }
-    else {
-        // conjtrans
-        for (int64_t j = 0; j < logical_n; j++) {
-            for (int64_t i = 0; i < logical_m; i++) {
-                C[j * ldc + i] += beta * sametype_conj(B[i * ldb + j]);
-            }
-        }
-    }
-}
-
-#endif /* header guard */
diff --git a/tests/unit_tests/blas/include/reference_blas_wrappers.hpp b/tests/unit_tests/blas/include/reference_blas_wrappers.hpp
deleted file mode 100644
index 8c7d0938a..000000000
--- a/tests/unit_tests/blas/include/reference_blas_wrappers.hpp
+++ /dev/null
@@ -1,2416 +0,0 @@
-/*******************************************************************************
-* Copyright 2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _REFERENCE_BLAS_WRAPPERS_HPP__
-#define _REFERENCE_BLAS_WRAPPERS_HPP__
-
-#include "oneapi/mkl/exceptions.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include <string>
-#include "cblas.h"
-
-#ifdef __linux__
-#include <dlfcn.h>
-#define LIB_TYPE                void *
-#define GET_LIB_HANDLE(libname) dlopen((libname), RTLD_LAZY | RTLD_LOCAL | RTLD_DEEPBIND)
-#define GET_FUNC(lib, fn)       dlsym(lib, (fn))
-#elif defined(_WIN64)
-#include <windows.h>
-#define LIB_TYPE                HINSTANCE
-#define GET_LIB_HANDLE(libname) LoadLibrary(libname)
-#define GET_FUNC(lib, fn)       GetProcAddress((lib), (fn))
-#endif
-
-extern "C" {
-static LIB_TYPE h_libblas = NULL;
-static LIB_TYPE blas_library() {
-    if (h_libblas == NULL) {
-        h_libblas = GET_LIB_HANDLE(REF_BLAS_LIBNAME);
-        if (h_libblas == NULL) {
-            throw oneapi::mkl::library_not_found(
-                "BLAS", "blas_library()",
-                std::string("failed to load BLAS library ") + REF_BLAS_LIBNAME);
-        }
-    }
-    return h_libblas;
-}
-
-static LIB_TYPE h_libcblas = NULL;
-static LIB_TYPE cblas_library() {
-    if (h_libcblas == NULL) {
-        h_libcblas = GET_LIB_HANDLE(REF_CBLAS_LIBNAME);
-        if (h_libcblas == NULL) {
-            throw oneapi::mkl::library_not_found(
-                "BLAS", "cblas_library()",
-                std::string("failed to load CBLAS library ") + REF_CBLAS_LIBNAME);
-        }
-    }
-    return h_libcblas;
-}
-
-/* Level 3 */
-
-static void (*cblas_sgemm_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-                             const int m, const int n, const int k, const float alpha,
-                             const float *a, const int lda, const float *b, const int ldb,
-                             const float beta, float *c, const int ldc);
-static void (*cblas_dgemm_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-                             const int m, const int n, const int k, const double alpha,
-                             const double *a, const int lda, const double *b, const int ldb,
-                             const double beta, double *c, const int ldc);
-static void (*cblas_cgemm_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-                             const int m, const int n, const int k, const void *alpha,
-                             const void *a, const int lda, const void *b, const int ldb,
-                             const void *beta, void *c, const int ldc);
-static void (*cblas_zgemm_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-                             const int m, const int n, const int k, const void *alpha,
-                             const void *a, const int lda, const void *b, const int ldb,
-                             const void *beta, void *c, const int ldc);
-static void (*cblas_ssymm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                             const int m, const int n, const float alpha, const float *a,
-                             const int lda, const float *b, const int ldb, const float beta,
-                             float *c, const int ldc);
-static void (*cblas_dsymm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                             const int m, const int n, const double alpha, const double *a,
-                             const int lda, const double *b, const int ldb, const double beta,
-                             double *c, const int ldc);
-static void (*cblas_csymm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                             const int m, const int n, const void *alpha, const void *a,
-                             const int lda, const void *b, const int ldb, const void *beta, void *c,
-                             const int ldc);
-static void (*cblas_zsymm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                             const int m, const int n, const void *alpha, const void *a,
-                             const int lda, const void *b, const int ldb, const void *beta, void *c,
-                             const int ldc);
-static void (*cblas_ssyrk_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                             const int n, const int k, const float alpha, const float *a,
-                             const int lda, const float beta, float *c, const int ldc);
-static void (*cblas_dsyrk_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                             const int n, const int k, const double alpha, const double *a,
-                             const int lda, const double beta, double *c, const int ldc);
-static void (*cblas_csyrk_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                             const int n, const int k, const void *alpha, const void *a,
-                             const int lda, const void *beta, void *c, const int ldc);
-static void (*cblas_zsyrk_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                             const int n, const int k, const void *alpha, const void *a,
-                             const int lda, const void *beta, void *c, const int ldc);
-static void (*cblas_chemm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                             const int m, const int n, const void *alpha, const void *a,
-                             const int lda, const void *b, const int ldb, const void *beta, void *c,
-                             const int ldc);
-static void (*cblas_zhemm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                             const int m, const int n, const void *alpha, const void *a,
-                             const int lda, const void *b, const int ldb, const void *beta, void *c,
-                             const int ldc);
-static void (*cblas_cherk_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                             const int n, const int k, const float alpha, const void *a,
-                             const int lda, const float beta, void *c, const int ldc);
-static void (*cblas_zherk_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                             const int n, const int k, const double alpha, const void *a,
-                             const int lda, const double beta, void *c, const int ldc);
-static void (*cblas_ssyr2k_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                              const int n, const int k, const float alpha, const float *a,
-                              const int lda, const float *b, const int ldb, const float beta,
-                              float *c, const int ldc);
-static void (*cblas_dsyr2k_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                              const int n, const int k, const double alpha, const double *a,
-                              const int lda, const double *b, const int ldb, const double beta,
-                              double *c, const int ldc);
-static void (*cblas_csyr2k_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                              const int n, const int k, const void *alpha, const void *a,
-                              const int lda, const void *b, const int ldb, const void *beta,
-                              void *c, const int ldc);
-static void (*cblas_zsyr2k_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                              const int n, const int k, const void *alpha, const void *a,
-                              const int lda, const void *b, const int ldb, const void *beta,
-                              void *c, const int ldc);
-static void (*cblas_cher2k_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                              const int n, const int k, const void *alpha, const void *a,
-                              const int lda, const void *b, const int ldb, const float beta,
-                              void *c, const int ldc);
-static void (*cblas_zher2k_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                              const int n, const int k, const void *alpha, const void *a,
-                              const int lda, const void *b, const int ldb, const double beta,
-                              void *c, const int ldc);
-static void (*cblas_strmm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                             CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                             const float alpha, const float *a, const int lda, float *b,
-                             const int ldb);
-static void (*cblas_dtrmm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                             CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                             const double alpha, const double *a, const int lda, double *b,
-                             const int ldb);
-static void (*cblas_ctrmm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                             CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                             const void *alpha, const void *a, const int lda, void *b,
-                             const int ldb);
-static void (*cblas_ztrmm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                             CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                             const void *alpha, const void *a, const int lda, void *b,
-                             const int ldb);
-static void (*cblas_strsm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                             CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                             const float alpha, const float *a, const int lda, float *b,
-                             const int ldb);
-static void (*cblas_dtrsm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                             CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                             const double alpha, const double *a, const int lda, double *b,
-                             const int ldb);
-static void (*cblas_ctrsm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                             CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                             const void *alpha, const void *a, const int lda, void *b,
-                             const int ldb);
-static void (*cblas_ztrsm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                             CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                             const void *alpha, const void *a, const int lda, void *b,
-                             const int ldb);
-
-static void cblas_sgemm_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-                                const int m, const int n, const int k, const float alpha,
-                                const float *a, const int lda, const float *b, const int ldb,
-                                const float beta, float *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_sgemm_p == NULL)
-            cblas_sgemm_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-                          const int m, const int n, const int k, const float alpha, const float *a,
-                          const int lda, const float *b, const int ldb, const float beta, float *c,
-                          const int ldc))GET_FUNC(h_libcblas, "cblas_sgemm");
-        if (cblas_sgemm_p != NULL)
-            cblas_sgemm_p(layout, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_dgemm_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-                                const int m, const int n, const int k, const double alpha,
-                                const double *a, const int lda, const double *b, const int ldb,
-                                const double beta, double *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_dgemm_p == NULL)
-            cblas_dgemm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa,
-                                      CBLAS_TRANSPOSE transb, const int m, const int n, const int k,
-                                      const double alpha, const double *a, const int lda,
-                                      const double *b, const int ldb, const double beta, double *c,
-                                      const int ldc))GET_FUNC(h_libcblas, "cblas_dgemm");
-        if (cblas_dgemm_p != NULL)
-            cblas_dgemm_p(layout, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_cgemm_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-                                const int m, const int n, const int k, const void *alpha,
-                                const void *a, const int lda, const void *b, const int ldb,
-                                const void *beta, void *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_cgemm_p == NULL)
-            cblas_cgemm_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-                          const int m, const int n, const int k, const void *alpha, const void *a,
-                          const int lda, const void *b, const int ldb, const void *beta, void *c,
-                          const int ldc))GET_FUNC(h_libcblas, "cblas_cgemm");
-        if (cblas_cgemm_p != NULL)
-            cblas_cgemm_p(layout, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_zgemm_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-                                const int m, const int n, const int k, const void *alpha,
-                                const void *a, const int lda, const void *b, const int ldb,
-                                const void *beta, void *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_zgemm_p == NULL)
-            cblas_zgemm_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb,
-                          const int m, const int n, const int k, const void *alpha, const void *a,
-                          const int lda, const void *b, const int ldb, const void *beta, void *c,
-                          const int ldc))GET_FUNC(h_libcblas, "cblas_zgemm");
-        if (cblas_zgemm_p != NULL)
-            cblas_zgemm_p(layout, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_ssymm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                                const int m, const int n, const float alpha, const float *a,
-                                const int lda, const float *b, const int ldb, const float beta,
-                                float *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_ssymm_p == NULL)
-            cblas_ssymm_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int m,
-                          const int n, const float alpha, const float *a, const int lda,
-                          const float *b, const int ldb, const float beta, float *c,
-                          const int ldc))GET_FUNC(h_libcblas, "cblas_ssymm");
-        if (cblas_ssymm_p != NULL)
-            cblas_ssymm_p(layout, left_right, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_dsymm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                                const int m, const int n, const double alpha, const double *a,
-                                const int lda, const double *b, const int ldb, const double beta,
-                                double *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_dsymm_p == NULL)
-            cblas_dsymm_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int m,
-                          const int n, const double alpha, const double *a, const int lda,
-                          const double *b, const int ldb, const double beta, double *c,
-                          const int ldc))GET_FUNC(h_libcblas, "cblas_dsymm");
-        if (cblas_dsymm_p != NULL)
-            cblas_dsymm_p(layout, left_right, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_csymm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                                const int m, const int n, const void *alpha, const void *a,
-                                const int lda, const void *b, const int ldb, const void *beta,
-                                void *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_csymm_p == NULL)
-            cblas_csymm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                                      const int m, const int n, const void *alpha, const void *a,
-                                      const int lda, const void *b, const int ldb, const void *beta,
-                                      void *c, const int ldc))GET_FUNC(h_libcblas, "cblas_csymm");
-        if (cblas_csymm_p != NULL)
-            cblas_csymm_p(layout, left_right, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_zsymm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                                const int m, const int n, const void *alpha, const void *a,
-                                const int lda, const void *b, const int ldb, const void *beta,
-                                void *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_zsymm_p == NULL)
-            cblas_zsymm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                                      const int m, const int n, const void *alpha, const void *a,
-                                      const int lda, const void *b, const int ldb, const void *beta,
-                                      void *c, const int ldc))GET_FUNC(h_libcblas, "cblas_zsymm");
-        if (cblas_zsymm_p != NULL)
-            cblas_zsymm_p(layout, left_right, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_ssyrk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                const int n, const int k, const float alpha, const float *a,
-                                const int lda, const float beta, float *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_ssyrk_p == NULL)
-            cblas_ssyrk_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                      const int n, const int k, const float alpha, const float *a,
-                                      const int lda, const float beta, float *c,
-                                      const int ldc))GET_FUNC(h_libcblas, "cblas_ssyrk");
-        if (cblas_ssyrk_p != NULL)
-            cblas_ssyrk_p(layout, uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
-    }
-}
-
-static void cblas_dsyrk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                const int n, const int k, const double alpha, const double *a,
-                                const int lda, const double beta, double *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_dsyrk_p == NULL)
-            cblas_dsyrk_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                      const int n, const int k, const double alpha, const double *a,
-                                      const int lda, const double beta, double *c,
-                                      const int ldc))GET_FUNC(h_libcblas, "cblas_dsyrk");
-        if (cblas_dsyrk_p != NULL)
-            cblas_dsyrk_p(layout, uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
-    }
-}
-
-static void cblas_csyrk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                const int n, const int k, const void *alpha, const void *a,
-                                const int lda, const void *beta, void *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_csyrk_p == NULL)
-            cblas_csyrk_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                      const int n, const int k, const void *alpha, const void *a,
-                                      const int lda, const void *beta, void *c,
-                                      const int ldc))GET_FUNC(h_libcblas, "cblas_csyrk");
-        if (cblas_csyrk_p != NULL)
-            cblas_csyrk_p(layout, uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
-    }
-}
-
-static void cblas_zsyrk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                const int n, const int k, const void *alpha, const void *a,
-                                const int lda, const void *beta, void *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_zsyrk_p == NULL)
-            cblas_zsyrk_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                      const int n, const int k, const void *alpha, const void *a,
-                                      const int lda, const void *beta, void *c,
-                                      const int ldc))GET_FUNC(h_libcblas, "cblas_zsyrk");
-        if (cblas_zsyrk_p != NULL)
-            cblas_zsyrk_p(layout, uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
-    }
-}
-
-static void cblas_chemm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                                const int m, const int n, const void *alpha, const void *a,
-                                const int lda, const void *b, const int ldb, const void *beta,
-                                void *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_chemm_p == NULL)
-            cblas_chemm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                                      const int m, const int n, const void *alpha, const void *a,
-                                      const int lda, const void *b, const int ldb, const void *beta,
-                                      void *c, const int ldc))GET_FUNC(h_libcblas, "cblas_chemm");
-        if (cblas_chemm_p != NULL)
-            cblas_chemm_p(layout, left_right, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_zhemm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                                const int m, const int n, const void *alpha, const void *a,
-                                const int lda, const void *b, const int ldb, const void *beta,
-                                void *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_zhemm_p == NULL)
-            cblas_zhemm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo,
-                                      const int m, const int n, const void *alpha, const void *a,
-                                      const int lda, const void *b, const int ldb, const void *beta,
-                                      void *c, const int ldc))GET_FUNC(h_libcblas, "cblas_zhemm");
-        if (cblas_zhemm_p != NULL)
-            cblas_zhemm_p(layout, left_right, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_cherk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                const int n, const int k, const float alpha, const void *a,
-                                const int lda, const float beta, void *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_cherk_p == NULL)
-            cblas_cherk_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                      const int n, const int k, const float alpha, const void *a,
-                                      const int lda, const float beta, void *c,
-                                      const int ldc))GET_FUNC(h_libcblas, "cblas_cherk");
-        if (cblas_cherk_p != NULL)
-            cblas_cherk_p(layout, uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
-    }
-}
-
-static void cblas_zherk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                const int n, const int k, const double alpha, const void *a,
-                                const int lda, const double beta, void *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_zherk_p == NULL)
-            cblas_zherk_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                      const int n, const int k, const double alpha, const void *a,
-                                      const int lda, const double beta, void *c,
-                                      const int ldc))GET_FUNC(h_libcblas, "cblas_zherk");
-        if (cblas_zherk_p != NULL)
-            cblas_zherk_p(layout, uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
-    }
-}
-
-static void cblas_ssyr2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                 const int n, const int k, const float alpha, const float *a,
-                                 const int lda, const float *b, const int ldb, const float beta,
-                                 float *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_ssyr2k_p == NULL)
-            cblas_ssyr2k_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int n,
-                          const int k, const float alpha, const float *a, const int lda,
-                          const float *b, const int ldb, const float beta, float *c,
-                          const int ldc))GET_FUNC(h_libcblas, "cblas_ssyr2k");
-        if (cblas_ssyr2k_p != NULL)
-            cblas_ssyr2k_p(layout, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_dsyr2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                 const int n, const int k, const double alpha, const double *a,
-                                 const int lda, const double *b, const int ldb, const double beta,
-                                 double *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_dsyr2k_p == NULL)
-            cblas_dsyr2k_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int n,
-                          const int k, const double alpha, const double *a, const int lda,
-                          const double *b, const int ldb, const double beta, double *c,
-                          const int ldc))GET_FUNC(h_libcblas, "cblas_dsyr2k");
-        if (cblas_dsyr2k_p != NULL)
-            cblas_dsyr2k_p(layout, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_csyr2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                 const int n, const int k, const void *alpha, const void *a,
-                                 const int lda, const void *b, const int ldb, const void *beta,
-                                 void *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_csyr2k_p == NULL)
-            cblas_csyr2k_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int n,
-                          const int k, const void *alpha, const void *a, const int lda,
-                          const void *b, const int ldb, const void *beta, void *c,
-                          const int ldc))GET_FUNC(h_libcblas, "cblas_csyr2k");
-        if (cblas_csyr2k_p != NULL)
-            cblas_csyr2k_p(layout, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_zsyr2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                 const int n, const int k, const void *alpha, const void *a,
-                                 const int lda, const void *b, const int ldb, const void *beta,
-                                 void *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_zsyr2k_p == NULL)
-            cblas_zsyr2k_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int n,
-                          const int k, const void *alpha, const void *a, const int lda,
-                          const void *b, const int ldb, const void *beta, void *c,
-                          const int ldc))GET_FUNC(h_libcblas, "cblas_zsyr2k");
-        if (cblas_zsyr2k_p != NULL)
-            cblas_zsyr2k_p(layout, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_cher2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                 const int n, const int k, const void *alpha, const void *a,
-                                 const int lda, const void *b, const int ldb, const float beta,
-                                 void *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_cher2k_p == NULL)
-            cblas_cher2k_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int n,
-                          const int k, const void *alpha, const void *a, const int lda,
-                          const void *b, const int ldb, const float beta, void *c,
-                          const int ldc))GET_FUNC(h_libcblas, "cblas_cher2k");
-        if (cblas_cher2k_p != NULL)
-            cblas_cher2k_p(layout, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_zher2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
-                                 const int n, const int k, const void *alpha, const void *a,
-                                 const int lda, const void *b, const int ldb, const double beta,
-                                 void *c, const int ldc) {
-    if (cblas_library() != NULL) {
-        if (cblas_zher2k_p == NULL)
-            cblas_zher2k_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int n,
-                          const int k, const void *alpha, const void *a, const int lda,
-                          const void *b, const int ldb, const double beta, void *c,
-                          const int ldc))GET_FUNC(h_libcblas, "cblas_zher2k");
-        if (cblas_zher2k_p != NULL)
-            cblas_zher2k_p(layout, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    }
-}
-
-static void cblas_strmm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                                CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                                const float alpha, const float *a, const int lda, float *b,
-                                const int ldb) {
-    if (cblas_library() != NULL) {
-        if (cblas_strmm_p == NULL)
-            cblas_strmm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                                      CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m,
-                                      const int n, const float alpha, const float *a, const int lda,
-                                      float *b, const int ldb))GET_FUNC(h_libcblas, "cblas_strmm");
-        if (cblas_strmm_p != NULL)
-            cblas_strmm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
-    }
-}
-
-static void cblas_dtrmm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                                CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                                const double alpha, const double *a, const int lda, double *b,
-                                const int ldb) {
-    if (cblas_library() != NULL) {
-        if (cblas_dtrmm_p == NULL)
-            cblas_dtrmm_p = (void (*)(
-                CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa,
-                CBLAS_DIAG diag, const int m, const int n, const double alpha, const double *a,
-                const int lda, double *b, const int ldb))GET_FUNC(h_libcblas, "cblas_dtrmm");
-        if (cblas_dtrmm_p != NULL)
-            cblas_dtrmm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
-    }
-}
-
-static void cblas_ctrmm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                                CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                                const void *alpha, const void *a, const int lda, void *b,
-                                const int ldb) {
-    if (cblas_library() != NULL) {
-        if (cblas_ctrmm_p == NULL)
-            cblas_ctrmm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                                      CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m,
-                                      const int n, const void *alpha, const void *a, const int lda,
-                                      void *b, const int ldb))GET_FUNC(h_libcblas, "cblas_ctrmm");
-        if (cblas_ctrmm_p != NULL)
-            cblas_ctrmm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
-    }
-}
-
-static void cblas_ztrmm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                                CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                                const void *alpha, const void *a, const int lda, void *b,
-                                const int ldb) {
-    if (cblas_library() != NULL) {
-        if (cblas_ztrmm_p == NULL)
-            cblas_ztrmm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                                      CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m,
-                                      const int n, const void *alpha, const void *a, const int lda,
-                                      void *b, const int ldb))GET_FUNC(h_libcblas, "cblas_ztrmm");
-        if (cblas_ztrmm_p != NULL)
-            cblas_ztrmm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
-    }
-}
-
-static void cblas_strsm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                                CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                                const float alpha, const float *a, const int lda, float *b,
-                                const int ldb) {
-    if (cblas_library() != NULL) {
-        if (cblas_strsm_p == NULL)
-            cblas_strsm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                                      CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m,
-                                      const int n, const float alpha, const float *a, const int lda,
-                                      float *b, const int ldb))GET_FUNC(h_libcblas, "cblas_strsm");
-        if (cblas_strsm_p != NULL)
-            cblas_strsm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
-    }
-}
-
-static void cblas_dtrsm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                                CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                                const double alpha, const double *a, const int lda, double *b,
-                                const int ldb) {
-    if (cblas_library() != NULL) {
-        if (cblas_dtrsm_p == NULL)
-            cblas_dtrsm_p = (void (*)(
-                CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa,
-                CBLAS_DIAG diag, const int m, const int n, const double alpha, const double *a,
-                const int lda, double *b, const int ldb))GET_FUNC(h_libcblas, "cblas_dtrsm");
-        if (cblas_dtrsm_p != NULL)
-            cblas_dtrsm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
-    }
-}
-
-static void cblas_ctrsm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                                CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                                const void *alpha, const void *a, const int lda, void *b,
-                                const int ldb) {
-    if (cblas_library() != NULL) {
-        if (cblas_ctrsm_p == NULL)
-            cblas_ctrsm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                                      CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m,
-                                      const int n, const void *alpha, const void *a, const int lda,
-                                      void *b, const int ldb))GET_FUNC(h_libcblas, "cblas_ctrsm");
-        if (cblas_ctrsm_p != NULL)
-            cblas_ctrsm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
-    }
-}
-
-static void cblas_ztrsm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                                CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n,
-                                const void *alpha, const void *a, const int lda, void *b,
-                                const int ldb) {
-    if (cblas_library() != NULL) {
-        if (cblas_ztrsm_p == NULL)
-            cblas_ztrsm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo,
-                                      CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m,
-                                      const int n, const void *alpha, const void *a, const int lda,
-                                      void *b, const int ldb))GET_FUNC(h_libcblas, "cblas_ztrsm");
-        if (cblas_ztrsm_p != NULL)
-            cblas_ztrsm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
-    }
-}
-
-/* Level 2 */
-
-static void (*cblas_sgemv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n,
-                             const float alpha, const float *a, const int lda, const float *x,
-                             const int incx, const float beta, float *y, const int incy);
-static void (*cblas_dgemv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n,
-                             const double alpha, const double *a, const int lda, const double *x,
-                             const int incx, const double beta, double *y, const int incy);
-static void (*cblas_cgemv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n,
-                             const void *alpha, const void *a, const int lda, const void *x,
-                             const int incx, const void *beta, void *y, const int incy);
-static void (*cblas_zgemv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n,
-                             const void *alpha, const void *a, const int lda, const void *x,
-                             const int incx, const void *beta, void *y, const int incy);
-static void (*cblas_sgbmv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n,
-                             int kl, int ku, const float alpha, const float *a, const int lda,
-                             const float *x, const int incx, const float beta, float *y,
-                             const int incy);
-static void (*cblas_dgbmv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n,
-                             int kl, int ku, const double alpha, const double *a, const int lda,
-                             const double *x, const int incx, const double beta, double *y,
-                             const int incy);
-static void (*cblas_cgbmv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n,
-                             int kl, int ku, const void *alpha, const void *a, const int lda,
-                             const void *x, const int incx, const void *beta, void *y,
-                             const int incy);
-static void (*cblas_zgbmv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n,
-                             int kl, int ku, const void *alpha, const void *a, const int lda,
-                             const void *x, const int incx, const void *beta, void *y,
-                             const int incy);
-static void (*cblas_sger_p)(CBLAS_LAYOUT layout, const int m, const int n, const float alpha,
-                            const float *x, const int incx, const float *y, const int incy,
-                            float *a, const int lda);
-static void (*cblas_dger_p)(CBLAS_LAYOUT layout, const int m, const int n, const double alpha,
-                            const double *x, const int incx, const double *y, const int incy,
-                            double *a, const int lda);
-static void (*cblas_cgerc_p)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha,
-                             const void *x, const int incx, const void *y, const int incy, void *a,
-                             const int lda);
-static void (*cblas_zgerc_p)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha,
-                             const void *x, const int incx, const void *y, const int incy, void *a,
-                             const int lda);
-static void (*cblas_cgeru_p)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha,
-                             const void *x, const int incx, const void *y, const int incy, void *a,
-                             const int lda);
-static void (*cblas_zgeru_p)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha,
-                             const void *x, const int incx, const void *y, const int incy, void *a,
-                             const int lda);
-static void (*cblas_chbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, const int k,
-                             const void *alpha, const void *a, const int lda, const void *x,
-                             const int incx, const void *beta, void *y, const int incy);
-static void (*cblas_zhbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, const int k,
-                             const void *alpha, const void *a, const int lda, const void *x,
-                             const int incx, const void *beta, void *y, const int incy);
-static void (*cblas_chemv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const void *alpha, const void *a, const int lda, const void *x,
-                             const int incx, const void *beta, void *y, const int incy);
-static void (*cblas_zhemv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const void *alpha, const void *a, const int lda, const void *x,
-                             const int incx, const void *beta, void *y, const int incy);
-static void (*cblas_cher_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                            const float alpha, const void *x, const int incx, void *a,
-                            const int lda);
-static void (*cblas_zher_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                            const double alpha, const void *x, const int incx, void *a,
-                            const int lda);
-static void (*cblas_cher2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const void *alpha, const void *x, const int incx, const void *y,
-                             const int incy, void *a, const int lda);
-static void (*cblas_zher2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const void *alpha, const void *x, const int incx, const void *y,
-                             const int incy, void *a, const int lda);
-static void (*cblas_chpmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const void *alpha, const void *a, const void *x, const int incx,
-                             const void *beta, void *y, const int incy);
-static void (*cblas_zhpmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const void *alpha, const void *a, const void *x, const int incx,
-                             const void *beta, void *y, const int incy);
-static void (*cblas_chpr_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                            const float alpha, const void *x, const int incx, void *a);
-static void (*cblas_zhpr_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                            const double alpha, const void *x, const int incx, void *a);
-static void (*cblas_chpr2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const void *alpha, const void *x, const int incx, const void *y,
-                             const int incy, void *a);
-static void (*cblas_zhpr2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const void *alpha, const void *x, const int incx, const void *y,
-                             const int incy, void *a);
-static void (*cblas_ssbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, const int k,
-                             const float alpha, const float *a, const int lda, const float *x,
-                             const int incx, const float beta, float *y, const int incy);
-static void (*cblas_dsbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, const int k,
-                             const double alpha, const double *a, const int lda, const double *x,
-                             const int incx, const double beta, double *y, const int incy);
-static void (*cblas_ssymv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const float alpha, const float *a, const int lda, const float *x,
-                             const int incx, const float beta, float *y, const int incy);
-static void (*cblas_dsymv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const double alpha, const double *a, const int lda, const double *x,
-                             const int incx, const double beta, double *y, const int incy);
-static void (*cblas_ssyr_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                            const float alpha, const float *x, const int incx, float *a,
-                            const int lda);
-static void (*cblas_dsyr_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                            const double alpha, const double *x, const int incx, double *a,
-                            const int lda);
-static void (*cblas_ssyr2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const float alpha, const float *x, const int incx, const float *y,
-                             const int incy, float *a, const int lda);
-static void (*cblas_dsyr2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const double alpha, const double *x, const int incx, const double *y,
-                             const int incy, double *a, const int lda);
-static void (*cblas_sspmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const float alpha, const float *a, const float *x, const int incx,
-                             const float beta, float *y, const int incy);
-static void (*cblas_dspmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const double alpha, const double *a, const double *x, const int incx,
-                             const double beta, double *y, const int incy);
-static void (*cblas_sspr_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                            const float alpha, const float *x, const int incx, float *a);
-static void (*cblas_dspr_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                            const double alpha, const double *x, const int incx, double *a);
-static void (*cblas_sspr2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const float alpha, const float *x, const int incx, const float *y,
-                             const int incy, float *a);
-static void (*cblas_dspr2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                             const double alpha, const double *x, const int incx, const double *y,
-                             const int incy, double *a);
-static void (*cblas_stbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const int k, const float *a,
-                             const int lda, float *x, const int incx);
-static void (*cblas_dtbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const int k, const double *a,
-                             const int lda, double *x, const int incx);
-static void (*cblas_ctbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const int k, const void *a,
-                             const int lda, void *x, const int incx);
-static void (*cblas_ztbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const int k, const void *a,
-                             const int lda, void *x, const int incx);
-static void (*cblas_stbsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const int k, const float *a,
-                             const int lda, float *x, const int incx);
-static void (*cblas_dtbsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const int k, const double *a,
-                             const int lda, double *x, const int incx);
-static void (*cblas_ctbsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const int k, const void *a,
-                             const int lda, void *x, const int incx);
-static void (*cblas_ztbsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const int k, const void *a,
-                             const int lda, void *x, const int incx);
-static void (*cblas_stpmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const float *a, float *x,
-                             const int incx);
-static void (*cblas_dtpmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const double *a, double *x,
-                             const int incx);
-static void (*cblas_ctpmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const void *a, void *x,
-                             const int incx);
-static void (*cblas_ztpmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const void *a, void *x,
-                             const int incx);
-static void (*cblas_stpsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const float *a, float *x,
-                             const int incx);
-static void (*cblas_dtpsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const double *a, double *x,
-                             const int incx);
-static void (*cblas_ctpsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const void *a, void *x,
-                             const int incx);
-static void (*cblas_ztpsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const void *a, void *x,
-                             const int incx);
-static void (*cblas_strmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const float *a, const int lda,
-                             float *x, const int incx);
-static void (*cblas_dtrmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const double *a, const int lda,
-                             double *x, const int incx);
-static void (*cblas_ctrmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const void *a, const int lda,
-                             void *x, const int incx);
-static void (*cblas_ztrmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const void *a, const int lda,
-                             void *x, const int incx);
-static void (*cblas_strsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const float *a, const int lda,
-                             float *x, const int incx);
-static void (*cblas_dtrsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const double *a, const int lda,
-                             double *x, const int incx);
-static void (*cblas_ctrsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const void *a, const int lda,
-                             void *x, const int incx);
-static void (*cblas_ztrsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                             CBLAS_DIAG unit_diag, const int n, const void *a, const int lda,
-                             void *x, const int incx);
-
-static void cblas_sgemv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m,
-                                const int n, const float alpha, const float *a, const int lda,
-                                const float *x, const int incx, const float beta, float *y,
-                                const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_sgemv_p == NULL)
-            cblas_sgemv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m,
-                                      const int n, const float alpha, const float *a, const int lda,
-                                      const float *x, const int incx, const float beta, float *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_sgemv");
-        if (cblas_sgemv_p != NULL)
-            cblas_sgemv_p(layout, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_dgemv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m,
-                                const int n, const double alpha, const double *a, const int lda,
-                                const double *x, const int incx, const double beta, double *y,
-                                const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_dgemv_p == NULL)
-            cblas_dgemv_p = (void (*)(
-                CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n,
-                const double alpha, const double *a, const int lda, const double *x, const int incx,
-                const double beta, double *y, const int incy))GET_FUNC(h_libcblas, "cblas_dgemv");
-        if (cblas_dgemv_p != NULL)
-            cblas_dgemv_p(layout, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_cgemv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m,
-                                const int n, const void *alpha, const void *a, const int lda,
-                                const void *x, const int incx, const void *beta, void *y,
-                                const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_cgemv_p == NULL)
-            cblas_cgemv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m,
-                                      const int n, const void *alpha, const void *a, const int lda,
-                                      const void *x, const int incx, const void *beta, void *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_cgemv");
-        if (cblas_cgemv_p != NULL)
-            cblas_cgemv_p(layout, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_zgemv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m,
-                                const int n, const void *alpha, const void *a, const int lda,
-                                const void *x, const int incx, const void *beta, void *y,
-                                const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_zgemv_p == NULL)
-            cblas_zgemv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m,
-                                      const int n, const void *alpha, const void *a, const int lda,
-                                      const void *x, const int incx, const void *beta, void *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_zgemv");
-        if (cblas_zgemv_p != NULL)
-            cblas_zgemv_p(layout, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_sgbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m,
-                                const int n, int kl, int ku, const float alpha, const float *a,
-                                const int lda, const float *x, const int incx, const float beta,
-                                float *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_sgbmv_p == NULL)
-            cblas_sgbmv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n,
-                          int kl, int ku, const float alpha, const float *a, const int lda,
-                          const float *x, const int incx, const float beta, float *y,
-                          const int incy))GET_FUNC(h_libcblas, "cblas_sgbmv");
-        if (cblas_sgbmv_p != NULL)
-            cblas_sgbmv_p(layout, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_dgbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m,
-                                const int n, int kl, int ku, const double alpha, const double *a,
-                                const int lda, const double *x, const int incx, const double beta,
-                                double *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_dgbmv_p == NULL)
-            cblas_dgbmv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n,
-                          int kl, int ku, const double alpha, const double *a, const int lda,
-                          const double *x, const int incx, const double beta, double *y,
-                          const int incy))GET_FUNC(h_libcblas, "cblas_dgbmv");
-        if (cblas_dgbmv_p != NULL)
-            cblas_dgbmv_p(layout, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_cgbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m,
-                                const int n, int kl, int ku, const void *alpha, const void *a,
-                                const int lda, const void *x, const int incx, const void *beta,
-                                void *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_cgbmv_p == NULL)
-            cblas_cgbmv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n,
-                          int kl, int ku, const void *alpha, const void *a, const int lda,
-                          const void *x, const int incx, const void *beta, void *y,
-                          const int incy))GET_FUNC(h_libcblas, "cblas_cgbmv");
-        if (cblas_cgbmv_p != NULL)
-            cblas_cgbmv_p(layout, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_zgbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m,
-                                const int n, int kl, int ku, const void *alpha, const void *a,
-                                const int lda, const void *x, const int incx, const void *beta,
-                                void *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_zgbmv_p == NULL)
-            cblas_zgbmv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n,
-                          int kl, int ku, const void *alpha, const void *a, const int lda,
-                          const void *x, const int incx, const void *beta, void *y,
-                          const int incy))GET_FUNC(h_libcblas, "cblas_zgbmv");
-        if (cblas_zgbmv_p != NULL)
-            cblas_zgbmv_p(layout, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_sger_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const float alpha,
-                               const float *x, const int incx, const float *y, const int incy,
-                               float *a, const int lda) {
-    if (cblas_library() != NULL) {
-        if (cblas_sger_p == NULL)
-            cblas_sger_p =
-                (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const float alpha,
-                          const float *x, const int incx, const float *y, const int incy, float *a,
-                          const int lda))GET_FUNC(h_libcblas, "cblas_sger");
-        if (cblas_sger_p != NULL)
-            cblas_sger_p(layout, m, n, alpha, x, incx, y, incy, a, lda);
-    }
-}
-
-static void cblas_dger_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const double alpha,
-                               const double *x, const int incx, const double *y, const int incy,
-                               double *a, const int lda) {
-    if (cblas_library() != NULL) {
-        if (cblas_dger_p == NULL)
-            cblas_dger_p =
-                (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const double alpha,
-                          const double *x, const int incx, const double *y, const int incy,
-                          double *a, const int lda))GET_FUNC(h_libcblas, "cblas_dger");
-        if (cblas_dger_p != NULL)
-            cblas_dger_p(layout, m, n, alpha, x, incx, y, incy, a, lda);
-    }
-}
-
-static void cblas_cgerc_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha,
-                                const void *x, const int incx, const void *y, const int incy,
-                                void *a, const int lda) {
-    if (cblas_library() != NULL) {
-        if (cblas_cgerc_p == NULL)
-            cblas_cgerc_p =
-                (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha,
-                          const void *x, const int incx, const void *y, const int incy, void *a,
-                          const int lda))GET_FUNC(h_libcblas, "cblas_cgerc");
-        if (cblas_cgerc_p != NULL)
-            cblas_cgerc_p(layout, m, n, alpha, x, incx, y, incy, a, lda);
-    }
-}
-
-static void cblas_zgerc_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha,
-                                const void *x, const int incx, const void *y, const int incy,
-                                void *a, const int lda) {
-    if (cblas_library() != NULL) {
-        if (cblas_zgerc_p == NULL)
-            cblas_zgerc_p =
-                (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha,
-                          const void *x, const int incx, const void *y, const int incy, void *a,
-                          const int lda))GET_FUNC(h_libcblas, "cblas_zgerc");
-        if (cblas_zgerc_p != NULL)
-            cblas_zgerc_p(layout, m, n, alpha, x, incx, y, incy, a, lda);
-    }
-}
-
-static void cblas_cgeru_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha,
-                                const void *x, const int incx, const void *y, const int incy,
-                                void *a, const int lda) {
-    if (cblas_library() != NULL) {
-        if (cblas_cgeru_p == NULL)
-            cblas_cgeru_p =
-                (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha,
-                          const void *x, const int incx, const void *y, const int incy, void *a,
-                          const int lda))GET_FUNC(h_libcblas, "cblas_cgeru");
-        if (cblas_cgeru_p != NULL)
-            cblas_cgeru_p(layout, m, n, alpha, x, incx, y, incy, a, lda);
-    }
-}
-
-static void cblas_zgeru_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha,
-                                const void *x, const int incx, const void *y, const int incy,
-                                void *a, const int lda) {
-    if (cblas_library() != NULL) {
-        if (cblas_zgeru_p == NULL)
-            cblas_zgeru_p =
-                (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha,
-                          const void *x, const int incx, const void *y, const int incy, void *a,
-                          const int lda))GET_FUNC(h_libcblas, "cblas_zgeru");
-        if (cblas_zgeru_p != NULL)
-            cblas_zgeru_p(layout, m, n, alpha, x, incx, y, incy, a, lda);
-    }
-}
-
-static void cblas_chbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const int k, const void *alpha, const void *a, const int lda,
-                                const void *x, const int incx, const void *beta, void *y,
-                                const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_chbmv_p == NULL)
-            cblas_chbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const int k, const void *alpha, const void *a, const int lda,
-                                      const void *x, const int incx, const void *beta, void *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_chbmv");
-        if (cblas_chbmv_p != NULL)
-            cblas_chbmv_p(layout, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_zhbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const int k, const void *alpha, const void *a, const int lda,
-                                const void *x, const int incx, const void *beta, void *y,
-                                const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_zhbmv_p == NULL)
-            cblas_zhbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const int k, const void *alpha, const void *a, const int lda,
-                                      const void *x, const int incx, const void *beta, void *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_zhbmv");
-        if (cblas_zhbmv_p != NULL)
-            cblas_zhbmv_p(layout, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_chemv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const void *alpha, const void *a, const int lda, const void *x,
-                                const int incx, const void *beta, void *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_chemv_p == NULL)
-            cblas_chemv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const void *alpha, const void *a, const int lda,
-                                      const void *x, const int incx, const void *beta, void *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_chemv");
-        if (cblas_chemv_p != NULL)
-            cblas_chemv_p(layout, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_zhemv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const void *alpha, const void *a, const int lda, const void *x,
-                                const int incx, const void *beta, void *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_zhemv_p == NULL)
-            cblas_zhemv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const void *alpha, const void *a, const int lda,
-                                      const void *x, const int incx, const void *beta, void *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_zhemv");
-        if (cblas_zhemv_p != NULL)
-            cblas_zhemv_p(layout, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_cher_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                               const float alpha, const void *x, const int incx, void *a,
-                               const int lda) {
-    if (cblas_library() != NULL) {
-        if (cblas_cher_p == NULL)
-            cblas_cher_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                     const float alpha, const void *x, const int incx, void *a,
-                                     const int lda))GET_FUNC(h_libcblas, "cblas_cher");
-        if (cblas_cher_p != NULL)
-            cblas_cher_p(layout, upper_lower, n, alpha, x, incx, a, lda);
-    }
-}
-
-static void cblas_zher_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                               const double alpha, const void *x, const int incx, void *a,
-                               const int lda) {
-    if (cblas_library() != NULL) {
-        if (cblas_zher_p == NULL)
-            cblas_zher_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                     const double alpha, const void *x, const int incx, void *a,
-                                     const int lda))GET_FUNC(h_libcblas, "cblas_zher");
-        if (cblas_zher_p != NULL)
-            cblas_zher_p(layout, upper_lower, n, alpha, x, incx, a, lda);
-    }
-}
-
-static void cblas_cher2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const void *alpha, const void *x, const int incx, const void *y,
-                                const int incy, void *a, const int lda) {
-    if (cblas_library() != NULL) {
-        if (cblas_cher2_p == NULL)
-            cblas_cher2_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const void *alpha, const void *x, const int incx,
-                                      const void *y, const int incy, void *a,
-                                      const int lda))GET_FUNC(h_libcblas, "cblas_cher2");
-        if (cblas_cher2_p != NULL)
-            cblas_cher2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a, lda);
-    }
-}
-
-static void cblas_zher2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const void *alpha, const void *x, const int incx, const void *y,
-                                const int incy, void *a, const int lda) {
-    if (cblas_library() != NULL) {
-        if (cblas_zher2_p == NULL)
-            cblas_zher2_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const void *alpha, const void *x, const int incx,
-                                      const void *y, const int incy, void *a,
-                                      const int lda))GET_FUNC(h_libcblas, "cblas_zher2");
-        if (cblas_zher2_p != NULL)
-            cblas_zher2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a, lda);
-    }
-}
-
-static void cblas_chpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const void *alpha, const void *a, const void *x, const int incx,
-                                const void *beta, void *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_chpmv_p == NULL)
-            cblas_chpmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const void *alpha, const void *a, const void *x,
-                                      const int incx, const void *beta, void *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_chpmv");
-        if (cblas_chpmv_p != NULL)
-            cblas_chpmv_p(layout, upper_lower, n, alpha, a, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_zhpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const void *alpha, const void *a, const void *x, const int incx,
-                                const void *beta, void *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_zhpmv_p == NULL)
-            cblas_zhpmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const void *alpha, const void *a, const void *x,
-                                      const int incx, const void *beta, void *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_zhpmv");
-        if (cblas_zhpmv_p != NULL)
-            cblas_zhpmv_p(layout, upper_lower, n, alpha, a, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_chpr_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                               const float alpha, const void *x, const int incx, void *a) {
-    if (cblas_library() != NULL) {
-        if (cblas_chpr_p == NULL)
-            cblas_chpr_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                     const float alpha, const void *x, const int incx,
-                                     void *a))GET_FUNC(h_libcblas, "cblas_chpr");
-        if (cblas_chpr_p != NULL)
-            cblas_chpr_p(layout, upper_lower, n, alpha, x, incx, a);
-    }
-}
-
-static void cblas_zhpr_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                               const double alpha, const void *x, const int incx, void *a) {
-    if (cblas_library() != NULL) {
-        if (cblas_zhpr_p == NULL)
-            cblas_zhpr_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                     const double alpha, const void *x, const int incx,
-                                     void *a))GET_FUNC(h_libcblas, "cblas_zhpr");
-        if (cblas_zhpr_p != NULL)
-            cblas_zhpr_p(layout, upper_lower, n, alpha, x, incx, a);
-    }
-}
-
-static void cblas_chpr2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const void *alpha, const void *x, const int incx, const void *y,
-                                const int incy, void *a) {
-    if (cblas_library() != NULL) {
-        if (cblas_chpr2_p == NULL)
-            cblas_chpr2_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                          const void *alpha, const void *x, const int incx, const void *y,
-                          const int incy, void *a))GET_FUNC(h_libcblas, "cblas_chpr2");
-        if (cblas_chpr2_p != NULL)
-            cblas_chpr2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a);
-    }
-}
-
-static void cblas_zhpr2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const void *alpha, const void *x, const int incx, const void *y,
-                                const int incy, void *a) {
-    if (cblas_library() != NULL) {
-        if (cblas_zhpr2_p == NULL)
-            cblas_zhpr2_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                          const void *alpha, const void *x, const int incx, const void *y,
-                          const int incy, void *a))GET_FUNC(h_libcblas, "cblas_zhpr2");
-        if (cblas_zhpr2_p != NULL)
-            cblas_zhpr2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a);
-    }
-}
-
-static void cblas_ssbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const int k, const float alpha, const float *a, const int lda,
-                                const float *x, const int incx, const float beta, float *y,
-                                const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_ssbmv_p == NULL)
-            cblas_ssbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const int k, const float alpha, const float *a, const int lda,
-                                      const float *x, const int incx, const float beta, float *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_ssbmv");
-        if (cblas_ssbmv_p != NULL)
-            cblas_ssbmv_p(layout, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_dsbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const int k, const double alpha, const double *a, const int lda,
-                                const double *x, const int incx, const double beta, double *y,
-                                const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_dsbmv_p == NULL)
-            cblas_dsbmv_p = (void (*)(
-                CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, const int k,
-                const double alpha, const double *a, const int lda, const double *x, const int incx,
-                const double beta, double *y, const int incy))GET_FUNC(h_libcblas, "cblas_dsbmv");
-        if (cblas_dsbmv_p != NULL)
-            cblas_dsbmv_p(layout, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_ssymv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const float alpha, const float *a, const int lda, const float *x,
-                                const int incx, const float beta, float *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_ssymv_p == NULL)
-            cblas_ssymv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const float alpha, const float *a, const int lda,
-                                      const float *x, const int incx, const float beta, float *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_ssymv");
-        if (cblas_ssymv_p != NULL)
-            cblas_ssymv_p(layout, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_dsymv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const double alpha, const double *a, const int lda, const double *x,
-                                const int incx, const double beta, double *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_dsymv_p == NULL)
-            cblas_dsymv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const double alpha, const double *a, const int lda,
-                                      const double *x, const int incx, const double beta, double *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_dsymv");
-        if (cblas_dsymv_p != NULL)
-            cblas_dsymv_p(layout, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_ssyr_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                               const float alpha, const float *x, const int incx, float *a,
-                               const int lda) {
-    if (cblas_library() != NULL) {
-        if (cblas_ssyr_p == NULL)
-            cblas_ssyr_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                     const float alpha, const float *x, const int incx, float *a,
-                                     const int lda))GET_FUNC(h_libcblas, "cblas_ssyr");
-        if (cblas_ssyr_p != NULL)
-            cblas_ssyr_p(layout, upper_lower, n, alpha, x, incx, a, lda);
-    }
-}
-
-static void cblas_dsyr_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                               const double alpha, const double *x, const int incx, double *a,
-                               const int lda) {
-    if (cblas_library() != NULL) {
-        if (cblas_dsyr_p == NULL)
-            cblas_dsyr_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                     const double alpha, const double *x, const int incx, double *a,
-                                     const int lda))GET_FUNC(h_libcblas, "cblas_dsyr");
-        if (cblas_dsyr_p != NULL)
-            cblas_dsyr_p(layout, upper_lower, n, alpha, x, incx, a, lda);
-    }
-}
-
-static void cblas_ssyr2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const float alpha, const float *x, const int incx, const float *y,
-                                const int incy, float *a, const int lda) {
-    if (cblas_library() != NULL) {
-        if (cblas_ssyr2_p == NULL)
-            cblas_ssyr2_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const float alpha, const float *x, const int incx,
-                                      const float *y, const int incy, float *a,
-                                      const int lda))GET_FUNC(h_libcblas, "cblas_ssyr2");
-        if (cblas_ssyr2_p != NULL)
-            cblas_ssyr2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a, lda);
-    }
-}
-
-static void cblas_dsyr2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const double alpha, const double *x, const int incx,
-                                const double *y, const int incy, double *a, const int lda) {
-    if (cblas_library() != NULL) {
-        if (cblas_dsyr2_p == NULL)
-            cblas_dsyr2_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const double alpha, const double *x, const int incx,
-                                      const double *y, const int incy, double *a,
-                                      const int lda))GET_FUNC(h_libcblas, "cblas_dsyr2");
-        if (cblas_dsyr2_p != NULL)
-            cblas_dsyr2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a, lda);
-    }
-}
-
-static void cblas_sspmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const float alpha, const float *a, const float *x, const int incx,
-                                const float beta, float *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_sspmv_p == NULL)
-            cblas_sspmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const float alpha, const float *a, const float *x,
-                                      const int incx, const float beta, float *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_sspmv");
-        if (cblas_sspmv_p != NULL)
-            cblas_sspmv_p(layout, upper_lower, n, alpha, a, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_dspmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const double alpha, const double *a, const double *x,
-                                const int incx, const double beta, double *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_dspmv_p == NULL)
-            cblas_dspmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                      const double alpha, const double *a, const double *x,
-                                      const int incx, const double beta, double *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_dspmv");
-        if (cblas_dspmv_p != NULL)
-            cblas_dspmv_p(layout, upper_lower, n, alpha, a, x, incx, beta, y, incy);
-    }
-}
-
-static void cblas_sspr_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                               const float alpha, const float *x, const int incx, float *a) {
-    if (cblas_library() != NULL) {
-        if (cblas_sspr_p == NULL)
-            cblas_sspr_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                     const float alpha, const float *x, const int incx,
-                                     float *a))GET_FUNC(h_libcblas, "cblas_sspr");
-        if (cblas_sspr_p != NULL)
-            cblas_sspr_p(layout, upper_lower, n, alpha, x, incx, a);
-    }
-}
-
-static void cblas_dspr_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                               const double alpha, const double *x, const int incx, double *a) {
-    if (cblas_library() != NULL) {
-        if (cblas_dspr_p == NULL)
-            cblas_dspr_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                     const double alpha, const double *x, const int incx,
-                                     double *a))GET_FUNC(h_libcblas, "cblas_dspr");
-        if (cblas_dspr_p != NULL)
-            cblas_dspr_p(layout, upper_lower, n, alpha, x, incx, a);
-    }
-}
-
-static void cblas_sspr2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const float alpha, const float *x, const int incx, const float *y,
-                                const int incy, float *a) {
-    if (cblas_library() != NULL) {
-        if (cblas_sspr2_p == NULL)
-            cblas_sspr2_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                          const float alpha, const float *x, const int incx, const float *y,
-                          const int incy, float *a))GET_FUNC(h_libcblas, "cblas_sspr2");
-        if (cblas_sspr2_p != NULL)
-            cblas_sspr2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a);
-    }
-}
-
-static void cblas_dspr2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                                const double alpha, const double *x, const int incx,
-                                const double *y, const int incy, double *a) {
-    if (cblas_library() != NULL) {
-        if (cblas_dspr2_p == NULL)
-            cblas_dspr2_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n,
-                          const double alpha, const double *x, const int incx, const double *y,
-                          const int incy, double *a))GET_FUNC(h_libcblas, "cblas_dspr2");
-        if (cblas_dspr2_p != NULL)
-            cblas_dspr2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a);
-    }
-}
-
-static void cblas_stbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const int k, const float *a,
-                                const int lda, float *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_stbmv_p == NULL)
-            cblas_stbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower,
-                                      CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n,
-                                      const int k, const float *a, const int lda, float *x,
-                                      const int incx))GET_FUNC(h_libcblas, "cblas_stbmv");
-        if (cblas_stbmv_p != NULL)
-            cblas_stbmv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-    }
-}
-
-static void cblas_dtbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const int k, const double *a,
-                                const int lda, double *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_dtbmv_p == NULL)
-            cblas_dtbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower,
-                                      CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n,
-                                      const int k, const double *a, const int lda, double *x,
-                                      const int incx))GET_FUNC(h_libcblas, "cblas_dtbmv");
-        if (cblas_dtbmv_p != NULL)
-            cblas_dtbmv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-    }
-}
-
-static void cblas_ctbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const int k, const void *a,
-                                const int lda, void *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_ctbmv_p == NULL)
-            cblas_ctbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower,
-                                      CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n,
-                                      const int k, const void *a, const int lda, void *x,
-                                      const int incx))GET_FUNC(h_libcblas, "cblas_ctbmv");
-        if (cblas_ctbmv_p != NULL)
-            cblas_ctbmv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-    }
-}
-
-static void cblas_ztbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const int k, const void *a,
-                                const int lda, void *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_ztbmv_p == NULL)
-            cblas_ztbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower,
-                                      CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n,
-                                      const int k, const void *a, const int lda, void *x,
-                                      const int incx))GET_FUNC(h_libcblas, "cblas_ztbmv");
-        if (cblas_ztbmv_p != NULL)
-            cblas_ztbmv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-    }
-}
-
-static void cblas_stbsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const int k, const float *a,
-                                const int lda, float *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_stbsv_p == NULL)
-            cblas_stbsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower,
-                                      CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n,
-                                      const int k, const float *a, const int lda, float *x,
-                                      const int incx))GET_FUNC(h_libcblas, "cblas_stbsv");
-        if (cblas_stbsv_p != NULL)
-            cblas_stbsv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-    }
-}
-
-static void cblas_dtbsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const int k, const double *a,
-                                const int lda, double *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_dtbsv_p == NULL)
-            cblas_dtbsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower,
-                                      CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n,
-                                      const int k, const double *a, const int lda, double *x,
-                                      const int incx))GET_FUNC(h_libcblas, "cblas_dtbsv");
-        if (cblas_dtbsv_p != NULL)
-            cblas_dtbsv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-    }
-}
-
-static void cblas_ctbsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const int k, const void *a,
-                                const int lda, void *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_ctbsv_p == NULL)
-            cblas_ctbsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower,
-                                      CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n,
-                                      const int k, const void *a, const int lda, void *x,
-                                      const int incx))GET_FUNC(h_libcblas, "cblas_ctbsv");
-        if (cblas_ctbsv_p != NULL)
-            cblas_ctbsv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-    }
-}
-
-static void cblas_ztbsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const int k, const void *a,
-                                const int lda, void *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_ztbsv_p == NULL)
-            cblas_ztbsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower,
-                                      CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n,
-                                      const int k, const void *a, const int lda, void *x,
-                                      const int incx))GET_FUNC(h_libcblas, "cblas_ztbsv");
-        if (cblas_ztbsv_p != NULL)
-            cblas_ztbsv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx);
-    }
-}
-
-static void cblas_stpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const float *a, float *x,
-                                const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_stpmv_p == NULL)
-            cblas_stpmv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const float *a, float *x,
-                          const int incx))GET_FUNC(h_libcblas, "cblas_stpmv");
-        if (cblas_stpmv_p != NULL)
-            cblas_stpmv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx);
-    }
-}
-
-static void cblas_dtpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const double *a, double *x,
-                                const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_dtpmv_p == NULL)
-            cblas_dtpmv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const double *a, double *x,
-                          const int incx))GET_FUNC(h_libcblas, "cblas_dtpmv");
-        if (cblas_dtpmv_p != NULL)
-            cblas_dtpmv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx);
-    }
-}
-
-static void cblas_ctpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const void *a, void *x,
-                                const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_ctpmv_p == NULL)
-            cblas_ctpmv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const void *a, void *x,
-                          const int incx))GET_FUNC(h_libcblas, "cblas_ctpmv");
-        if (cblas_ctpmv_p != NULL)
-            cblas_ctpmv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx);
-    }
-}
-
-static void cblas_ztpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const void *a, void *x,
-                                const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_ztpmv_p == NULL)
-            cblas_ztpmv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const void *a, void *x,
-                          const int incx))GET_FUNC(h_libcblas, "cblas_ztpmv");
-        if (cblas_ztpmv_p != NULL)
-            cblas_ztpmv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx);
-    }
-}
-
-static void cblas_stpsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const float *a, float *x,
-                                const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_stpsv_p == NULL)
-            cblas_stpsv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const float *a, float *x,
-                          const int incx))GET_FUNC(h_libcblas, "cblas_stpsv");
-        if (cblas_stpsv_p != NULL)
-            cblas_stpsv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx);
-    }
-}
-
-static void cblas_dtpsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const double *a, double *x,
-                                const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_dtpsv_p == NULL)
-            cblas_dtpsv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const double *a, double *x,
-                          const int incx))GET_FUNC(h_libcblas, "cblas_dtpsv");
-        if (cblas_dtpsv_p != NULL)
-            cblas_dtpsv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx);
-    }
-}
-
-static void cblas_ctpsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const void *a, void *x,
-                                const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_ctpsv_p == NULL)
-            cblas_ctpsv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const void *a, void *x,
-                          const int incx))GET_FUNC(h_libcblas, "cblas_ctpsv");
-        if (cblas_ctpsv_p != NULL)
-            cblas_ctpsv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx);
-    }
-}
-
-static void cblas_ztpsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const void *a, void *x,
-                                const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_ztpsv_p == NULL)
-            cblas_ztpsv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const void *a, void *x,
-                          const int incx))GET_FUNC(h_libcblas, "cblas_ztpsv");
-        if (cblas_ztpsv_p != NULL)
-            cblas_ztpsv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx);
-    }
-}
-
-static void cblas_strmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const float *a, const int lda,
-                                float *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_strmv_p == NULL)
-            cblas_strmv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const float *a, const int lda,
-                          float *x, const int incx))GET_FUNC(h_libcblas, "cblas_strmv");
-        if (cblas_strmv_p != NULL)
-            cblas_strmv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-    }
-}
-
-static void cblas_dtrmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const double *a, const int lda,
-                                double *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_dtrmv_p == NULL)
-            cblas_dtrmv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const double *a, const int lda,
-                          double *x, const int incx))GET_FUNC(h_libcblas, "cblas_dtrmv");
-        if (cblas_dtrmv_p != NULL)
-            cblas_dtrmv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-    }
-}
-
-static void cblas_ctrmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const void *a, const int lda,
-                                void *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_ctrmv_p == NULL)
-            cblas_ctrmv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, void *x,
-                          const int incx))GET_FUNC(h_libcblas, "cblas_ctrmv");
-        if (cblas_ctrmv_p != NULL)
-            cblas_ctrmv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-    }
-}
-
-static void cblas_ztrmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const void *a, const int lda,
-                                void *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_ztrmv_p == NULL)
-            cblas_ztrmv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, void *x,
-                          const int incx))GET_FUNC(h_libcblas, "cblas_ztrmv");
-        if (cblas_ztrmv_p != NULL)
-            cblas_ztrmv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-    }
-}
-
-static void cblas_strsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const float *a, const int lda,
-                                float *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_strsv_p == NULL)
-            cblas_strsv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const float *a, const int lda,
-                          float *x, const int incx))GET_FUNC(h_libcblas, "cblas_strsv");
-        if (cblas_strsv_p != NULL)
-            cblas_strsv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-    }
-}
-
-static void cblas_dtrsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const double *a, const int lda,
-                                double *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_dtrsv_p == NULL)
-            cblas_dtrsv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const double *a, const int lda,
-                          double *x, const int incx))GET_FUNC(h_libcblas, "cblas_dtrsv");
-        if (cblas_dtrsv_p != NULL)
-            cblas_dtrsv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-    }
-}
-
-static void cblas_ctrsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const void *a, const int lda,
-                                void *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_ctrsv_p == NULL)
-            cblas_ctrsv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, void *x,
-                          const int incx))GET_FUNC(h_libcblas, "cblas_ctrsv");
-        if (cblas_ctrsv_p != NULL)
-            cblas_ctrsv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-    }
-}
-
-static void cblas_ztrsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                                CBLAS_DIAG unit_diag, const int n, const void *a, const int lda,
-                                void *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_ztrsv_p == NULL)
-            cblas_ztrsv_p =
-                (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans,
-                          CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, void *x,
-                          const int incx))GET_FUNC(h_libcblas, "cblas_ztrsv");
-        if (cblas_ztrsv_p != NULL)
-            cblas_ztrsv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx);
-    }
-}
-
-/* Level 1 */
-
-static float (*cblas_sasum_p)(const int n, const float *x, const int incx);
-static double (*cblas_dasum_p)(const int n, const double *x, const int incx);
-static float (*cblas_scasum_p)(const int n, const void *x, const int incx);
-static double (*cblas_dzasum_p)(const int n, const void *x, const int incx);
-static void (*cblas_saxpy_p)(const int n, const float alpha, const float *x, const int incx,
-                             float *y, const int incy);
-static void (*cblas_daxpy_p)(const int n, const double alpha, const double *x, const int incx,
-                             double *y, const int incy);
-static void (*cblas_caxpy_p)(const int n, const void *alpha, const void *x, const int incx, void *y,
-                             const int incy);
-static void (*cblas_zaxpy_p)(const int n, const void *alpha, const void *x, const int incx, void *y,
-                             const int incy);
-static void (*cblas_scopy_p)(const int n, const float *x, const int incx, float *y, const int incy);
-static void (*cblas_dcopy_p)(const int n, const double *x, const int incx, double *y,
-                             const int incy);
-static void (*cblas_ccopy_p)(const int n, const void *x, const int incx, void *y, const int incy);
-static void (*cblas_zcopy_p)(const int n, const void *x, const int incx, void *y, const int incy);
-static float (*cblas_sdot_p)(const int n, const float *x, const int incx, const float *y,
-                             const int incy);
-static double (*cblas_ddot_p)(const int n, const double *x, const int incx, const double *y,
-                              const int incy);
-static double (*cblas_dsdot_p)(const int n, const float *x, const int incx, const float *y,
-                               const int incy);
-static float (*cblas_sdsdot_p)(const int n, const float sb, const float *x, const int incx,
-                               const float *y, const int incy);
-static float (*cblas_snrm2_p)(const int n, const float *x, const int incx);
-static double (*cblas_dnrm2_p)(const int n, const double *x, const int incx);
-static float (*cblas_scnrm2_p)(const int n, const void *x, const int incx);
-static double (*cblas_dznrm2_p)(const int n, const void *x, const int incx);
-static void (*cblas_srot_p)(const int n, float *x, const int incx, float *y, const int incy,
-                            const float c, const float s);
-static void (*cblas_drot_p)(const int n, double *x, const int incx, double *y, const int incy,
-                            const double c, const double s);
-static void (*csrot_p)(const int *n, void *x, const int *incx, void *y, const int *incy,
-                       const float *c, const float *s);
-static void (*zdrot_p)(const int *n, void *x, const int *incx, void *y, const int *incy,
-                       const double *c, const double *s);
-static void (*cblas_srotg_p)(float *a, float *b, float *c, float *s);
-static void (*cblas_drotg_p)(double *a, double *b, double *c, double *s);
-static void (*crotg_p)(void *a, void *b, float *c, void *s);
-static void (*zrotg_p)(void *a, void *b, double *c, void *s);
-static void (*cblas_srotm_p)(const int n, float *x, const int incx, float *y, const int incy,
-                             const float *param);
-static void (*cblas_drotm_p)(const int n, double *x, const int incx, double *y, const int incy,
-                             const double *param);
-static void (*cblas_srotmg_p)(float *d1, float *d2, float *x1, float y1, float *param);
-static void (*cblas_drotmg_p)(double *d1, double *d2, double *x1, double y1, double *param);
-static void (*cblas_sscal_p)(const int n, const float alpha, float *x, const int incx);
-static void (*cblas_dscal_p)(const int n, const double alpha, double *x, const int incx);
-static void (*cblas_cscal_p)(const int n, const void *alpha, void *x, const int incx);
-static void (*cblas_zscal_p)(const int n, const void *alpha, void *x, const int incx);
-static void (*cblas_csscal_p)(const int n, const float alpha, void *x, const int incx);
-static void (*cblas_zdscal_p)(const int n, const double alpha, void *x, const int incx);
-static void (*cblas_sswap_p)(const int n, float *x, const int incx, float *y, const int incy);
-static void (*cblas_dswap_p)(const int n, double *x, const int incx, double *y, const int incy);
-static void (*cblas_cswap_p)(const int n, void *x, const int incx, void *y, const int incy);
-static void (*cblas_zswap_p)(const int n, void *x, const int incx, void *y, const int incy);
-static void (*cblas_cdotc_sub_p)(const int n, const void *x, const int incx, const void *y,
-                                 const int incy, void *pres);
-static void (*cblas_zdotc_sub_p)(const int n, const void *x, const int incx, const void *y,
-                                 const int incy, void *pres);
-static void (*cblas_cdotu_sub_p)(const int n, const void *x, const int incx, const void *y,
-                                 const int incy, void *pres);
-static void (*cblas_zdotu_sub_p)(const int n, const void *x, const int incx, const void *y,
-                                 const int incy, void *pres);
-static int (*cblas_isamax_p)(const int n, const float *x, const int incx);
-static int (*cblas_idamax_p)(const int n, const double *x, const int incx);
-static int (*cblas_icamax_p)(const int n, const void *x, const int incx);
-static int (*cblas_izamax_p)(const int n, const void *x, const int incx);
-
-static float cblas_sasum_wrapper(const int n, const float *x, const int incx) {
-    float sasum_res = 0.0f;
-    if (cblas_library() != NULL) {
-        if (cblas_sasum_p == NULL)
-            cblas_sasum_p = (float (*)(const int n, const float *x, const int incx))GET_FUNC(
-                h_libcblas, "cblas_sasum");
-        if (cblas_sasum_p != NULL)
-            sasum_res = cblas_sasum_p(n, x, incx);
-    }
-    return sasum_res;
-}
-
-static double cblas_dasum_wrapper(const int n, const double *x, const int incx) {
-    double dasum_res = 0.0;
-    if (cblas_library() != NULL) {
-        if (cblas_dasum_p == NULL)
-            cblas_dasum_p = (double (*)(const int n, const double *x, const int incx))GET_FUNC(
-                h_libcblas, "cblas_dasum");
-        if (cblas_dasum_p != NULL)
-            dasum_res = cblas_dasum_p(n, x, incx);
-    }
-    return dasum_res;
-}
-
-static float cblas_scasum_wrapper(const int n, const void *x, const int incx) {
-    float scasum_res = 0.0f;
-    if (cblas_library() != NULL) {
-        if (cblas_scasum_p == NULL)
-            cblas_scasum_p = (float (*)(const int n, const void *x, const int incx))GET_FUNC(
-                h_libcblas, "cblas_scasum");
-        if (cblas_scasum_p != NULL)
-            scasum_res = cblas_scasum_p(n, x, incx);
-    }
-    return scasum_res;
-}
-
-static double cblas_dzasum_wrapper(const int n, const void *x, const int incx) {
-    double dzasum_res = 0.0;
-    if (cblas_library() != NULL) {
-        if (cblas_dzasum_p == NULL)
-            cblas_dzasum_p = (double (*)(const int n, const void *x, const int incx))GET_FUNC(
-                h_libcblas, "cblas_dzasum");
-        if (cblas_dzasum_p != NULL)
-            dzasum_res = cblas_dzasum_p(n, x, incx);
-    }
-    return dzasum_res;
-}
-
-static void cblas_saxpy_wrapper(const int n, const float alpha, const float *x, const int incx,
-                                float *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_saxpy_p == NULL)
-            cblas_saxpy_p =
-                (void (*)(const int n, const float alpha, const float *x, const int incx, float *y,
-                          const int incy))GET_FUNC(h_libcblas, "cblas_saxpy");
-        if (cblas_saxpy_p != NULL)
-            cblas_saxpy_p(n, alpha, x, incx, y, incy);
-    }
-}
-
-static void cblas_daxpy_wrapper(const int n, const double alpha, const double *x, const int incx,
-                                double *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_daxpy_p == NULL)
-            cblas_daxpy_p =
-                (void (*)(const int n, const double alpha, const double *x, const int incx,
-                          double *y, const int incy))GET_FUNC(h_libcblas, "cblas_daxpy");
-        if (cblas_daxpy_p != NULL)
-            cblas_daxpy_p(n, alpha, x, incx, y, incy);
-    }
-}
-
-static void cblas_caxpy_wrapper(const int n, const void *alpha, const void *x, const int incx,
-                                void *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_caxpy_p == NULL)
-            cblas_caxpy_p = (void (*)(const int n, const void *alpha, const void *x, const int incx,
-                                      void *y, const int incy))GET_FUNC(h_libcblas, "cblas_caxpy");
-        if (cblas_caxpy_p != NULL)
-            cblas_caxpy_p(n, alpha, x, incx, y, incy);
-    }
-}
-
-static void cblas_zaxpy_wrapper(const int n, const void *alpha, const void *x, const int incx,
-                                void *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_zaxpy_p == NULL)
-            cblas_zaxpy_p = (void (*)(const int n, const void *alpha, const void *x, const int incx,
-                                      void *y, const int incy))GET_FUNC(h_libcblas, "cblas_zaxpy");
-        if (cblas_zaxpy_p != NULL)
-            cblas_zaxpy_p(n, alpha, x, incx, y, incy);
-    }
-}
-
-static void cblas_scopy_wrapper(const int n, const float *x, const int incx, float *y,
-                                const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_scopy_p == NULL)
-            cblas_scopy_p = (void (*)(const int n, const float *x, const int incx, float *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_scopy");
-        if (cblas_scopy_p != NULL)
-            cblas_scopy_p(n, x, incx, y, incy);
-    }
-}
-
-static void cblas_dcopy_wrapper(const int n, const double *x, const int incx, double *y,
-                                const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_dcopy_p == NULL)
-            cblas_dcopy_p = (void (*)(const int n, const double *x, const int incx, double *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_dcopy");
-        if (cblas_dcopy_p != NULL)
-            cblas_dcopy_p(n, x, incx, y, incy);
-    }
-}
-
-static void cblas_ccopy_wrapper(const int n, const void *x, const int incx, void *y,
-                                const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_ccopy_p == NULL)
-            cblas_ccopy_p = (void (*)(const int n, const void *x, const int incx, void *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_ccopy");
-        if (cblas_ccopy_p != NULL)
-            cblas_ccopy_p(n, x, incx, y, incy);
-    }
-}
-
-static void cblas_zcopy_wrapper(const int n, const void *x, const int incx, void *y,
-                                const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_zcopy_p == NULL)
-            cblas_zcopy_p = (void (*)(const int n, const void *x, const int incx, void *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_zcopy");
-        if (cblas_zcopy_p != NULL)
-            cblas_zcopy_p(n, x, incx, y, incy);
-    }
-}
-
-static float cblas_sdot_wrapper(const int n, const float *x, const int incx, const float *y,
-                                const int incy) {
-    float sdot_res = 0.0f;
-    if (cblas_library() != NULL) {
-        if (cblas_sdot_p == NULL)
-            cblas_sdot_p = (float (*)(const int n, const float *x, const int incx, const float *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_sdot");
-        if (cblas_sdot_p != NULL)
-            sdot_res = cblas_sdot_p(n, x, incx, y, incy);
-    }
-    return sdot_res;
-}
-
-static double cblas_ddot_wrapper(const int n, const double *x, const int incx, const double *y,
-                                 const int incy) {
-    double ddot_res = 0.0;
-    if (cblas_library() != NULL) {
-        if (cblas_ddot_p == NULL)
-            cblas_ddot_p =
-                (double (*)(const int n, const double *x, const int incx, const double *y,
-                            const int incy))GET_FUNC(h_libcblas, "cblas_ddot");
-        if (cblas_ddot_p != NULL)
-            ddot_res = cblas_ddot_p(n, x, incx, y, incy);
-    }
-    return ddot_res;
-}
-
-static double cblas_dsdot_wrapper(const int n, const float *x, const int incx, const float *y,
-                                  const int incy) {
-    double dsdot_res = 0.0;
-    if (cblas_library() != NULL) {
-        if (cblas_dsdot_p == NULL)
-            cblas_dsdot_p = (double (*)(const int n, const float *x, const int incx, const float *y,
-                                        const int incy))GET_FUNC(h_libcblas, "cblas_dsdot");
-        if (cblas_dsdot_p != NULL)
-            dsdot_res = cblas_dsdot_p(n, x, incx, y, incy);
-    }
-    return dsdot_res;
-}
-
-static float cblas_sdsdot_wrapper(const int n, const float sb, const float *x, const int incx,
-                                  const float *y, const int incy) {
-    float sdsdot_res = 0.0f;
-    if (cblas_library() != NULL) {
-        if (cblas_sdsdot_p == NULL)
-            cblas_sdsdot_p =
-                (float (*)(const int n, const float sb, const float *x, const int incx,
-                           const float *y, const int incy))GET_FUNC(h_libcblas, "cblas_sdsdot");
-        if (cblas_sdsdot_p != NULL)
-            sdsdot_res = cblas_sdsdot_p(n, sb, x, incx, y, incy);
-    }
-    return sdsdot_res;
-}
-
-static float cblas_snrm2_wrapper(const int n, const float *x, const int incx) {
-    float snrm2_res = 0.0f;
-    if (cblas_library() != NULL) {
-        if (cblas_snrm2_p == NULL)
-            cblas_snrm2_p = (float (*)(const int n, const float *x, const int incx))GET_FUNC(
-                h_libcblas, "cblas_snrm2");
-        if (cblas_snrm2_p != NULL)
-            snrm2_res = cblas_snrm2_p(n, x, incx);
-    }
-    return snrm2_res;
-}
-
-static double cblas_dnrm2_wrapper(const int n, const double *x, const int incx) {
-    double dnrm2_res = 0.0;
-    if (cblas_library() != NULL) {
-        if (cblas_dnrm2_p == NULL)
-            cblas_dnrm2_p = (double (*)(const int n, const double *x, const int incx))GET_FUNC(
-                h_libcblas, "cblas_dnrm2");
-        if (cblas_dnrm2_p != NULL)
-            dnrm2_res = cblas_dnrm2_p(n, x, incx);
-    }
-    return dnrm2_res;
-}
-
-static float cblas_scnrm2_wrapper(const int n, const void *x, const int incx) {
-    float scnrm2_res = 0.0f;
-    if (cblas_library() != NULL) {
-        if (cblas_scnrm2_p == NULL)
-            cblas_scnrm2_p = (float (*)(const int n, const void *x, const int incx))GET_FUNC(
-                h_libcblas, "cblas_scnrm2");
-        if (cblas_scnrm2_p != NULL)
-            scnrm2_res = cblas_scnrm2_p(n, x, incx);
-    }
-    return scnrm2_res;
-}
-
-static double cblas_dznrm2_wrapper(const int n, const void *x, const int incx) {
-    double dznrm2_res = 0.0;
-    if (cblas_library() != NULL) {
-        if (cblas_dznrm2_p == NULL)
-            cblas_dznrm2_p = (double (*)(const int n, const void *x, const int incx))GET_FUNC(
-                h_libcblas, "cblas_dznrm2");
-        if (cblas_dznrm2_p != NULL)
-            dznrm2_res = cblas_dznrm2_p(n, x, incx);
-    }
-    return dznrm2_res;
-}
-
-static void cblas_srot_wrapper(const int n, float *x, const int incx, float *y, const int incy,
-                               const float c, const float s) {
-    if (cblas_library() != NULL) {
-        if (cblas_srot_p == NULL)
-            cblas_srot_p =
-                (void (*)(const int n, float *x, const int incx, float *y, const int incy,
-                          const float c, const float s))GET_FUNC(h_libcblas, "cblas_srot");
-        if (cblas_srot_p != NULL)
-            cblas_srot_p(n, x, incx, y, incy, c, s);
-    }
-}
-
-static void cblas_drot_wrapper(const int n, double *x, const int incx, double *y, const int incy,
-                               const double c, const double s) {
-    if (cblas_library() != NULL) {
-        if (cblas_drot_p == NULL)
-            cblas_drot_p =
-                (void (*)(const int n, double *x, const int incx, double *y, const int incy,
-                          const double c, const double s))GET_FUNC(h_libcblas, "cblas_drot");
-        if (cblas_drot_p != NULL)
-            cblas_drot_p(n, x, incx, y, incy, c, s);
-    }
-}
-
-static void csrot_wrapper(const int *n, void *x, const int *incx, void *y, const int *incy,
-                          const float *c, const float *s) {
-    if (blas_library() != NULL) {
-        if (csrot_p == NULL)
-            csrot_p = (void (*)(const int *n, void *x, const int *incx, void *y, const int *incy,
-                                const float *c, const float *s))GET_FUNC(h_libblas, "csrot_");
-        if (csrot_p == NULL)
-            csrot_p = (void (*)(const int *n, void *x, const int *incx, void *y, const int *incy,
-                                const float *c, const float *s))GET_FUNC(h_libblas, "CSROT");
-        if (csrot_p != NULL)
-            csrot_p(n, x, incx, y, incy, c, s);
-    }
-}
-
-static void zdrot_wrapper(const int *n, void *x, const int *incx, void *y, const int *incy,
-                          const double *c, const double *s) {
-    if (blas_library() != NULL) {
-        if (zdrot_p == NULL)
-            zdrot_p = (void (*)(const int *n, void *x, const int *incx, void *y, const int *incy,
-                                const double *c, const double *s))GET_FUNC(h_libblas, "zdrot_");
-        if (zdrot_p == NULL)
-            zdrot_p = (void (*)(const int *n, void *x, const int *incx, void *y, const int *incy,
-                                const double *c, const double *s))GET_FUNC(h_libblas, "ZDROT");
-        if (zdrot_p != NULL)
-            zdrot_p(n, x, incx, y, incy, c, s);
-    }
-}
-
-static void cblas_srotg_wrapper(float *a, float *b, float *c, float *s) {
-    if (cblas_library() != NULL) {
-        if (cblas_srotg_p == NULL)
-            cblas_srotg_p = (void (*)(float *a, float *b, float *c, float *s))GET_FUNC(
-                h_libcblas, "cblas_srotg");
-        if (cblas_srotg_p != NULL)
-            cblas_srotg_p(a, b, c, s);
-    }
-}
-
-static void cblas_drotg_wrapper(double *a, double *b, double *c, double *s) {
-    if (cblas_library() != NULL) {
-        if (cblas_drotg_p == NULL)
-            cblas_drotg_p = (void (*)(double *a, double *b, double *c, double *s))GET_FUNC(
-                h_libcblas, "cblas_drotg");
-        if (cblas_drotg_p != NULL)
-            cblas_drotg_p(a, b, c, s);
-    }
-}
-
-static void crotg_wrapper(void *a, void *b, float *c, void *s) {
-    if (blas_library() != NULL) {
-        if (crotg_p == NULL)
-            crotg_p = (void (*)(void *a, void *b, float *c, void *s))GET_FUNC(h_libblas, "crotg_");
-        if (crotg_p == NULL)
-            crotg_p = (void (*)(void *a, void *b, float *c, void *s))GET_FUNC(h_libblas, "CROTG");
-        if (crotg_p != NULL)
-            crotg_p(a, b, c, s);
-    }
-}
-
-static void zrotg_wrapper(void *a, void *b, double *c, void *s) {
-    if (blas_library() != NULL) {
-        if (zrotg_p == NULL)
-            zrotg_p = (void (*)(void *a, void *b, double *c, void *s))GET_FUNC(h_libblas, "zrotg_");
-        if (zrotg_p == NULL)
-            zrotg_p = (void (*)(void *a, void *b, double *c, void *s))GET_FUNC(h_libblas, "ZROTG");
-        if (zrotg_p != NULL)
-            zrotg_p(a, b, c, s);
-    }
-}
-
-static void cblas_srotm_wrapper(const int n, float *x, const int incx, float *y, const int incy,
-                                const float *param) {
-    if (cblas_library() != NULL) {
-        if (cblas_srotm_p == NULL)
-            cblas_srotm_p =
-                (void (*)(const int n, float *x, const int incx, float *y, const int incy,
-                          const float *param))GET_FUNC(h_libcblas, "cblas_srotm");
-        if (cblas_srotm_p != NULL)
-            cblas_srotm_p(n, x, incx, y, incy, param);
-    }
-}
-
-static void cblas_drotm_wrapper(const int n, double *x, const int incx, double *y, const int incy,
-                                const double *param) {
-    if (cblas_library() != NULL) {
-        if (cblas_drotm_p == NULL)
-            cblas_drotm_p =
-                (void (*)(const int n, double *x, const int incx, double *y, const int incy,
-                          const double *param))GET_FUNC(h_libcblas, "cblas_drotm");
-        if (cblas_drotm_p != NULL)
-            cblas_drotm_p(n, x, incx, y, incy, param);
-    }
-}
-
-static void cblas_srotmg_wrapper(float *d1, float *d2, float *x1, float y1, float *param) {
-    if (cblas_library() != NULL) {
-        if (cblas_srotmg_p == NULL)
-            cblas_srotmg_p = (void (*)(float *d1, float *d2, float *x1, float y1,
-                                       float *param))GET_FUNC(h_libcblas, "cblas_srotmg");
-        if (cblas_srotmg_p != NULL)
-            cblas_srotmg_p(d1, d2, x1, y1, param);
-    }
-}
-
-static void cblas_drotmg_wrapper(double *d1, double *d2, double *x1, double y1, double *param) {
-    if (cblas_library() != NULL) {
-        if (cblas_drotmg_p == NULL)
-            cblas_drotmg_p = (void (*)(double *d1, double *d2, double *x1, double y1,
-                                       double *param))GET_FUNC(h_libcblas, "cblas_drotmg");
-        if (cblas_drotmg_p != NULL)
-            cblas_drotmg_p(d1, d2, x1, y1, param);
-    }
-}
-
-static void cblas_sscal_wrapper(const int n, const float alpha, float *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_sscal_p == NULL)
-            cblas_sscal_p = (void (*)(const int n, const float alpha, float *x,
-                                      const int incx))GET_FUNC(h_libcblas, "cblas_sscal");
-        if (cblas_sscal_p != NULL)
-            cblas_sscal_p(n, alpha, x, incx);
-    }
-}
-
-static void cblas_dscal_wrapper(const int n, const double alpha, double *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_dscal_p == NULL)
-            cblas_dscal_p = (void (*)(const int n, const double alpha, double *x,
-                                      const int incx))GET_FUNC(h_libcblas, "cblas_dscal");
-        if (cblas_dscal_p != NULL)
-            cblas_dscal_p(n, alpha, x, incx);
-    }
-}
-
-static void cblas_cscal_wrapper(const int n, const void *alpha, void *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_cscal_p == NULL)
-            cblas_cscal_p = (void (*)(const int n, const void *alpha, void *x,
-                                      const int incx))GET_FUNC(h_libcblas, "cblas_cscal");
-        if (cblas_cscal_p != NULL)
-            cblas_cscal_p(n, alpha, x, incx);
-    }
-}
-
-static void cblas_zscal_wrapper(const int n, const void *alpha, void *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_zscal_p == NULL)
-            cblas_zscal_p = (void (*)(const int n, const void *alpha, void *x,
-                                      const int incx))GET_FUNC(h_libcblas, "cblas_zscal");
-        if (cblas_zscal_p != NULL)
-            cblas_zscal_p(n, alpha, x, incx);
-    }
-}
-
-static void cblas_csscal_wrapper(const int n, const float alpha, void *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_csscal_p == NULL)
-            cblas_csscal_p = (void (*)(const int n, const float alpha, void *x,
-                                       const int incx))GET_FUNC(h_libcblas, "cblas_csscal");
-        if (cblas_csscal_p != NULL)
-            cblas_csscal_p(n, alpha, x, incx);
-    }
-}
-
-static void cblas_zdscal_wrapper(const int n, const double alpha, void *x, const int incx) {
-    if (cblas_library() != NULL) {
-        if (cblas_zdscal_p == NULL)
-            cblas_zdscal_p = (void (*)(const int n, const double alpha, void *x,
-                                       const int incx))GET_FUNC(h_libcblas, "cblas_zdscal");
-        if (cblas_zdscal_p != NULL)
-            cblas_zdscal_p(n, alpha, x, incx);
-    }
-}
-
-static void cblas_sswap_wrapper(const int n, float *x, const int incx, float *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_sswap_p == NULL)
-            cblas_sswap_p = (void (*)(const int n, float *x, const int incx, float *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_sswap");
-        if (cblas_sswap_p != NULL)
-            cblas_sswap_p(n, x, incx, y, incy);
-    }
-}
-
-static void cblas_dswap_wrapper(const int n, double *x, const int incx, double *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_dswap_p == NULL)
-            cblas_dswap_p = (void (*)(const int n, double *x, const int incx, double *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_dswap");
-        if (cblas_dswap_p != NULL)
-            cblas_dswap_p(n, x, incx, y, incy);
-    }
-}
-
-static void cblas_cswap_wrapper(const int n, void *x, const int incx, void *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_cswap_p == NULL)
-            cblas_cswap_p = (void (*)(const int n, void *x, const int incx, void *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_cswap");
-        if (cblas_cswap_p != NULL)
-            cblas_cswap_p(n, x, incx, y, incy);
-    }
-}
-
-static void cblas_zswap_wrapper(const int n, void *x, const int incx, void *y, const int incy) {
-    if (cblas_library() != NULL) {
-        if (cblas_zswap_p == NULL)
-            cblas_zswap_p = (void (*)(const int n, void *x, const int incx, void *y,
-                                      const int incy))GET_FUNC(h_libcblas, "cblas_zswap");
-        if (cblas_zswap_p != NULL)
-            cblas_zswap_p(n, x, incx, y, incy);
-    }
-}
-
-static void cblas_cdotc_sub_wrapper(const int n, const void *x, const int incx, const void *y,
-                                    const int incy, void *pres) {
-    if (cblas_library() != NULL) {
-        if (cblas_cdotc_sub_p == NULL)
-            cblas_cdotc_sub_p =
-                (void (*)(const int n, const void *x, const int incx, const void *y, const int incy,
-                          void *pres))GET_FUNC(h_libcblas, "cblas_cdotc_sub");
-        if (cblas_cdotc_sub_p != NULL)
-            cblas_cdotc_sub_p(n, x, incx, y, incy, pres);
-    }
-}
-
-static void cblas_zdotc_sub_wrapper(const int n, const void *x, const int incx, const void *y,
-                                    const int incy, void *pres) {
-    if (cblas_library() != NULL) {
-        if (cblas_zdotc_sub_p == NULL)
-            cblas_zdotc_sub_p =
-                (void (*)(const int n, const void *x, const int incx, const void *y, const int incy,
-                          void *pres))GET_FUNC(h_libcblas, "cblas_zdotc_sub");
-        if (cblas_zdotc_sub_p != NULL)
-            cblas_zdotc_sub_p(n, x, incx, y, incy, pres);
-    }
-}
-
-static void cblas_cdotu_sub_wrapper(const int n, const void *x, const int incx, const void *y,
-                                    const int incy, void *pres) {
-    if (cblas_library() != NULL) {
-        if (cblas_cdotu_sub_p == NULL)
-            cblas_cdotu_sub_p =
-                (void (*)(const int n, const void *x, const int incx, const void *y, const int incy,
-                          void *pres))GET_FUNC(h_libcblas, "cblas_cdotu_sub");
-        if (cblas_cdotu_sub_p != NULL)
-            cblas_cdotu_sub_p(n, x, incx, y, incy, pres);
-    }
-}
-
-static void cblas_zdotu_sub_wrapper(const int n, const void *x, const int incx, const void *y,
-                                    const int incy, void *pres) {
-    if (cblas_library() != NULL) {
-        if (cblas_zdotu_sub_p == NULL)
-            cblas_zdotu_sub_p =
-                (void (*)(const int n, const void *x, const int incx, const void *y, const int incy,
-                          void *pres))GET_FUNC(h_libcblas, "cblas_zdotu_sub");
-        if (cblas_zdotu_sub_p != NULL)
-            cblas_zdotu_sub_p(n, x, incx, y, incy, pres);
-    }
-}
-
-static int cblas_isamax_wrapper(const int n, const float *x, const int incx) {
-    int isamax_res = 0;
-    if (cblas_library() != NULL) {
-        if (cblas_isamax_p == NULL)
-            cblas_isamax_p = (int (*)(const int n, const float *x, const int incx))GET_FUNC(
-                h_libcblas, "cblas_isamax");
-        if (cblas_isamax_p != NULL)
-            isamax_res = cblas_isamax_p(n, x, incx);
-    }
-    return isamax_res;
-}
-
-static int cblas_idamax_wrapper(const int n, const double *x, const int incx) {
-    int idamax_res = 0;
-    if (cblas_library() != NULL) {
-        if (cblas_idamax_p == NULL)
-            cblas_idamax_p = (int (*)(const int n, const double *x, const int incx))GET_FUNC(
-                h_libcblas, "cblas_idamax");
-        if (cblas_idamax_p != NULL)
-            idamax_res = cblas_idamax_p(n, x, incx);
-    }
-    return idamax_res;
-}
-
-static int cblas_icamax_wrapper(const int n, const void *x, const int incx) {
-    int icamax_res = 0;
-    if (cblas_library() != NULL) {
-        if (cblas_icamax_p == NULL)
-            cblas_icamax_p = (int (*)(const int n, const void *x, const int incx))GET_FUNC(
-                h_libcblas, "cblas_icamax");
-        if (cblas_icamax_p != NULL)
-            icamax_res = cblas_icamax_p(n, x, incx);
-    }
-    return icamax_res;
-}
-
-static int cblas_izamax_wrapper(const int n, const void *x, const int incx) {
-    int izamax_res = 0;
-    if (cblas_library() != NULL) {
-        if (cblas_izamax_p == NULL)
-            cblas_izamax_p = (int (*)(const int n, const void *x, const int incx))GET_FUNC(
-                h_libcblas, "cblas_izamax");
-        if (cblas_izamax_p != NULL)
-            izamax_res = cblas_izamax_p(n, x, incx);
-    }
-    return izamax_res;
-}
-}
-
-#endif /* header guard */
diff --git a/tests/unit_tests/blas/include/test_common.hpp b/tests/unit_tests/blas/include/test_common.hpp
deleted file mode 100644
index 5d607991e..000000000
--- a/tests/unit_tests/blas/include/test_common.hpp
+++ /dev/null
@@ -1,711 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _TEST_COMMON_HPP__
-#define _TEST_COMMON_HPP__
-
-#include <algorithm>
-
-#include <complex>
-#include <stdexcept>
-#include <type_traits>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#define MAX_NUM_PRINT 20
-
-namespace std {
-static sycl::half abs(sycl::half v) {
-    if (v < sycl::half(0))
-        return -v;
-    else
-        return v;
-}
-} // namespace std
-
-// Complex helpers.
-template <typename T>
-struct complex_info {
-    using real_type = T;
-    static const bool is_complex = false;
-};
-
-template <typename T>
-struct complex_info<std::complex<T>> {
-    using real_type = T;
-    static const bool is_complex = true;
-};
-
-template <typename T>
-constexpr bool is_complex() {
-    return complex_info<T>::is_complex;
-}
-template <typename T>
-constexpr int num_components() {
-    return is_complex<T>() ? 2 : 1;
-}
-
-// Matrix helpers.
-template <typename T>
-constexpr T inner_dimension(oneapi::mkl::transpose trans, T m, T n) {
-    return (trans == oneapi::mkl::transpose::nontrans) ? m : n;
-}
-template <typename T>
-constexpr T outer_dimension(oneapi::mkl::transpose trans, T m, T n) {
-    return (trans == oneapi::mkl::transpose::nontrans) ? n : m;
-}
-template <typename T>
-constexpr T matrix_size(oneapi::mkl::transpose trans, T m, T n, T ldm) {
-    return outer_dimension(trans, m, n) * ldm;
-}
-template <typename T>
-constexpr T matrix_size(oneapi::mkl::layout layout, oneapi::mkl::transpose trans, T m, T n, T ldm) {
-    return (layout == oneapi::mkl::layout::col_major) ? outer_dimension(trans, m, n) * ldm
-                                                      : inner_dimension(trans, m, n) * ldm;
-}
-
-// SYCL buffer creation helper.
-template <typename vec>
-sycl::buffer<typename vec::value_type, 1> make_buffer(const vec &v) {
-    sycl::buffer<typename vec::value_type, 1> buf(v.data(), sycl::range<1>(v.size()));
-    return buf;
-}
-
-// Reference helpers.
-template <typename T>
-struct ref_type_info {
-    using type = T;
-};
-template <>
-struct ref_type_info<std::complex<float>> {
-    using type = std::complex<float>;
-};
-template <>
-struct ref_type_info<std::complex<double>> {
-    using type = std::complex<double>;
-};
-template <>
-struct ref_type_info<int8_t> {
-    using type = int8_t;
-};
-template <>
-struct ref_type_info<uint8_t> {
-    using type = uint8_t;
-};
-template <>
-struct ref_type_info<int32_t> {
-    using type = int32_t;
-};
-
-// Random initialization.
-template <typename fp>
-static fp rand_scalar() {
-    return fp(std::rand()) / fp(RAND_MAX) - fp(0.5);
-}
-template <typename fp>
-static std::complex<fp> rand_complex_scalar() {
-    return std::complex<fp>(rand_scalar<fp>(), rand_scalar<fp>());
-}
-template <>
-std::complex<float> rand_scalar() {
-    return rand_complex_scalar<float>();
-}
-template <>
-std::complex<double> rand_scalar() {
-    return rand_complex_scalar<double>();
-}
-template <>
-int8_t rand_scalar() {
-    return std::rand() % 254 - 127;
-}
-template <>
-int32_t rand_scalar() {
-    return std::rand() % 256 - 128;
-}
-template <>
-uint8_t rand_scalar() {
-    return std::rand() % 128;
-}
-
-template <>
-sycl::half rand_scalar() {
-    return sycl::half(std::rand() % 32000) / sycl::half(32000) - sycl::half(0.5);
-}
-
-template <typename fp>
-static fp rand_scalar(int mag) {
-    fp tmp = fp(mag) + fp(std::rand()) / fp(RAND_MAX) - fp(0.5);
-    if (std::rand() % 2)
-        return tmp;
-    else
-        return -tmp;
-}
-template <typename fp>
-static std::complex<fp> rand_complex_scalar(int mag) {
-    return std::complex<fp>(rand_scalar<fp>(mag), rand_scalar<fp>(mag));
-}
-template <>
-std::complex<float> rand_scalar(int mag) {
-    return rand_complex_scalar<float>(mag);
-}
-template <>
-std::complex<double> rand_scalar(int mag) {
-    return rand_complex_scalar<double>(mag);
-}
-
-template <typename fp>
-void rand_vector(fp *v, int n, int inc) {
-    int abs_inc = std::abs(inc);
-    for (int i = 0; i < n; i++)
-        v[i * abs_inc] = rand_scalar<fp>();
-}
-
-template <typename vec>
-void rand_vector(vec &v, int n, int inc) {
-    using fp = typename vec::value_type;
-    int abs_inc = std::abs(inc);
-
-    v.resize(n * abs_inc);
-
-    for (int i = 0; i < n; i++)
-        v[i * abs_inc] = rand_scalar<fp>();
-}
-
-template <typename fp>
-oneapi::mkl::transpose rand_trans() {
-    std::int64_t tmp;
-    oneapi::mkl::transpose trans;
-    if ((std::is_same<fp, float>::value) || (std::is_same<fp, double>::value)) {
-        trans = (oneapi::mkl::transpose)(std::rand() % 2);
-    }
-    else {
-        tmp = std::rand() % 3;
-        if (tmp == 2)
-            trans = oneapi::mkl::transpose::conjtrans;
-        else
-            trans = (oneapi::mkl::transpose)tmp;
-    }
-    return trans;
-}
-
-template <typename vec>
-void print_matrix(vec &M, oneapi::mkl::transpose trans, int m, int n, int ld, char *name) {
-    std::cout << "Matrix " << name << ":\n";
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-            if (trans == oneapi::mkl::transpose::nontrans)
-                std::cout << (double)M[i + j * ld] << " ";
-            else
-                std::cout << (double)M[j + i * ld] << " ";
-        }
-        std::cout << std::endl;
-    }
-}
-
-template <typename fp>
-void copy_vector(fp *src, int n, int inc, fp *dest) {
-    int abs_inc = std::abs(inc);
-    for (int i = 0; i < n; i++)
-        dest[i * abs_inc] = src[i * abs_inc];
-}
-
-template <typename vec_src, typename vec_dest>
-void copy_matrix(vec_src &src, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m,
-                 int n, int ld, vec_dest &dest) {
-    using T_data = typename vec_dest::value_type;
-    dest.resize(matrix_size(layout, trans, m, n, ld));
-    if (((trans == oneapi::mkl::transpose::nontrans) &&
-         (layout == oneapi::mkl::layout::col_major)) ||
-        ((trans != oneapi::mkl::transpose::nontrans) &&
-         (layout == oneapi::mkl::layout::row_major))) {
-        for (int j = 0; j < n; j++)
-            for (int i = 0; i < m; i++)
-                dest[i + j * ld] = (T_data)src[i + j * ld];
-    }
-    else {
-        for (int i = 0; i < m; i++)
-            for (int j = 0; j < n; j++)
-                dest[j + i * ld] = (T_data)src[j + i * ld];
-    }
-}
-
-template <typename fp_src, typename fp_dst>
-void copy_matrix(fp_src *src, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m,
-                 int n, int ld, fp_dst *dest) {
-    if (((trans == oneapi::mkl::transpose::nontrans) &&
-         (layout == oneapi::mkl::layout::col_major)) ||
-        ((trans != oneapi::mkl::transpose::nontrans) &&
-         (layout == oneapi::mkl::layout::row_major))) {
-        for (int j = 0; j < n; j++)
-            for (int i = 0; i < m; i++)
-                dest[i + j * ld] = (fp_dst)src[i + j * ld];
-    }
-    else {
-        for (int i = 0; i < m; i++)
-            for (int j = 0; j < n; j++)
-                dest[j + i * ld] = (fp_dst)src[j + i * ld];
-    }
-}
-
-template <typename vec>
-void rand_matrix(vec &M, oneapi::mkl::transpose trans, int m, int n, int ld) {
-    using fp = typename vec::value_type;
-
-    M.resize(matrix_size(trans, m, n, ld));
-
-    if (trans == oneapi::mkl::transpose::nontrans) {
-        for (int j = 0; j < n; j++)
-            for (int i = 0; i < m; i++)
-                M[i + j * ld] = rand_scalar<fp>();
-    }
-    else {
-        for (int i = 0; i < m; i++)
-            for (int j = 0; j < n; j++)
-                M[j + i * ld] = rand_scalar<fp>();
-    }
-}
-
-template <typename vec>
-void rand_matrix(vec &M, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, int n,
-                 int ld) {
-    using fp = typename vec::value_type;
-
-    M.resize(matrix_size(layout, trans, m, n, ld));
-
-    if (((trans == oneapi::mkl::transpose::nontrans) &&
-         (layout == oneapi::mkl::layout::col_major)) ||
-        ((trans != oneapi::mkl::transpose::nontrans) &&
-         (layout == oneapi::mkl::layout::row_major))) {
-        for (int j = 0; j < n; j++)
-            for (int i = 0; i < m; i++)
-                M[i + j * ld] = rand_scalar<fp>();
-    }
-    else {
-        for (int i = 0; i < m; i++)
-            for (int j = 0; j < n; j++)
-                M[j + i * ld] = rand_scalar<fp>();
-    }
-}
-
-template <typename fp>
-void rand_matrix(fp *M, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, int n,
-                 int ld) {
-    if (((trans == oneapi::mkl::transpose::nontrans) &&
-         (layout == oneapi::mkl::layout::col_major)) ||
-        ((trans != oneapi::mkl::transpose::nontrans) &&
-         (layout == oneapi::mkl::layout::row_major))) {
-        for (int j = 0; j < n; j++)
-            for (int i = 0; i < m; i++)
-                M[i + j * ld] = rand_scalar<fp>();
-    }
-    else {
-        for (int i = 0; i < m; i++)
-            for (int j = 0; j < n; j++)
-                M[j + i * ld] = rand_scalar<fp>();
-    }
-}
-
-template <typename vec>
-void rand_trsm_matrix(vec &M, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m,
-                      int n, int ld) {
-    using fp = typename vec::value_type;
-
-    M.resize(matrix_size(layout, trans, m, n, ld));
-
-    if (((trans == oneapi::mkl::transpose::nontrans) &&
-         (layout == oneapi::mkl::layout::col_major)) ||
-        ((trans != oneapi::mkl::transpose::nontrans) &&
-         (layout == oneapi::mkl::layout::row_major))) {
-        for (int j = 0; j < n; j++)
-            for (int i = 0; i < m; i++) {
-                if (i == j)
-                    M[i + j * ld] = rand_scalar<fp>(10);
-                else
-                    M[i + j * ld] = rand_scalar<fp>();
-            }
-    }
-    else {
-        for (int i = 0; i < m; i++)
-            for (int j = 0; j < n; j++) {
-                if (i == j)
-                    M[j + i * ld] = rand_scalar<fp>(10);
-                else
-                    M[j + i * ld] = rand_scalar<fp>();
-            }
-    }
-}
-
-template <typename fp>
-void rand_trsm_matrix(fp *M, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, int n,
-                      int ld) {
-    if (((trans == oneapi::mkl::transpose::nontrans) &&
-         (layout == oneapi::mkl::layout::col_major)) ||
-        ((trans != oneapi::mkl::transpose::nontrans) &&
-         (layout == oneapi::mkl::layout::row_major))) {
-        for (int j = 0; j < n; j++)
-            for (int i = 0; i < m; i++) {
-                if (i == j)
-                    M[i + j * ld] = rand_scalar<fp>(10);
-                else
-                    M[i + j * ld] = rand_scalar<fp>();
-            }
-    }
-    else {
-        for (int i = 0; i < m; i++)
-            for (int j = 0; j < n; j++) {
-                if (i == j)
-                    M[j + i * ld] = rand_scalar<fp>(10);
-                else
-                    M[j + i * ld] = rand_scalar<fp>();
-            }
-    }
-}
-
-template <typename vec>
-void rand_tpsv_matrix(vec &M, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-                      oneapi::mkl::transpose trans, int m) {
-    using fp = typename vec::value_type;
-    std::vector<fp> tmp;
-    int start, end, i, j, k = 0;
-
-    rand_trsm_matrix(tmp, layout, trans, m, m, m);
-    M.resize((m * (m + 1)) / 2);
-
-    for (j = 0; j < m; j++) {
-        if (layout == oneapi::mkl::layout::col_major) {
-            start = (upper_lower == oneapi::mkl::uplo::U) ? 0 : j;
-            end = (upper_lower == oneapi::mkl::uplo::U) ? j : m - 1;
-        }
-        else {
-            start = (upper_lower == oneapi::mkl::uplo::U) ? j : 0;
-            end = (upper_lower == oneapi::mkl::uplo::U) ? m - 1 : j;
-        }
-        for (i = start; i <= end; i++) {
-            M[k] = tmp[i + j * m];
-            k++;
-        }
-    }
-}
-
-template <typename vec>
-void rand_tbsv_matrix(vec &M, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-                      oneapi::mkl::transpose trans, int m, int k, int ld) {
-    using fp = typename vec::value_type;
-    std::vector<fp> tmp;
-    int i, j, n;
-
-    rand_trsm_matrix(tmp, layout, trans, m, m, ld);
-    M.resize(matrix_size(layout, trans, m, m, ld));
-
-    if (((layout == oneapi::mkl::layout::col_major) && (upper_lower == oneapi::mkl::uplo::U)) ||
-        ((layout == oneapi::mkl::layout::row_major) && (upper_lower == oneapi::mkl::uplo::L))) {
-        for (j = 0; j < m; j++) {
-            n = k - j;
-            for (i = std::max(0, j - k); i <= j; i++) {
-                M[(n + i) + j * ld] = tmp[i + j * ld];
-            }
-        }
-    }
-    else {
-        for (j = 0; j < m; j++) {
-            n = -j;
-            for (i = j; i < std::min(m, j + k + 1); i++) {
-                M[(n + i) + j * ld] = tmp[i + j * ld];
-            }
-        }
-    }
-}
-
-// Correctness checking.
-template <typename fp>
-typename std::enable_if<!std::is_integral<fp>::value, bool>::type check_equal(fp x, fp x_ref,
-                                                                              int error_mag) {
-    using fp_real = typename complex_info<fp>::real_type;
-    fp_real bound = (error_mag * num_components<fp>() * std::numeric_limits<fp_real>::epsilon());
-
-    bool ok;
-
-    fp_real aerr = std::abs(x - x_ref);
-    fp_real rerr = aerr / std::abs(x_ref);
-    ok = (rerr <= bound) || (aerr <= bound);
-    if (!ok)
-        std::cout << "relative error = " << rerr << " absolute error = " << aerr
-                  << " limit = " << bound << std::endl;
-    return ok;
-}
-
-template <typename fp>
-typename std::enable_if<std::is_integral<fp>::value, bool>::type check_equal(fp x, fp x_ref,
-                                                                             int error_mag) {
-    return (x == x_ref);
-}
-
-template <typename fp>
-bool check_equal_ptr(sycl::queue queue, fp *x, fp x_ref, int error_mag) {
-    fp x_host;
-    queue.memcpy(&x_host, x, sizeof(fp)).wait();
-    return check_equal(x_host, x_ref, error_mag);
-}
-
-template <typename fp>
-bool check_equal_trsm(fp x, fp x_ref, int error_mag) {
-    using fp_real = typename complex_info<fp>::real_type;
-    fp_real bound = std::max(fp_real(5e-5), (error_mag * num_components<fp>() *
-                                             std::numeric_limits<fp_real>::epsilon()));
-    fp zero = fp(0);
-    bool ok, check_rerr = (x_ref != zero);
-
-    fp_real aerr = std::abs(x - x_ref);
-    fp_real rerr = check_rerr ? aerr / std::abs(x_ref) : 0.0;
-    ok = check_rerr ? ((rerr <= bound) || (aerr <= bound)) : (aerr <= bound);
-    if (!ok)
-        std::cout << "relative error = " << rerr << " absolute error = " << aerr
-                  << " limit = " << bound << std::endl;
-    return ok;
-}
-
-template <typename fp>
-bool check_equal(fp x, fp x_ref, int error_mag, std::ostream &out) {
-    bool good = check_equal(x, x_ref, error_mag);
-
-    if (!good) {
-        out << "Difference in result: DPC++ " << x << " vs. Reference " << x_ref << std::endl;
-    }
-    return good;
-}
-
-template <typename fp>
-bool check_equal_ptr(sycl::queue queue, fp *x, fp x_ref, int error_mag, std::ostream &out) {
-    fp x_host;
-    queue.memcpy(&x_host, x, sizeof(fp)).wait();
-    return check_equal(x_host, x_ref, error_mag, out);
-}
-
-template <typename fp>
-bool check_equal_vector(const fp *v, const fp *v_ref, int n, int inc, int error_mag,
-                        std::ostream &out) {
-    int abs_inc = std::abs(inc), count = 0;
-    bool good = true;
-
-    for (int i = 0; i < n; i++) {
-        if (!check_equal(v[i * abs_inc], v_ref[i * abs_inc], error_mag)) {
-            int i_actual = (inc > 0) ? i : n - i;
-            std::cout << "Difference in entry " << i_actual << ": DPC++ " << v[i * abs_inc]
-                      << " vs. Reference " << v_ref[i * abs_inc] << std::endl;
-            good = false;
-            count++;
-            if (count > MAX_NUM_PRINT)
-                return good;
-        }
-    }
-
-    return good;
-}
-
-template <typename vec1, typename vec2>
-bool check_equal_vector(vec1 &v, vec2 &v_ref, int n, int inc, int error_mag, std::ostream &out) {
-    int abs_inc = std::abs(inc), count = 0;
-    bool good = true;
-
-    for (int i = 0; i < n; i++) {
-        if (!check_equal(v[i * abs_inc], v_ref[i * abs_inc], error_mag)) {
-            int i_actual = (inc > 0) ? i : n - i;
-            std::cout << "Difference in entry " << i_actual << ": DPC++ " << v[i * abs_inc]
-                      << " vs. Reference " << v_ref[i * abs_inc] << std::endl;
-            good = false;
-            count++;
-            if (count > MAX_NUM_PRINT)
-                return good;
-        }
-    }
-
-    return good;
-}
-
-template <typename vec1, typename vec2>
-bool check_equal_trsv_vector(vec1 &v, vec2 &v_ref, int n, int inc, int error_mag,
-                             std::ostream &out) {
-    int abs_inc = std::abs(inc), count = 0;
-    bool good = true;
-
-    for (int i = 0; i < n; i++) {
-        if (!check_equal_trsm(v[i * abs_inc], v_ref[i * abs_inc], error_mag)) {
-            int i_actual = (inc > 0) ? i : n - i;
-            std::cout << "Difference in entry " << i_actual << ": DPC++ " << v[i * abs_inc]
-                      << " vs. Reference " << v_ref[i * abs_inc] << std::endl;
-            good = false;
-            count++;
-            if (count > MAX_NUM_PRINT)
-                return good;
-        }
-    }
-
-    return good;
-}
-
-template <typename acc1, typename acc2>
-bool check_equal_matrix(acc1 &M, acc2 &M_ref, oneapi::mkl::layout layout, int m, int n, int ld,
-                        int error_mag, std::ostream &out) {
-    bool good = true;
-    int idx, count = 0;
-    for (int j = 0; j < n; j++) {
-        for (int i = 0; i < m; i++) {
-            idx = (layout == oneapi::mkl::layout::col_major) ? i + j * ld : j + i * ld;
-            if (!check_equal(M[idx], M_ref[idx], error_mag)) {
-                out << "Difference in entry (" << i << ',' << j << "): DPC++ " << M[idx]
-                    << " vs. Reference " << M_ref[idx] << std::endl;
-                good = false;
-                count++;
-                if (count > MAX_NUM_PRINT)
-                    return good;
-            }
-        }
-    }
-
-    return good;
-}
-
-template <typename fp>
-bool check_equal_matrix(const fp *M, const fp *M_ref, oneapi::mkl::layout layout, int m, int n,
-                        int ld, int error_mag, std::ostream &out) {
-    bool good = true;
-    int idx, count = 0;
-    for (int j = 0; j < n; j++) {
-        for (int i = 0; i < m; i++) {
-            idx = (layout == oneapi::mkl::layout::col_major) ? i + j * ld : j + i * ld;
-            if (!check_equal(M[idx], M_ref[idx], error_mag)) {
-                out << "Difference in entry (" << i << ',' << j << "): DPC++ " << M[idx]
-                    << " vs. Reference " << M_ref[idx] << std::endl;
-                good = false;
-                count++;
-                if (count > MAX_NUM_PRINT)
-                    return good;
-            }
-        }
-    }
-
-    return good;
-}
-
-template <typename acc1, typename acc2>
-bool check_equal_matrix(acc1 &M, acc2 &M_ref, oneapi::mkl::layout layout,
-                        oneapi::mkl::uplo upper_lower, int m, int n, int ld, int error_mag,
-                        std::ostream &out) {
-    bool good = true;
-    int idx, count = 0;
-    for (int j = 0; j < n; j++) {
-        for (int i = 0; i < m; i++) {
-            idx = (layout == oneapi::mkl::layout::col_major) ? i + j * ld : j + i * ld;
-            if (((upper_lower == oneapi::mkl::uplo::upper) && (j >= i)) ||
-                ((upper_lower == oneapi::mkl::uplo::lower) && (j <= i))) {
-                if (!check_equal(M[idx], M_ref[idx], error_mag)) {
-                    out << "Difference in entry (" << i << ',' << j << "): DPC++ " << M[idx]
-                        << " vs. Reference " << M_ref[idx] << std::endl;
-                    good = false;
-                    count++;
-                    if (count > MAX_NUM_PRINT)
-                        return good;
-                }
-            }
-        }
-    }
-
-    return good;
-}
-
-template <typename acc1, typename acc2>
-bool check_equal_trsm_matrix(acc1 &M, acc2 &M_ref, oneapi::mkl::layout layout, int m, int n, int ld,
-                             int error_mag, std::ostream &out) {
-    bool good = true;
-    int idx, count = 0;
-    for (int j = 0; j < n; j++) {
-        for (int i = 0; i < m; i++) {
-            idx = (layout == oneapi::mkl::layout::col_major) ? i + j * ld : j + i * ld;
-            if (!check_equal_trsm(M[idx], M_ref[idx], error_mag)) {
-                out << "Difference in entry (" << i << ',' << j << "): DPC++ " << M[idx]
-                    << " vs. Reference " << M_ref[idx] << std::endl;
-                good = false;
-                count++;
-                if (count > MAX_NUM_PRINT)
-                    return good;
-            }
-        }
-    }
-
-    return good;
-}
-
-// Helper for using std::result_of for evalutation operator[] return type
-template <typename T>
-struct access_index {
-    auto operator()(T M) {
-        return M[0];
-    }
-};
-
-// Helper for checking if a matrix/vector/accessor structure returns an integral type
-template <typename T>
-constexpr bool is_matrix_type_integral() {
-    return std::is_integral_v<
-        std::remove_reference_t<typename std::result_of<access_index<T>(T)>::type>>;
-}
-
-template <typename fp>
-typename std::enable_if<std::is_integral<fp>::value, bool>::type check_almost_equal_int(
-    fp x, fp x_ref, int error_mag) {
-    return (std::abs(x - x_ref) <= error_mag);
-}
-
-template <typename Ta, typename Tb>
-bool check_almost_equal_matrix_int(Ta &M, Tb &M_ref, oneapi::mkl::layout layout, int m, int n,
-                                   int ld, int error_mag, std::ostream &out) {
-    static_assert(is_matrix_type_integral<Ta>() && is_matrix_type_integral<Tb>());
-    bool good = true;
-    int idx, count = 0;
-    for (int j = 0; j < n; j++) {
-        for (int i = 0; i < m; i++) {
-            idx = (layout == oneapi::mkl::layout::col_major) ? i + j * ld : j + i * ld;
-            if (!check_almost_equal_int(M[idx], M_ref[idx], error_mag)) {
-                out << "Difference in entry (" << i << ',' << j << "): DPC++ " << M[idx]
-                    << " vs. Reference " << M_ref[idx] << std::endl;
-                good = false;
-                count++;
-                if (count > MAX_NUM_PRINT)
-                    return good;
-            }
-        }
-    }
-
-    return good;
-}
-
-template <typename Ta, typename Tb>
-bool check_almost_equal_matrix(Ta &M, Tb &M_ref, oneapi::mkl::layout layout, int m, int n, int ld,
-                               int error_mag, std::ostream &out) {
-    // Only call if returned dtype is integral
-    if constexpr (is_matrix_type_integral<Ta>() && is_matrix_type_integral<Tb>())
-        return check_almost_equal_matrix_int(M, M_ref, layout, m, n, ld, error_mag, out);
-    return check_equal_matrix(M, M_ref, layout, m, n, ld, error_mag, out);
-}
-
-#endif /* header guard */
diff --git a/tests/unit_tests/blas/level1/CMakeLists.txt b/tests/unit_tests/blas/level1/CMakeLists.txt
deleted file mode 100644
index f02adbde3..000000000
--- a/tests/unit_tests/blas/level1/CMakeLists.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build object from all test sources
-set(L1_SOURCES "nrm2.cpp" "iamin.cpp" "iamax.cpp" "dotu.cpp" "dot.cpp" "dotc.cpp" "copy.cpp" "axpy.cpp" "axpby.cpp" "asum.cpp" "swap.cpp" "sdsdot.cpp" "scal.cpp" "rotmg.cpp" "rotm.cpp" "rotg.cpp" "rot.cpp" "nrm2_usm.cpp" "iamin_usm.cpp" "iamax_usm.cpp" "dotu_usm.cpp" "dot_usm.cpp" "dotc_usm.cpp" "copy_usm.cpp" "axpy_usm.cpp" "axpby_usm.cpp" "asum_usm.cpp" "swap_usm.cpp" "sdsdot_usm.cpp" "scal_usm.cpp" "rotmg_usm.cpp" "rotm_usm.cpp" "rotg_usm.cpp" "rot_usm.cpp")
-
-if(BUILD_SHARED_LIBS)
-  add_library(blas_level1_rt OBJECT ${L1_SOURCES})
-  target_compile_options(blas_level1_rt PRIVATE -DCALL_RT_API -DNOMINMAX)
-  target_include_directories(blas_level1_rt
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-      PUBLIC ${CBLAS_INCLUDE}
-  )
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET blas_level1_rt SOURCES ${L1_SOURCES})
-  else()
-    target_link_libraries(blas_level1_rt PUBLIC ONEMKL::SYCL::SYCL)
-  endif()
-endif()
-
-add_library(blas_level1_ct OBJECT ${L1_SOURCES})
-target_compile_options(blas_level1_ct PRIVATE -DNOMINMAX)
-target_include_directories(blas_level1_ct
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-    PUBLIC ${PROJECT_SOURCE_DIR}/include
-    PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-    PUBLIC ${CMAKE_BINARY_DIR}/bin
-    PUBLIC ${CBLAS_INCLUDE}
-)
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET blas_level1_ct SOURCES ${L1_SOURCES})
-else()
-  target_link_libraries(blas_level1_ct PUBLIC ONEMKL::SYCL::SYCL)
-endif()
diff --git a/tests/unit_tests/blas/level1/asum.cpp b/tests/unit_tests/blas/level1/asum.cpp
deleted file mode 100644
index 6969789e3..000000000
--- a/tests/unit_tests/blas/level1/asum.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp, typename fp_res>
-int test(device* dev, oneapi::mkl::layout layout, int64_t N, int64_t incx) {
-    // Prepare data.
-    vector<fp> x;
-    fp_res result = fp_res(-1), result_ref = fp_res(-1);
-
-    rand_vector(x, N, incx);
-
-    // Call Reference ASUM.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = std::abs(incx);
-
-    result_ref = ::asum<fp_ref, fp_res>(&N_ref, (fp_ref*)x.data(), &incx_ref);
-    // Call DPC++ ASUM.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during ASUM:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp_res, 1> result_buffer(&result, range<1>(1));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::asum(main_queue, N, x_buffer, incx, result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::asum(main_queue, N, x_buffer, incx, result_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::asum, N,
-                                        x_buffer, incx, result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::asum, N, x_buffer,
-                                        incx, result_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during ASUM:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of ASUM:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto result_accessor = result_buffer.get_host_access(read_only);
-    bool good = check_equal(result_accessor[0], result_ref, N, std::cout);
-
-    return (int)good;
-}
-
-class AsumTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(AsumTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (::test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP(
-        (::test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP(
-        (::test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3)));
-}
-
-TEST_P(AsumTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        (::test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP(
-        (::test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP(
-        (::test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3)));
-}
-
-TEST_P(AsumTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP((::test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP((::test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP((::test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, -3)));
-}
-
-TEST_P(AsumTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, -3)));
-}
-
-INSTANTIATE_TEST_SUITE_P(AsumTestSuite, AsumTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/asum_usm.cpp b/tests/unit_tests/blas/level1/asum_usm.cpp
deleted file mode 100644
index b42799abd..000000000
--- a/tests/unit_tests/blas/level1/asum_usm.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp, typename fp_res, usm::alloc alloc_type = usm::alloc::shared>
-int test(device* dev, oneapi::mkl::layout layout, int64_t N, int64_t incx) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during ASUM:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua);
-    fp_res result_ref = fp_res(-1);
-
-    rand_vector(x, N, incx);
-
-    // Call Reference ASUM.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = std::abs(incx);
-
-    result_ref = ::asum<fp_ref, fp_res>(&N_ref, (fp_ref*)x.data(), &incx_ref);
-
-    // Call DPC++ ASUM.
-
-    fp_res* result_p;
-    if constexpr (alloc_type == usm::alloc::shared) {
-        result_p = (fp_res*)oneapi::mkl::malloc_shared(64, sizeof(fp_res), *dev, cxt);
-    }
-    else if constexpr (alloc_type == usm::alloc::device) {
-        result_p = (fp_res*)oneapi::mkl::malloc_device(64, sizeof(fp_res), *dev, cxt);
-    }
-    else {
-        throw std::runtime_error("Bad alloc_type");
-    }
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::asum(main_queue, N, x.data(), incx,
-                                                             result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::asum(main_queue, N, x.data(), incx, result_p,
-                                                          dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::asum, N,
-                                        x.data(), incx, result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::asum, N, x.data(),
-                                        incx, result_p, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during ASUM:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of ASUM:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_ptr(main_queue, result_p, result_ref, N, std::cout);
-
-    oneapi::mkl::free_usm(result_p, cxt);
-
-    return (int)good;
-}
-
-class AsumUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(AsumUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (::test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP(
-        (::test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP((::test<float, float, usm::alloc::device>(std::get<0>(GetParam()),
-                                                                std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP(
-        (::test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3)));
-}
-
-TEST_P(AsumUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        (::test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP(
-        (::test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP((::test<double, double, usm::alloc::device>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP(
-        (::test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3)));
-}
-
-TEST_P(AsumUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP((::test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP((::test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP((::test<std::complex<float>, float, usm::alloc::device>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP((::test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, -3)));
-}
-
-TEST_P(AsumUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP((::test<std::complex<double>, double, usm::alloc::device>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, -3)));
-}
-
-INSTANTIATE_TEST_SUITE_P(AsumUsmTestSuite, AsumUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/axpby.cpp b/tests/unit_tests/blas/level1/axpby.cpp
deleted file mode 100644
index d43f9beda..000000000
--- a/tests/unit_tests/blas/level1/axpby.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp alpha, fp beta) {
-    // Prepare data.
-    vector<fp> x, y, y_ref;
-
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-    y_ref = y;
-
-    // Call Reference AXPBY.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::axpby(&N_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta,
-            (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ AXPBY.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during AXPBY:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::axpby(main_queue, N, alpha, x_buffer, incx, beta,
-                                                       y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::axpby(main_queue, N, alpha, x_buffer, incx, beta,
-                                                    y_buffer, incy);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::axpby, N,
-                                        alpha, x_buffer, incx, beta, y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::axpby, N, alpha,
-                                        x_buffer, incx, beta, y_buffer, incy);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during AXPBY:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of AXPBY:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(y_accessor, y_ref, N, incy, N, std::cout);
-
-    return (int)good;
-}
-
-class AxpbyTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(AxpbyTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, alpha, beta));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, alpha, beta));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2, alpha, beta));
-}
-TEST_P(AxpbyTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, alpha, beta));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, alpha, beta));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2, alpha, beta));
-}
-TEST_P(AxpbyTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                1357, 2, 3, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                1357, 1, 1, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                1357, -3, -2, alpha, beta));
-}
-TEST_P(AxpbyTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 1357, 2, 3, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 1357, 1, 1, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 1357, -3, -2, alpha, beta));
-}
-
-INSTANTIATE_TEST_SUITE_P(AxpbyTestSuite, AxpbyTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/axpby_usm.cpp b/tests/unit_tests/blas/level1/axpby_usm.cpp
deleted file mode 100644
index ae85ca8f1..000000000
--- a/tests/unit_tests/blas/level1/axpby_usm.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp alpha, fp beta) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during AXPBY:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua);
-
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-
-    auto y_ref = y;
-
-    // Call Reference AXPBY.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::axpby(&N_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta,
-            (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ AXPBY.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::axpby(main_queue, N, alpha, x.data(), incx,
-                                                              beta, y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::axpby(main_queue, N, alpha, x.data(), incx,
-                                                           beta, y.data(), incy, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::axpby, N,
-                                        alpha, x.data(), incx, beta, y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::axpby, N, alpha,
-                                        x.data(), incx, beta, y.data(), incy, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during AXPBY:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of AXPBY:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(y, y_ref, N, incy, N, std::cout);
-
-    return (int)good;
-}
-
-class AxpbyUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(AxpbyUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, alpha, beta));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, alpha, beta));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2, alpha, beta));
-}
-TEST_P(AxpbyUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, alpha, beta));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, alpha, beta));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2, alpha, beta));
-}
-TEST_P(AxpbyUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                1357, 2, 3, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                1357, 1, 1, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                1357, -3, -2, alpha, beta));
-}
-TEST_P(AxpbyUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 1357, 2, 3, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 1357, 1, 1, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 1357, -3, -2, alpha, beta));
-}
-
-INSTANTIATE_TEST_SUITE_P(AxpbyUsmTestSuite, AxpbyUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/axpy.cpp b/tests/unit_tests/blas/level1/axpy.cpp
deleted file mode 100644
index c81f2902d..000000000
--- a/tests/unit_tests/blas/level1/axpy.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp alpha) {
-    // Prepare data.
-    vector<fp> x, y, y_ref;
-
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-    y_ref = y;
-
-    // Call Reference AXPY.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::axpy(&N_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y_ref.data(),
-           &incy_ref);
-
-    // Call DPC++ AXPY.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during AXPY:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::axpy(main_queue, N, alpha, x_buffer, incx,
-                                                      y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::axpy(main_queue, N, alpha, x_buffer, incx, y_buffer,
-                                                   incy);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::axpy, N, alpha,
-                                        x_buffer, incx, y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::axpy, N, alpha,
-                                        x_buffer, incx, y_buffer, incy);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during AXPY:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of AXPY:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(y_accessor, y_ref, N, incy, N, std::cout);
-
-    return (int)good;
-}
-
-class AxpyTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(AxpyTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, alpha));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, alpha));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2, alpha));
-}
-TEST_P(AxpyTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, alpha));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, alpha));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2, alpha));
-}
-TEST_P(AxpyTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                1357, 2, 3, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                1357, 1, 1, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                1357, -3, -2, alpha));
-}
-TEST_P(AxpyTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 1357, 2, 3, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 1357, 1, 1, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 1357, -3, -2, alpha));
-}
-
-INSTANTIATE_TEST_SUITE_P(AxpyTestSuite, AxpyTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/axpy_usm.cpp b/tests/unit_tests/blas/level1/axpy_usm.cpp
deleted file mode 100644
index da68f173c..000000000
--- a/tests/unit_tests/blas/level1/axpy_usm.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp alpha) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during AXPY:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua);
-
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-
-    auto y_ref = y;
-
-    // Call Reference AXPY.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::axpy(&N_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y_ref.data(),
-           &incy_ref);
-
-    // Call DPC++ AXPY.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::axpy(main_queue, N, alpha, x.data(), incx,
-                                                             y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::axpy(main_queue, N, alpha, x.data(), incx,
-                                                          y.data(), incy, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::axpy, N, alpha,
-                                        x.data(), incx, y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::axpy, N, alpha,
-                                        x.data(), incx, y.data(), incy, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during AXPY:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of AXPY:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(y, y_ref, N, incy, N, std::cout);
-
-    return (int)good;
-}
-
-class AxpyUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(AxpyUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, alpha));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, alpha));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2, alpha));
-}
-TEST_P(AxpyUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, alpha));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, alpha));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2, alpha));
-}
-TEST_P(AxpyUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                1357, 2, 3, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                1357, 1, 1, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                1357, -3, -2, alpha));
-}
-TEST_P(AxpyUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 1357, 2, 3, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 1357, 1, 1, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 1357, -3, -2, alpha));
-}
-
-INSTANTIATE_TEST_SUITE_P(AxpyUsmTestSuite, AxpyUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/copy.cpp b/tests/unit_tests/blas/level1/copy.cpp
deleted file mode 100644
index 87a1c2f1b..000000000
--- a/tests/unit_tests/blas/level1/copy.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy) {
-    // Prepare data.
-    vector<fp> x, y, y_ref;
-
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-    y_ref = y;
-
-    // Call Reference COPY.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::copy(&N_ref, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y_ref.data(), &incy_ref);
-
-    // Call DPC++ COPY.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during COPY:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::copy(main_queue, N, x_buffer, incx, y_buffer,
-                                                      incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::copy(main_queue, N, x_buffer, incx, y_buffer, incy);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::copy, N,
-                                        x_buffer, incx, y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::copy, N, x_buffer,
-                                        incx, y_buffer, incy);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during COPY:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of COPY:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(y_accessor, y_ref, N, incy, N, std::cout);
-
-    return (int)good;
-}
-
-class CopyTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(CopyTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-TEST_P(CopyTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-TEST_P(CopyTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-TEST_P(CopyTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-
-INSTANTIATE_TEST_SUITE_P(CopyTestSuite, CopyTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/copy_usm.cpp b/tests/unit_tests/blas/level1/copy_usm.cpp
deleted file mode 100644
index 0f491015b..000000000
--- a/tests/unit_tests/blas/level1/copy_usm.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during COPY:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua);
-
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-
-    auto y_ref = y;
-
-    // Call Reference COPY.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::copy(&N_ref, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y_ref.data(), &incy_ref);
-
-    // Call DPC++ COPY.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::copy(main_queue, N, x.data(), incx,
-                                                             y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::copy(main_queue, N, x.data(), incx, y.data(),
-                                                          incy, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::copy, N,
-                                        x.data(), incx, y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::copy, N, x.data(),
-                                        incx, y.data(), incy, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during COPY:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of COPY:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(y, y_ref, N, incy, N, std::cout);
-
-    return (int)good;
-}
-
-class CopyUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(CopyUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-TEST_P(CopyUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-TEST_P(CopyUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-TEST_P(CopyUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-
-INSTANTIATE_TEST_SUITE_P(CopyUsmTestSuite, CopyUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/dot.cpp b/tests/unit_tests/blas/level1/dot.cpp
deleted file mode 100644
index 11cb09bcc..000000000
--- a/tests/unit_tests/blas/level1/dot.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp, typename fp_res>
-int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy) {
-    // Prepare data.
-    vector<fp> x, y;
-    fp_res result = fp_res(-1), result_ref = fp_res(-1);
-
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-
-    // Call Reference DOT.
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    result_ref = ::dot<fp, fp_res>(&N_ref, (fp*)x.data(), &incx_ref, (fp*)y.data(), &incy_ref);
-
-    // Call DPC++ DOT.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during DOT:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp_res, 1> result_buffer(&result, range<1>(1));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::dot(main_queue, N, x_buffer, incx, y_buffer, incy,
-                                                     result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::dot(main_queue, N, x_buffer, incx, y_buffer, incy,
-                                                  result_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::dot, N,
-                                        x_buffer, incx, y_buffer, incy, result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::dot, N, x_buffer,
-                                        incx, y_buffer, incy, result_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during DOT:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of DOT:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto result_accessor = result_buffer.get_host_access(read_only);
-    bool good = check_equal(result_accessor[0], result_ref, N, std::cout);
-
-    return (int)good;
-}
-
-class DotTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(DotTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3)));
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1)));
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2)));
-}
-TEST_P(DotTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3)));
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1)));
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2)));
-}
-TEST_P(DotTests, RealDoubleSinglePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-    EXPECT_TRUEORSKIP(
-        (test<float, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3)));
-    EXPECT_TRUEORSKIP(
-        (test<float, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1)));
-    EXPECT_TRUEORSKIP(
-        (test<float, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2)));
-}
-
-INSTANTIATE_TEST_SUITE_P(DotTestSuite, DotTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/dot_usm.cpp b/tests/unit_tests/blas/level1/dot_usm.cpp
deleted file mode 100644
index b8780c75d..000000000
--- a/tests/unit_tests/blas/level1/dot_usm.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp, typename fp_res, usm::alloc alloc_type = usm::alloc::shared>
-int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during DOT:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua);
-    fp_res result_ref = fp_res(-1);
-
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-
-    // Call Reference DOT.
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    result_ref = ::dot<fp, fp_res>(&N_ref, (fp*)x.data(), &incx_ref, (fp*)y.data(), &incy_ref);
-
-    // Call DPC++ DOT.
-
-    fp_res* result_p;
-    if constexpr (alloc_type == usm::alloc::shared) {
-        result_p = (fp_res*)oneapi::mkl::malloc_shared(64, sizeof(fp_res), *dev, cxt);
-    }
-    else if constexpr (alloc_type == usm::alloc::device) {
-        result_p = (fp_res*)oneapi::mkl::malloc_device(64, sizeof(fp_res), *dev, cxt);
-    }
-    else {
-        throw std::runtime_error("Bad alloc_type");
-    }
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::dot(main_queue, N, x.data(), incx, y.data(),
-                                                            incy, result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::dot(main_queue, N, x.data(), incx, y.data(),
-                                                         incy, result_p, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::dot, N,
-                                        x.data(), incx, y.data(), incy, result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::dot, N, x.data(),
-                                        incx, y.data(), incy, result_p, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during DOT:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of DOT:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    bool good = check_equal_ptr(main_queue, result_p, result_ref, N, std::cout);
-
-    oneapi::mkl::free_usm(result_p, cxt);
-
-    return (int)good;
-}
-
-class DotUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(DotUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3)));
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1)));
-    EXPECT_TRUEORSKIP((test<float, float, usm::alloc::device>(std::get<0>(GetParam()),
-                                                              std::get<1>(GetParam()), 101, 1, 1)));
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2)));
-}
-TEST_P(DotUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3)));
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1)));
-    EXPECT_TRUEORSKIP((test<double, double, usm::alloc::device>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 101, 1, 1)));
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2)));
-}
-TEST_P(DotUsmTests, RealDoubleSinglePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-    EXPECT_TRUEORSKIP(
-        (test<float, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3)));
-    EXPECT_TRUEORSKIP(
-        (test<float, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1)));
-    EXPECT_TRUEORSKIP((test<float, double, usm::alloc::device>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 101, 1, 1)));
-    EXPECT_TRUEORSKIP(
-        (test<float, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2)));
-}
-
-INSTANTIATE_TEST_SUITE_P(DotUsmTestSuite, DotUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/dotc.cpp b/tests/unit_tests/blas/level1/dotc.cpp
deleted file mode 100644
index cb8d0fc37..000000000
--- a/tests/unit_tests/blas/level1/dotc.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) {
-    // Prepare data.
-    vector<fp> x, y;
-    fp result = 0.0, result_reference = 0.0;
-
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-
-    // Call Reference DOTC.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::dotc((fp_ref *)&result_reference, &N_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(),
-           &incy_ref);
-
-    // Call DPC++ DOTC.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during DOTC:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> result_buffer(&result, range<1>(1));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::dotc(main_queue, N, x_buffer, incx, y_buffer, incy,
-                                                      result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::dotc(main_queue, N, x_buffer, incx, y_buffer, incy,
-                                                   result_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::dotc, N,
-                                        x_buffer, incx, y_buffer, incy, result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::dotc, N, x_buffer,
-                                        incx, y_buffer, incy, result_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during DOTC:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of DOTC:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto result_accessor = result_buffer.get_host_access(read_only);
-    bool good = check_equal(result_accessor[0], result_reference, N, std::cout);
-
-    return (int)good;
-}
-
-class DotcTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(DotcTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-TEST_P(DotcTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-
-INSTANTIATE_TEST_SUITE_P(DotcTestSuite, DotcTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/dotc_usm.cpp b/tests/unit_tests/blas/level1/dotc_usm.cpp
deleted file mode 100644
index ad05c9d3b..000000000
--- a/tests/unit_tests/blas/level1/dotc_usm.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during DOTC:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua);
-    fp result_reference = 0.0;
-
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-
-    // Call Reference DOTC.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::dotc((fp_ref *)&result_reference, &N_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(),
-           &incy_ref);
-
-    // Call DPC++ DOTC.
-
-    auto result_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::dotc(
-                    main_queue, N, x.data(), incx, y.data(), incy, result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::dotc(main_queue, N, x.data(), incx, y.data(),
-                                                          incy, result_p, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::dotc, N,
-                                        x.data(), incx, y.data(), incy, result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::dotc, N, x.data(),
-                                        incx, y.data(), incy, result_p, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during DOTC:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of DOTC:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal(*result_p, result_reference, N, std::cout);
-
-    oneapi::mkl::free_shared(result_p, cxt);
-
-    return (int)good;
-}
-
-class DotcUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(DotcUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-TEST_P(DotcUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-
-INSTANTIATE_TEST_SUITE_P(DotcUsmTestSuite, DotcUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/dotu.cpp b/tests/unit_tests/blas/level1/dotu.cpp
deleted file mode 100644
index bbef3ad8c..000000000
--- a/tests/unit_tests/blas/level1/dotu.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) {
-    // Prepare data.
-    vector<fp> x, y;
-    fp result = 0.0, result_reference = 0.0;
-
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-
-    // Call Reference DOTU.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::dotu((fp_ref *)&result_reference, &N_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(),
-           &incy_ref);
-
-    // Call DPC++ DOTU.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during DOTU:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> result_buffer(&result, range<1>(1));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::dotu(main_queue, N, x_buffer, incx, y_buffer, incy,
-                                                      result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::dotu(main_queue, N, x_buffer, incx, y_buffer, incy,
-                                                   result_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::dotu, N,
-                                        x_buffer, incx, y_buffer, incy, result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::dotu, N, x_buffer,
-                                        incx, y_buffer, incy, result_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during DOTU:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of DOTU:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto result_accessor = result_buffer.get_host_access(read_only);
-    bool good = check_equal(result_accessor[0], result_reference, N, std::cout);
-
-    return (int)good;
-}
-
-class DotuTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(DotuTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-TEST_P(DotuTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-
-INSTANTIATE_TEST_SUITE_P(DotuTestSuite, DotuTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/dotu_usm.cpp b/tests/unit_tests/blas/level1/dotu_usm.cpp
deleted file mode 100644
index 3f30bf5ff..000000000
--- a/tests/unit_tests/blas/level1/dotu_usm.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during DOTU:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua);
-    fp result_reference = 0.0;
-
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-
-    // Call Reference DOTU.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::dotu((fp_ref *)&result_reference, &N_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(),
-           &incy_ref);
-
-    // Call DPC++ DOTU.
-
-    auto result_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::dotu(
-                    main_queue, N, x.data(), incx, y.data(), incy, result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::dotu(main_queue, N, x.data(), incx, y.data(),
-                                                          incy, result_p, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::dotu, N,
-                                        x.data(), incx, y.data(), incy, result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::dotu, N, x.data(),
-                                        incx, y.data(), incy, result_p, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during DOTU:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of DOTU:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal(*result_p, result_reference, N, std::cout);
-    oneapi::mkl::free_shared(result_p, cxt);
-
-    return (int)good;
-}
-
-class DotuUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(DotuUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-TEST_P(DotuUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, -2));
-}
-
-INSTANTIATE_TEST_SUITE_P(DotuUsmTestSuite, DotuUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/iamax.cpp b/tests/unit_tests/blas/level1/iamax.cpp
deleted file mode 100644
index 977f12b5d..000000000
--- a/tests/unit_tests/blas/level1/iamax.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, int N, int incx) {
-    // Prepare data.
-    vector<fp> x;
-    int64_t result = -1, result_ref = -1;
-    rand_vector(x, N, incx);
-
-    // Call Reference IAMAX.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx;
-
-    result_ref = ::iamax(&N_ref, (fp_ref*)x.data(), &incx_ref);
-
-    // Call DPC++ IAMAX.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during IAMAX:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<int64_t, 1> result_buffer(&result, range<1>(1));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::iamax(main_queue, N, x_buffer, incx,
-                                                       result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::iamax(main_queue, N, x_buffer, incx, result_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::iamax, N,
-                                        x_buffer, incx, result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::iamax, N,
-                                        x_buffer, incx, result_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during IAMAX:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of IAMAX:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto result_accessor = result_buffer.get_host_access(read_only);
-    bool good = check_equal(result_accessor[0], result_ref, 0, std::cout);
-
-    return (int)good;
-}
-
-class IamaxTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(IamaxTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-TEST_P(IamaxTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-TEST_P(IamaxTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-TEST_P(IamaxTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-
-INSTANTIATE_TEST_SUITE_P(IamaxTestSuite, IamaxTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/iamax_usm.cpp b/tests/unit_tests/blas/level1/iamax_usm.cpp
deleted file mode 100644
index 405a79532..000000000
--- a/tests/unit_tests/blas/level1/iamax_usm.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp, usm::alloc alloc_type = usm::alloc::shared>
-int test(device* dev, oneapi::mkl::layout layout, int N, int incx) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during IAMAX:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua);
-    int64_t result_ref = -1;
-    rand_vector(x, N, incx);
-
-    // Call Reference IAMAX.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx;
-
-    result_ref = ::iamax(&N_ref, (fp_ref*)x.data(), &incx_ref);
-
-    // Call DPC++ IAMAX.
-
-    int64_t* result_p;
-    if constexpr (alloc_type == usm::alloc::shared) {
-        result_p = (int64_t*)oneapi::mkl::malloc_shared(64, sizeof(int64_t), *dev, cxt);
-    }
-    else if constexpr (alloc_type == usm::alloc::device) {
-        result_p = (int64_t*)oneapi::mkl::malloc_device(64, sizeof(int64_t), *dev, cxt);
-    }
-    else {
-        throw std::runtime_error("Bad alloc_type");
-    }
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::iamax(main_queue, N, x.data(), incx,
-                                                              result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::iamax(main_queue, N, x.data(), incx, result_p,
-                                                           dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::iamax, N,
-                                        x.data(), incx, result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::iamax, N,
-                                        x.data(), incx, result_p, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during IAMAX:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of IAMAX:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_ptr(main_queue, result_p, result_ref, 0, std::cout);
-    oneapi::mkl::free_usm(result_p, cxt);
-
-    return (int)good;
-}
-
-class IamaxUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(IamaxUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP((
-        test<float, usm::alloc::device>(std::get<0>(GetParam()), std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-TEST_P(IamaxUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP((test<double, usm::alloc::device>(std::get<0>(GetParam()),
-                                                        std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-TEST_P(IamaxUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, usm::alloc::device>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-TEST_P(IamaxUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, usm::alloc::device>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-
-INSTANTIATE_TEST_SUITE_P(IamaxUsmTestSuite, IamaxUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/iamin.cpp b/tests/unit_tests/blas/level1/iamin.cpp
deleted file mode 100644
index a52862cb6..000000000
--- a/tests/unit_tests/blas/level1/iamin.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, int N, int incx) {
-    // Prepare data.
-    vector<fp> x;
-    int64_t result = -1, result_ref = -1;
-    rand_vector(x, N, incx);
-
-    // Call Reference IAMIN.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx;
-
-    result_ref = ::iamin(&N_ref, (fp_ref*)x.data(), &incx_ref);
-
-    // Call DPC++ IAMIN.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during IAMIN:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<int64_t, 1> result_buffer(&result, range<1>(1));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::iamin(main_queue, N, x_buffer, incx,
-                                                       result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::iamin(main_queue, N, x_buffer, incx, result_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::iamin, N,
-                                        x_buffer, incx, result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::iamin, N,
-                                        x_buffer, incx, result_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during IAMIN:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of IAMIN:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto result_accessor = result_buffer.get_host_access(read_only);
-    bool good = check_equal(result_accessor[0], result_ref, 0, std::cout);
-
-    return (int)good;
-}
-
-class IaminTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(IaminTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-TEST_P(IaminTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-TEST_P(IaminTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-TEST_P(IaminTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-
-INSTANTIATE_TEST_SUITE_P(IaminTestSuite, IaminTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/iamin_usm.cpp b/tests/unit_tests/blas/level1/iamin_usm.cpp
deleted file mode 100644
index a3523c8e7..000000000
--- a/tests/unit_tests/blas/level1/iamin_usm.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp, usm::alloc alloc_type = usm::alloc::shared>
-int test(device* dev, oneapi::mkl::layout layout, int N, int incx) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during IAMIN:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua);
-    int64_t result_ref = -1;
-    rand_vector(x, N, incx);
-
-    // Call Reference IAMIN.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx;
-
-    result_ref = ::iamin(&N_ref, (fp_ref*)x.data(), &incx_ref);
-
-    // Call DPC++ IAMIN.
-
-    int64_t* result_p;
-    if constexpr (alloc_type == usm::alloc::shared) {
-        result_p = (int64_t*)oneapi::mkl::malloc_shared(64, sizeof(int64_t), *dev, cxt);
-    }
-    else if constexpr (alloc_type == usm::alloc::device) {
-        result_p = (int64_t*)oneapi::mkl::malloc_device(64, sizeof(int64_t), *dev, cxt);
-    }
-    else {
-        throw std::runtime_error("Bad alloc_type");
-    }
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::iamin(main_queue, N, x.data(), incx,
-                                                              result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::iamin(main_queue, N, x.data(), incx, result_p,
-                                                           dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::iamin, N,
-                                        x.data(), incx, result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::iamin, N,
-                                        x.data(), incx, result_p, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during IAMIN:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of IAMIN:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_ptr(main_queue, result_p, result_ref, 0, std::cout);
-    oneapi::mkl::free_usm(result_p, cxt);
-
-    return (int)good;
-}
-
-class IaminUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(IaminUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP((
-        test<float, usm::alloc::device>(std::get<0>(GetParam()), std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-TEST_P(IaminUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP((test<double, usm::alloc::device>(std::get<0>(GetParam()),
-                                                        std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-TEST_P(IaminUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, usm::alloc::device>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-TEST_P(IaminUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, usm::alloc::device>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3));
-}
-
-INSTANTIATE_TEST_SUITE_P(IaminUsmTestSuite, IaminUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/nrm2.cpp b/tests/unit_tests/blas/level1/nrm2.cpp
deleted file mode 100644
index 423cecb59..000000000
--- a/tests/unit_tests/blas/level1/nrm2.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp, typename fp_res>
-int test(device* dev, oneapi::mkl::layout layout, int N, int incx) {
-    // Prepare data.
-    vector<fp> x;
-    fp_res result = fp_res(-1), result_ref = fp_res(-1);
-
-    rand_vector(x, N, incx);
-
-    // Call Reference NRM2.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = std::abs(incx);
-
-    result_ref = ::nrm2<fp_ref, fp_res>(&N_ref, (fp_ref*)x.data(), &incx_ref);
-
-    // Call DPC++ NRM2.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during NRM2:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp_res, 1> result_buffer(&result, range<1>(1));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::nrm2(main_queue, N, x_buffer, incx, result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::nrm2(main_queue, N, x_buffer, incx, result_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::nrm2, N,
-                                        x_buffer, incx, result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::nrm2, N, x_buffer,
-                                        incx, result_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during NRM2:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of NRM2:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto result_accessor = result_buffer.get_host_access(read_only);
-    bool good = check_equal(result_accessor[0], result_ref, N, std::cout);
-
-    return (int)good;
-}
-
-class Nrm2Tests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(Nrm2Tests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3)));
-}
-TEST_P(Nrm2Tests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3)));
-}
-TEST_P(Nrm2Tests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                        std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                        std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                        std::get<1>(GetParam()), 1357, -3)));
-}
-TEST_P(Nrm2Tests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, -3)));
-}
-
-INSTANTIATE_TEST_SUITE_P(Nrm2TestSuite, Nrm2Tests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/nrm2_usm.cpp b/tests/unit_tests/blas/level1/nrm2_usm.cpp
deleted file mode 100644
index 8628738f4..000000000
--- a/tests/unit_tests/blas/level1/nrm2_usm.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp, typename fp_res, usm::alloc alloc_type = usm::alloc::shared>
-int test(device* dev, oneapi::mkl::layout layout, int N, int incx) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during NRM2:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua);
-    fp_res result_ref = fp_res(-1);
-
-    rand_vector(x, N, incx);
-
-    // Call Reference NRM2.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = std::abs(incx);
-
-    result_ref = ::nrm2<fp_ref, fp_res>(&N_ref, (fp_ref*)x.data(), &incx_ref);
-
-    // Call DPC++ NRM2.
-
-    fp_res* result_p;
-    if constexpr (alloc_type == usm::alloc::shared) {
-        result_p = (fp_res*)oneapi::mkl::malloc_shared(64, sizeof(fp_res), *dev, cxt);
-    }
-    else if constexpr (alloc_type == usm::alloc::device) {
-        result_p = (fp_res*)oneapi::mkl::malloc_device(64, sizeof(fp_res), *dev, cxt);
-    }
-    else {
-        throw std::runtime_error("Bad alloc_type");
-    }
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::nrm2(main_queue, N, x.data(), incx,
-                                                             result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::nrm2(main_queue, N, x.data(), incx, result_p,
-                                                          dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::nrm2, N,
-                                        x.data(), incx, result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::nrm2, N, x.data(),
-                                        incx, result_p, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during NRM2:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of NRM2:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_ptr(main_queue, result_p, result_ref, N, std::cout);
-    oneapi::mkl::free_usm(result_p, cxt);
-
-    return (int)good;
-}
-
-class Nrm2UsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(Nrm2UsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP((test<float, float, usm::alloc::device>(std::get<0>(GetParam()),
-                                                              std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3)));
-}
-TEST_P(Nrm2UsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP((test<double, double, usm::alloc::device>(std::get<0>(GetParam()),
-                                                                std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3)));
-}
-TEST_P(Nrm2UsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                        std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                        std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float, usm::alloc::device>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                        std::get<1>(GetParam()), 1357, -3)));
-}
-TEST_P(Nrm2UsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, 2)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, 1)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double, usm::alloc::device>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 101, 1)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(std::get<0>(GetParam()),
-                                                          std::get<1>(GetParam()), 1357, -3)));
-}
-
-INSTANTIATE_TEST_SUITE_P(Nrm2UsmTestSuite, Nrm2UsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/rot.cpp b/tests/unit_tests/blas/level1/rot.cpp
deleted file mode 100644
index f65540182..000000000
--- a/tests/unit_tests/blas/level1/rot.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp, typename fp_scalar>
-int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp_scalar c,
-         fp_scalar s) {
-    // Prepare data.
-    vector<fp> x, x_ref, y, y_ref;
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-    y_ref = y;
-    x_ref = x;
-
-    // Call Reference ROT.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::rot(&N_ref, (fp_ref *)x_ref.data(), &incx_ref, (fp_ref *)y_ref.data(), &incy_ref,
-          (fp_scalar *)&c, (fp_scalar *)&s);
-
-    // Call DPC++ ROT.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during ROT:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::rot(main_queue, N, x_buffer, incx, y_buffer, incy,
-                                                     c, s);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::rot(main_queue, N, x_buffer, incx, y_buffer, incy, c,
-                                                  s);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::rot, N,
-                                        x_buffer, incx, y_buffer, incy, c, s);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::rot, N, x_buffer,
-                                        incx, y_buffer, incy, c, s);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during ROT:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of ROT:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto x_accessor = x_buffer.get_host_access(read_only);
-    bool good_x = check_equal_vector(x_accessor, x_ref, N, incx, N, std::cout);
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good_y = check_equal_vector(y_accessor, y_ref, N, incy, N, std::cout);
-
-    bool good = good_x && good_y;
-
-    return (int)good;
-}
-
-class RotTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(RotTests, RealSinglePrecision) {
-    float c(2.0);
-    float s(-0.5);
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, c, s)));
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, c, s)));
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, c, s)));
-}
-TEST_P(RotTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double c(2.0);
-    double s(-0.5);
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, c, s)));
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, c, s)));
-    EXPECT_TRUEORSKIP((test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357,
-                                            -2, -3, c, s)));
-}
-TEST_P(RotTests, ComplexSinglePrecision) {
-    float c = 2.0;
-    float s = -0.5;
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, c, s)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, c, s)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, c, s)));
-}
-TEST_P(RotTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double c = 2.0;
-    double s = -0.5;
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, c, s)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, c, s)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, c, s)));
-}
-
-INSTANTIATE_TEST_SUITE_P(RotTestSuite, RotTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/rot_usm.cpp b/tests/unit_tests/blas/level1/rot_usm.cpp
deleted file mode 100644
index 287ac285b..000000000
--- a/tests/unit_tests/blas/level1/rot_usm.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp, typename fp_scalar>
-int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp_scalar c,
-         fp_scalar s) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during ROT:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua);
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-
-    auto x_ref = x;
-    auto y_ref = y;
-
-    // Call Reference ROT.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::rot(&N_ref, (fp_ref *)x_ref.data(), &incx_ref, (fp_ref *)y_ref.data(), &incy_ref,
-          (fp_scalar *)&c, (fp_scalar *)&s);
-
-    // Call DPC++ ROT.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::rot(main_queue, N, x.data(), incx, y.data(),
-                                                            incy, c, s, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::rot(main_queue, N, x.data(), incx, y.data(),
-                                                         incy, c, s, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::rot, N,
-                                        x.data(), incx, y.data(), incy, c, s, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::rot, N, x.data(),
-                                        incx, y.data(), incy, c, s, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during ROT:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of ROT:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good_x = check_equal_vector(x, x_ref, N, incx, N, std::cout);
-    bool good_y = check_equal_vector(y, y_ref, N, incy, N, std::cout);
-    bool good = good_x && good_y;
-
-    return (int)good;
-}
-
-class RotUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(RotUsmTests, RealSinglePrecision) {
-    float c(2.0);
-    float s(-0.5);
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, c, s)));
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, c, s)));
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, c, s)));
-}
-TEST_P(RotUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double c(2.0);
-    double s(-0.5);
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, c, s)));
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, c, s)));
-    EXPECT_TRUEORSKIP((test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357,
-                                            -2, -3, c, s)));
-}
-TEST_P(RotUsmTests, ComplexSinglePrecision) {
-    float c = 2.0;
-    float s = -0.5;
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, c, s)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, c, s)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, c, s)));
-}
-TEST_P(RotUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double c = 2.0;
-    double s = -0.5;
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, c, s)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, c, s)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, c, s)));
-}
-
-INSTANTIATE_TEST_SUITE_P(RotUsmTestSuite, RotUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/rotg.cpp b/tests/unit_tests/blas/level1/rotg.cpp
deleted file mode 100644
index 1a0d569d8..000000000
--- a/tests/unit_tests/blas/level1/rotg.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp, typename fp_scalar>
-int test(device *dev, oneapi::mkl::layout layout) {
-    // Prepare data.
-    fp a, b, s, a_ref, b_ref, s_ref;
-    fp_scalar c, c_ref;
-
-    a = rand_scalar<fp>();
-    b = rand_scalar<fp>();
-    s = rand_scalar<fp>();
-    c = rand_scalar<fp_scalar>();
-
-    a_ref = a;
-    b_ref = b;
-    s_ref = s;
-    c_ref = c;
-
-    // Call Reference ROTG.
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::rotg((fp_ref *)&a_ref, (fp_ref *)&b_ref, (fp_scalar *)&c_ref, (fp_ref *)&s_ref);
-
-    // Call DPC++ ROTG.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during ROTG:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> a_buffer(&a, range<1>(1));
-    buffer<fp, 1> b_buffer(&b, range<1>(1));
-    buffer<fp_scalar, 1> c_buffer(&c, range<1>(1));
-    buffer<fp, 1> s_buffer(&s, range<1>(1));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::rotg(main_queue, a_buffer, b_buffer, c_buffer,
-                                                      s_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::rotg(main_queue, a_buffer, b_buffer, c_buffer,
-                                                   s_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::rotg, a_buffer,
-                                        b_buffer, c_buffer, s_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::rotg, a_buffer,
-                                        b_buffer, c_buffer, s_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during ROTG:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of ROTG:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto a_accessor = a_buffer.get_host_access(read_only);
-    bool good_a = check_equal(a_accessor[0], a_ref, 4, std::cout);
-    auto b_accessor = b_buffer.get_host_access(read_only);
-    bool good_b = check_equal(b_accessor[0], b_ref, 4, std::cout);
-    auto s_accessor = s_buffer.get_host_access(read_only);
-    bool good_s = check_equal(s_accessor[0], s_ref, 4, std::cout);
-    auto c_accessor = c_buffer.get_host_access(read_only);
-    bool good_c = check_equal(c_accessor[0], c_ref, 4, std::cout);
-
-    bool good = good_a && good_b && good_c && good_s;
-
-    return (int)good;
-}
-
-class RotgTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(RotgTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP((test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-    EXPECT_TRUEORSKIP((test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-    EXPECT_TRUEORSKIP((test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-}
-TEST_P(RotgTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP((test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-    EXPECT_TRUEORSKIP((test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-    EXPECT_TRUEORSKIP((test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-}
-TEST_P(RotgTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-}
-TEST_P(RotgTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-}
-
-INSTANTIATE_TEST_SUITE_P(RotgTestSuite, RotgTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/rotg_usm.cpp b/tests/unit_tests/blas/level1/rotg_usm.cpp
deleted file mode 100644
index de71a793d..000000000
--- a/tests/unit_tests/blas/level1/rotg_usm.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp, typename fp_scalar, usm::alloc alloc_type = usm::alloc::shared>
-int test(device *dev, oneapi::mkl::layout layout) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during ROTG:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-
-    fp a, b, s, a_ref, b_ref, s_ref;
-    fp_scalar c, c_ref;
-
-    a = rand_scalar<fp>();
-    b = rand_scalar<fp>();
-    s = rand_scalar<fp>();
-    c = rand_scalar<fp_scalar>();
-    a_ref = a;
-    b_ref = b;
-    s_ref = s;
-    c_ref = c;
-
-    // Call Reference ROTG.
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::rotg((fp_ref *)&a_ref, (fp_ref *)&b_ref, (fp_scalar *)&c_ref, (fp_ref *)&s_ref);
-
-    // Call DPC++ ROTG.
-    fp *a_p, *b_p, *s_p;
-    fp_scalar *c_p;
-    if constexpr (alloc_type == usm::alloc::shared) {
-        a_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt);
-        b_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt);
-        s_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt);
-        c_p = (fp_scalar *)oneapi::mkl::malloc_shared(64, sizeof(fp_scalar), *dev, cxt);
-    }
-    else if constexpr (alloc_type == usm::alloc::device) {
-        a_p = (fp *)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt);
-        b_p = (fp *)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt);
-        s_p = (fp *)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt);
-        c_p = (fp_scalar *)oneapi::mkl::malloc_device(64, sizeof(fp_scalar), *dev, cxt);
-    }
-    else {
-        throw std::runtime_error("Bad alloc_type");
-    }
-
-    main_queue.memcpy(a_p, &a, sizeof(fp));
-    main_queue.memcpy(b_p, &b, sizeof(fp));
-    main_queue.memcpy(s_p, &s, sizeof(fp));
-    main_queue.memcpy(c_p, &c, sizeof(fp_scalar));
-    main_queue.wait();
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::rotg(main_queue, a_p, b_p, c_p, s_p,
-                                                             dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::rotg(main_queue, a_p, b_p, c_p, s_p,
-                                                          dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::rotg, a_p, b_p,
-                                        c_p, s_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::rotg, a_p, b_p,
-                                        c_p, s_p, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during ROTG:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of ROTG:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good_a = check_equal_ptr(main_queue, a_p, a_ref, 4, std::cout);
-    bool good_b = check_equal_ptr(main_queue, b_p, b_ref, 4, std::cout);
-    bool good_s = check_equal_ptr(main_queue, s_p, s_ref, 4, std::cout);
-    bool good_c = check_equal_ptr(main_queue, c_p, c_ref, 4, std::cout);
-
-    bool good = good_a && good_b && good_c && good_s;
-
-    oneapi::mkl::free_usm(a_p, cxt);
-    oneapi::mkl::free_usm(b_p, cxt);
-    oneapi::mkl::free_usm(s_p, cxt);
-    oneapi::mkl::free_usm(c_p, cxt);
-
-    return (int)good;
-}
-
-class RotgUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(RotgUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP((test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-    EXPECT_TRUEORSKIP(
-        (test<float, float, usm::alloc::device>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-}
-TEST_P(RotgUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP((test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-    EXPECT_TRUEORSKIP((test<double, double, usm::alloc::device>(std::get<0>(GetParam()),
-                                                                std::get<1>(GetParam()))));
-}
-TEST_P(RotgUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float, usm::alloc::device>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()))));
-}
-TEST_P(RotgUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double, usm::alloc::device>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()))));
-}
-
-INSTANTIATE_TEST_SUITE_P(RotgUsmTestSuite, RotgUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/rotm.cpp b/tests/unit_tests/blas/level1/rotm.cpp
deleted file mode 100644
index ab2c599bf..000000000
--- a/tests/unit_tests/blas/level1/rotm.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp flag) {
-    // Prepare data.
-    vector<fp> x, x_ref, y, y_ref;
-    vector<fp> param;
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-    rand_vector(param, 5, 1);
-    param[0] = flag;
-    y_ref = y;
-    x_ref = x;
-
-    // Call Reference ROTM.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::rotm(&N_ref, (fp_ref *)x_ref.data(), &incx_ref, (fp_ref *)y_ref.data(), &incy_ref,
-           (fp_ref *)param.data());
-
-    // Call DPC++ ROTM.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during ROTM:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> param_buffer = make_buffer(param);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::rotm(main_queue, N, x_buffer, incx, y_buffer, incy,
-                                                      param_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::rotm(main_queue, N, x_buffer, incx, y_buffer, incy,
-                                                   param_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::rotm, N,
-                                        x_buffer, incx, y_buffer, incy, param_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::rotm, N, x_buffer,
-                                        incx, y_buffer, incy, param_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during ROTM:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of ROTM:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto x_accessor = x_buffer.get_host_access(read_only);
-    bool good_x = check_equal_vector(x_accessor, x_ref, N, incx, N, std::cout);
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good_y = check_equal_vector(y_accessor, y_ref, N, incy, N, std::cout);
-    bool good = good_x && good_y;
-
-    return (int)good;
-}
-
-class RotmTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(RotmTests, RealSinglePrecision) {
-    float flag(-1.0);
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-    flag = 0.0;
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-    flag = 1.0;
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-    flag = -2.0;
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-}
-TEST_P(RotmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double flag(-1.0);
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-    flag = 0.0;
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-    flag = 1.0;
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-    flag = -2.0;
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-}
-
-INSTANTIATE_TEST_SUITE_P(RotmTestSuite, RotmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/rotm_usm.cpp b/tests/unit_tests/blas/level1/rotm_usm.cpp
deleted file mode 100644
index 7723e096c..000000000
--- a/tests/unit_tests/blas/level1/rotm_usm.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp flag) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during ROTM:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), param(ua);
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-    rand_vector(param, 5, 1);
-    param[0] = flag;
-
-    auto x_ref = x;
-    auto y_ref = y;
-
-    // Call Reference ROTM.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::rotm(&N_ref, (fp_ref *)x_ref.data(), &incx_ref, (fp_ref *)y_ref.data(), &incy_ref,
-           (fp_ref *)param.data());
-
-    // Call DPC++ ROTM.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::rotm(
-                    main_queue, N, x.data(), incx, y.data(), incy, param.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::rotm(main_queue, N, x.data(), incx, y.data(),
-                                                          incy, param.data(), dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::rotm, N,
-                                        x.data(), incx, y.data(), incy, param.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::rotm, N, x.data(),
-                                        incx, y.data(), incy, param.data(), dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during ROTM:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of ROTM:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good_x = check_equal_vector(x, x_ref, N, incx, N, std::cout);
-    bool good_y = check_equal_vector(y, y_ref, N, incy, N, std::cout);
-    bool good = good_x && good_y;
-
-    return (int)good;
-}
-
-class RotmUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(RotmUsmTests, RealSinglePrecision) {
-    float flag(-1.0);
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-    flag = 0.0;
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-    flag = 1.0;
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-    flag = -2.0;
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-}
-TEST_P(RotmUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double flag(-1.0);
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-    flag = 0.0;
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-    flag = 1.0;
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-    flag = -2.0;
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, flag));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, flag));
-}
-
-INSTANTIATE_TEST_SUITE_P(RotmUsmTestSuite, RotmUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/rotmg.cpp b/tests/unit_tests/blas/level1/rotmg.cpp
deleted file mode 100644
index f62bd1cf9..000000000
--- a/tests/unit_tests/blas/level1/rotmg.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout) {
-    // Prepare data.
-    fp d1, d2, x1, y1, d1_ref, d2_ref, x1_ref;
-    vector<fp> param(5, fp(0)), param_ref(5, fp(0));
-
-    d1 = rand_scalar<fp>();
-    d1 = abs(d1);
-    d2 = rand_scalar<fp>();
-    x1 = rand_scalar<fp>();
-    y1 = rand_scalar<fp>();
-    d1_ref = d1;
-    d2_ref = d2;
-    x1_ref = x1;
-
-    // Call Reference ROTMG.
-
-    ::rotmg(&d1_ref, &d2_ref, &x1_ref, &y1, (fp*)param_ref.data());
-
-    // Call DPC++ ROTMG.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during ROTMG:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> d1_buffer(&d1, range<1>(1));
-    buffer<fp, 1> d2_buffer(&d2, range<1>(1));
-    buffer<fp, 1> x1_buffer(&x1, range<1>(1));
-    buffer<fp, 1> param_buffer = make_buffer(param);
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::rotmg(main_queue, d1_buffer, d2_buffer, x1_buffer,
-                                                       y1, param_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::rotmg(main_queue, d1_buffer, d2_buffer, x1_buffer, y1,
-                                                    param_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::rotmg,
-                                        d1_buffer, d2_buffer, x1_buffer, y1, param_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::rotmg, d1_buffer,
-                                        d2_buffer, x1_buffer, y1, param_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during ROTMG:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of ROTMG:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    int error_mag = 50;
-
-    auto d1_accessor = d1_buffer.get_host_access(read_only);
-    bool good_d1 = check_equal(d1_accessor[0], d1_ref, error_mag, std::cout);
-    auto d2_accessor = d2_buffer.get_host_access(read_only);
-    bool good_d2 = check_equal(d2_accessor[0], d2_ref, error_mag, std::cout);
-    auto x1_accessor = x1_buffer.get_host_access(read_only);
-    bool good_x1 = check_equal(x1_accessor[0], x1_ref, error_mag, std::cout);
-    auto param_accessor = param_buffer.get_host_access(read_only);
-
-    constexpr fp unit_matrix = -2;
-    constexpr fp rescaled_matrix = -1;
-    constexpr fp sltc_matrix = 0;
-    constexpr fp clts_matrix = 1;
-
-    fp flag = param_accessor[0];
-    fp h11 = param_accessor[1];
-    fp h12 = param_accessor[3];
-    fp h21 = param_accessor[2];
-    fp h22 = param_accessor[4];
-
-    fp flag_ref = param_ref[0];
-    fp h11_ref = param_ref[1];
-    fp h12_ref = param_ref[3];
-    fp h21_ref = param_ref[2];
-    fp h22_ref = param_ref[4];
-
-    bool flag_good = (flag_ref == flag);
-    bool h11_good = true;
-    bool h12_good = true;
-    bool h21_good = true;
-    bool h22_good = true;
-
-    /* Some values of param have to be ignored depending on the flag value since they are
-     * implementation defined */
-    if (flag_ref != unit_matrix) {
-        if (flag_ref == sltc_matrix) {
-            h12_good = check_equal(h12, h12_ref, error_mag, std::cout);
-            h21_good = check_equal(h21, h21_ref, error_mag, std::cout);
-        }
-        else if (flag_ref == clts_matrix) {
-            h11_good = check_equal(h11, h11_ref, error_mag, std::cout);
-            h22_good = check_equal(h22, h22_ref, error_mag, std::cout);
-        }
-        else {
-            flag_good = flag_good && (flag == rescaled_matrix);
-            h11_good = check_equal(h11, h11_ref, error_mag, std::cout);
-            h12_good = check_equal(h12, h12_ref, error_mag, std::cout);
-            h21_good = check_equal(h21, h21_ref, error_mag, std::cout);
-            h22_good = check_equal(h22, h22_ref, error_mag, std::cout);
-        }
-    }
-
-    bool good =
-        good_d1 && good_d2 && good_x1 && flag_good && h11_good && h12_good && h21_good && h22_good;
-
-    return (int)good;
-}
-
-class RotmgTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(RotmgTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-TEST_P(RotmgTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(RotmgTestSuite, RotmgTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/rotmg_usm.cpp b/tests/unit_tests/blas/level1/rotmg_usm.cpp
deleted file mode 100644
index 92eeee491..000000000
--- a/tests/unit_tests/blas/level1/rotmg_usm.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp, usm::alloc alloc_type = usm::alloc::shared>
-int test(device *dev, oneapi::mkl::layout layout) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during ROTMG:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> param(5, fp(0), ua), param_ref(5, fp(0), ua);
-    fp d1, d2, x1, y1, d1_ref, d2_ref, x1_ref;
-
-    d1 = rand_scalar<fp>();
-    d1 = abs(d1);
-    d2 = rand_scalar<fp>();
-    x1 = rand_scalar<fp>();
-    y1 = rand_scalar<fp>();
-    d1_ref = d1;
-    d2_ref = d2;
-    x1_ref = x1;
-
-    fp *d1_p, *d2_p, *x1_p;
-    if constexpr (alloc_type == usm::alloc::device) {
-        d1_p = (fp *)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt);
-        d2_p = (fp *)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt);
-        x1_p = (fp *)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt);
-    }
-    else if constexpr (alloc_type == usm::alloc::shared) {
-        d1_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt);
-        d2_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt);
-        x1_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt);
-    }
-    else {
-        throw std::runtime_error("Bad alloc_type");
-    }
-    main_queue.memcpy(d1_p, &d1, sizeof(fp));
-    main_queue.memcpy(d2_p, &d2, sizeof(fp));
-    main_queue.memcpy(x1_p, &x1, sizeof(fp));
-    main_queue.wait();
-
-    // Call Reference ROTMG.
-
-    ::rotmg(&d1_ref, &d2_ref, &x1_ref, &y1, (fp *)param_ref.data());
-
-    // Call DPC++ ROTMG.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::rotmg(main_queue, d1_p, d2_p, x1_p, y1,
-                                                              param.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::rotmg(main_queue, d1_p, d2_p, x1_p, y1,
-                                                           param.data(), dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::rotmg, d1_p,
-                                        d2_p, x1_p, y1, param.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::rotmg, d1_p, d2_p,
-                                        x1_p, y1, param.data(), dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during ROTMG:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of ROTMG:\n" << error.what() << std::endl;
-    }
-
-    int error_mag = 50;
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good_d1 = check_equal_ptr(main_queue, d1_p, d1_ref, error_mag, std::cout);
-    bool good_d2 = check_equal_ptr(main_queue, d2_p, d2_ref, error_mag, std::cout);
-    bool good_x1 = check_equal_ptr(main_queue, x1_p, x1_ref, error_mag, std::cout);
-
-    constexpr fp unit_matrix = -2;
-    constexpr fp rescaled_matrix = -1;
-    constexpr fp sltc_matrix = 0;
-    constexpr fp clts_matrix = 1;
-
-    fp param_host[5];
-    main_queue.memcpy(&param_host, param.data(), sizeof(fp) * 5);
-    main_queue.wait();
-
-    fp flag = param_host[0];
-    fp h11 = param_host[1];
-    fp h12 = param_host[3];
-    fp h21 = param_host[2];
-    fp h22 = param_host[4];
-
-    fp flag_ref = param_ref[0];
-    fp h11_ref = param_ref[1];
-    fp h12_ref = param_ref[3];
-    fp h21_ref = param_ref[2];
-    fp h22_ref = param_ref[4];
-
-    bool flag_good = (flag_ref == flag);
-    bool h11_good = true;
-    bool h12_good = true;
-    bool h21_good = true;
-    bool h22_good = true;
-
-    /* Some values of param have to be ignored depending on the flag value since they are
-     * implementation defined */
-    if (flag_ref != unit_matrix) {
-        if (flag_ref == sltc_matrix) {
-            h12_good = check_equal(h12, h12_ref, error_mag, std::cout);
-            h21_good = check_equal(h21, h21_ref, error_mag, std::cout);
-        }
-        else if (flag_ref == clts_matrix) {
-            h11_good = check_equal(h11, h11_ref, error_mag, std::cout);
-            h22_good = check_equal(h22, h22_ref, error_mag, std::cout);
-        }
-        else {
-            flag_good = flag_good && (flag == rescaled_matrix);
-            h11_good = check_equal(h11, h11_ref, error_mag, std::cout);
-            h12_good = check_equal(h12, h12_ref, error_mag, std::cout);
-            h21_good = check_equal(h21, h21_ref, error_mag, std::cout);
-            h22_good = check_equal(h22, h22_ref, error_mag, std::cout);
-        }
-    }
-
-    bool good =
-        good_d1 && good_d2 && good_x1 && flag_good && h11_good && h12_good && h21_good && h22_good;
-
-    oneapi::mkl::free_usm(d1_p, cxt);
-    oneapi::mkl::free_usm(d2_p, cxt);
-    oneapi::mkl::free_usm(x1_p, cxt);
-
-    return (int)good;
-}
-
-class RotmgUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(RotmgUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-    EXPECT_TRUEORSKIP(
-        (test<float, usm::alloc::device>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-}
-TEST_P(RotmgUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam())));
-    EXPECT_TRUEORSKIP(
-        (test<double, usm::alloc::device>(std::get<0>(GetParam()), std::get<1>(GetParam()))));
-}
-
-INSTANTIATE_TEST_SUITE_P(RotmgUsmTestSuite, RotmgUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/scal.cpp b/tests/unit_tests/blas/level1/scal.cpp
deleted file mode 100644
index 8901bb424..000000000
--- a/tests/unit_tests/blas/level1/scal.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp, typename fp_scalar>
-int test(device* dev, oneapi::mkl::layout layout, int N, int incx, fp_scalar alpha) {
-    // Prepare data.
-    vector<fp> x, x_ref;
-
-    rand_vector(x, N, incx);
-    x_ref = x;
-
-    // Call Reference SCAL.
-    using fp_ref = typename ref_type_info<fp>::type;
-    using fp_scalar_mkl = typename ref_type_info<fp_scalar>::type;
-
-    const int N_ref = N, incx_ref = std::abs(incx);
-
-    ::scal(&N_ref, (fp_scalar_mkl*)&alpha, (fp_ref*)x_ref.data(), &incx_ref);
-
-    // Call DPC++ SCAL.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during SCAL:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::scal(main_queue, N, alpha, x_buffer, incx);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::scal(main_queue, N, alpha, x_buffer, incx);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::scal, N, alpha,
-                                        x_buffer, incx);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::scal, N, alpha,
-                                        x_buffer, incx);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during SCAL:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of SCAL:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto x_accessor = x_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(x_accessor, x_ref, N, incx, N, std::cout);
-
-    return (int)good;
-}
-
-class ScalTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(ScalTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, alpha)));
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, alpha)));
-}
-TEST_P(ScalTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, alpha)));
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, alpha)));
-}
-TEST_P(ScalTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, alpha)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, alpha)));
-}
-TEST_P(ScalTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, alpha)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, alpha)));
-}
-TEST_P(ScalTests, ComplexRealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                        std::get<1>(GetParam()), 1357, 2, alpha)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                        std::get<1>(GetParam()), 1357, -3, alpha)));
-}
-TEST_P(ScalTests, ComplexRealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, alpha)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, alpha)));
-}
-
-INSTANTIATE_TEST_SUITE_P(ScalTestSuite, ScalTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/scal_usm.cpp b/tests/unit_tests/blas/level1/scal_usm.cpp
deleted file mode 100644
index e669deb2d..000000000
--- a/tests/unit_tests/blas/level1/scal_usm.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp, typename fp_scalar>
-int test(device* dev, oneapi::mkl::layout layout, int N, int incx, fp_scalar alpha) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during SCAL:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua);
-
-    rand_vector(x, N, incx);
-
-    auto x_ref = x;
-
-    // Call Reference SCAL.
-    using fp_ref = typename ref_type_info<fp>::type;
-    using fp_scalar_mkl = typename ref_type_info<fp_scalar>::type;
-
-    const int N_ref = N, incx_ref = std::abs(incx);
-
-    ::scal(&N_ref, (fp_scalar_mkl*)&alpha, (fp_ref*)x_ref.data(), &incx_ref);
-
-    // Call DPC++ SCAL.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::scal(main_queue, N, alpha, x.data(), incx,
-                                                             dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::scal(main_queue, N, alpha, x.data(), incx,
-                                                          dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::scal, N, alpha,
-                                        x.data(), incx, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::scal, N, alpha,
-                                        x.data(), incx, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during SCAL:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of SCAL:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(x, x_ref, N, incx, N, std::cout);
-
-    return (int)good;
-}
-
-class ScalUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(ScalUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, alpha)));
-    EXPECT_TRUEORSKIP(
-        (test<float, float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, alpha)));
-}
-TEST_P(ScalUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, alpha)));
-    EXPECT_TRUEORSKIP(
-        (test<double, double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, alpha)));
-}
-TEST_P(ScalUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, alpha)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, alpha)));
-}
-TEST_P(ScalUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, alpha)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, alpha)));
-}
-TEST_P(ScalUsmTests, ComplexRealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                        std::get<1>(GetParam()), 1357, 2, alpha)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(std::get<0>(GetParam()),
-                                                        std::get<1>(GetParam()), 1357, -3, alpha)));
-}
-TEST_P(ScalUsmTests, ComplexRealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, alpha)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -3, alpha)));
-}
-
-INSTANTIATE_TEST_SUITE_P(ScalUsmTestSuite, ScalUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/sdsdot.cpp b/tests/unit_tests/blas/level1/sdsdot.cpp
deleted file mode 100644
index 7293a3699..000000000
--- a/tests/unit_tests/blas/level1/sdsdot.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, float alpha) {
-    // Prepare data.
-    vector<float> x, y;
-    float result = float(-1), result_ref = float(-1);
-
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-
-    // Call Reference SDSDOT.
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    result_ref = ::sdsdot(&N_ref, (float *)&alpha, (float *)x.data(), &incx_ref, (float *)y.data(),
-                          &incy_ref);
-
-    // Call DPC++ SDSDOT.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SDSDOT:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<float, 1> x_buffer = make_buffer(x);
-    buffer<float, 1> y_buffer = make_buffer(y);
-    buffer<float, 1> result_buffer(&result, range<1>(1));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::sdsdot(main_queue, N, alpha, x_buffer, incx,
-                                                        y_buffer, incy, result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::sdsdot(main_queue, N, alpha, x_buffer, incx, y_buffer,
-                                                     incy, result_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::sdsdot, N,
-                                        alpha, x_buffer, incx, y_buffer, incy, result_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::sdsdot, N, alpha,
-                                        x_buffer, incx, y_buffer, incy, result_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SDSDOT:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SDSDOT:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto result_accessor = result_buffer.get_host_access(read_only);
-    bool good = check_equal(result_accessor[0], result_ref, N, std::cout);
-
-    return (int)good;
-}
-
-class SdsdotTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(SdsdotTests, RealSinglePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-    EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, 2.0));
-    EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, 2.0));
-    EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, 2.0));
-}
-
-INSTANTIATE_TEST_SUITE_P(SdsdotTestSuite, SdsdotTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/sdsdot_usm.cpp b/tests/unit_tests/blas/level1/sdsdot_usm.cpp
deleted file mode 100644
index a5740516c..000000000
--- a/tests/unit_tests/blas/level1/sdsdot_usm.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, float alpha) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SDSDOT:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<float, usm::alloc::shared, 64>(cxt, *dev);
-    vector<float, decltype(ua)> x(ua), y(ua);
-    float result_ref = float(-1);
-
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-
-    // Call Reference SDSDOT.
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    result_ref = ::sdsdot(&N_ref, (float *)&alpha, (float *)x.data(), &incx_ref, (float *)y.data(),
-                          &incy_ref);
-
-    // Call DPC++ SDSDOT.
-
-    auto result_p = (float *)oneapi::mkl::malloc_shared(64, sizeof(float), *dev, cxt);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::sdsdot(
-                    main_queue, N, alpha, x.data(), incx, y.data(), incy, result_p, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::sdsdot(main_queue, N, alpha, x.data(), incx,
-                                                            y.data(), incy, result_p, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::sdsdot, N,
-                                        alpha, x.data(), incx, y.data(), incy, result_p,
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::sdsdot, N, alpha,
-                                        x.data(), incx, y.data(), incy, result_p, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SDSDOT:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SDSDOT:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal(*result_p, result_ref, N, std::cout);
-    oneapi::mkl::free_shared(result_p, cxt);
-
-    return (int)good;
-}
-
-class SdsdotUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(SdsdotUsmTests, RealSinglePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-    EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3, 2.0));
-    EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3, 2.0));
-    EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1, 2.0));
-}
-
-INSTANTIATE_TEST_SUITE_P(SdsdotUsmTestSuite, SdsdotUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/swap.cpp b/tests/unit_tests/blas/level1/swap.cpp
deleted file mode 100644
index 6c6721537..000000000
--- a/tests/unit_tests/blas/level1/swap.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy) {
-    // Prepare data.
-    vector<fp> x, x_ref, y, y_ref;
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-    y_ref = y;
-    x_ref = x;
-
-    // Call Reference SWAP.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::swap(&N_ref, (fp_ref*)x_ref.data(), &incx_ref, (fp_ref*)y_ref.data(), &incy_ref);
-
-    // Call DPC++ SWAP.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during SWAP:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::swap(main_queue, N, x_buffer, incx, y_buffer,
-                                                      incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::swap(main_queue, N, x_buffer, incx, y_buffer, incy);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::swap, N,
-                                        x_buffer, incx, y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::swap, N, x_buffer,
-                                        incx, y_buffer, incy);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during SWAP:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of SWAP:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    auto x_accessor = x_buffer.get_host_access(read_only);
-    bool good_y = check_equal_vector(y_accessor, y_ref, N, incy, N, std::cout);
-    bool good_x = check_equal_vector(x_accessor, x_ref, N, incx, N, std::cout);
-    bool good = good_x && good_y;
-
-    return (int)good;
-}
-
-class SwapTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(SwapTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-}
-TEST_P(SwapTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-}
-TEST_P(SwapTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-}
-TEST_P(SwapTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-}
-
-INSTANTIATE_TEST_SUITE_P(SwapTestSuite, SwapTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level1/swap_usm.cpp b/tests/unit_tests/blas/level1/swap_usm.cpp
deleted file mode 100644
index de20f3eb7..000000000
--- a/tests/unit_tests/blas/level1/swap_usm.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during SWAP:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua);
-    rand_vector(x, N, incx);
-    rand_vector(y, N, incy);
-
-    auto x_ref = x;
-    auto y_ref = y;
-
-    // Call Reference SWAP.
-    using fp_ref = typename ref_type_info<fp>::type;
-    const int N_ref = N, incx_ref = incx, incy_ref = incy;
-
-    ::swap(&N_ref, (fp_ref*)x_ref.data(), &incx_ref, (fp_ref*)y_ref.data(), &incy_ref);
-
-    // Call DPC++ SWAP.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::swap(main_queue, N, x.data(), incx,
-                                                             y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::swap(main_queue, N, x.data(), incx, y.data(),
-                                                          incy, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::swap, N,
-                                        x.data(), incx, y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::swap, N, x.data(),
-                                        incx, y.data(), incy, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during SWAP:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of SWAP:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good_y = check_equal_vector(y, y_ref, N, incy, N, std::cout);
-    bool good_x = check_equal_vector(x, x_ref, N, incx, N, std::cout);
-    bool good = good_x && good_y;
-
-    return (int)good;
-}
-
-class SwapUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(SwapUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-}
-TEST_P(SwapUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-}
-TEST_P(SwapUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-}
-TEST_P(SwapUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 2, 3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, -2, -3));
-    EXPECT_TRUEORSKIP(
-        test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()), 1357, 1, 1));
-}
-
-INSTANTIATE_TEST_SUITE_P(SwapUsmTestSuite, SwapUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/CMakeLists.txt b/tests/unit_tests/blas/level2/CMakeLists.txt
deleted file mode 100644
index a61bcfeef..000000000
--- a/tests/unit_tests/blas/level2/CMakeLists.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build object from all test sources
-set(L2_SOURCES "hpr2.cpp" "hpmv.cpp" "her.cpp" "her2.cpp" "hemv.cpp" "hbmv.cpp" "geru.cpp" "ger.cpp" "gerc.cpp" "gemv.cpp" "gbmv.cpp" "trsv.cpp" "trmv.cpp" "tpsv.cpp" "tpmv.cpp" "tbsv.cpp" "tbmv.cpp" "syr.cpp" "syr2.cpp" "symv.cpp" "spr.cpp" "spr2.cpp" "spmv.cpp" "sbmv.cpp" "hpr.cpp" "hpr2_usm.cpp" "hpmv_usm.cpp" "her_usm.cpp" "her2_usm.cpp" "hemv_usm.cpp" "hbmv_usm.cpp" "geru_usm.cpp" "ger_usm.cpp" "gerc_usm.cpp" "gemv_usm.cpp" "gbmv_usm.cpp" "trsv_usm.cpp" "trmv_usm.cpp" "tpsv_usm.cpp" "tpmv_usm.cpp" "tbsv_usm.cpp" "tbmv_usm.cpp" "syr_usm.cpp" "syr2_usm.cpp" "symv_usm.cpp" "spr_usm.cpp" "spr2_usm.cpp" "spmv_usm.cpp" "sbmv_usm.cpp" "hpr_usm.cpp")
-
-if(BUILD_SHARED_LIBS)
-  add_library(blas_level2_rt OBJECT ${L2_SOURCES})
-  target_compile_options(blas_level2_rt PRIVATE -DCALL_RT_API -DNOMINMAX)
-  target_include_directories(blas_level2_rt
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-      PUBLIC ${CBLAS_INCLUDE}
-  )
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET blas_level2_rt SOURCES ${L2_SOURCES})
-  else()
-    target_link_libraries(blas_level2_rt PUBLIC ONEMKL::SYCL::SYCL)
-  endif()
-endif()
-
-add_library(blas_level2_ct OBJECT ${L2_SOURCES})
-target_compile_options(blas_level2_ct PRIVATE -DNOMINMAX)
-target_include_directories(blas_level2_ct
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-    PUBLIC ${PROJECT_SOURCE_DIR}/include
-    PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-    PUBLIC ${CMAKE_BINARY_DIR}/bin
-    PUBLIC ${CBLAS_INCLUDE}
-)
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET blas_level2_ct SOURCES ${L2_SOURCES})
-else()
-  target_link_libraries(blas_level2_ct PUBLIC ONEMKL::SYCL::SYCL)
-endif()
-
diff --git a/tests/unit_tests/blas/level2/gbmv.cpp b/tests/unit_tests/blas/level2/gbmv.cpp
deleted file mode 100644
index 94fcbc906..000000000
--- a/tests/unit_tests/blas/level2/gbmv.cpp
+++ /dev/null
@@ -1,265 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, int m, int n,
-         int kl, int ku, fp alpha, fp beta, int incx, int incy, int lda) {
-    // Prepare data.
-    int x_len = outer_dimension(transa, m, n);
-    int y_len = inner_dimension(transa, m, n);
-
-    vector<fp> x, y, y_ref, A;
-
-    rand_vector(x, x_len, incx);
-    rand_vector(y, y_len, incy);
-    y_ref = y;
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-
-    // Call Reference GBMV.
-    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    int kl_ref = kl, ku_ref = ku;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::gbmv(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), &m_ref, &n_ref, &kl_ref,
-           &ku_ref, (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref,
-           (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ GBMV.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GBMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::gbmv(main_queue, transa, m, n, kl, ku, alpha,
-                                                      A_buffer, lda, x_buffer, incx, beta, y_buffer,
-                                                      incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::gbmv(main_queue, transa, m, n, kl, ku, alpha,
-                                                   A_buffer, lda, x_buffer, incx, beta, y_buffer,
-                                                   incy);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gbmv, transa,
-                                        m, n, kl, ku, alpha, A_buffer, lda, x_buffer, incx, beta,
-                                        y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gbmv, transa, m,
-                                        n, kl, ku, alpha, A_buffer, lda, x_buffer, incx, beta,
-                                        y_buffer, incy);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GBMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GBMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(y_accessor, y_ref, y_len, incy, std::max<int>(m, n), std::cout);
-
-    return (int)good;
-}
-
-class GbmvTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(GbmvTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 2, 3,
-                                  42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, -2,
-                                  -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 1, 1,
-                                  42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 2, 3,
-                                  42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha, beta, -2, -3,
-                                  42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 1, 1,
-                                  42));
-}
-TEST_P(GbmvTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 2,
-                                   3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, -2,
-                                   -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 1,
-                                   1, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 2, 3,
-                                   42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha, beta, -2, -3,
-                                   42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 1, 1,
-                                   42));
-}
-TEST_P(GbmvTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::nontrans, 25, 30, 5, 7,
-                                                alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::nontrans, 25, 30, 5, 7,
-                                                alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::nontrans, 25, 30, 5, 7,
-                                                alpha, beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                                beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                                beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                                beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::conjtrans, 25, 30, 5, 7,
-                                                alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::conjtrans, 25, 30, 5, 7,
-                                                alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::conjtrans, 25, 30, 5, 7,
-                                                alpha, beta, 1, 1, 42));
-}
-TEST_P(GbmvTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::nontrans, 25, 30, 5, 7,
-                                                 alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::nontrans, 25, 30, 5, 7,
-                                                 alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::nontrans, 25, 30, 5, 7,
-                                                 alpha, beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                                 beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                                 beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                                 beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::conjtrans, 25, 30, 5, 7,
-                                                 alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::conjtrans, 25, 30, 5, 7,
-                                                 alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::conjtrans, 25, 30, 5, 7,
-                                                 alpha, beta, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(GbmvTestSuite, GbmvTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/gbmv_usm.cpp b/tests/unit_tests/blas/level2/gbmv_usm.cpp
deleted file mode 100644
index 9d92fcf7e..000000000
--- a/tests/unit_tests/blas/level2/gbmv_usm.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, int m, int n,
-         int kl, int ku, fp alpha, fp beta, int incx, int incy, int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GBMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-    int x_len = outer_dimension(transa, m, n);
-    int y_len = inner_dimension(transa, m, n);
-
-    rand_vector(x, x_len, incx);
-    rand_vector(y, y_len, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-
-    auto y_ref = y;
-
-    // Call Reference GBMV.
-    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    int kl_ref = kl, ku_ref = ku;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::gbmv(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), &m_ref, &n_ref, &kl_ref,
-           &ku_ref, (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref,
-           (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ GBMV.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::gbmv(main_queue, transa, m, n, kl, ku,
-                                                             alpha, A.data(), lda, x.data(), incx,
-                                                             beta, y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::gbmv(main_queue, transa, m, n, kl, ku, alpha,
-                                                          A.data(), lda, x.data(), incx, beta,
-                                                          y.data(), incy, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gbmv, transa,
-                                        m, n, kl, ku, alpha, A.data(), lda, x.data(), incx, beta,
-                                        y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gbmv, transa, m,
-                                        n, kl, ku, alpha, A.data(), lda, x.data(), incx, beta,
-                                        y.data(), incy, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GBMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GBMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(y, y_ref, y_len, incy, std::max<int>(m, n), std::cout);
-
-    return (int)good;
-}
-
-class GbmvUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(GbmvUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 2, 3,
-                                  42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, -2,
-                                  -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 1, 1,
-                                  42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 2, 3,
-                                  42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha, beta, -2, -3,
-                                  42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 1, 1,
-                                  42));
-}
-TEST_P(GbmvUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 2,
-                                   3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, -2,
-                                   -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::nontrans, 25, 30, 5, 7, alpha, beta, 1,
-                                   1, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 2, 3,
-                                   42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha, beta, -2, -3,
-                                   42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha, beta, 1, 1,
-                                   42));
-}
-TEST_P(GbmvUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::nontrans, 25, 30, 5, 7,
-                                                alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::nontrans, 25, 30, 5, 7,
-                                                alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::nontrans, 25, 30, 5, 7,
-                                                alpha, beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                                beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                                beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                                beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::conjtrans, 25, 30, 5, 7,
-                                                alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::conjtrans, 25, 30, 5, 7,
-                                                alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::conjtrans, 25, 30, 5, 7,
-                                                alpha, beta, 1, 1, 42));
-}
-TEST_P(GbmvUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::nontrans, 25, 30, 5, 7,
-                                                 alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::nontrans, 25, 30, 5, 7,
-                                                 alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::nontrans, 25, 30, 5, 7,
-                                                 alpha, beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                                 beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                                 beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::trans, 25, 30, 5, 7, alpha,
-                                                 beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::conjtrans, 25, 30, 5, 7,
-                                                 alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::conjtrans, 25, 30, 5, 7,
-                                                 alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::conjtrans, 25, 30, 5, 7,
-                                                 alpha, beta, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(GbmvUsmTestSuite, GbmvUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/gemv.cpp b/tests/unit_tests/blas/level2/gemv.cpp
deleted file mode 100644
index 3bfff4324..000000000
--- a/tests/unit_tests/blas/level2/gemv.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, int m, int n,
-         fp alpha, fp beta, int incx, int incy, int lda) {
-    // Prepare data.
-    int x_len = outer_dimension(transa, m, n);
-    int y_len = inner_dimension(transa, m, n);
-
-    vector<fp> x, y, y_ref, A;
-
-    rand_vector(x, x_len, incx);
-    rand_vector(y, y_len, incy);
-    y_ref = y;
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-
-    // Call Reference GEMV.
-    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::gemv(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), &m_ref, &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref,
-           (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ GEMV.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::gemv(main_queue, transa, m, n, alpha, A_buffer,
-                                                      lda, x_buffer, incx, beta, y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::gemv(main_queue, transa, m, n, alpha, A_buffer, lda,
-                                                   x_buffer, incx, beta, y_buffer, incy);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemv, transa,
-                                        m, n, alpha, A_buffer, lda, x_buffer, incx, beta, y_buffer,
-                                        incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemv, transa, m,
-                                        n, alpha, A_buffer, lda, x_buffer, incx, beta, y_buffer,
-                                        incy);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GEMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GEMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(y_accessor, y_ref, y_len, incy, std::max<int>(m, n), std::cout);
-
-    return (int)good;
-}
-
-class GemvTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(GemvTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::nontrans, 25, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::nontrans, 25, 30, alpha, beta, -2, -3,
-                                  42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::nontrans, 25, 30, alpha, beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::trans, 25, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42));
-}
-TEST_P(GemvTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::nontrans, 25, 30, alpha, beta, 2, 3,
-                                   42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::nontrans, 25, 30, alpha, beta, -2, -3,
-                                   42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::nontrans, 25, 30, alpha, beta, 1, 1,
-                                   42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::trans, 25, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42));
-}
-
-TEST_P(GemvTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::nontrans, 2, 3, alpha, beta,
-                                                2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::nontrans, 2, 3, alpha, beta,
-                                                -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::nontrans, 2, 3, alpha, beta,
-                                                1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::trans, 2, 3, alpha, beta, 2,
-                                                3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::trans, 2, 3, alpha, beta,
-                                                -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::trans, 2, 3, alpha, beta, 1,
-                                                1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::conjtrans, 2, 3, alpha,
-                                                beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::conjtrans, 2, 3, alpha,
-                                                beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::conjtrans, 2, 3, alpha,
-                                                beta, 1, 1, 42));
-}
-
-TEST_P(GemvTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::nontrans, 25, 30, alpha,
-                                                 beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::nontrans, 25, 30, alpha,
-                                                 beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::nontrans, 25, 30, alpha,
-                                                 beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::trans, 25, 30, alpha, beta,
-                                                 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::trans, 25, 30, alpha, beta,
-                                                 -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::trans, 25, 30, alpha, beta,
-                                                 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::conjtrans, 25, 30, alpha,
-                                                 beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::conjtrans, 25, 30, alpha,
-                                                 beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::conjtrans, 25, 30, alpha,
-                                                 beta, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemvTestSuite, GemvTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/gemv_usm.cpp b/tests/unit_tests/blas/level2/gemv_usm.cpp
deleted file mode 100644
index d1e726e38..000000000
--- a/tests/unit_tests/blas/level2/gemv_usm.cpp
+++ /dev/null
@@ -1,258 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, int m, int n,
-         fp alpha, fp beta, int incx, int incy, int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-    int x_len = outer_dimension(transa, m, n);
-    int y_len = inner_dimension(transa, m, n);
-
-    rand_vector(x, x_len, incx);
-    rand_vector(y, y_len, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-
-    auto y_ref = y;
-
-    // Call Reference GEMV.
-    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::gemv(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), &m_ref, &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref,
-           (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ GEMV.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::gemv(main_queue, transa, m, n, alpha,
-                                                             A.data(), lda, x.data(), incx, beta,
-                                                             y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::gemv(main_queue, transa, m, n, alpha, A.data(),
-                                                          lda, x.data(), incx, beta, y.data(), incy,
-                                                          dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemv, transa,
-                                        m, n, alpha, A.data(), lda, x.data(), incx, beta, y.data(),
-                                        incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemv, transa, m,
-                                        n, alpha, A.data(), lda, x.data(), incx, beta, y.data(),
-                                        incy, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GEMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GEMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(y, y_ref, y_len, incy, std::max<int>(m, n), std::cout);
-
-    return (int)good;
-}
-
-class GemvUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(GemvUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::nontrans, 25, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::nontrans, 25, 30, alpha, beta, -2, -3,
-                                  42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::nontrans, 25, 30, alpha, beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::trans, 25, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42));
-}
-TEST_P(GemvUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::nontrans, 25, 30, alpha, beta, 2, 3,
-                                   42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::nontrans, 25, 30, alpha, beta, -2, -3,
-                                   42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::nontrans, 25, 30, alpha, beta, 1, 1,
-                                   42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::trans, 25, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::trans, 25, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::transpose::trans, 25, 30, alpha, beta, 1, 1, 42));
-}
-TEST_P(GemvUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::nontrans, 25, 30, alpha,
-                                                beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::nontrans, 25, 30, alpha,
-                                                beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::nontrans, 25, 30, alpha,
-                                                beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::trans, 25, 30, alpha, beta,
-                                                2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::trans, 25, 30, alpha, beta,
-                                                -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::trans, 25, 30, alpha, beta,
-                                                1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::conjtrans, 25, 30, alpha,
-                                                beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::conjtrans, 25, 30, alpha,
-                                                beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::transpose::conjtrans, 25, 30, alpha,
-                                                beta, 1, 1, 42));
-}
-TEST_P(GemvUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::nontrans, 25, 30, alpha,
-                                                 beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::nontrans, 25, 30, alpha,
-                                                 beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::nontrans, 25, 30, alpha,
-                                                 beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::trans, 25, 30, alpha, beta,
-                                                 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::trans, 25, 30, alpha, beta,
-                                                 -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::trans, 25, 30, alpha, beta,
-                                                 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::conjtrans, 25, 30, alpha,
-                                                 beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::conjtrans, 25, 30, alpha,
-                                                 beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::transpose::conjtrans, 25, 30, alpha,
-                                                 beta, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemvUsmTestSuite, GemvUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/ger.cpp b/tests/unit_tests/blas/level2/ger.cpp
deleted file mode 100644
index 3b32d2827..000000000
--- a/tests/unit_tests/blas/level2/ger.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy,
-         int lda) {
-    // Prepare data.
-
-    vector<fp> x, y, A_ref, A;
-
-    rand_vector(x, m, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-    A_ref = A;
-
-    // Call Reference GER.
-    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::ger(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(),
-          &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref);
-
-    // Call DPC++ GER.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GER:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::ger(main_queue, m, n, alpha, x_buffer, incx,
-                                                     y_buffer, incy, A_buffer, lda);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::ger(main_queue, m, n, alpha, x_buffer, incx, y_buffer,
-                                                  incy, A_buffer, lda);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::ger, m, n,
-                                        alpha, x_buffer, incx, y_buffer, incy, A_buffer, lda);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::ger, m, n, alpha,
-                                        x_buffer, incx, y_buffer, incy, A_buffer, lda);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GER:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GER:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto A_accessor = A_buffer.get_host_access(read_only);
-    bool good =
-        check_equal_matrix(A_accessor, A_ref, layout, m, n, lda, std::max<int>(m, n), std::cout);
-
-    return (int)good;
-}
-
-class GerTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(GerTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 25, 30, alpha, 1, 1, 42));
-}
-TEST_P(GerTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 25, 30, alpha, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(GerTestSuite, GerTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/ger_usm.cpp b/tests/unit_tests/blas/level2/ger_usm.cpp
deleted file mode 100644
index 87087f026..000000000
--- a/tests/unit_tests/blas/level2/ger_usm.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy,
-         int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GER:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-
-    rand_vector(x, m, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-
-    auto A_ref = A;
-
-    // Call Reference GER.
-    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::ger(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(),
-          &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref);
-
-    // Call DPC++ GER.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::ger(main_queue, m, n, alpha, x.data(), incx,
-                                                            y.data(), incy, A.data(), lda,
-                                                            dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done =
-                    oneapi::mkl::blas::row_major::ger(main_queue, m, n, alpha, x.data(), incx,
-                                                      y.data(), incy, A.data(), lda, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::ger, m, n,
-                                        alpha, x.data(), incx, y.data(), incy, A.data(), lda,
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::ger, m, n, alpha,
-                                        x.data(), incx, y.data(), incy, A.data(), lda,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GER:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GER:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(A, A_ref, layout, m, n, lda, std::max<int>(m, n), std::cout);
-
-    return (int)good;
-}
-
-class GerUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(GerUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(
-        test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()), 25, 30, alpha, 1, 1, 42));
-}
-TEST_P(GerUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(
-        test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()), 25, 30, alpha, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(GerUsmTestSuite, GerUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/gerc.cpp b/tests/unit_tests/blas/level2/gerc.cpp
deleted file mode 100644
index c19c9f029..000000000
--- a/tests/unit_tests/blas/level2/gerc.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy,
-         int lda) {
-    // Prepare data.
-
-    vector<fp> x, y, A_ref, A;
-
-    rand_vector(x, m, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-    A_ref = A;
-
-    // Call Reference GERC.
-    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::gerc(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(),
-           &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref);
-
-    // Call DPC++ GERC.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GERC:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::gerc(main_queue, m, n, alpha, x_buffer, incx,
-                                                      y_buffer, incy, A_buffer, lda);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::gerc(main_queue, m, n, alpha, x_buffer, incx,
-                                                   y_buffer, incy, A_buffer, lda);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gerc, m, n,
-                                        alpha, x_buffer, incx, y_buffer, incy, A_buffer, lda);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gerc, m, n, alpha,
-                                        x_buffer, incx, y_buffer, incy, A_buffer, lda);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GERC:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GERC:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto A_accessor = A_buffer.get_host_access(read_only);
-    bool good =
-        check_equal_matrix(A_accessor, A_ref, layout, m, n, lda, std::max<int>(m, n), std::cout);
-
-    return (int)good;
-}
-
-class GercTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(GercTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                25, 30, alpha, 1, 1, 42));
-}
-TEST_P(GercTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 25, 30, alpha, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(GercTestSuite, GercTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/gerc_usm.cpp b/tests/unit_tests/blas/level2/gerc_usm.cpp
deleted file mode 100644
index b6473484d..000000000
--- a/tests/unit_tests/blas/level2/gerc_usm.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy,
-         int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GERC:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-
-    rand_vector(x, m, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-
-    auto A_ref = A;
-
-    // Call Reference GERC.
-    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::gerc(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(),
-           &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref);
-
-    // Call DPC++ GERC.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::gerc(main_queue, m, n, alpha, x.data(),
-                                                             incx, y.data(), incy, A.data(), lda,
-                                                             dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done =
-                    oneapi::mkl::blas::row_major::gerc(main_queue, m, n, alpha, x.data(), incx,
-                                                       y.data(), incy, A.data(), lda, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gerc, m, n,
-                                        alpha, x.data(), incx, y.data(), incy, A.data(), lda,
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gerc, m, n, alpha,
-                                        x.data(), incx, y.data(), incy, A.data(), lda,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GERC:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GERC:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(A, A_ref, layout, m, n, lda, std::max<int>(m, n), std::cout);
-
-    return (int)good;
-}
-
-class GercUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(GercUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                25, 30, alpha, 1, 1, 42));
-}
-TEST_P(GercUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 25, 30, alpha, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(GercUsmTestSuite, GercUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/geru.cpp b/tests/unit_tests/blas/level2/geru.cpp
deleted file mode 100644
index e0cb7c45d..000000000
--- a/tests/unit_tests/blas/level2/geru.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy,
-         int lda) {
-    // Prepare data.
-
-    vector<fp> x, y, A_ref, A;
-
-    rand_vector(x, m, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-    A_ref = A;
-
-    // Call Reference GERU.
-    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::geru(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(),
-           &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref);
-
-    // Call DPC++ GERU.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GERU:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::geru(main_queue, m, n, alpha, x_buffer, incx,
-                                                      y_buffer, incy, A_buffer, lda);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::geru(main_queue, m, n, alpha, x_buffer, incx,
-                                                   y_buffer, incy, A_buffer, lda);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::geru, m, n,
-                                        alpha, x_buffer, incx, y_buffer, incy, A_buffer, lda);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::geru, m, n, alpha,
-                                        x_buffer, incx, y_buffer, incy, A_buffer, lda);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GERU:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GERU:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto A_accessor = A_buffer.get_host_access(read_only);
-    bool good =
-        check_equal_matrix(A_accessor, A_ref, layout, m, n, lda, std::max<int>(m, n), std::cout);
-
-    return (int)good;
-}
-
-class GeruTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(GeruTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                25, 30, alpha, 1, 1, 42));
-}
-TEST_P(GeruTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 25, 30, alpha, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(GeruTestSuite, GeruTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/geru_usm.cpp b/tests/unit_tests/blas/level2/geru_usm.cpp
deleted file mode 100644
index 1e882bd97..000000000
--- a/tests/unit_tests/blas/level2/geru_usm.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy,
-         int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during GERU:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-
-    rand_vector(x, m, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, m, n, lda);
-
-    auto A_ref = A;
-
-    // Call Reference GERU.
-    const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::geru(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(),
-           &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref);
-
-    // Call DPC++ GERU.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::geru(main_queue, m, n, alpha, x.data(),
-                                                             incx, y.data(), incy, A.data(), lda,
-                                                             dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done =
-                    oneapi::mkl::blas::row_major::geru(main_queue, m, n, alpha, x.data(), incx,
-                                                       y.data(), incy, A.data(), lda, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::geru, m, n,
-                                        alpha, x.data(), incx, y.data(), incy, A.data(), lda,
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::geru, m, n, alpha,
-                                        x.data(), incx, y.data(), incy, A.data(), lda,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during GERU:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of GERU:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(A, A_ref, layout, m, n, lda, std::max<int>(m, n), std::cout);
-
-    return (int)good;
-}
-
-class GeruUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(GeruUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                25, 30, alpha, 1, 1, 42));
-}
-TEST_P(GeruUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 25, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 25, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 25, 30, alpha, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(GeruUsmTestSuite, GeruUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hbmv.cpp b/tests/unit_tests/blas/level2/hbmv.cpp
deleted file mode 100644
index 119aef32a..000000000
--- a/tests/unit_tests/blas/level2/hbmv.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, int k,
-         fp alpha, fp beta, int incx, int incy, int lda) {
-    // Prepare data.
-    vector<fp> x, y, y_ref, A;
-
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    y_ref = y;
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-
-    // Call Reference HBMV.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    const int k_ref = k;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::hbmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, &k_ref,
-           (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref,
-           (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ HBMV.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during HBMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::hbmv(main_queue, upper_lower, n, k, alpha,
-                                                      A_buffer, lda, x_buffer, incx, beta, y_buffer,
-                                                      incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::hbmv(main_queue, upper_lower, n, k, alpha, A_buffer,
-                                                   lda, x_buffer, incx, beta, y_buffer, incy);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::hbmv,
-                                        upper_lower, n, k, alpha, A_buffer, lda, x_buffer, incx,
-                                        beta, y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::hbmv, upper_lower,
-                                        n, k, alpha, A_buffer, lda, x_buffer, incx, beta, y_buffer,
-                                        incy);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during HBMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of HBMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout);
-
-    return (int)good;
-}
-
-class HbmvTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(HbmvTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 2, 3,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 2, 3,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, 5, alpha, beta, -2,
-                                                -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, 5, alpha, beta, -2,
-                                                -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 1, 1,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 1, 1,
-                                                42));
-}
-TEST_P(HbmvTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 2, 3,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 2, 3,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, 5, alpha, beta, -2,
-                                                 -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, 5, alpha, beta, -2,
-                                                 -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 1, 1,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 1, 1,
-                                                 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(HbmvTestSuite, HbmvTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hbmv_usm.cpp b/tests/unit_tests/blas/level2/hbmv_usm.cpp
deleted file mode 100644
index 60305cb93..000000000
--- a/tests/unit_tests/blas/level2/hbmv_usm.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, int k,
-         fp alpha, fp beta, int incx, int incy, int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during HBMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-
-    auto y_ref = y;
-
-    // Call Reference HBMV.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    const int k_ref = k;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::hbmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, &k_ref,
-           (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref,
-           (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ HBMV.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::hbmv(main_queue, upper_lower, n, k, alpha,
-                                                             A.data(), lda, x.data(), incx, beta,
-                                                             y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::hbmv(main_queue, upper_lower, n, k, alpha,
-                                                          A.data(), lda, x.data(), incx, beta,
-                                                          y.data(), incy, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::hbmv,
-                                        upper_lower, n, k, alpha, A.data(), lda, x.data(), incx,
-                                        beta, y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::hbmv, upper_lower,
-                                        n, k, alpha, A.data(), lda, x.data(), incx, beta, y.data(),
-                                        incy, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during HBMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of HBMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout);
-
-    return (int)good;
-}
-
-class HbmvUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(HbmvUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 2, 3,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 2, 3,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, 5, alpha, beta, -2,
-                                                -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, 5, alpha, beta, -2,
-                                                -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 1, 1,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 1, 1,
-                                                42));
-}
-TEST_P(HbmvUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 2, 3,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 2, 3,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, 5, alpha, beta, -2,
-                                                 -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, 5, alpha, beta, -2,
-                                                 -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 1, 1,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 1, 1,
-                                                 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(HbmvUsmTestSuite, HbmvUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hemv.cpp b/tests/unit_tests/blas/level2/hemv.cpp
deleted file mode 100644
index 3636e3774..000000000
--- a/tests/unit_tests/blas/level2/hemv.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         fp beta, int incx, int incy, int lda) {
-    // Prepare data.
-    vector<fp> x, y, y_ref, A;
-
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    y_ref = y;
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-
-    // Call Reference HEMV.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::hemv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref,
-           (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ HEMV.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during HEMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::hemv(main_queue, upper_lower, n, alpha, A_buffer,
-                                                      lda, x_buffer, incx, beta, y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::hemv(main_queue, upper_lower, n, alpha, A_buffer, lda,
-                                                   x_buffer, incx, beta, y_buffer, incy);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::hemv,
-                                        upper_lower, n, alpha, A_buffer, lda, x_buffer, incx, beta,
-                                        y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::hemv, upper_lower,
-                                        n, alpha, A_buffer, lda, x_buffer, incx, beta, y_buffer,
-                                        incy);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during HEMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of HEMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout);
-
-    return (int)good;
-}
-
-class HemvTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(HemvTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, beta, -2, -3,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, beta, -2, -3,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1,
-                                                42));
-}
-TEST_P(HemvTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, beta, -2, -3,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, beta, -2, -3,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1,
-                                                 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(HemvTestSuite, HemvTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hemv_usm.cpp b/tests/unit_tests/blas/level2/hemv_usm.cpp
deleted file mode 100644
index a1b8093fc..000000000
--- a/tests/unit_tests/blas/level2/hemv_usm.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         fp beta, int incx, int incy, int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during HEMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-
-    auto y_ref = y;
-
-    // Call Reference HEMV.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::hemv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref,
-           (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ HEMV.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::hemv(main_queue, upper_lower, n, alpha,
-                                                             A.data(), lda, x.data(), incx, beta,
-                                                             y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::hemv(main_queue, upper_lower, n, alpha,
-                                                          A.data(), lda, x.data(), incx, beta,
-                                                          y.data(), incy, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::hemv,
-                                        upper_lower, n, alpha, A.data(), lda, x.data(), incx, beta,
-                                        y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::hemv, upper_lower,
-                                        n, alpha, A.data(), lda, x.data(), incx, beta, y.data(),
-                                        incy, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during HEMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of HEMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout);
-
-    return (int)good;
-}
-
-class HemvUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(HemvUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, beta, -2, -3,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, beta, -2, -3,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1,
-                                                42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1,
-                                                42));
-}
-TEST_P(HemvUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, beta, -2, -3,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, beta, -2, -3,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1,
-                                                 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1,
-                                                 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(HemvUsmTestSuite, HemvUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/her.cpp b/tests/unit_tests/blas/level2/her.cpp
deleted file mode 100644
index 46ae9a879..000000000
--- a/tests/unit_tests/blas/level2/her.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp, typename fp_scalar>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n,
-         fp_scalar alpha, int incx, int lda) {
-    // Prepare data.
-    vector<fp> x, A_ref, A;
-    rand_vector(x, n, incx);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-    A_ref = A;
-
-    // Call Reference HER.
-    const int n_ref = n, incx_ref = incx, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-    using fp_scalar_mkl = typename ref_type_info<fp_scalar>::type;
-
-    ::her(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-          (fp_scalar_mkl *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data(), &lda_ref);
-
-    // Call DPC++ HER.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during HER:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::her(main_queue, upper_lower, n, alpha, x_buffer,
-                                                     incx, A_buffer, lda);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::her(main_queue, upper_lower, n, alpha, x_buffer, incx,
-                                                  A_buffer, lda);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::her,
-                                        upper_lower, n, alpha, x_buffer, incx, A_buffer, lda);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::her, upper_lower,
-                                        n, alpha, x_buffer, incx, A_buffer, lda);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during HER:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of HER:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto A_accessor = A_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(A_accessor, A_ref, layout, n, n, lda, n, std::cout);
-
-    return (int)good;
-}
-
-class HerTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(HerTests, ComplexSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::lower, 30, alpha, 2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::upper, 30, alpha, 2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::lower, 30, alpha, -2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::upper, 30, alpha, -2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::lower, 30, alpha, 1, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::upper, 30, alpha, 1, 42)));
-}
-TEST_P(HerTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::lower, 30, alpha, 2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::upper, 30, alpha, 2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::lower, 30, alpha, -2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::upper, 30, alpha, -2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::lower, 30, alpha, 1, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::upper, 30, alpha, 1, 42)));
-}
-
-INSTANTIATE_TEST_SUITE_P(HerTestSuite, HerTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/her2.cpp b/tests/unit_tests/blas/level2/her2.cpp
deleted file mode 100644
index e98c5cc8b..000000000
--- a/tests/unit_tests/blas/level2/her2.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         int incx, int incy, int lda) {
-    // Prepare data.
-    vector<fp> x, y, A_ref, A;
-
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-    A_ref = A;
-
-    // Call Reference HER2.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::her2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref,
-           (fp_ref *)A_ref.data(), &lda_ref);
-
-    // Call DPC++ HER2.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during HER2:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::her2(main_queue, upper_lower, n, alpha, x_buffer,
-                                                      incx, y_buffer, incy, A_buffer, lda);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::her2(main_queue, upper_lower, n, alpha, x_buffer,
-                                                   incx, y_buffer, incy, A_buffer, lda);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::her2,
-                                        upper_lower, n, alpha, x_buffer, incx, y_buffer, incy,
-                                        A_buffer, lda);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::her2, upper_lower,
-                                        n, alpha, x_buffer, incx, y_buffer, incy, A_buffer, lda);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during HER2:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of HER2:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto A_accessor = A_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(A_accessor, A_ref, layout, n, n, lda, n, std::cout);
-
-    return (int)good;
-}
-
-class Her2Tests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(Her2Tests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, 1, 1, 42));
-}
-TEST_P(Her2Tests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(Her2TestSuite, Her2Tests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/her2_usm.cpp b/tests/unit_tests/blas/level2/her2_usm.cpp
deleted file mode 100644
index c732331ee..000000000
--- a/tests/unit_tests/blas/level2/her2_usm.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         int incx, int incy, int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during HER2:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-
-    auto A_ref = A;
-
-    // Call Reference HER2.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::her2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref,
-           (fp_ref *)A_ref.data(), &lda_ref);
-
-    // Call DPC++ HER2.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::her2(main_queue, upper_lower, n, alpha,
-                                                             x.data(), incx, y.data(), incy,
-                                                             A.data(), lda, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::her2(main_queue, upper_lower, n, alpha,
-                                                          x.data(), incx, y.data(), incy, A.data(),
-                                                          lda, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::her2,
-                                        upper_lower, n, alpha, x.data(), incx, y.data(), incy,
-                                        A.data(), lda, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::her2, upper_lower,
-                                        n, alpha, x.data(), incx, y.data(), incy, A.data(), lda,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during HER2:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of HER2:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(A, A_ref, layout, n, n, lda, n, std::cout);
-
-    return (int)good;
-}
-
-class Her2UsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(Her2UsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, 1, 1, 42));
-}
-TEST_P(Her2UsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(Her2UsmTestSuite, Her2UsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/her_usm.cpp b/tests/unit_tests/blas/level2/her_usm.cpp
deleted file mode 100644
index 9e1f5099e..000000000
--- a/tests/unit_tests/blas/level2/her_usm.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp, typename fp_scalar>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n,
-         fp_scalar alpha, int incx, int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during HER:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-
-    auto A_ref = A;
-
-    // Call Reference HER.
-    const int n_ref = n, incx_ref = incx, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-    using fp_scalar_mkl = typename ref_type_info<fp_scalar>::type;
-
-    ::her(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-          (fp_scalar_mkl *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data(), &lda_ref);
-
-    // Call DPC++ HER.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::her(
-                    main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), lda, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::her(
-                    main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), lda, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::her,
-                                        upper_lower, n, alpha, x.data(), incx, A.data(), lda,
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::her, upper_lower,
-                                        n, alpha, x.data(), incx, A.data(), lda, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during HER:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of HER:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(A, A_ref, layout, n, n, lda, n, std::cout);
-
-    return (int)good;
-}
-
-class HerUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(HerUsmTests, ComplexSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::lower, 30, alpha, 2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::upper, 30, alpha, 2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::lower, 30, alpha, -2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::upper, 30, alpha, -2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::lower, 30, alpha, 1, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::upper, 30, alpha, 1, 42)));
-}
-TEST_P(HerUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::lower, 30, alpha, 2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::upper, 30, alpha, 2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::lower, 30, alpha, -2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::upper, 30, alpha, -2, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::lower, 30, alpha, 1, 42)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::upper, 30, alpha, 1, 42)));
-}
-
-INSTANTIATE_TEST_SUITE_P(HerUsmTestSuite, HerUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hpmv.cpp b/tests/unit_tests/blas/level2/hpmv.cpp
deleted file mode 100644
index 69e6ea9b2..000000000
--- a/tests/unit_tests/blas/level2/hpmv.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         fp beta, int incx, int incy) {
-    // Prepare data.
-    vector<fp> x, y, y_ref, A;
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    y_ref = y;
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, n);
-
-    // Call Reference HPMV.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::hpmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)A.data(), (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta,
-           (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ HPMV.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during HPMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::hpmv(main_queue, upper_lower, n, alpha, A_buffer,
-                                                      x_buffer, incx, beta, y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::hpmv(main_queue, upper_lower, n, alpha, A_buffer,
-                                                   x_buffer, incx, beta, y_buffer, incy);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::hpmv,
-                                        upper_lower, n, alpha, A_buffer, x_buffer, incx, beta,
-                                        y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::hpmv, upper_lower,
-                                        n, alpha, A_buffer, x_buffer, incx, beta, y_buffer, incy);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during HPMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of HPMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout);
-
-    return (int)good;
-}
-
-class HpmvTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(HpmvTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, beta, -2, -3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, beta, -2, -3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1));
-}
-TEST_P(HpmvTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, beta, -2,
-                                                 -3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, beta, -2,
-                                                 -3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1));
-}
-
-INSTANTIATE_TEST_SUITE_P(HpmvTestSuite, HpmvTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hpmv_usm.cpp b/tests/unit_tests/blas/level2/hpmv_usm.cpp
deleted file mode 100644
index 743194b18..000000000
--- a/tests/unit_tests/blas/level2/hpmv_usm.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         fp beta, int incx, int incy) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during HPMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, n);
-
-    auto y_ref = y;
-
-    // Call Reference HPMV.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::hpmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)A.data(), (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta,
-           (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ HPMV.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::hpmv(main_queue, upper_lower, n, alpha,
-                                                             A.data(), x.data(), incx, beta,
-                                                             y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::hpmv(main_queue, upper_lower, n, alpha,
-                                                          A.data(), x.data(), incx, beta, y.data(),
-                                                          incy, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::hpmv,
-                                        upper_lower, n, alpha, A.data(), x.data(), incx, beta,
-                                        y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::hpmv, upper_lower,
-                                        n, alpha, A.data(), x.data(), incx, beta, y.data(), incy,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during HPMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of HPMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout);
-
-    return (int)good;
-}
-
-class HpmvUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(HpmvUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, beta, -2, -3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, beta, -2, -3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1));
-}
-TEST_P(HpmvUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, beta, -2,
-                                                 -3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, beta, -2,
-                                                 -3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1));
-}
-
-INSTANTIATE_TEST_SUITE_P(HpmvUsmTestSuite, HpmvUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hpr.cpp b/tests/unit_tests/blas/level2/hpr.cpp
deleted file mode 100644
index b2e5548bd..000000000
--- a/tests/unit_tests/blas/level2/hpr.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp, typename fp_scalar>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n,
-         fp_scalar alpha, int incx) {
-    // Prepare data.
-    vector<fp> x, A_ref, A;
-    rand_vector(x, n, incx);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, n);
-    A_ref = A;
-
-    // Call Reference HPR.
-    const int n_ref = n, incx_ref = incx;
-    using fp_ref = typename ref_type_info<fp>::type;
-    using fp_scalar_mkl = typename ref_type_info<fp_scalar>::type;
-
-    ::hpr(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-          (fp_scalar_mkl *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data());
-
-    // Call DPC++ HPR.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during HPR:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::hpr(main_queue, upper_lower, n, alpha, x_buffer,
-                                                     incx, A_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::hpr(main_queue, upper_lower, n, alpha, x_buffer, incx,
-                                                  A_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::hpr,
-                                        upper_lower, n, alpha, x_buffer, incx, A_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::hpr, upper_lower,
-                                        n, alpha, x_buffer, incx, A_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during HPR:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of HPR:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto A_accessor = A_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(A_accessor, A_ref, layout, n, n, n, n, std::cout);
-
-    return (int)good;
-}
-
-class HprTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(HprTests, ComplexSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower, 30, alpha, 2)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper, 30, alpha, 2)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::lower, 30, alpha, -2)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::upper, 30, alpha, -2)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower, 30, alpha, 1)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper, 30, alpha, 1)));
-}
-
-TEST_P(HprTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower, 30, alpha, 2)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper, 30, alpha, 2)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::lower, 30, alpha, -2)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::upper, 30, alpha, -2)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower, 30, alpha, 1)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper, 30, alpha, 1)));
-}
-
-INSTANTIATE_TEST_SUITE_P(HprTestSuite, HprTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hpr2.cpp b/tests/unit_tests/blas/level2/hpr2.cpp
deleted file mode 100644
index e2b19e2fd..000000000
--- a/tests/unit_tests/blas/level2/hpr2.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         int incx, int incy) {
-    // Prepare data.
-    vector<fp> x, y, A_ref, A;
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, n);
-    A_ref = A;
-
-    // Call Reference HPR2.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::hpr2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref,
-           (fp_ref *)A_ref.data());
-
-    // Call DPC++ HPR2.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during HPR2:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::hpr2(main_queue, upper_lower, n, alpha, x_buffer,
-                                                      incx, y_buffer, incy, A_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::hpr2(main_queue, upper_lower, n, alpha, x_buffer,
-                                                   incx, y_buffer, incy, A_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::hpr2,
-                                        upper_lower, n, alpha, x_buffer, incx, y_buffer, incy,
-                                        A_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::hpr2, upper_lower,
-                                        n, alpha, x_buffer, incx, y_buffer, incy, A_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during HPR2:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of HPR2:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto A_accessor = A_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(A_accessor, A_ref, layout, n, n, n, n, std::cout);
-
-    return (int)good;
-}
-
-class Hpr2Tests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(Hpr2Tests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, 1, 1));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, 1, 1));
-}
-TEST_P(Hpr2Tests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, 1, 1));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, 1, 1));
-}
-
-INSTANTIATE_TEST_SUITE_P(Hpr2TestSuite, Hpr2Tests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hpr2_usm.cpp b/tests/unit_tests/blas/level2/hpr2_usm.cpp
deleted file mode 100644
index 6dc60dbf6..000000000
--- a/tests/unit_tests/blas/level2/hpr2_usm.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         int incx, int incy) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during HPR2:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, n);
-
-    auto A_ref = A;
-
-    // Call Reference HPR2.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::hpr2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref,
-           (fp_ref *)A_ref.data());
-
-    // Call DPC++ HPR2.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::hpr2(main_queue, upper_lower, n, alpha,
-                                                             x.data(), incx, y.data(), incy,
-                                                             A.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::hpr2(main_queue, upper_lower, n, alpha,
-                                                          x.data(), incx, y.data(), incy, A.data(),
-                                                          dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::hpr2,
-                                        upper_lower, n, alpha, x.data(), incx, y.data(), incy,
-                                        A.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::hpr2, upper_lower,
-                                        n, alpha, x.data(), incx, y.data(), incy, A.data(),
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during HPR2:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of HPR2:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(A, A_ref, layout, n, n, n, n, std::cout);
-
-    return (int)good;
-}
-
-class Hpr2UsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(Hpr2UsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::lower, 30, alpha, 1, 1));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::uplo::upper, 30, alpha, 1, 1));
-}
-TEST_P(Hpr2UsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::lower, 30, alpha, 1, 1));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::uplo::upper, 30, alpha, 1, 1));
-}
-
-INSTANTIATE_TEST_SUITE_P(Hpr2UsmTestSuite, Hpr2UsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/hpr_usm.cpp b/tests/unit_tests/blas/level2/hpr_usm.cpp
deleted file mode 100644
index b90b0ee63..000000000
--- a/tests/unit_tests/blas/level2/hpr_usm.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp, typename fp_scalar>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n,
-         fp_scalar alpha, int incx) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during HPR:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, n);
-
-    auto A_ref = A;
-
-    // Call Reference HPR.
-    const int n_ref = n, incx_ref = incx;
-    using fp_ref = typename ref_type_info<fp>::type;
-    using fp_scalar_mkl = typename ref_type_info<fp_scalar>::type;
-
-    ::hpr(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-          (fp_scalar_mkl *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data());
-
-    // Call DPC++ HPR.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::hpr(main_queue, upper_lower, n, alpha,
-                                                            x.data(), incx, A.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::hpr(main_queue, upper_lower, n, alpha,
-                                                         x.data(), incx, A.data(), dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::hpr,
-                                        upper_lower, n, alpha, x.data(), incx, A.data(),
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::hpr, upper_lower,
-                                        n, alpha, x.data(), incx, A.data(), dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during HPR:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of HPR:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(A, A_ref, layout, n, n, n, n, std::cout);
-
-    return (int)good;
-}
-
-class HprUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(HprUsmTests, ComplexSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower, 30, alpha, 2)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper, 30, alpha, 2)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::lower, 30, alpha, -2)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<float>, float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                          oneapi::mkl::uplo::upper, 30, alpha, -2)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower, 30, alpha, 1)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper, 30, alpha, 1)));
-}
-
-TEST_P(HprUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower, 30, alpha, 2)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper, 30, alpha, 2)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::lower, 30, alpha, -2)));
-    EXPECT_TRUEORSKIP(
-        (test<std::complex<double>, double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                            oneapi::mkl::uplo::upper, 30, alpha, -2)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower, 30, alpha, 1)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper, 30, alpha, 1)));
-}
-
-INSTANTIATE_TEST_SUITE_P(HprUsmTestSuite, HprUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/sbmv.cpp b/tests/unit_tests/blas/level2/sbmv.cpp
deleted file mode 100644
index c0347dfda..000000000
--- a/tests/unit_tests/blas/level2/sbmv.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, int k,
-         fp alpha, fp beta, int incx, int incy, int lda) {
-    // Prepare data.
-    vector<fp> x, y, y_ref, A;
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    y_ref = y;
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-
-    // Call Reference SBMV.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    const int k_ref = k;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::sbmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, &k_ref,
-           (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref,
-           (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ SBMV.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SBMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::sbmv(main_queue, upper_lower, n, k, alpha,
-                                                      A_buffer, lda, x_buffer, incx, beta, y_buffer,
-                                                      incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::sbmv(main_queue, upper_lower, n, k, alpha, A_buffer,
-                                                   lda, x_buffer, incx, beta, y_buffer, incy);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::sbmv,
-                                        upper_lower, n, k, alpha, A_buffer, lda, x_buffer, incx,
-                                        beta, y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::sbmv, upper_lower,
-                                        n, k, alpha, A_buffer, lda, x_buffer, incx, beta, y_buffer,
-                                        incy);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SBMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SBMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout);
-
-    return (int)good;
-}
-
-class SbmvTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(SbmvTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42));
-}
-TEST_P(SbmvTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(SbmvTestSuite, SbmvTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/sbmv_usm.cpp b/tests/unit_tests/blas/level2/sbmv_usm.cpp
deleted file mode 100644
index 4fb7d46ad..000000000
--- a/tests/unit_tests/blas/level2/sbmv_usm.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, int k,
-         fp alpha, fp beta, int incx, int incy, int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SBMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-
-    auto y_ref = y;
-
-    // Call Reference SBMV.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    const int k_ref = k;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::sbmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, &k_ref,
-           (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref,
-           (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ SBMV.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::sbmv(main_queue, upper_lower, n, k, alpha,
-                                                             A.data(), lda, x.data(), incx, beta,
-                                                             y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::sbmv(main_queue, upper_lower, n, k, alpha,
-                                                          A.data(), lda, x.data(), incx, beta,
-                                                          y.data(), incy, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::sbmv,
-                                        upper_lower, n, k, alpha, A.data(), lda, x.data(), incx,
-                                        beta, y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::sbmv, upper_lower,
-                                        n, k, alpha, A.data(), lda, x.data(), incx, beta, y.data(),
-                                        incy, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SBMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SBMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout);
-
-    return (int)good;
-}
-
-class SbmvUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(SbmvUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42));
-}
-TEST_P(SbmvUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, 5, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, 5, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, 5, alpha, beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, 5, alpha, beta, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(SbmvUsmTestSuite, SbmvUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/spmv.cpp b/tests/unit_tests/blas/level2/spmv.cpp
deleted file mode 100644
index 799e7d775..000000000
--- a/tests/unit_tests/blas/level2/spmv.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         fp beta, int incx, int incy) {
-    // Prepare data.
-    vector<fp> x, y, y_ref, A;
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    y_ref = y;
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, n);
-
-    // Call Reference SPMV.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::spmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)A.data(), (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta,
-           (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ SPMV.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SPMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::spmv(main_queue, upper_lower, n, alpha, A_buffer,
-                                                      x_buffer, incx, beta, y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::spmv(main_queue, upper_lower, n, alpha, A_buffer,
-                                                   x_buffer, incx, beta, y_buffer, incy);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::spmv,
-                                        upper_lower, n, alpha, A_buffer, x_buffer, incx, beta,
-                                        y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::spmv, upper_lower,
-                                        n, alpha, A_buffer, x_buffer, incx, beta, y_buffer, incy);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SPMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SPMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout);
-
-    return (int)good;
-}
-
-class SpmvTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(SpmvTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, beta, -2, -3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, beta, -2, -3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1));
-}
-TEST_P(SpmvTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, beta, -2, -3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, beta, -2, -3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1));
-}
-
-INSTANTIATE_TEST_SUITE_P(SpmvTestSuite, SpmvTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/spmv_usm.cpp b/tests/unit_tests/blas/level2/spmv_usm.cpp
deleted file mode 100644
index ae38ada4a..000000000
--- a/tests/unit_tests/blas/level2/spmv_usm.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         fp beta, int incx, int incy) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SPMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, n);
-
-    auto y_ref = y;
-
-    // Call Reference SPMV.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::spmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)A.data(), (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta,
-           (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ SPMV.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::spmv(main_queue, upper_lower, n, alpha,
-                                                             A.data(), x.data(), incx, beta,
-                                                             y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::spmv(main_queue, upper_lower, n, alpha,
-                                                          A.data(), x.data(), incx, beta, y.data(),
-                                                          incy, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::spmv,
-                                        upper_lower, n, alpha, A.data(), x.data(), incx, beta,
-                                        y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::spmv, upper_lower,
-                                        n, alpha, A.data(), x.data(), incx, beta, y.data(), incy,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SPMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SPMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout);
-
-    return (int)good;
-}
-
-class SpmvUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(SpmvUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, beta, -2, -3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, beta, -2, -3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1));
-}
-TEST_P(SpmvUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, beta, -2, -3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, beta, -2, -3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1));
-}
-
-INSTANTIATE_TEST_SUITE_P(SpmvUsmTestSuite, SpmvUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/spr.cpp b/tests/unit_tests/blas/level2/spr.cpp
deleted file mode 100644
index 4e4b5d8a9..000000000
--- a/tests/unit_tests/blas/level2/spr.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         int incx) {
-    // Prepare data.
-    vector<fp> x, A_ref, A;
-    rand_vector(x, n, incx);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, n);
-    A_ref = A;
-
-    // Call Reference SPR.
-    const int n_ref = n, incx_ref = incx;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::spr(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-          (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data());
-
-    // Call DPC++ SPR.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SPR:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::spr(main_queue, upper_lower, n, alpha, x_buffer,
-                                                     incx, A_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::spr(main_queue, upper_lower, n, alpha, x_buffer, incx,
-                                                  A_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::spr,
-                                        upper_lower, n, alpha, x_buffer, incx, A_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::spr, upper_lower,
-                                        n, alpha, x_buffer, incx, A_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SPR:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SPR:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto A_accessor = A_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(A_accessor, A_ref, layout, n, n, n, n, std::cout);
-
-    return (int)good;
-}
-
-class SprTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(SprTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, -2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, -2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 1));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 1));
-}
-TEST_P(SprTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, -2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, -2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 1));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 1));
-}
-
-INSTANTIATE_TEST_SUITE_P(SprTestSuite, SprTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/spr2.cpp b/tests/unit_tests/blas/level2/spr2.cpp
deleted file mode 100644
index d9d00a4e8..000000000
--- a/tests/unit_tests/blas/level2/spr2.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         int incx, int incy) {
-    // Prepare data.
-    vector<fp> x, y, A_ref, A;
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, n);
-    A_ref = A;
-
-    // Call Reference SPR2.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::spr2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref,
-           (fp_ref *)A_ref.data());
-
-    // Call DPC++ SPR2.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SPR2:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::spr2(main_queue, upper_lower, n, alpha, x_buffer,
-                                                      incx, y_buffer, incy, A_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::spr2(main_queue, upper_lower, n, alpha, x_buffer,
-                                                   incx, y_buffer, incy, A_buffer);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::spr2,
-                                        upper_lower, n, alpha, x_buffer, incx, y_buffer, incy,
-                                        A_buffer);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::spr2, upper_lower,
-                                        n, alpha, x_buffer, incx, y_buffer, incy, A_buffer);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SPR2:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SPR2:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto A_accessor = A_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(A_accessor, A_ref, layout, n, n, n, n, std::cout);
-
-    return (int)good;
-}
-
-class Spr2Tests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(Spr2Tests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 1, 1));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 1, 1));
-}
-TEST_P(Spr2Tests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 1, 1));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 1, 1));
-}
-
-INSTANTIATE_TEST_SUITE_P(Spr2TestSuite, Spr2Tests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/spr2_usm.cpp b/tests/unit_tests/blas/level2/spr2_usm.cpp
deleted file mode 100644
index 683288775..000000000
--- a/tests/unit_tests/blas/level2/spr2_usm.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         int incx, int incy) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SPR2:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, n);
-
-    auto A_ref = A;
-
-    // Call Reference SPR2.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::spr2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref,
-           (fp_ref *)A_ref.data());
-
-    // Call DPC++ SPR2.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::spr2(main_queue, upper_lower, n, alpha,
-                                                             x.data(), incx, y.data(), incy,
-                                                             A.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::spr2(main_queue, upper_lower, n, alpha,
-                                                          x.data(), incx, y.data(), incy, A.data(),
-                                                          dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::spr2,
-                                        upper_lower, n, alpha, x.data(), incx, y.data(), incy,
-                                        A.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::spr2, upper_lower,
-                                        n, alpha, x.data(), incx, y.data(), incy, A.data(),
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SPR2:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SPR2:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(A, A_ref, layout, n, n, n, n, std::cout);
-
-    return (int)good;
-}
-
-class Spr2UsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(Spr2UsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 1, 1));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 1, 1));
-}
-TEST_P(Spr2UsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 2, 3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, -2, -3));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 1, 1));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 1, 1));
-}
-
-INSTANTIATE_TEST_SUITE_P(Spr2UsmTestSuite, Spr2UsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/spr_usm.cpp b/tests/unit_tests/blas/level2/spr_usm.cpp
deleted file mode 100644
index 3a23a33b4..000000000
--- a/tests/unit_tests/blas/level2/spr_usm.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         int incx) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SPR:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, n);
-
-    auto A_ref = A;
-
-    // Call Reference SPR.
-    const int n_ref = n, incx_ref = incx;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::spr(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-          (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data());
-
-    // Call DPC++ SPR.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::spr(main_queue, upper_lower, n, alpha,
-                                                            x.data(), incx, A.data(), dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::spr(main_queue, upper_lower, n, alpha,
-                                                         x.data(), incx, A.data(), dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::spr,
-                                        upper_lower, n, alpha, x.data(), incx, A.data(),
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::spr, upper_lower,
-                                        n, alpha, x.data(), incx, A.data(), dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SPR:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SPR:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(A, A_ref, layout, n, n, n, n, std::cout);
-
-    return (int)good;
-}
-
-class SprUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(SprUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, -2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, -2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 1));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 1));
-}
-TEST_P(SprUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, -2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, -2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 1));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 1));
-}
-
-INSTANTIATE_TEST_SUITE_P(SprUsmTestSuite, SprUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/symv.cpp b/tests/unit_tests/blas/level2/symv.cpp
deleted file mode 100644
index a22e48ff7..000000000
--- a/tests/unit_tests/blas/level2/symv.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         fp beta, int incx, int incy, int lda) {
-    // Prepare data.
-    vector<fp> x, y, y_ref, A;
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    y_ref = y;
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-
-    // Call Reference SYMV.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::symv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref,
-           (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ SYMV.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SYMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::symv(main_queue, upper_lower, n, alpha, A_buffer,
-                                                      lda, x_buffer, incx, beta, y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::symv(main_queue, upper_lower, n, alpha, A_buffer, lda,
-                                                   x_buffer, incx, beta, y_buffer, incy);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::symv,
-                                        upper_lower, n, alpha, A_buffer, lda, x_buffer, incx, beta,
-                                        y_buffer, incy);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::symv, upper_lower,
-                                        n, alpha, A_buffer, lda, x_buffer, incx, beta, y_buffer,
-                                        incy);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SYMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SYMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto y_accessor = y_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(y_accessor, y_ref, n, incy, n, std::cout);
-
-    return (int)good;
-}
-
-class SymvTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(SymvTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1, 42));
-}
-TEST_P(SymvTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(SymvTestSuite, SymvTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/symv_usm.cpp b/tests/unit_tests/blas/level2/symv_usm.cpp
deleted file mode 100644
index f33c0d25f..000000000
--- a/tests/unit_tests/blas/level2/symv_usm.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         fp beta, int incx, int incy, int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SYMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-
-    auto y_ref = y;
-
-    // Call Reference SYMV.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::symv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref,
-           (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref);
-
-    // Call DPC++ SYMV.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::symv(main_queue, upper_lower, n, alpha,
-                                                             A.data(), lda, x.data(), incx, beta,
-                                                             y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::symv(main_queue, upper_lower, n, alpha,
-                                                          A.data(), lda, x.data(), incx, beta,
-                                                          y.data(), incy, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::symv,
-                                        upper_lower, n, alpha, A.data(), lda, x.data(), incx, beta,
-                                        y.data(), incy, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::symv, upper_lower,
-                                        n, alpha, A.data(), lda, x.data(), incx, beta, y.data(),
-                                        incy, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SYMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SYMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(y, y_ref, n, incy, n, std::cout);
-
-    return (int)good;
-}
-
-class SymvUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(SymvUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1, 42));
-}
-TEST_P(SymvUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, beta, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, beta, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, beta, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, beta, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(SymvUsmTestSuite, SymvUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/syr.cpp b/tests/unit_tests/blas/level2/syr.cpp
deleted file mode 100644
index 6b305582b..000000000
--- a/tests/unit_tests/blas/level2/syr.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         int incx, int lda) {
-    // Prepare data.
-    vector<fp> x, A_ref, A;
-    rand_vector(x, n, incx);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-    A_ref = A;
-
-    // Call Reference SYR.
-    const int n_ref = n, incx_ref = incx, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::syr(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-          (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data(), &lda_ref);
-
-    // Call DPC++ SYR.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SYR:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::syr(main_queue, upper_lower, n, alpha, x_buffer,
-                                                     incx, A_buffer, lda);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::syr(main_queue, upper_lower, n, alpha, x_buffer, incx,
-                                                  A_buffer, lda);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::syr,
-                                        upper_lower, n, alpha, x_buffer, incx, A_buffer, lda);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::syr, upper_lower,
-                                        n, alpha, x_buffer, incx, A_buffer, lda);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SYR:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SYR:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto A_accessor = A_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(A_accessor, A_ref, layout, n, n, lda, n, std::cout);
-
-    return (int)good;
-}
-
-class SyrTests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(SyrTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, -2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, -2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 1, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 1, 42));
-}
-TEST_P(SyrTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, -2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, -2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 1, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(SyrTestSuite, SyrTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/syr2.cpp b/tests/unit_tests/blas/level2/syr2.cpp
deleted file mode 100644
index 5da1e0106..000000000
--- a/tests/unit_tests/blas/level2/syr2.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         int incx, int incy, int lda) {
-    // Prepare data.
-    vector<fp> x, y, A_ref, A;
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-    A_ref = A;
-
-    // Call Reference SYR2.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::syr2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref,
-           (fp_ref *)A_ref.data(), &lda_ref);
-
-    // Call DPC++ SYR2.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SYR2:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> y_buffer = make_buffer(y);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::syr2(main_queue, upper_lower, n, alpha, x_buffer,
-                                                      incx, y_buffer, incy, A_buffer, lda);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::syr2(main_queue, upper_lower, n, alpha, x_buffer,
-                                                   incx, y_buffer, incy, A_buffer, lda);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::syr2,
-                                        upper_lower, n, alpha, x_buffer, incx, y_buffer, incy,
-                                        A_buffer, lda);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::syr2, upper_lower,
-                                        n, alpha, x_buffer, incx, y_buffer, incy, A_buffer, lda);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SYR2:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SYR2:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto A_accessor = A_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(A_accessor, A_ref, layout, n, n, lda, n, std::cout);
-
-    return (int)good;
-}
-
-class Syr2Tests : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {
-};
-
-TEST_P(Syr2Tests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 1, 1, 42));
-}
-TEST_P(Syr2Tests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(Syr2TestSuite, Syr2Tests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/syr2_usm.cpp b/tests/unit_tests/blas/level2/syr2_usm.cpp
deleted file mode 100644
index a1e2cba7d..000000000
--- a/tests/unit_tests/blas/level2/syr2_usm.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         int incx, int incy, int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SYR2:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), y(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_vector(y, n, incy);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-
-    auto A_ref = A;
-
-    // Call Reference SYR2.
-    const int n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::syr2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-           (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref,
-           (fp_ref *)A_ref.data(), &lda_ref);
-
-    // Call DPC++ SYR2.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::syr2(main_queue, upper_lower, n, alpha,
-                                                             x.data(), incx, y.data(), incy,
-                                                             A.data(), lda, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::syr2(main_queue, upper_lower, n, alpha,
-                                                          x.data(), incx, y.data(), incy, A.data(),
-                                                          lda, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::syr2,
-                                        upper_lower, n, alpha, x.data(), incx, y.data(), incy,
-                                        A.data(), lda, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::syr2, upper_lower,
-                                        n, alpha, x.data(), incx, y.data(), incy, A.data(), lda,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SYR2:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SYR2:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(A, A_ref, layout, n, n, lda, n, std::cout);
-
-    return (int)good;
-}
-
-class Syr2UsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(Syr2UsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 1, 1, 42));
-}
-TEST_P(Syr2UsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 2, 3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, -2, -3, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 1, 1, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 1, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(Syr2UsmTestSuite, Syr2UsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/syr_usm.cpp b/tests/unit_tests/blas/level2/syr_usm.cpp
deleted file mode 100644
index 5a9f5034d..000000000
--- a/tests/unit_tests/blas/level2/syr_usm.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fp>
-int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha,
-         int incx, int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception during SYR:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-
-    auto A_ref = A;
-
-    // Call Reference SYR.
-    const int n_ref = n, incx_ref = incx, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::syr(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref,
-          (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data(), &lda_ref);
-
-    // Call DPC++ SYR.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::syr(
-                    main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), lda, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::syr(
-                    main_queue, upper_lower, n, alpha, x.data(), incx, A.data(), lda, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::syr,
-                                        upper_lower, n, alpha, x.data(), incx, A.data(), lda,
-                                        dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::syr, upper_lower,
-                                        n, alpha, x.data(), incx, A.data(), lda, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const &e) {
-        std::cout << "Caught synchronous SYCL exception during SYR:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented &e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of SYR:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(A, A_ref, layout, n, n, lda, n, std::cout);
-
-    return (int)good;
-}
-
-class SyrUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, oneapi::mkl::layout>> {};
-
-TEST_P(SyrUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, -2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, -2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, 30, alpha, 1, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, 30, alpha, 1, 42));
-}
-TEST_P(SyrUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, -2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, -2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, 30, alpha, 1, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, 30, alpha, 1, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(SyrUsmTestSuite, SyrUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/tbmv.cpp b/tests/unit_tests/blas/level2/tbmv.cpp
deleted file mode 100644
index 554082a01..000000000
--- a/tests/unit_tests/blas/level2/tbmv.cpp
+++ /dev/null
@@ -1,279 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose transa, oneapi::mkl::diag unit_nonunit, int n, int k, int incx,
-         int lda) {
-    // Prepare data.
-    vector<fp> x, x_ref, A;
-    rand_vector(x, n, incx);
-    x_ref = x;
-    rand_matrix(A, layout, transa, n, n, lda);
-
-    // Call Reference TBMV.
-    const int n_ref = n, incx_ref = incx, lda_ref = lda;
-    const int k_ref = k;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::tbmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &n_ref, &k_ref,
-           (fp_ref*)A.data(), &lda_ref, (fp_ref*)x_ref.data(), &incx_ref);
-
-    // Call DPC++ TBMV.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TBMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::tbmv(main_queue, upper_lower, transa, unit_nonunit,
-                                                      n, k, A_buffer, lda, x_buffer, incx);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::tbmv(main_queue, upper_lower, transa, unit_nonunit, n,
-                                                   k, A_buffer, lda, x_buffer, incx);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::tbmv,
-                                        upper_lower, transa, unit_nonunit, n, k, A_buffer, lda,
-                                        x_buffer, incx);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::tbmv, upper_lower,
-                                        transa, unit_nonunit, n, k, A_buffer, lda, x_buffer, incx);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TBMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TBMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto x_accessor = x_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(x_accessor, x_ref, n, incx, n, std::cout);
-
-    return (int)good;
-}
-
-class TbmvTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(TbmvTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-TEST_P(TbmvTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-TEST_P(TbmvTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-TEST_P(TbmvTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(TbmvTestSuite, TbmvTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/tbmv_usm.cpp b/tests/unit_tests/blas/level2/tbmv_usm.cpp
deleted file mode 100644
index 808c5d1c3..000000000
--- a/tests/unit_tests/blas/level2/tbmv_usm.cpp
+++ /dev/null
@@ -1,286 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose transa, oneapi::mkl::diag unit_nonunit, int n, int k, int incx,
-         int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TBMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_matrix(A, layout, transa, n, n, lda);
-
-    auto x_ref = x;
-
-    // Call Reference TBMV.
-    const int n_ref = n, incx_ref = incx, lda_ref = lda;
-    const int k_ref = k;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::tbmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &n_ref, &k_ref,
-           (fp_ref*)A.data(), &lda_ref, (fp_ref*)x_ref.data(), &incx_ref);
-
-    // Call DPC++ TBMV.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::tbmv(main_queue, upper_lower, transa,
-                                                             unit_nonunit, n, k, A.data(), lda,
-                                                             x.data(), incx, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::tbmv(main_queue, upper_lower, transa,
-                                                          unit_nonunit, n, k, A.data(), lda,
-                                                          x.data(), incx, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::tbmv,
-                                        upper_lower, transa, unit_nonunit, n, k, A.data(), lda,
-                                        x.data(), incx, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::tbmv, upper_lower,
-                                        transa, unit_nonunit, n, k, A.data(), lda, x.data(), incx,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TBMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TBMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(x, x_ref, n, incx, n, std::cout);
-
-    return (int)good;
-}
-
-class TbmvUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(TbmvUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-TEST_P(TbmvUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-TEST_P(TbmvUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-TEST_P(TbmvUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(TbmvUsmTestSuite, TbmvUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/tbsv.cpp b/tests/unit_tests/blas/level2/tbsv.cpp
deleted file mode 100644
index e653105e8..000000000
--- a/tests/unit_tests/blas/level2/tbsv.cpp
+++ /dev/null
@@ -1,279 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose transa, oneapi::mkl::diag unit_nonunit, int n, int k, int incx,
-         int lda) {
-    // Prepare data.
-    vector<fp> x, x_ref, A;
-    rand_vector(x, n, incx);
-    x_ref = x;
-    rand_tbsv_matrix(A, layout, upper_lower, transa, n, k, lda);
-
-    // Call Reference TBSV.
-    const int n_ref = n, incx_ref = incx, lda_ref = lda;
-    const int k_ref = k;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::tbsv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &n_ref, &k_ref,
-           (fp_ref*)A.data(), &lda_ref, (fp_ref*)x_ref.data(), &incx_ref);
-
-    // Call DPC++ TBSV.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TBSV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::tbsv(main_queue, upper_lower, transa, unit_nonunit,
-                                                      n, k, A_buffer, lda, x_buffer, incx);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::tbsv(main_queue, upper_lower, transa, unit_nonunit, n,
-                                                   k, A_buffer, lda, x_buffer, incx);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::tbsv,
-                                        upper_lower, transa, unit_nonunit, n, k, A_buffer, lda,
-                                        x_buffer, incx);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::tbsv, upper_lower,
-                                        transa, unit_nonunit, n, k, A_buffer, lda, x_buffer, incx);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TBSV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TBSV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto x_accessor = x_buffer.get_host_access(read_only);
-    bool good = check_equal_trsv_vector(x_accessor, x_ref, n, incx, n, std::cout);
-
-    return (int)good;
-}
-
-class TbsvTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(TbsvTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-TEST_P(TbsvTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-TEST_P(TbsvTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-TEST_P(TbsvTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(TbsvTestSuite, TbsvTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/tbsv_usm.cpp b/tests/unit_tests/blas/level2/tbsv_usm.cpp
deleted file mode 100644
index 1b77997eb..000000000
--- a/tests/unit_tests/blas/level2/tbsv_usm.cpp
+++ /dev/null
@@ -1,286 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose transa, oneapi::mkl::diag unit_nonunit, int n, int k, int incx,
-         int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TBSV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_tbsv_matrix(A, layout, upper_lower, transa, n, k, lda);
-
-    auto x_ref = x;
-
-    // Call Reference TBSV.
-    const int n_ref = n, incx_ref = incx, lda_ref = lda;
-    const int k_ref = k;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::tbsv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &n_ref, &k_ref,
-           (fp_ref*)A.data(), &lda_ref, (fp_ref*)x_ref.data(), &incx_ref);
-
-    // Call DPC++ TBSV.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::tbsv(main_queue, upper_lower, transa,
-                                                             unit_nonunit, n, k, A.data(), lda,
-                                                             x.data(), incx, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::tbsv(main_queue, upper_lower, transa,
-                                                          unit_nonunit, n, k, A.data(), lda,
-                                                          x.data(), incx, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::tbsv,
-                                        upper_lower, transa, unit_nonunit, n, k, A.data(), lda,
-                                        x.data(), incx, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::tbsv, upper_lower,
-                                        transa, unit_nonunit, n, k, A.data(), lda, x.data(), incx,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TBSV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TBSV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_trsv_vector(x, x_ref, n, incx, n, std::cout);
-
-    return (int)good;
-}
-
-class TbsvUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(TbsvUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-TEST_P(TbsvUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-TEST_P(TbsvUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-TEST_P(TbsvUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 5, 2, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(TbsvUsmTestSuite, TbsvUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/tpmv.cpp b/tests/unit_tests/blas/level2/tpmv.cpp
deleted file mode 100644
index ce45279bb..000000000
--- a/tests/unit_tests/blas/level2/tpmv.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose transa, oneapi::mkl::diag unit_nonunit, int n, int incx) {
-    // Prepare data.
-    vector<fp> x, x_ref, A;
-    rand_vector(x, n, incx);
-    x_ref = x;
-    rand_matrix(A, layout, transa, n, n, n);
-
-    // Call Reference TPMV.
-    const int n_ref = n, incx_ref = incx;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::tpmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &n_ref,
-           (fp_ref*)A.data(), (fp_ref*)x_ref.data(), &incx_ref);
-
-    // Call DPC++ TPMV.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TBMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::tpmv(main_queue, upper_lower, transa, unit_nonunit,
-                                                      n, A_buffer, x_buffer, incx);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::tpmv(main_queue, upper_lower, transa, unit_nonunit, n,
-                                                   A_buffer, x_buffer, incx);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::tpmv,
-                                        upper_lower, transa, unit_nonunit, n, A_buffer, x_buffer,
-                                        incx);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::tpmv, upper_lower,
-                                        transa, unit_nonunit, n, A_buffer, x_buffer, incx);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TBMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TPMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto x_accessor = x_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(x_accessor, x_ref, n, incx, n, std::cout);
-
-    return (int)good;
-}
-
-class TpmvTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(TpmvTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-}
-TEST_P(TpmvTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-}
-TEST_P(TpmvTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-}
-TEST_P(TpmvTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-}
-
-INSTANTIATE_TEST_SUITE_P(TpmvTestSuite, TpmvTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/tpmv_usm.cpp b/tests/unit_tests/blas/level2/tpmv_usm.cpp
deleted file mode 100644
index 74ebc2502..000000000
--- a/tests/unit_tests/blas/level2/tpmv_usm.cpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose transa, oneapi::mkl::diag unit_nonunit, int n, int incx) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TPMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_matrix(A, layout, transa, n, n, n);
-
-    auto x_ref = x;
-
-    // Call Reference TPMV.
-    const int n_ref = n, incx_ref = incx;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::tpmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &n_ref,
-           (fp_ref*)A.data(), (fp_ref*)x_ref.data(), &incx_ref);
-
-    // Call DPC++ TPMV.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::tpmv(main_queue, upper_lower, transa,
-                                                             unit_nonunit, n, A.data(), x.data(),
-                                                             incx, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::tpmv(main_queue, upper_lower, transa,
-                                                          unit_nonunit, n, A.data(), x.data(), incx,
-                                                          dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::tpmv,
-                                        upper_lower, transa, unit_nonunit, n, A.data(), x.data(),
-                                        incx, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::tpmv, upper_lower,
-                                        transa, unit_nonunit, n, A.data(), x.data(), incx,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TBMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TPMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(x, x_ref, n, incx, n, std::cout);
-
-    return (int)good;
-}
-
-class TpmvUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(TpmvUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-}
-TEST_P(TpmvUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-}
-TEST_P(TpmvUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-}
-TEST_P(TpmvUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-}
-
-INSTANTIATE_TEST_SUITE_P(TpmvUsmTestSuite, TpmvUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/tpsv.cpp b/tests/unit_tests/blas/level2/tpsv.cpp
deleted file mode 100644
index 2a12ab1da..000000000
--- a/tests/unit_tests/blas/level2/tpsv.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose transa, oneapi::mkl::diag unit_nonunit, int n, int incx) {
-    // Prepare data.
-    vector<fp> x, x_ref, A;
-    rand_vector(x, n, incx);
-    x_ref = x;
-    rand_tpsv_matrix(A, layout, upper_lower, transa, n);
-
-    // Call Reference TPSV.
-    const int n_ref = n, incx_ref = incx;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::tpsv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &n_ref,
-           (fp_ref*)A.data(), (fp_ref*)x_ref.data(), &incx_ref);
-
-    // Call DPC++ TPSV.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TPSV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::tpsv(main_queue, upper_lower, transa, unit_nonunit,
-                                                      n, A_buffer, x_buffer, incx);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::tpsv(main_queue, upper_lower, transa, unit_nonunit, n,
-                                                   A_buffer, x_buffer, incx);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::tpsv,
-                                        upper_lower, transa, unit_nonunit, n, A_buffer, x_buffer,
-                                        incx);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::tpsv, upper_lower,
-                                        transa, unit_nonunit, n, A_buffer, x_buffer, incx);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TPSV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TPSV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto x_accessor = x_buffer.get_host_access(read_only);
-    bool good = check_equal_trsv_vector(x_accessor, x_ref, n, incx, n, std::cout);
-
-    return (int)good;
-}
-
-class TpsvTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(TpsvTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-}
-TEST_P(TpsvTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-}
-TEST_P(TpsvTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-}
-TEST_P(TpsvTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-}
-
-INSTANTIATE_TEST_SUITE_P(TpsvTestSuite, TpsvTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/tpsv_usm.cpp b/tests/unit_tests/blas/level2/tpsv_usm.cpp
deleted file mode 100644
index bcb676843..000000000
--- a/tests/unit_tests/blas/level2/tpsv_usm.cpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose transa, oneapi::mkl::diag unit_nonunit, int n, int incx) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TPSV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_tpsv_matrix(A, layout, upper_lower, transa, n);
-
-    auto x_ref = x;
-
-    // Call Reference TPSV.
-    const int n_ref = n, incx_ref = incx;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::tpsv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &n_ref,
-           (fp_ref*)A.data(), (fp_ref*)x_ref.data(), &incx_ref);
-
-    // Call DPC++ TPSV.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::tpsv(main_queue, upper_lower, transa,
-                                                             unit_nonunit, n, A.data(), x.data(),
-                                                             incx, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::tpsv(main_queue, upper_lower, transa,
-                                                          unit_nonunit, n, A.data(), x.data(), incx,
-                                                          dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::tpsv,
-                                        upper_lower, transa, unit_nonunit, n, A.data(), x.data(),
-                                        incx, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::tpsv, upper_lower,
-                                        transa, unit_nonunit, n, A.data(), x.data(), incx,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TPSV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TPSV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_trsv_vector(x, x_ref, n, incx, n, std::cout);
-
-    return (int)good;
-}
-
-class TpsvUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(TpsvUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2));
-}
-TEST_P(TpsvUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2));
-}
-TEST_P(TpsvUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-}
-TEST_P(TpsvUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2));
-}
-
-INSTANTIATE_TEST_SUITE_P(TpsvUsmTestSuite, TpsvUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/trmv.cpp b/tests/unit_tests/blas/level2/trmv.cpp
deleted file mode 100644
index 8dfc517eb..000000000
--- a/tests/unit_tests/blas/level2/trmv.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose transa, oneapi::mkl::diag unit_nonunit, int n, int incx, int lda) {
-    // Prepare data.
-    vector<fp> x, x_ref, A;
-    rand_vector(x, n, incx);
-    x_ref = x;
-    rand_matrix(A, layout, transa, n, n, lda);
-
-    // Call Reference TRMV.
-    const int n_ref = n, incx_ref = incx, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::trmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &n_ref,
-           (fp_ref*)A.data(), &lda_ref, (fp_ref*)x_ref.data(), &incx_ref);
-
-    // Call DPC++ TRMV.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TRMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::trmv(main_queue, upper_lower, transa, unit_nonunit,
-                                                      n, A_buffer, lda, x_buffer, incx);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::trmv(main_queue, upper_lower, transa, unit_nonunit, n,
-                                                   A_buffer, lda, x_buffer, incx);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::trmv,
-                                        upper_lower, transa, unit_nonunit, n, A_buffer, lda,
-                                        x_buffer, incx);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::trmv, upper_lower,
-                                        transa, unit_nonunit, n, A_buffer, lda, x_buffer, incx);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TRMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TRMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto x_accessor = x_buffer.get_host_access(read_only);
-    bool good = check_equal_vector(x_accessor, x_ref, n, incx, n, std::cout);
-
-    return (int)good;
-}
-
-class TrmvTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(TrmvTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-TEST_P(TrmvTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-TEST_P(TrmvTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-TEST_P(TrmvTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(TrmvTestSuite, TrmvTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/trmv_usm.cpp b/tests/unit_tests/blas/level2/trmv_usm.cpp
deleted file mode 100644
index af3e4b898..000000000
--- a/tests/unit_tests/blas/level2/trmv_usm.cpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose transa, oneapi::mkl::diag unit_nonunit, int n, int incx, int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TRMV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_matrix(A, layout, transa, n, n, lda);
-
-    auto x_ref = x;
-
-    // Call Reference TRMV.
-    const int n_ref = n, incx_ref = incx, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::trmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &n_ref,
-           (fp_ref*)A.data(), &lda_ref, (fp_ref*)x_ref.data(), &incx_ref);
-
-    // Call DPC++ TRMV.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::trmv(main_queue, upper_lower, transa,
-                                                             unit_nonunit, n, A.data(), lda,
-                                                             x.data(), incx, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::trmv(main_queue, upper_lower, transa,
-                                                          unit_nonunit, n, A.data(), lda, x.data(),
-                                                          incx, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::trmv,
-                                        upper_lower, transa, unit_nonunit, n, A.data(), lda,
-                                        x.data(), incx, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::trmv, upper_lower,
-                                        transa, unit_nonunit, n, A.data(), lda, x.data(), incx,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TRMV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TRMV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_vector(x, x_ref, n, incx, n, std::cout);
-
-    return (int)good;
-}
-
-class TrmvUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(TrmvUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-TEST_P(TrmvUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-TEST_P(TrmvUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-TEST_P(TrmvUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(TrmvUsmTestSuite, TrmvUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/trsv.cpp b/tests/unit_tests/blas/level2/trsv.cpp
deleted file mode 100644
index fb1e39e06..000000000
--- a/tests/unit_tests/blas/level2/trsv.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose transa, oneapi::mkl::diag unit_nonunit, int n, int incx, int lda) {
-    // Prepare data.
-    vector<fp> x, x_ref, A;
-    rand_vector(x, n, incx);
-    x_ref = x;
-    rand_trsm_matrix(A, layout, transa, n, n, lda);
-
-    // Call Reference TRSV.
-    const int n_ref = n, incx_ref = incx, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::trsv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &n_ref,
-           (fp_ref*)A.data(), &lda_ref, (fp_ref*)x_ref.data(), &incx_ref);
-
-    // Call DPC++ TRSV.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TRSV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> x_buffer = make_buffer(x);
-    buffer<fp, 1> A_buffer = make_buffer(A);
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::trsv(main_queue, upper_lower, transa, unit_nonunit,
-                                                      n, A_buffer, lda, x_buffer, incx);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::trsv(main_queue, upper_lower, transa, unit_nonunit, n,
-                                                   A_buffer, lda, x_buffer, incx);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::trsv,
-                                        upper_lower, transa, unit_nonunit, n, A_buffer, lda,
-                                        x_buffer, incx);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::trsv, upper_lower,
-                                        transa, unit_nonunit, n, A_buffer, lda, x_buffer, incx);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TRSV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TRSV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto x_accessor = x_buffer.get_host_access(read_only);
-    bool good = check_equal_trsv_vector(x_accessor, x_ref, n, incx, n, std::cout);
-
-    return (int)good;
-}
-
-class TrsvTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(TrsvTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-TEST_P(TrsvTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-TEST_P(TrsvTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-TEST_P(TrsvTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(TrsvTestSuite, TrsvTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level2/trsv_usm.cpp b/tests/unit_tests/blas/level2/trsv_usm.cpp
deleted file mode 100644
index 2e6242d58..000000000
--- a/tests/unit_tests/blas/level2/trsv_usm.cpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <complex>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose transa, oneapi::mkl::diag unit_nonunit, int n, int incx, int lda) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TRSV:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> x(ua), A(ua);
-    rand_vector(x, n, incx);
-    rand_trsm_matrix(A, layout, transa, n, n, lda);
-
-    auto x_ref = x;
-
-    // Call Reference TRSV.
-    const int n_ref = n, incx_ref = incx, lda_ref = lda;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::trsv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_nonunit), &n_ref,
-           (fp_ref*)A.data(), &lda_ref, (fp_ref*)x_ref.data(), &incx_ref);
-
-    // Call DPC++ TRSV.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::trsv(main_queue, upper_lower, transa,
-                                                             unit_nonunit, n, A.data(), lda,
-                                                             x.data(), incx, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::trsv(main_queue, upper_lower, transa,
-                                                          unit_nonunit, n, A.data(), lda, x.data(),
-                                                          incx, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::trsv,
-                                        upper_lower, transa, unit_nonunit, n, A.data(), lda,
-                                        x.data(), incx, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::trsv, upper_lower,
-                                        transa, unit_nonunit, n, A.data(), lda, x.data(), incx,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TRSV:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TRSV:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_trsv_vector(x, x_ref, n, incx, n, std::cout);
-
-    return (int)good;
-}
-
-class TrsvUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(TrsvUsmTests, RealSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                  oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-TEST_P(TrsvUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans,
-                                   oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-TEST_P(TrsvUsmTests, ComplexSinglePrecision) {
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-TEST_P(TrsvUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::unit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 30, 2, 42));
-}
-
-INSTANTIATE_TEST_SUITE_P(TrsvUsmTestSuite, TrsvUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/CMakeLists.txt b/tests/unit_tests/blas/level3/CMakeLists.txt
deleted file mode 100644
index 53cba3f4c..000000000
--- a/tests/unit_tests/blas/level3/CMakeLists.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build object from all test sources
-set(L3_SOURCES "gemm.cpp" "symm.cpp" "syrk.cpp" "hemm.cpp" "herk.cpp" "syr2k.cpp" "her2k.cpp" "trmm.cpp" "trsm.cpp" "gemm_usm.cpp" "symm_usm.cpp" "syrk_usm.cpp" "hemm_usm.cpp" "herk_usm.cpp" "syr2k_usm.cpp" "her2k_usm.cpp" "trmm_usm.cpp" "trsm_usm.cpp")
-
-if(BUILD_SHARED_LIBS)
-  add_library(blas_level3_rt OBJECT ${L3_SOURCES})
-  target_compile_options(blas_level3_rt PRIVATE -DCALL_RT_API -DNOMINMAX)
-  target_include_directories(blas_level3_rt
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-      PUBLIC ${CBLAS_INCLUDE}
-  )
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET blas_level3_rt SOURCES ${L3_SOURCES})
-  else()
-    target_link_libraries(blas_level3_rt PUBLIC ONEMKL::SYCL::SYCL)
-  endif()
-endif()
-
-add_library(blas_level3_ct OBJECT ${L3_SOURCES})
-target_compile_options(blas_level3_ct PRIVATE  -DNOMINMAX)
-target_include_directories(blas_level3_ct
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-    PUBLIC ${PROJECT_SOURCE_DIR}/include
-    PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-    PUBLIC ${CMAKE_BINARY_DIR}/bin
-    PUBLIC ${CBLAS_INCLUDE}
-)
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET blas_level3_ct SOURCES ${L3_SOURCES})
-else()
-  target_link_libraries(blas_level3_ct PUBLIC ONEMKL::SYCL::SYCL)
-endif()
-
-
diff --git a/tests/unit_tests/blas/level3/gemm.cpp b/tests/unit_tests/blas/level3/gemm.cpp
deleted file mode 100644
index 564700b16..000000000
--- a/tests/unit_tests/blas/level3/gemm.cpp
+++ /dev/null
@@ -1,313 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename Ta, typename Tc>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa,
-         oneapi::mkl::transpose transb, int m, int n, int k, int lda, int ldb, int ldc, Tc alpha,
-         Tc beta) {
-    // Prepare data.
-    vector<Ta, allocator_helper<Ta, 64>> A, B;
-    vector<Tc, allocator_helper<Tc, 64>> C, C_ref;
-
-    rand_matrix(A, layout, transa, m, k, lda);
-    rand_matrix(B, layout, transb, k, n, ldb);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, m, n, ldc);
-    C_ref = C;
-
-    // Call Reference GEMM.
-    const int m_ref = m, n_ref = n, k_ref = k;
-    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
-
-    using Ta_ref = typename ref_type_info<Ta>::type;
-    using Tc_ref = typename ref_type_info<Tc>::type;
-
-    ::gemm(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa),
-           convert_to_cblas_trans(transb), &m_ref, &n_ref, &k_ref, (Tc_ref*)&alpha,
-           (Ta_ref*)A.data(), &lda_ref, (Ta_ref*)B.data(), &ldb_ref, (Tc_ref*)&beta,
-           (Tc_ref*)C_ref.data(), &ldc_ref);
-
-    // Call DPC++ GEMM.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMM:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<Ta, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<Ta, 1> B_buffer(B.data(), range<1>(B.size()));
-    buffer<Tc, 1> C_buffer(C.data(), range<1>(C.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::gemm(main_queue, transa, transb, m, n, k, alpha,
-                                                      A_buffer, lda, B_buffer, ldb, beta, C_buffer,
-                                                      ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::gemm(main_queue, transa, transb, m, n, k, alpha,
-                                                   A_buffer, lda, B_buffer, ldb, beta, C_buffer,
-                                                   ldc);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemm, transa,
-                                        transb, m, n, k, alpha, A_buffer, lda, B_buffer, ldb, beta,
-                                        C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemm, transa,
-                                        transb, m, n, k, alpha, A_buffer, lda, B_buffer, ldb, beta,
-                                        C_buffer, ldc);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during GEMM:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of GEMM:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto C_accessor = C_buffer.get_host_access(read_only);
-    bool good = check_equal_matrix(C_accessor, C_ref, layout, m, n, ldc, 10 * k, std::cout);
-
-    return (int)good;
-}
-
-class GemmTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(GemmTests, Bfloat16Bfloat16FloatPrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<oneapi::mkl::bfloat16, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<oneapi::mkl::bfloat16, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<oneapi::mkl::bfloat16, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<oneapi::mkl::bfloat16, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-}
-
-TEST_P(GemmTests, HalfHalfFloatPrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<sycl::half, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<sycl::half, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<sycl::half, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<sycl::half, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-}
-
-TEST_P(GemmTests, RealHalfPrecision) {
-    sycl::half alpha(2.0);
-    sycl::half beta(3.0);
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-}
-
-TEST_P(GemmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<float, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 3, 8, 9, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-}
-
-TEST_P(GemmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP((test<double, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<double, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<double, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<double, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-}
-
-TEST_P(GemmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::conjtrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::conjtrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::conjtrans,
-        oneapi::mkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-}
-
-TEST_P(GemmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::conjtrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::conjtrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::conjtrans,
-        oneapi::mkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemmTestSuite, GemmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/gemm_usm.cpp b/tests/unit_tests/blas/level3/gemm_usm.cpp
deleted file mode 100644
index 9d5d8d048..000000000
--- a/tests/unit_tests/blas/level3/gemm_usm.cpp
+++ /dev/null
@@ -1,312 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename Ta, typename Tc>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa,
-         oneapi::mkl::transpose transb, int m, int n, int k, int lda, int ldb, int ldc, Tc alpha,
-         Tc beta) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during GEMM:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<Ta, usm::alloc::shared, 64>(cxt, *dev);
-    auto uc = usm_allocator<Tc, usm::alloc::shared, 64>(cxt, *dev);
-    vector<Ta, decltype(ua)> A(ua), B(ua);
-    vector<Tc, decltype(uc)> C(ua);
-    rand_matrix(A, layout, transa, m, k, lda);
-    rand_matrix(B, layout, transb, k, n, ldb);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, m, n, ldc);
-
-    auto C_ref = C;
-
-    // Call Reference GEMM.
-    const int m_ref = m, n_ref = n, k_ref = k;
-    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
-
-    using ta_ref = typename ref_type_info<Ta>::type;
-    using tc_ref = typename ref_type_info<Tc>::type;
-
-    ::gemm(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa),
-           convert_to_cblas_trans(transb), &m_ref, &n_ref, &k_ref, (tc_ref*)&alpha,
-           (ta_ref*)A.data(), &lda_ref, (ta_ref*)B.data(), &ldb_ref, (tc_ref*)&beta,
-           (tc_ref*)C_ref.data(), &ldc_ref);
-
-    // Call DPC++ GEMM.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::gemm(main_queue, transa, transb, m, n, k,
-                                                             alpha, A.data(), lda, B.data(), ldb,
-                                                             beta, C.data(), ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::gemm(main_queue, transa, transb, m, n, k,
-                                                          alpha, A.data(), lda, B.data(), ldb, beta,
-                                                          C.data(), ldc, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemm, transa,
-                                        transb, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta,
-                                        C.data(), ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemm, transa,
-                                        transb, m, n, k, alpha, A.data(), lda, B.data(), ldb, beta,
-                                        C.data(), ldc, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during GEMM:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of GEMM:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(C, C_ref, layout, m, n, ldc, 10 * k, std::cout);
-
-    return (int)good;
-}
-
-class GemmUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(GemmUsmTests, Bfloat16Bfloat16FloatPrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<oneapi::mkl::bfloat16, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<oneapi::mkl::bfloat16, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<oneapi::mkl::bfloat16, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<oneapi::mkl::bfloat16, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-}
-
-TEST_P(GemmUsmTests, HalfHalfFloatPrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<sycl::half, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<sycl::half, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<sycl::half, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<sycl::half, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-}
-
-TEST_P(GemmUsmTests, RealHalfPrecision) {
-    sycl::half alpha(2.0);
-    sycl::half beta(3.0);
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<sycl::half, sycl::half>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-}
-
-TEST_P(GemmUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<float, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<float, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-}
-
-TEST_P(GemmUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP((test<double, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<double, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<double, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<double, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-}
-
-TEST_P(GemmUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::conjtrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::conjtrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::conjtrans,
-        oneapi::mkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-}
-
-TEST_P(GemmUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::nontrans,
-        oneapi::mkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::trans,
-        oneapi::mkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::conjtrans,
-        oneapi::mkl::transpose::nontrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::conjtrans,
-        oneapi::mkl::transpose::trans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::transpose::conjtrans,
-        oneapi::mkl::transpose::conjtrans, 79, 83, 91, 103, 105, 106, alpha, beta)));
-}
-
-INSTANTIATE_TEST_SUITE_P(GemmUsmTestSuite, GemmUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/hemm.cpp b/tests/unit_tests/blas/level3/hemm.cpp
deleted file mode 100644
index ce050e97d..000000000
--- a/tests/unit_tests/blas/level3/hemm.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right,
-         oneapi::mkl::uplo upper_lower, int m, int n, int lda, int ldb, int ldc, fp alpha,
-         fp beta) {
-    // Prepare data.
-    vector<fp, allocator_helper<fp, 64>> A, B, C, C_ref;
-    if (left_right == oneapi::mkl::side::left)
-        rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, m, m, lda);
-    else
-        rand_matrix(A, oneapi::mkl::transpose::nontrans, n, n, lda);
-    rand_matrix(B, layout, oneapi::mkl::transpose::nontrans, m, n, ldb);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, m, n, ldc);
-    C_ref = C;
-
-    // Call Reference HEMM.
-    const int m_ref = m, n_ref = n;
-    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::hemm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right),
-           convert_to_cblas_uplo(upper_lower), &m_ref, &n_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
-           &lda_ref, (fp_ref*)B.data(), &ldb_ref, (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
-
-    // Call DPC++ HEMM.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during HEMM:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-    buffer<fp, 1> C_buffer(C.data(), range<1>(C.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::hemm(main_queue, left_right, upper_lower, m, n,
-                                                      alpha, A_buffer, lda, B_buffer, ldb, beta,
-                                                      C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::hemm(main_queue, left_right, upper_lower, m, n, alpha,
-                                                   A_buffer, lda, B_buffer, ldb, beta, C_buffer,
-                                                   ldc);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::hemm,
-                                        left_right, upper_lower, m, n, alpha, A_buffer, lda,
-                                        B_buffer, ldb, beta, C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::hemm, left_right,
-                                        upper_lower, m, n, alpha, A_buffer, lda, B_buffer, ldb,
-                                        beta, C_buffer, ldc);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during HEMM:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of HEMM:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto C_accessor = C_buffer.get_host_access(read_only);
-    bool good =
-        check_equal_matrix(C_accessor, C_ref, layout, m, n, ldc, 10 * std::max(m, n), std::cout);
-
-    return (int)good;
-}
-
-class HemmTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(HemmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                72, 27, 101, 102, 103, alpha, beta));
-}
-TEST_P(HemmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-}
-
-INSTANTIATE_TEST_SUITE_P(HemmTestSuite, HemmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/hemm_usm.cpp b/tests/unit_tests/blas/level3/hemm_usm.cpp
deleted file mode 100644
index eafb06ea5..000000000
--- a/tests/unit_tests/blas/level3/hemm_usm.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right,
-         oneapi::mkl::uplo upper_lower, int m, int n, int lda, int ldb, int ldc, fp alpha,
-         fp beta) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during HEMM:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), B(ua), C(ua);
-    if (left_right == oneapi::mkl::side::left)
-        rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, m, m, lda);
-    else
-        rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-    rand_matrix(B, layout, oneapi::mkl::transpose::nontrans, m, n, ldb);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, m, n, ldc);
-
-    auto C_ref = C;
-
-    // Call Reference HEMM.
-    const int m_ref = m, n_ref = n;
-    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-    ::hemm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right),
-           convert_to_cblas_uplo(upper_lower), &m_ref, &n_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
-           &lda_ref, (fp_ref*)B.data(), &ldb_ref, (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
-
-    // Call DPC++ HEMM.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::hemm(main_queue, left_right, upper_lower, m,
-                                                             n, alpha, A.data(), lda, B.data(), ldb,
-                                                             beta, C.data(), ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::hemm(main_queue, left_right, upper_lower, m, n,
-                                                          alpha, A.data(), lda, B.data(), ldb, beta,
-                                                          C.data(), ldc, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::hemm,
-                                        left_right, upper_lower, m, n, alpha, A.data(), lda,
-                                        B.data(), ldb, beta, C.data(), ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::hemm, left_right,
-                                        upper_lower, m, n, alpha, A.data(), lda, B.data(), ldb,
-                                        beta, C.data(), ldc, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during HEMM:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of HEMM:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(C, C_ref, layout, m, n, ldc, 10 * std::max(m, n), std::cout);
-
-    return (int)good;
-}
-
-class HemmUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(HemmUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                72, 27, 101, 102, 103, alpha, beta));
-}
-TEST_P(HemmUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-}
-
-INSTANTIATE_TEST_SUITE_P(HemmUsmTestSuite, HemmUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/her2k.cpp b/tests/unit_tests/blas/level3/her2k.cpp
deleted file mode 100644
index ce57041d9..000000000
--- a/tests/unit_tests/blas/level3/her2k.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp, typename fp_scalar>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose trans, int n, int k, int lda, int ldb, int ldc, fp alpha,
-         fp_scalar beta) {
-    fp alpha_row(alpha.real(), -alpha.imag());
-
-    // Prepare data.
-    vector<fp, allocator_helper<fp, 64>> A, B, C, C_ref;
-    rand_matrix(A, layout, trans, n, k, lda);
-    rand_matrix(B, layout, trans, n, k, ldb);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, n, n, ldc);
-    C_ref = C;
-
-    // Call Reference HER2K.
-    const int n_ref = n, k_ref = k;
-    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-    using fp_scalar_mkl = typename ref_type_info<fp_scalar>::type;
-
-    ::her2k(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-            convert_to_cblas_trans(trans), &n_ref, &k_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
-            &lda_ref, (fp_ref*)B.data(), &ldb_ref, (fp_scalar_mkl*)&beta, (fp_ref*)C_ref.data(),
-            &ldc_ref);
-
-    // Call DPC++ HER2K.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during HER2K:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-    buffer<fp, 1> C_buffer(C.data(), range<1>(C.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::her2k(main_queue, upper_lower, trans, n, k, alpha,
-                                                       A_buffer, lda, B_buffer, ldb, beta, C_buffer,
-                                                       ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::her2k(main_queue, upper_lower, trans, n, k, alpha,
-                                                    A_buffer, lda, B_buffer, ldb, beta, C_buffer,
-                                                    ldc);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::her2k,
-                                        upper_lower, trans, n, k, alpha, A_buffer, lda, B_buffer,
-                                        ldb, beta, C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::her2k,
-                                        upper_lower, trans, n, k, alpha, A_buffer, lda, B_buffer,
-                                        ldb, beta, C_buffer, ldc);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during HER2K:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of HER2K:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto C_accessor = C_buffer.get_host_access(read_only);
-    bool good =
-        check_equal_matrix(C_accessor, C_ref, layout, n, n, ldc, 10 * std::max(n, k), std::cout);
-
-    return (int)good;
-}
-
-class Her2kTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(Her2kTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    float beta(1.0);
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 102, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 102, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 102, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 102, 103, alpha, beta)));
-}
-TEST_P(Her2kTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    double beta(1.0);
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 102, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 102, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 102, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 102, 103, alpha, beta)));
-}
-
-INSTANTIATE_TEST_SUITE_P(Her2kTestSuite, Her2kTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/her2k_usm.cpp b/tests/unit_tests/blas/level3/her2k_usm.cpp
deleted file mode 100644
index a4ada6cb2..000000000
--- a/tests/unit_tests/blas/level3/her2k_usm.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp, typename fp_scalar>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose trans, int n, int k, int lda, int ldb, int ldc, fp alpha,
-         fp_scalar beta) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during HER2K:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    fp alpha_row(alpha.real(), -alpha.imag());
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), B(ua), C(ua);
-    rand_matrix(A, layout, trans, n, k, lda);
-    rand_matrix(B, layout, trans, n, k, ldb);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, n, n, ldc);
-
-    auto C_ref = C;
-
-    // Call Reference HER2K.
-    const int n_ref = n, k_ref = k;
-    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-    using fp_scalar_mkl = typename ref_type_info<fp_scalar>::type;
-
-    ::her2k(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-            convert_to_cblas_trans(trans), &n_ref, &k_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
-            &lda_ref, (fp_ref*)B.data(), &ldb_ref, (fp_scalar_mkl*)&beta, (fp_ref*)C_ref.data(),
-            &ldc_ref);
-
-    // Call DPC++ HER2K.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::her2k(main_queue, upper_lower, trans, n, k,
-                                                              alpha, A.data(), lda, B.data(), ldb,
-                                                              beta, C.data(), ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::her2k(main_queue, upper_lower, trans, n, k,
-                                                           alpha, A.data(), lda, B.data(), ldb,
-                                                           beta, C.data(), ldc, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::her2k,
-                                        upper_lower, trans, n, k, alpha, A.data(), lda, B.data(),
-                                        ldb, beta, C.data(), ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::her2k,
-                                        upper_lower, trans, n, k, alpha, A.data(), lda, B.data(),
-                                        ldb, beta, C.data(), ldc, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during HER2K:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of HER2K:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(C, C_ref, layout, n, n, ldc, 10 * std::max(n, k), std::cout);
-
-    return (int)good;
-}
-
-class Her2kUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(Her2kUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    float beta(1.0);
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 102, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 102, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 102, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 102, 103, alpha, beta)));
-}
-TEST_P(Her2kUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    double beta(1.0);
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 102, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 102, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 102, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 102, 103, alpha, beta)));
-}
-
-INSTANTIATE_TEST_SUITE_P(Her2kUsmTestSuite, Her2kUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/herk.cpp b/tests/unit_tests/blas/level3/herk.cpp
deleted file mode 100644
index f908a77b7..000000000
--- a/tests/unit_tests/blas/level3/herk.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp, typename fp_scalar>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose trans, int n, int k, int lda, int ldc, fp_scalar alpha,
-         fp_scalar beta) {
-    // Prepare data.
-    vector<fp, allocator_helper<fp, 64>> A, C, C_ref;
-    rand_matrix(A, layout, trans, n, k, lda);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, n, n, ldc);
-    C_ref = C;
-
-    // Call Reference HERK.
-    const int n_ref = n, k_ref = k;
-    const int lda_ref = lda, ldc_ref = ldc;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::herk(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(trans), &n_ref, &k_ref, (fp_scalar*)&alpha, (fp_ref*)A.data(),
-           &lda_ref, (fp_scalar*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
-
-    // Call DPC++ HERK.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during HERK:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> C_buffer(C.data(), range<1>(C.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::herk(main_queue, upper_lower, trans, n, k, alpha,
-                                                      A_buffer, lda, beta, C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::herk(main_queue, upper_lower, trans, n, k, alpha,
-                                                   A_buffer, lda, beta, C_buffer, ldc);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::herk,
-                                        upper_lower, trans, n, k, alpha, A_buffer, lda, beta,
-                                        C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::herk, upper_lower,
-                                        trans, n, k, alpha, A_buffer, lda, beta, C_buffer, ldc);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during HERK:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of HERK:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto C_accessor = C_buffer.get_host_access(read_only);
-    bool good =
-        check_equal_matrix(C_accessor, C_ref, layout, n, n, ldc, 10 * std::max(n, k), std::cout);
-
-    return (int)good;
-}
-
-class HerkTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(HerkTests, ComplexSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 103, alpha, beta)));
-}
-TEST_P(HerkTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 103, alpha, beta)));
-}
-
-INSTANTIATE_TEST_SUITE_P(HerkTestSuite, HerkTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/herk_usm.cpp b/tests/unit_tests/blas/level3/herk_usm.cpp
deleted file mode 100644
index 470159c63..000000000
--- a/tests/unit_tests/blas/level3/herk_usm.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp, typename fp_scalar>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose trans, int n, int k, int lda, int ldc, fp_scalar alpha,
-         fp_scalar beta) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during HERK:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), C(ua);
-    rand_matrix(A, layout, trans, n, k, lda);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, n, n, ldc);
-
-    auto C_ref = C;
-
-    // Call Reference HERK.
-    const int n_ref = n, k_ref = k;
-    const int lda_ref = lda, ldc_ref = ldc;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::herk(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(trans), &n_ref, &k_ref, (fp_scalar*)&alpha, (fp_ref*)A.data(),
-           &lda_ref, (fp_scalar*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
-
-    // Call DPC++ HERK.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::herk(main_queue, upper_lower, trans, n, k,
-                                                             alpha, A.data(), lda, beta, C.data(),
-                                                             ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::herk(main_queue, upper_lower, trans, n, k,
-                                                          alpha, A.data(), lda, beta, C.data(), ldc,
-                                                          dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::herk,
-                                        upper_lower, trans, n, k, alpha, A.data(), lda, beta,
-                                        C.data(), ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::herk, upper_lower,
-                                        trans, n, k, alpha, A.data(), lda, beta, C.data(), ldc,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during HERK:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of HERK:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(C, C_ref, layout, n, n, ldc, 10 * std::max(n, k), std::cout);
-
-    return (int)good;
-}
-
-class HerkUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(HerkUsmTests, ComplexSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<float>, float>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 103, alpha, beta)));
-}
-TEST_P(HerkUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 72, 27, 101, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 103, alpha, beta)));
-    EXPECT_TRUEORSKIP((test<std::complex<double>, double>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::conjtrans, 72, 27, 101, 103, alpha, beta)));
-}
-
-INSTANTIATE_TEST_SUITE_P(HerkUsmTestSuite, HerkUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/symm.cpp b/tests/unit_tests/blas/level3/symm.cpp
deleted file mode 100644
index 3f6920370..000000000
--- a/tests/unit_tests/blas/level3/symm.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right,
-         oneapi::mkl::uplo upper_lower, int m, int n, int lda, int ldb, int ldc, fp alpha,
-         fp beta) {
-    // Prepare data.
-    vector<fp, allocator_helper<fp, 64>> A, B, C, C_ref;
-    if (left_right == oneapi::mkl::side::left)
-        rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, m, m, lda);
-    else
-        rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-    rand_matrix(B, layout, oneapi::mkl::transpose::nontrans, m, n, ldb);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, m, n, ldc);
-    C_ref = C;
-
-    // Call Reference SYMM.
-    const int m_ref = m, n_ref = n;
-    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::symm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right),
-           convert_to_cblas_uplo(upper_lower), &m_ref, &n_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
-           &lda_ref, (fp_ref*)B.data(), &ldb_ref, (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
-
-    // Call DPC++ SYMM.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during SYMM:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-    buffer<fp, 1> C_buffer(C.data(), range<1>(C.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::symm(main_queue, left_right, upper_lower, m, n,
-                                                      alpha, A_buffer, lda, B_buffer, ldb, beta,
-                                                      C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::symm(main_queue, left_right, upper_lower, m, n, alpha,
-                                                   A_buffer, lda, B_buffer, ldb, beta, C_buffer,
-                                                   ldc);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::symm,
-                                        left_right, upper_lower, m, n, alpha, A_buffer, lda,
-                                        B_buffer, ldb, beta, C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::symm, left_right,
-                                        upper_lower, m, n, alpha, A_buffer, lda, B_buffer, ldb,
-                                        beta, C_buffer, ldc);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during SYMM:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of SYMM:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto C_accessor = C_buffer.get_host_access(read_only);
-    bool good =
-        check_equal_matrix(C_accessor, C_ref, layout, m, n, ldc, 10 * std::max(m, n), std::cout);
-
-    return (int)good;
-}
-
-class SymmTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(SymmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower, 72, 27, 101,
-                                  102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower, 72, 27, 101,
-                                  102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::upper, 72, 27, 101,
-                                  102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::upper, 72, 27, 101,
-                                  102, 103, alpha, beta));
-}
-TEST_P(SymmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower, 72, 27, 101,
-                                   102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower, 72, 27, 101,
-                                   102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::upper, 72, 27, 101,
-                                   102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::upper, 72, 27, 101,
-                                   102, 103, alpha, beta));
-}
-TEST_P(SymmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                72, 27, 101, 102, 103, alpha, beta));
-}
-TEST_P(SymmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-}
-
-INSTANTIATE_TEST_SUITE_P(SymmTestSuite, SymmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/symm_usm.cpp b/tests/unit_tests/blas/level3/symm_usm.cpp
deleted file mode 100644
index f774e82e3..000000000
--- a/tests/unit_tests/blas/level3/symm_usm.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right,
-         oneapi::mkl::uplo upper_lower, int m, int n, int lda, int ldb, int ldc, fp alpha,
-         fp beta) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during SYMM:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), B(ua), C(ua);
-    if (left_right == oneapi::mkl::side::left)
-        rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, m, m, lda);
-    else
-        rand_matrix(A, layout, oneapi::mkl::transpose::nontrans, n, n, lda);
-    rand_matrix(B, layout, oneapi::mkl::transpose::nontrans, m, n, ldb);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, m, n, ldc);
-
-    auto C_ref = C;
-
-    // Call Reference SYMM.
-    const int m_ref = m, n_ref = n;
-    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::symm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right),
-           convert_to_cblas_uplo(upper_lower), &m_ref, &n_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
-           &lda_ref, (fp_ref*)B.data(), &ldb_ref, (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
-
-    // Call DPC++ SYMM.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::symm(main_queue, left_right, upper_lower, m,
-                                                             n, alpha, A.data(), lda, B.data(), ldb,
-                                                             beta, C.data(), ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::symm(main_queue, left_right, upper_lower, m, n,
-                                                          alpha, A.data(), lda, B.data(), ldb, beta,
-                                                          C.data(), ldc, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::symm,
-                                        left_right, upper_lower, m, n, alpha, A.data(), lda,
-                                        B.data(), ldb, beta, C.data(), ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::symm, left_right,
-                                        upper_lower, m, n, alpha, A.data(), lda, B.data(), ldb,
-                                        beta, C.data(), ldc, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during SYMM:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of SYMM:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(C, C_ref, layout, m, n, ldc, 10 * std::max(m, n), std::cout);
-
-    return (int)good;
-}
-
-class SymmUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(SymmUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower, 72, 27, 101,
-                                  102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower, 72, 27, 101,
-                                  102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::upper, 72, 27, 101,
-                                  102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::upper, 72, 27, 101,
-                                  102, 103, alpha, beta));
-}
-TEST_P(SymmUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower, 72, 27, 101,
-                                   102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower, 72, 27, 101,
-                                   102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::upper, 72, 27, 101,
-                                   102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::upper, 72, 27, 101,
-                                   102, 103, alpha, beta));
-}
-TEST_P(SymmUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                72, 27, 101, 102, 103, alpha, beta));
-}
-TEST_P(SymmUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                 72, 27, 101, 102, 103, alpha, beta));
-}
-
-INSTANTIATE_TEST_SUITE_P(SymmUsmTestSuite, SymmUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/syr2k.cpp b/tests/unit_tests/blas/level3/syr2k.cpp
deleted file mode 100644
index 0153e9ec0..000000000
--- a/tests/unit_tests/blas/level3/syr2k.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose trans, int n, int k, int lda, int ldb, int ldc, fp alpha, fp beta) {
-    // Prepare data.
-    vector<fp, allocator_helper<fp, 64>> A, B, C, C_ref;
-    rand_matrix(A, layout, trans, n, k, lda);
-    rand_matrix(B, layout, trans, n, k, ldb);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, n, n, ldc);
-    C_ref = C;
-
-    // Call Reference SYR2K.
-    const int n_ref = n, k_ref = k;
-    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::syr2k(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-            convert_to_cblas_trans(trans), &n_ref, &k_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
-            &lda_ref, (fp_ref*)B.data(), &ldb_ref, (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
-
-    // Call DPC++ SYR2K.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during SYR2K:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-    buffer<fp, 1> C_buffer(C.data(), range<1>(C.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::syr2k(main_queue, upper_lower, trans, n, k, alpha,
-                                                       A_buffer, lda, B_buffer, ldb, beta, C_buffer,
-                                                       ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::syr2k(main_queue, upper_lower, trans, n, k, alpha,
-                                                    A_buffer, lda, B_buffer, ldb, beta, C_buffer,
-                                                    ldc);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::syr2k,
-                                        upper_lower, trans, n, k, alpha, A_buffer, lda, B_buffer,
-                                        ldb, beta, C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::syr2k,
-                                        upper_lower, trans, n, k, alpha, A_buffer, lda, B_buffer,
-                                        ldb, beta, C_buffer, ldc);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during SYR2K:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of SYR2K:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto C_accessor = C_buffer.get_host_access(read_only);
-    bool good =
-        check_equal_matrix(C_accessor, C_ref, layout, n, n, ldc, 10 * std::max(n, k), std::cout);
-
-    return (int)good;
-}
-
-class Syr2kTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(Syr2kTests, RealSinglePrecision) {
-    float alpha(3.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, 73,
-                                  27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, 73,
-                                  27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, 73, 27,
-                                  101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, 73, 27,
-                                  101, 102, 103, alpha, beta));
-}
-TEST_P(Syr2kTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(3.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, 73,
-                                   27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, 73,
-                                   27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, 73, 27,
-                                   101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, 73, 27,
-                                   101, 102, 103, alpha, beta));
-}
-TEST_P(Syr2kTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(3.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 102, 103, alpha, beta));
-}
-TEST_P(Syr2kTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(3.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 102, 103, alpha, beta));
-}
-
-INSTANTIATE_TEST_SUITE_P(Syr2kTestSuite, Syr2kTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/syr2k_usm.cpp b/tests/unit_tests/blas/level3/syr2k_usm.cpp
deleted file mode 100644
index efa3f07d3..000000000
--- a/tests/unit_tests/blas/level3/syr2k_usm.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose trans, int n, int k, int lda, int ldb, int ldc, fp alpha, fp beta) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during SYR2K:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), B(ua), C(ua);
-    rand_matrix(A, layout, trans, n, k, lda);
-    rand_matrix(B, layout, trans, n, k, ldb);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, n, n, ldc);
-
-    auto C_ref = C;
-
-    // Call Reference SYR2K.
-    const int n_ref = n, k_ref = k;
-    const int lda_ref = lda, ldb_ref = ldb, ldc_ref = ldc;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::syr2k(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-            convert_to_cblas_trans(trans), &n_ref, &k_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
-            &lda_ref, (fp_ref*)B.data(), &ldb_ref, (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
-
-    // Call DPC++ SYR2K.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::syr2k(main_queue, upper_lower, trans, n, k,
-                                                              alpha, A.data(), lda, B.data(), ldb,
-                                                              beta, C.data(), ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::syr2k(main_queue, upper_lower, trans, n, k,
-                                                           alpha, A.data(), lda, B.data(), ldb,
-                                                           beta, C.data(), ldc, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::syr2k,
-                                        upper_lower, trans, n, k, alpha, A.data(), lda, B.data(),
-                                        ldb, beta, C.data(), ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::syr2k,
-                                        upper_lower, trans, n, k, alpha, A.data(), lda, B.data(),
-                                        ldb, beta, C.data(), ldc, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during SYR2K:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of SYR2K:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(C, C_ref, layout, n, n, ldc, 10 * std::max(n, k), std::cout);
-
-    return (int)good;
-}
-
-class Syr2kUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(Syr2kUsmTests, RealSinglePrecision) {
-    float alpha(3.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, 73,
-                                  27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, 73,
-                                  27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, 73, 27,
-                                  101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, 73, 27,
-                                  101, 102, 103, alpha, beta));
-}
-TEST_P(Syr2kUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(3.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, 73,
-                                   27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, 73,
-                                   27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, 73, 27,
-                                   101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, 73, 27,
-                                   101, 102, 103, alpha, beta));
-}
-TEST_P(Syr2kUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(3.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 102, 103, alpha, beta));
-}
-TEST_P(Syr2kUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(3.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 102, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 102, 103, alpha, beta));
-}
-
-INSTANTIATE_TEST_SUITE_P(Syr2kUsmTestSuite, Syr2kUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/syrk.cpp b/tests/unit_tests/blas/level3/syrk.cpp
deleted file mode 100644
index a6b28735d..000000000
--- a/tests/unit_tests/blas/level3/syrk.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose trans, int n, int k, int lda, int ldc, fp alpha, fp beta) {
-    // Prepare data.
-    vector<fp, allocator_helper<fp, 64>> A, C, C_ref;
-    rand_matrix(A, layout, trans, n, k, lda);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, n, n, ldc);
-    C_ref = C;
-
-    // Call Reference SYRK.
-    const int n_ref = n, k_ref = k;
-    const int lda_ref = lda, ldc_ref = ldc;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::syrk(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(trans), &n_ref, &k_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
-           &lda_ref, (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
-
-    // Call DPC++ SYRK.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during SYRK:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> C_buffer(C.data(), range<1>(C.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::syrk(main_queue, upper_lower, trans, n, k, alpha,
-                                                      A_buffer, lda, beta, C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::syrk(main_queue, upper_lower, trans, n, k, alpha,
-                                                   A_buffer, lda, beta, C_buffer, ldc);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::syrk,
-                                        upper_lower, trans, n, k, alpha, A_buffer, lda, beta,
-                                        C_buffer, ldc);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::syrk, upper_lower,
-                                        trans, n, k, alpha, A_buffer, lda, beta, C_buffer, ldc);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during SYRK:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of SYRK:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto C_accessor = C_buffer.get_host_access(read_only);
-    bool good =
-        check_equal_matrix(C_accessor, C_ref, layout, n, n, ldc, 10 * std::max(n, k), std::cout);
-
-    return (int)good;
-}
-
-class SyrkTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(SyrkTests, RealSinglePrecision) {
-    float alpha(3.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, 73,
-                                  27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, 73,
-                                  27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, 73, 27,
-                                  101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, 73, 27,
-                                  101, 103, alpha, beta));
-}
-TEST_P(SyrkTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(3.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, 73,
-                                   27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, 73,
-                                   27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, 73, 27,
-                                   101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, 73, 27,
-                                   101, 103, alpha, beta));
-}
-TEST_P(SyrkTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(3.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
-}
-TEST_P(SyrkTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(3.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
-}
-
-INSTANTIATE_TEST_SUITE_P(SyrkTestSuite, SyrkTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/syrk_usm.cpp b/tests/unit_tests/blas/level3/syrk_usm.cpp
deleted file mode 100644
index e5569eb78..000000000
--- a/tests/unit_tests/blas/level3/syrk_usm.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower,
-         oneapi::mkl::transpose trans, int n, int k, int lda, int ldc, fp alpha, fp beta) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during SYRK:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), C(ua);
-    rand_matrix(A, layout, trans, n, k, lda);
-    rand_matrix(C, layout, oneapi::mkl::transpose::nontrans, n, n, ldc);
-
-    auto C_ref = C;
-
-    // Call Reference SYRK.
-    const int n_ref = n, k_ref = k;
-    const int lda_ref = lda, ldc_ref = ldc;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-    ::syrk(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower),
-           convert_to_cblas_trans(trans), &n_ref, &k_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
-           &lda_ref, (fp_ref*)&beta, (fp_ref*)C_ref.data(), &ldc_ref);
-
-    // Call DPC++ SYRK.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::syrk(main_queue, upper_lower, trans, n, k,
-                                                             alpha, A.data(), lda, beta, C.data(),
-                                                             ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::syrk(main_queue, upper_lower, trans, n, k,
-                                                          alpha, A.data(), lda, beta, C.data(), ldc,
-                                                          dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::syrk,
-                                        upper_lower, trans, n, k, alpha, A.data(), lda, beta,
-                                        C.data(), ldc, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::syrk, upper_lower,
-                                        trans, n, k, alpha, A.data(), lda, beta, C.data(), ldc,
-                                        dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during SYRK:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of SYRK:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(C, C_ref, layout, n, n, ldc, 10 * std::max(n, k), std::cout);
-
-    return (int)good;
-}
-
-class SyrkUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(SyrkUsmTests, RealSinglePrecision) {
-    float alpha(3.0);
-    float beta(3.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, 73,
-                                  27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, 73,
-                                  27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, 73, 27,
-                                  101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, 73, 27,
-                                  101, 103, alpha, beta));
-}
-TEST_P(SyrkUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(3.0);
-    double beta(3.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, 73,
-                                   27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, 73,
-                                   27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, 73, 27,
-                                   101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, 73, 27,
-                                   101, 103, alpha, beta));
-}
-TEST_P(SyrkUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(3.0, -0.5);
-    std::complex<float> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
-}
-TEST_P(SyrkUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(3.0, -0.5);
-    std::complex<double> beta(3.0, -1.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::nontrans, 73, 27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::lower,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::uplo::upper,
-        oneapi::mkl::transpose::trans, 73, 27, 101, 103, alpha, beta));
-}
-
-INSTANTIATE_TEST_SUITE_P(SyrkUsmTestSuite, SyrkUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/trmm.cpp b/tests/unit_tests/blas/level3/trmm.cpp
deleted file mode 100644
index 2a02aa0d1..000000000
--- a/tests/unit_tests/blas/level3/trmm.cpp
+++ /dev/null
@@ -1,366 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right,
-         oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa,
-         oneapi::mkl::diag unit_nonunit, int m, int n, int lda, int ldb, fp alpha) {
-    // Prepare data.
-    vector<fp, allocator_helper<fp, 64>> A, B, B_ref;
-    if (left_right == oneapi::mkl::side::right)
-        rand_matrix(A, layout, transa, n, n, lda);
-    else
-        rand_matrix(A, layout, transa, m, m, lda);
-
-    rand_matrix(B, layout, oneapi::mkl::transpose::nontrans, m, n, ldb);
-    B_ref = B;
-
-    // Call Reference TRMM.
-    const int m_ref = m, n_ref = n;
-    const int lda_ref = lda, ldb_ref = ldb;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::trmm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right),
-           convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-           convert_to_cblas_diag(unit_nonunit), &m_ref, &n_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
-           &lda_ref, (fp_ref*)B_ref.data(), &ldb_ref);
-
-    // Call DPC++ TRMM.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TRMM:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::trmm(main_queue, left_right, upper_lower, transa,
-                                                      unit_nonunit, m, n, alpha, A_buffer, lda,
-                                                      B_buffer, ldb);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::trmm(main_queue, left_right, upper_lower, transa,
-                                                   unit_nonunit, m, n, alpha, A_buffer, lda,
-                                                   B_buffer, ldb);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::trmm,
-                                        left_right, upper_lower, transa, unit_nonunit, m, n, alpha,
-                                        A_buffer, lda, B_buffer, ldb);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::trmm, left_right,
-                                        upper_lower, transa, unit_nonunit, m, n, alpha, A_buffer,
-                                        lda, B_buffer, ldb);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TRMM:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TRMM:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto B_accessor = B_buffer.get_host_access(read_only);
-    bool good =
-        check_equal_matrix(B_accessor, B_ref, layout, m, n, ldb, 10 * std::max(m, n), std::cout);
-
-    return (int)good;
-}
-
-class TrmmTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(TrmmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-}
-TEST_P(TrmmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-}
-TEST_P(TrmmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::nontrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::nontrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-}
-TEST_P(TrmmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::nontrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::nontrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-}
-
-INSTANTIATE_TEST_SUITE_P(TrmmTestSuite, TrmmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/trmm_usm.cpp b/tests/unit_tests/blas/level3/trmm_usm.cpp
deleted file mode 100644
index 1fa9bbdb0..000000000
--- a/tests/unit_tests/blas/level3/trmm_usm.cpp
+++ /dev/null
@@ -1,368 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right,
-         oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa,
-         oneapi::mkl::diag unit_nonunit, int m, int n, int lda, int ldb, fp alpha) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TRMM:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), B(ua);
-    if (left_right == oneapi::mkl::side::right)
-        rand_matrix(A, layout, transa, n, n, lda);
-    else
-        rand_matrix(A, layout, transa, m, m, lda);
-
-    rand_matrix(B, layout, oneapi::mkl::transpose::nontrans, m, n, ldb);
-
-    auto B_ref = B;
-
-    // Call Reference TRMM.
-    const int m_ref = m, n_ref = n;
-    const int lda_ref = lda, ldb_ref = ldb;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::trmm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right),
-           convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-           convert_to_cblas_diag(unit_nonunit), &m_ref, &n_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
-           &lda_ref, (fp_ref*)B_ref.data(), &ldb_ref);
-
-    // Call DPC++ TRMM.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::trmm(
-                    main_queue, left_right, upper_lower, transa, unit_nonunit, m, n, alpha,
-                    A.data(), lda, B.data(), ldb, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::trmm(
-                    main_queue, left_right, upper_lower, transa, unit_nonunit, m, n, alpha,
-                    A.data(), lda, B.data(), ldb, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::trmm,
-                                        left_right, upper_lower, transa, unit_nonunit, m, n, alpha,
-                                        A.data(), lda, B.data(), ldb, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::trmm, left_right,
-                                        upper_lower, transa, unit_nonunit, m, n, alpha, A.data(),
-                                        lda, B.data(), ldb, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TRMM:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TRMM:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good = check_equal_matrix(B, B_ref, layout, m, n, ldb, 10 * std::max(m, n), std::cout);
-
-    return (int)good;
-}
-
-class TrmmUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(TrmmUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-}
-TEST_P(TrmmUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-}
-TEST_P(TrmmUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::nontrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::nontrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-}
-TEST_P(TrmmUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::nontrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::nontrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-}
-
-INSTANTIATE_TEST_SUITE_P(TrmmUsmTestSuite, TrmmUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/trsm.cpp b/tests/unit_tests/blas/level3/trsm.cpp
deleted file mode 100644
index 90b8d5c93..000000000
--- a/tests/unit_tests/blas/level3/trsm.cpp
+++ /dev/null
@@ -1,494 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "allocator_helper.hpp"
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right,
-         oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa,
-         oneapi::mkl::diag unit_nonunit, int m, int n, int lda, int ldb, fp alpha) {
-    // Prepare data.
-    vector<fp, allocator_helper<fp, 64>> A, B, B_ref;
-    if (left_right == oneapi::mkl::side::right)
-        rand_trsm_matrix(A, layout, transa, n, n, lda);
-    else
-        rand_trsm_matrix(A, layout, transa, m, m, lda);
-
-    rand_matrix(B, layout, oneapi::mkl::transpose::nontrans, m, n, ldb);
-    B_ref = B;
-
-    // Call Reference TRSM.
-    const int m_ref = m, n_ref = n;
-    const int lda_ref = lda, ldb_ref = ldb;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::trsm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right),
-           convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-           convert_to_cblas_diag(unit_nonunit), &m_ref, &n_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
-           &lda_ref, (fp_ref*)B_ref.data(), &ldb_ref);
-
-    // Call DPC++ TRSM.
-
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TRSM:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-
-    buffer<fp, 1> A_buffer(A.data(), range<1>(A.size()));
-    buffer<fp, 1> B_buffer(B.data(), range<1>(B.size()));
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                oneapi::mkl::blas::column_major::trsm(main_queue, left_right, upper_lower, transa,
-                                                      unit_nonunit, m, n, alpha, A_buffer, lda,
-                                                      B_buffer, ldb);
-                break;
-            case oneapi::mkl::layout::row_major:
-                oneapi::mkl::blas::row_major::trsm(main_queue, left_right, upper_lower, transa,
-                                                   unit_nonunit, m, n, alpha, A_buffer, lda,
-                                                   B_buffer, ldb);
-                break;
-            default: break;
-        }
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::trsm,
-                                        left_right, upper_lower, transa, unit_nonunit, m, n, alpha,
-                                        A_buffer, lda, B_buffer, ldb);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::trsm, left_right,
-                                        upper_lower, transa, unit_nonunit, m, n, alpha, A_buffer,
-                                        lda, B_buffer, ldb);
-                break;
-            default: break;
-        }
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TRSM:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TRSM:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto B_accessor = B_buffer.get_host_access(read_only);
-    bool good = check_equal_trsm_matrix(B_accessor, B_ref, layout, m, n, ldb, 10 * std::max(m, n),
-                                        std::cout);
-
-    return (int)good;
-}
-
-class TrsmTests : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {
-};
-
-TEST_P(TrsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-}
-TEST_P(TrsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                   101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                   101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                   101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                   101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-}
-TEST_P(TrsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::nontrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::nontrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                oneapi::mkl::transpose::nontrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                oneapi::mkl::transpose::nontrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::trans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::trans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                oneapi::mkl::transpose::trans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                oneapi::mkl::transpose::trans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::conjtrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::conjtrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                oneapi::mkl::transpose::conjtrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                oneapi::mkl::transpose::conjtrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-}
-TEST_P(TrsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::nontrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::nontrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                 oneapi::mkl::transpose::nontrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                 oneapi::mkl::transpose::nontrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::trans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::trans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                 oneapi::mkl::transpose::trans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                 oneapi::mkl::transpose::trans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::conjtrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::conjtrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                 oneapi::mkl::transpose::conjtrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                 oneapi::mkl::transpose::conjtrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-}
-
-INSTANTIATE_TEST_SUITE_P(TrsmTestSuite, TrsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/blas/level3/trsm_usm.cpp b/tests/unit_tests/blas/level3/trsm_usm.cpp
deleted file mode 100644
index f84b0ed61..000000000
--- a/tests/unit_tests/blas/level3/trsm_usm.cpp
+++ /dev/null
@@ -1,497 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "cblas.h"
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "onemkl_blas_helper.hpp"
-#include "reference_blas_templates.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-using namespace sycl;
-using std::vector;
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-template <typename fp>
-int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right,
-         oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa,
-         oneapi::mkl::diag unit_nonunit, int m, int n, int lda, int ldb, fp alpha) {
-    // Catch asynchronous exceptions.
-    auto exception_handler = [](exception_list exceptions) {
-        for (std::exception_ptr const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (exception const& e) {
-                std::cout << "Caught asynchronous SYCL exception during TRSM:\n"
-                          << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    };
-
-    queue main_queue(*dev, exception_handler);
-    context cxt = main_queue.get_context();
-    event done;
-    std::vector<event> dependencies;
-
-    // Prepare data.
-    auto ua = usm_allocator<fp, usm::alloc::shared, 64>(cxt, *dev);
-    vector<fp, decltype(ua)> A(ua), B(ua);
-    if (left_right == oneapi::mkl::side::right)
-        rand_trsm_matrix(A, layout, transa, n, n, lda);
-    else
-        rand_trsm_matrix(A, layout, transa, m, m, lda);
-
-    rand_matrix(B, layout, oneapi::mkl::transpose::nontrans, m, n, ldb);
-
-    auto B_ref = B;
-
-    // Call Reference TRSM.
-    const int m_ref = m, n_ref = n;
-    const int lda_ref = lda, ldb_ref = ldb;
-
-    using fp_ref = typename ref_type_info<fp>::type;
-
-    ::trsm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right),
-           convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa),
-           convert_to_cblas_diag(unit_nonunit), &m_ref, &n_ref, (fp_ref*)&alpha, (fp_ref*)A.data(),
-           &lda_ref, (fp_ref*)B_ref.data(), &ldb_ref);
-
-    // Call DPC++ TRSM.
-
-    try {
-#ifdef CALL_RT_API
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                done = oneapi::mkl::blas::column_major::trsm(
-                    main_queue, left_right, upper_lower, transa, unit_nonunit, m, n, alpha,
-                    A.data(), lda, B.data(), ldb, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                done = oneapi::mkl::blas::row_major::trsm(
-                    main_queue, left_right, upper_lower, transa, unit_nonunit, m, n, alpha,
-                    A.data(), lda, B.data(), ldb, dependencies);
-                break;
-            default: break;
-        }
-        done.wait();
-#else
-        switch (layout) {
-            case oneapi::mkl::layout::col_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::trsm,
-                                        left_right, upper_lower, transa, unit_nonunit, m, n, alpha,
-                                        A.data(), lda, B.data(), ldb, dependencies);
-                break;
-            case oneapi::mkl::layout::row_major:
-                TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::trsm, left_right,
-                                        upper_lower, transa, unit_nonunit, m, n, alpha, A.data(),
-                                        lda, B.data(), ldb, dependencies);
-                break;
-            default: break;
-        }
-        main_queue.wait();
-#endif
-    }
-    catch (exception const& e) {
-        std::cout << "Caught synchronous SYCL exception during TRSM:\n" << e.what() << std::endl;
-        print_error_code(e);
-    }
-
-    catch (const oneapi::mkl::unimplemented& e) {
-        return test_skipped;
-    }
-
-    catch (const std::runtime_error& error) {
-        std::cout << "Error raised during execution of TRSM:\n" << error.what() << std::endl;
-    }
-
-    // Compare the results of reference implementation and DPC++ implementation.
-
-    bool good =
-        check_equal_trsm_matrix(B, B_ref, layout, m, n, ldb, 10 * std::max(m, n), std::cout);
-
-    return (int)good;
-}
-
-class TrsmUsmTests
-        : public ::testing::TestWithParam<std::tuple<sycl::device*, oneapi::mkl::layout>> {};
-
-TEST_P(TrsmUsmTests, RealSinglePrecision) {
-    float alpha(2.0);
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                  27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<float>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                  oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                  oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-                                  101, 102, alpha));
-}
-TEST_P(TrsmUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    double alpha(2.0);
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                   101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                   101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                   101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::unit, 72, 27,
-                                   101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<double>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                   oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                   oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72,
-                                   27, 101, 102, alpha));
-}
-TEST_P(TrsmUsmTests, ComplexSinglePrecision) {
-    std::complex<float> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::nontrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::nontrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                oneapi::mkl::transpose::nontrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                oneapi::mkl::transpose::nontrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::trans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::trans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                oneapi::mkl::transpose::trans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                oneapi::mkl::transpose::trans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::conjtrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                oneapi::mkl::transpose::conjtrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                oneapi::mkl::transpose::conjtrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                oneapi::mkl::transpose::conjtrans,
-                                                oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<float>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-}
-TEST_P(TrsmUsmTests, ComplexDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam()));
-
-    std::complex<double> alpha(2.0, -0.5);
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::nontrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::nontrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                 oneapi::mkl::transpose::nontrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                 oneapi::mkl::transpose::nontrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::trans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::trans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                 oneapi::mkl::transpose::trans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                 oneapi::mkl::transpose::trans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::conjtrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::lower,
-                                                 oneapi::mkl::transpose::conjtrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::left, oneapi::mkl::uplo::upper,
-                                                 oneapi::mkl::transpose::conjtrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                                                 oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                                                 oneapi::mkl::transpose::conjtrans,
-                                                 oneapi::mkl::diag::unit, 72, 27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::trans, oneapi::mkl::diag::nonunit, 72, 27,
-        101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::lower, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::left,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-    EXPECT_TRUEORSKIP(test<std::complex<double>>(
-        std::get<0>(GetParam()), std::get<1>(GetParam()), oneapi::mkl::side::right,
-        oneapi::mkl::uplo::upper, oneapi::mkl::transpose::conjtrans, oneapi::mkl::diag::nonunit, 72,
-        27, 101, 102, alpha));
-}
-
-INSTANTIATE_TEST_SUITE_P(TrsmUsmTestSuite, TrsmUsmTests,
-                         ::testing::Combine(testing::ValuesIn(devices),
-                                            testing::Values(oneapi::mkl::layout::col_major,
-                                                            oneapi::mkl::layout::row_major)),
-                         ::LayoutDeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/dft/CMakeLists.txt b/tests/unit_tests/dft/CMakeLists.txt
deleted file mode 100644
index 2c46cd38c..000000000
--- a/tests/unit_tests/dft/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_subdirectory(source)
diff --git a/tests/unit_tests/dft/include/compute_inplace.hpp b/tests/unit_tests/dft/include/compute_inplace.hpp
deleted file mode 100644
index 9cc161c34..000000000
--- a/tests/unit_tests/dft/include/compute_inplace.hpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef ONEMKL_COMPUTE_INPLACE_HPP
-#define ONEMKL_COMPUTE_INPLACE_HPP
-
-#include "compute_tester.hpp"
-#include <oneapi/mkl/exceptions.hpp>
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-int DFT_Test<precision, domain>::test_in_place_buffer() {
-    if (!init(MemoryAccessModel::buffer)) {
-        return test_skipped;
-    }
-
-    auto modified_strides_fwd = this->strides_fwd;
-    auto modified_strides_bwd = this->strides_bwd;
-    if (domain == oneapi::mkl::dft::domain::REAL) {
-        // both input and output strides must be set
-        auto default_conjuate_strides = get_conjugate_even_complex_strides(sizes);
-        std::ptrdiff_t rank = static_cast<std::ptrdiff_t>(sizes.size());
-
-        if (modified_strides_fwd.size() == 0) {
-            modified_strides_fwd = std::vector<std::int64_t>(
-                default_conjuate_strides.begin(), default_conjuate_strides.begin() + rank + 1);
-            std::transform(modified_strides_fwd.begin() + 1, modified_strides_fwd.begin() + rank,
-                           modified_strides_fwd.begin() + 1, [](std::int64_t& s) { return 2 * s; });
-        }
-        if (modified_strides_bwd.size() == 0) {
-            modified_strides_bwd = std::vector<std::int64_t>(
-                default_conjuate_strides.begin(), default_conjuate_strides.begin() + rank + 1);
-        }
-    }
-    else {
-        // General consistency requirements for in-place complex domain transforms require that strides are the same forward and backward.
-        modified_strides_fwd = modified_strides_bwd;
-    }
-
-    auto [forward_distance, backward_distance] =
-        get_default_distances<domain, true>(sizes, modified_strides_fwd, modified_strides_bwd);
-    auto ref_distance = std::accumulate(sizes.begin(), sizes.end(), 1, std::multiplies<>());
-
-    descriptor_t descriptor{ sizes };
-    descriptor.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                         oneapi::mkl::dft::config_value::INPLACE);
-    if constexpr (domain == oneapi::mkl::dft::domain::REAL) {
-        descriptor.set_value(oneapi::mkl::dft::config_param::CONJUGATE_EVEN_STORAGE,
-                             oneapi::mkl::dft::config_value::COMPLEX_COMPLEX);
-        descriptor.set_value(oneapi::mkl::dft::config_param::PACKED_FORMAT,
-                             oneapi::mkl::dft::config_value::CCE_FORMAT);
-    }
-    descriptor.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batches);
-    descriptor.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, forward_distance);
-    descriptor.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, backward_distance);
-    if (modified_strides_fwd.size()) {
-        descriptor.set_value(oneapi::mkl::dft::config_param::FWD_STRIDES,
-                             modified_strides_fwd.data());
-    }
-    if (modified_strides_bwd.size()) {
-        descriptor.set_value(oneapi::mkl::dft::config_param::BWD_STRIDES,
-                             modified_strides_bwd.data());
-    }
-    commit_descriptor(descriptor, sycl_queue);
-
-    std::vector<FwdInputType> inout_host(
-        strided_copy(input, sizes, modified_strides_fwd, batches, forward_distance));
-    int real_multiplier = (domain == oneapi::mkl::dft::domain::REAL ? 2 : 1);
-    inout_host.resize(
-        cast_unsigned(std::max(forward_distance, real_multiplier * backward_distance) * batches +
-                      get_default(modified_strides_bwd, 0, 0L) * real_multiplier));
-
-    {
-        sycl::buffer<FwdInputType, 1> inout_buf{ inout_host };
-
-        oneapi::mkl::dft::compute_forward<descriptor_t, FwdInputType>(descriptor, inout_buf);
-
-        {
-            auto acc_host = inout_buf.get_host_access();
-            auto ptr_host = reinterpret_cast<FwdOutputType*>(acc_host.get_pointer());
-            for (std::int64_t i = 0; i < batches; i++) {
-                EXPECT_TRUE(check_equal_strided<domain == oneapi::mkl::dft::domain::REAL>(
-                    ptr_host + backward_distance * i, out_host_ref.data() + ref_distance * i, sizes,
-                    modified_strides_bwd, abs_error_margin, rel_error_margin, std::cout));
-            }
-        }
-
-        oneapi::mkl::dft::compute_backward<std::remove_reference_t<decltype(descriptor)>,
-                                           FwdInputType>(descriptor, inout_buf);
-    }
-
-    std::vector<FwdInputType> fwd_data_ref = input;
-    // account for scaling that occurs during DFT
-    std::for_each(fwd_data_ref.begin(), fwd_data_ref.end(),
-                  [this](auto& x) { x *= static_cast<PrecisionType>(forward_elements); });
-
-    for (std::int64_t i = 0; i < batches; i++) {
-        EXPECT_TRUE(check_equal_strided<false>(
-            inout_host.data() + forward_distance * i, fwd_data_ref.data() + ref_distance * i, sizes,
-            modified_strides_fwd, abs_error_margin, rel_error_margin, std::cout));
-    }
-
-    return !::testing::Test::HasFailure();
-}
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-int DFT_Test<precision, domain>::test_in_place_USM() {
-    if (!init(MemoryAccessModel::usm)) {
-        return test_skipped;
-    }
-
-    auto modified_strides_fwd = this->strides_fwd;
-    auto modified_strides_bwd = this->strides_bwd;
-    if (domain == oneapi::mkl::dft::domain::REAL) {
-        // both input and output strides must be set
-        auto default_conjuate_strides = get_conjugate_even_complex_strides(sizes);
-        std::ptrdiff_t rank = static_cast<std::ptrdiff_t>(sizes.size());
-
-        if (modified_strides_fwd.size() == 0) {
-            modified_strides_fwd = std::vector<std::int64_t>(
-                default_conjuate_strides.begin(), default_conjuate_strides.begin() + rank + 1);
-            std::transform(modified_strides_fwd.begin() + 1, modified_strides_fwd.begin() + rank,
-                           modified_strides_fwd.begin() + 1, [](std::int64_t& s) { return 2 * s; });
-        }
-        if (modified_strides_bwd.size() == 0) {
-            modified_strides_bwd = std::vector<std::int64_t>(
-                default_conjuate_strides.begin(), default_conjuate_strides.begin() + rank + 1);
-        }
-    }
-    else {
-        // General consistency requirements for in-place complex domain transforms require that strides are the same forward and backward.
-        modified_strides_fwd = modified_strides_bwd;
-    }
-
-    auto [forward_distance, backward_distance] =
-        get_default_distances<domain, true>(sizes, modified_strides_fwd, modified_strides_bwd);
-    auto ref_distance = std::accumulate(sizes.begin(), sizes.end(), 1, std::multiplies<>());
-
-    descriptor_t descriptor = { sizes };
-    descriptor.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                         oneapi::mkl::dft::config_value::INPLACE);
-    if constexpr (domain == oneapi::mkl::dft::domain::REAL) {
-        descriptor.set_value(oneapi::mkl::dft::config_param::CONJUGATE_EVEN_STORAGE,
-                             oneapi::mkl::dft::config_value::COMPLEX_COMPLEX);
-        descriptor.set_value(oneapi::mkl::dft::config_param::PACKED_FORMAT,
-                             oneapi::mkl::dft::config_value::CCE_FORMAT);
-    }
-    descriptor.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batches);
-    descriptor.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, forward_distance);
-    descriptor.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, backward_distance);
-    if (modified_strides_fwd.size()) {
-        descriptor.set_value(oneapi::mkl::dft::config_param::FWD_STRIDES,
-                             modified_strides_fwd.data());
-    }
-    if (modified_strides_bwd.size()) {
-        descriptor.set_value(oneapi::mkl::dft::config_param::BWD_STRIDES,
-                             modified_strides_bwd.data());
-    }
-    commit_descriptor(descriptor, sycl_queue);
-
-    auto ua_input = usm_allocator_t<FwdInputType>(cxt, *dev);
-    std::vector<FwdInputType, decltype(ua_input)> inout(
-        strided_copy(input, sizes, modified_strides_fwd, batches, forward_distance, ua_input),
-        ua_input);
-    int real_multiplier = (domain == oneapi::mkl::dft::domain::REAL ? 2 : 1);
-    inout.resize(
-        cast_unsigned(std::max(forward_distance, real_multiplier * backward_distance) * batches +
-                      real_multiplier * get_default(modified_strides_bwd, 0, 0L)));
-
-    std::vector<sycl::event> no_dependencies;
-    oneapi::mkl::dft::compute_forward<descriptor_t, FwdInputType>(descriptor, inout.data(),
-                                                                  no_dependencies)
-        .wait_and_throw();
-
-    for (std::int64_t i = 0; i < batches; i++) {
-        EXPECT_TRUE(check_equal_strided<domain == oneapi::mkl::dft::domain::REAL>(
-            reinterpret_cast<FwdOutputType*>(inout.data()) + backward_distance * i,
-            out_host_ref.data() + ref_distance * i, sizes, modified_strides_bwd, abs_error_margin,
-            rel_error_margin, std::cout));
-    }
-
-    sycl::event done =
-        oneapi::mkl::dft::compute_backward<std::remove_reference_t<decltype(descriptor)>,
-                                           FwdInputType>(descriptor, inout.data(), no_dependencies);
-    done.wait_and_throw();
-
-    std::for_each(input.begin(), input.end(),
-                  [this](auto& x) { x *= static_cast<PrecisionType>(forward_elements); });
-
-    for (std::int64_t i = 0; i < batches; i++) {
-        EXPECT_TRUE(check_equal_strided<false>(
-            inout.data() + forward_distance * i, input.data() + ref_distance * i, sizes,
-            modified_strides_fwd, abs_error_margin, rel_error_margin, std::cout));
-    }
-
-    return !::testing::Test::HasFailure();
-}
-
-#endif //ONEMKL_COMPUTE_INPLACE_HPP
diff --git a/tests/unit_tests/dft/include/compute_inplace_real_real.hpp b/tests/unit_tests/dft/include/compute_inplace_real_real.hpp
deleted file mode 100644
index d4af1a44a..000000000
--- a/tests/unit_tests/dft/include/compute_inplace_real_real.hpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef ONEMKL_COMPUTE_INPLACE_REAL_REAL_HPP
-#define ONEMKL_COMPUTE_INPLACE_REAL_REAL_HPP
-
-#include "compute_tester.hpp"
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-int DFT_Test<precision, domain>::test_in_place_real_real_USM() {
-    if (!init(MemoryAccessModel::usm)) {
-        return test_skipped;
-    }
-    if constexpr (domain == oneapi::mkl::dft::domain::REAL) {
-        std::cout << "skipping real split tests as they are not supported" << std::endl;
-
-        return test_skipped;
-    }
-    else {
-        descriptor_t descriptor{ sizes };
-        PrecisionType backward_scale = 1.f / static_cast<PrecisionType>(forward_elements);
-        descriptor.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                             oneapi::mkl::dft::config_value::INPLACE);
-        descriptor.set_value(oneapi::mkl::dft::config_param::COMPLEX_STORAGE,
-                             oneapi::mkl::dft::config_value::REAL_REAL);
-        descriptor.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batches);
-        descriptor.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, forward_elements);
-        descriptor.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, forward_elements);
-        descriptor.set_value(oneapi::mkl::dft::config_param::BACKWARD_SCALE, backward_scale);
-
-        commit_descriptor(descriptor, sycl_queue);
-
-        auto ua_input = usm_allocator_t<PrecisionType>(cxt, *dev);
-
-        std::vector<PrecisionType, decltype(ua_input)> inout_re(size_total, ua_input);
-        std::vector<PrecisionType, decltype(ua_input)> inout_im(size_total, ua_input);
-        std::copy(input_re.begin(), input_re.end(), inout_re.begin());
-        std::copy(input_im.begin(), input_im.end(), inout_im.begin());
-
-        std::vector<sycl::event> no_dependencies;
-        oneapi::mkl::dft::compute_forward<descriptor_t, PrecisionType>(
-            descriptor, inout_re.data(), inout_im.data(), no_dependencies)
-            .wait_and_throw();
-
-        std::vector<FwdOutputType> output_data(size_total);
-        for (std::size_t i = 0; i < output_data.size(); ++i) {
-            output_data[i] = { inout_re[i], inout_im[i] };
-        }
-        EXPECT_TRUE(check_equal_vector(output_data.data(), out_host_ref.data(), output_data.size(),
-                                       abs_error_margin, rel_error_margin, std::cout));
-
-        oneapi::mkl::dft::compute_backward<std::remove_reference_t<decltype(descriptor)>,
-                                           PrecisionType>(descriptor, inout_re.data(),
-                                                          inout_im.data(), no_dependencies)
-            .wait_and_throw();
-
-        for (std::size_t i = 0; i < output_data.size(); ++i) {
-            output_data[i] = { inout_re[i], inout_im[i] };
-        }
-
-        EXPECT_TRUE(check_equal_vector(output_data.data(), input.data(), input.size(),
-                                       abs_error_margin, rel_error_margin, std::cout));
-
-        return !::testing::Test::HasFailure();
-    }
-}
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-int DFT_Test<precision, domain>::test_in_place_real_real_buffer() {
-    if (!init(MemoryAccessModel::buffer)) {
-        return test_skipped;
-    }
-
-    if constexpr (domain == oneapi::mkl::dft::domain::REAL) {
-        std::cout << "skipping real split tests as they are not supported" << std::endl;
-
-        return test_skipped;
-    }
-    else {
-        descriptor_t descriptor{ sizes };
-
-        PrecisionType backward_scale = 1.f / static_cast<PrecisionType>(forward_elements);
-        descriptor.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                             oneapi::mkl::dft::config_value::INPLACE);
-        descriptor.set_value(oneapi::mkl::dft::config_param::COMPLEX_STORAGE,
-                             oneapi::mkl::dft::config_value::REAL_REAL);
-        descriptor.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batches);
-        descriptor.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, forward_elements);
-        descriptor.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, forward_elements);
-        descriptor.set_value(oneapi::mkl::dft::config_param::BACKWARD_SCALE, backward_scale);
-
-        commit_descriptor(descriptor, sycl_queue);
-
-        std::vector<PrecisionType> host_inout_re(size_total, static_cast<PrecisionType>(0));
-        std::vector<PrecisionType> host_inout_im(size_total, static_cast<PrecisionType>(0));
-        std::copy(input_re.begin(), input_re.end(), host_inout_re.begin());
-        std::copy(input_im.begin(), input_im.end(), host_inout_im.begin());
-
-        sycl::buffer<PrecisionType, 1> inout_re_buf{ host_inout_re.data(),
-                                                     sycl::range<1>(size_total) };
-        sycl::buffer<PrecisionType, 1> inout_im_buf{ host_inout_im.data(),
-                                                     sycl::range<1>(size_total) };
-
-        oneapi::mkl::dft::compute_forward<descriptor_t, PrecisionType>(descriptor, inout_re_buf,
-                                                                       inout_im_buf);
-
-        {
-            auto acc_inout_re = inout_re_buf.get_host_access();
-            auto acc_inout_im = inout_im_buf.get_host_access();
-            std::vector<FwdOutputType> output_data(size_total, static_cast<FwdOutputType>(0));
-            for (std::size_t i = 0; i < output_data.size(); ++i) {
-                output_data[i] = { acc_inout_re[i], acc_inout_im[i] };
-            }
-            EXPECT_TRUE(check_equal_vector(output_data.data(), out_host_ref.data(),
-                                           output_data.size(), abs_error_margin, rel_error_margin,
-                                           std::cout));
-        }
-
-        oneapi::mkl::dft::compute_backward<std::remove_reference_t<decltype(descriptor)>,
-                                           PrecisionType>(descriptor, inout_re_buf, inout_im_buf);
-
-        {
-            auto acc_inout_re = inout_re_buf.get_host_access();
-            auto acc_inout_im = inout_im_buf.get_host_access();
-            std::vector<FwdInputType> output_data(size_total, static_cast<FwdInputType>(0));
-            for (std::size_t i = 0; i < output_data.size(); ++i) {
-                output_data[i] = { acc_inout_re[i], acc_inout_im[i] };
-            }
-            EXPECT_TRUE(check_equal_vector(output_data.data(), input.data(), input.size(),
-                                           abs_error_margin, rel_error_margin, std::cout));
-        }
-        return !::testing::Test::HasFailure();
-    }
-}
-
-#endif //ONEMKL_COMPUTE_INPLACE_REAL_REAL_HPP
diff --git a/tests/unit_tests/dft/include/compute_out_of_place.hpp b/tests/unit_tests/dft/include/compute_out_of_place.hpp
deleted file mode 100644
index df5e1e323..000000000
--- a/tests/unit_tests/dft/include/compute_out_of_place.hpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef ONEMKL_COMPUTE_OUT_OF_PLACE_HPP
-#define ONEMKL_COMPUTE_OUT_OF_PLACE_HPP
-
-#include "compute_tester.hpp"
-#include <numeric>
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-int DFT_Test<precision, domain>::test_out_of_place_buffer() {
-    if (!init(MemoryAccessModel::buffer)) {
-        return test_skipped;
-    }
-
-    auto [forward_distance, backward_distance] =
-        get_default_distances<domain>(sizes, strides_fwd, strides_bwd);
-    auto ref_distance = std::accumulate(sizes.begin(), sizes.end(), 1, std::multiplies<>());
-
-    descriptor_t descriptor{ sizes };
-    descriptor.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                         oneapi::mkl::dft::config_value::NOT_INPLACE);
-    if constexpr (domain == oneapi::mkl::dft::domain::REAL) {
-        descriptor.set_value(oneapi::mkl::dft::config_param::CONJUGATE_EVEN_STORAGE,
-                             oneapi::mkl::dft::config_value::COMPLEX_COMPLEX);
-        descriptor.set_value(oneapi::mkl::dft::config_param::PACKED_FORMAT,
-                             oneapi::mkl::dft::config_value::CCE_FORMAT);
-    }
-    descriptor.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batches);
-    descriptor.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, forward_distance);
-    descriptor.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, backward_distance);
-    if (strides_fwd.size()) {
-        descriptor.set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides_fwd.data());
-    }
-    if (strides_bwd.size()) {
-        descriptor.set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides_bwd.data());
-    }
-    else if constexpr (domain == oneapi::mkl::dft::domain::REAL) {
-        const auto complex_strides = get_conjugate_even_complex_strides(sizes);
-        descriptor.set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, complex_strides.data());
-    }
-    commit_descriptor(descriptor, sycl_queue);
-    std::vector<FwdInputType> fwd_data(
-        strided_copy(input, sizes, strides_fwd, batches, forward_distance));
-
-    auto tmp = std::vector<FwdOutputType>(
-        cast_unsigned(backward_distance * batches + get_default(strides_bwd, 0, 0L)), 0);
-    {
-        sycl::buffer<FwdInputType, 1> fwd_buf{ fwd_data };
-        sycl::buffer<FwdOutputType, 1> bwd_buf{ tmp };
-
-        oneapi::mkl::dft::compute_forward<descriptor_t, FwdInputType, FwdOutputType>(
-            descriptor, fwd_buf, bwd_buf);
-
-        {
-            auto acc_bwd = bwd_buf.get_host_access();
-            auto bwd_ptr = acc_bwd.get_pointer();
-            for (std::int64_t i = 0; i < batches; i++) {
-                EXPECT_TRUE(check_equal_strided<domain == oneapi::mkl::dft::domain::REAL>(
-                    bwd_ptr + backward_distance * i, out_host_ref.data() + ref_distance * i, sizes,
-                    strides_bwd, abs_error_margin, rel_error_margin, std::cout));
-            }
-        }
-
-        oneapi::mkl::dft::compute_backward<std::remove_reference_t<decltype(descriptor)>,
-                                           FwdOutputType, FwdInputType>(descriptor, bwd_buf,
-                                                                        fwd_buf);
-    }
-
-    // account for scaling that occurs during DFT
-    std::for_each(input.begin(), input.end(),
-                  [this](auto &x) { x *= static_cast<PrecisionType>(forward_elements); });
-
-    for (std::int64_t i = 0; i < batches; i++) {
-        EXPECT_TRUE(check_equal_strided<false>(fwd_data.data() + forward_distance * i,
-                                               input.data() + ref_distance * i, sizes, strides_fwd,
-                                               abs_error_margin, rel_error_margin, std::cout));
-    }
-
-    return !::testing::Test::HasFailure();
-}
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-int DFT_Test<precision, domain>::test_out_of_place_USM() {
-    if (!init(MemoryAccessModel::usm)) {
-        return test_skipped;
-    }
-    const std::vector<sycl::event> no_dependencies;
-
-    auto [forward_distance, backward_distance] =
-        get_default_distances<domain>(sizes, strides_fwd, strides_bwd);
-
-    descriptor_t descriptor{ sizes };
-    descriptor.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                         oneapi::mkl::dft::config_value::NOT_INPLACE);
-    if constexpr (domain == oneapi::mkl::dft::domain::REAL) {
-        descriptor.set_value(oneapi::mkl::dft::config_param::CONJUGATE_EVEN_STORAGE,
-                             oneapi::mkl::dft::config_value::COMPLEX_COMPLEX);
-        descriptor.set_value(oneapi::mkl::dft::config_param::PACKED_FORMAT,
-                             oneapi::mkl::dft::config_value::CCE_FORMAT);
-    }
-    descriptor.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batches);
-    descriptor.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, forward_distance);
-    descriptor.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, backward_distance);
-    if (strides_fwd.size()) {
-        descriptor.set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides_fwd.data());
-    }
-    if (strides_bwd.size()) {
-        descriptor.set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides_bwd.data());
-    }
-    else if constexpr (domain == oneapi::mkl::dft::domain::REAL) {
-        const auto complex_strides = get_conjugate_even_complex_strides(sizes);
-        descriptor.set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, complex_strides.data());
-    }
-    commit_descriptor(descriptor, sycl_queue);
-
-    auto ua_input = usm_allocator_t<FwdInputType>(cxt, *dev);
-    auto ua_output = usm_allocator_t<FwdOutputType>(cxt, *dev);
-
-    std::vector<FwdInputType, decltype(ua_input)> fwd(
-        strided_copy(input, sizes, strides_fwd, batches, forward_distance, ua_input), ua_input);
-    std::vector<FwdOutputType, decltype(ua_output)> bwd(
-        cast_unsigned(backward_distance * batches + get_default(strides_bwd, 0, 0L)), ua_output);
-
-    oneapi::mkl::dft::compute_forward<descriptor_t, FwdInputType, FwdOutputType>(
-        descriptor, fwd.data(), bwd.data(), no_dependencies)
-        .wait_and_throw();
-
-    auto bwd_ptr = &bwd[0];
-    auto ref_distance = std::accumulate(sizes.begin(), sizes.end(), 1, std::multiplies<>());
-    for (std::int64_t i = 0; i < batches; i++) {
-        EXPECT_TRUE(check_equal_strided<domain == oneapi::mkl::dft::domain::REAL>(
-            bwd_ptr + backward_distance * i, out_host_ref.data() + ref_distance * i, sizes,
-            strides_bwd, abs_error_margin, rel_error_margin, std::cout));
-    }
-
-    oneapi::mkl::dft::compute_backward<std::remove_reference_t<decltype(descriptor)>, FwdOutputType,
-                                       FwdInputType>(descriptor, bwd.data(), fwd.data(),
-                                                     no_dependencies)
-        .wait_and_throw();
-
-    // account for scaling that occurs during DFT
-    std::for_each(input.begin(), input.end(),
-                  [this](auto &x) { x *= static_cast<PrecisionType>(forward_elements); });
-
-    for (std::int64_t i = 0; i < batches; i++) {
-        EXPECT_TRUE(check_equal_strided<false>(fwd.data() + forward_distance * i,
-                                               input.data() + ref_distance * i, sizes, strides_fwd,
-                                               abs_error_margin, rel_error_margin, std::cout));
-    }
-
-    return !::testing::Test::HasFailure();
-}
-
-#endif //ONEMKL_COMPUTE_OUT_OF_PLACE_HPP
diff --git a/tests/unit_tests/dft/include/compute_out_of_place_real_real.hpp b/tests/unit_tests/dft/include/compute_out_of_place_real_real.hpp
deleted file mode 100644
index fb3ecb4f2..000000000
--- a/tests/unit_tests/dft/include/compute_out_of_place_real_real.hpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef ONEMKL_COMPUTE_OUT_OF_PLACE_REAL_REAL_HPP
-#define ONEMKL_COMPUTE_OUT_OF_PLACE_REAL_REAL_HPP
-
-#include "compute_tester.hpp"
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-int DFT_Test<precision, domain>::test_out_of_place_real_real_USM() {
-    if (!init(MemoryAccessModel::usm)) {
-        return test_skipped;
-    }
-
-    if constexpr (domain == oneapi::mkl::dft::domain::REAL) {
-        std::cout << "skipping real split tests as they are not supported" << std::endl;
-
-        return test_skipped;
-    }
-    else {
-        descriptor_t descriptor{ sizes };
-
-        PrecisionType backward_scale = 1.f / static_cast<PrecisionType>(forward_elements);
-        descriptor.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                             oneapi::mkl::dft::config_value::NOT_INPLACE);
-        descriptor.set_value(oneapi::mkl::dft::config_param::COMPLEX_STORAGE,
-                             oneapi::mkl::dft::config_value::REAL_REAL);
-        descriptor.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batches);
-        descriptor.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, forward_elements);
-        descriptor.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, forward_elements);
-        descriptor.set_value(oneapi::mkl::dft::config_param::BACKWARD_SCALE, backward_scale);
-
-        commit_descriptor(descriptor, sycl_queue);
-
-        auto ua_input = usm_allocator_t<PrecisionType>(cxt, *dev);
-        auto ua_output = usm_allocator_t<PrecisionType>(cxt, *dev);
-
-        std::vector<PrecisionType, decltype(ua_input)> in_re(size_total, ua_input);
-        std::vector<PrecisionType, decltype(ua_input)> in_im(size_total, ua_input);
-        std::vector<PrecisionType, decltype(ua_output)> out_re(size_total, ua_output);
-        std::vector<PrecisionType, decltype(ua_output)> out_im(size_total, ua_output);
-        std::vector<PrecisionType, decltype(ua_input)> out_back_re(size_total, ua_input);
-        std::vector<PrecisionType, decltype(ua_input)> out_back_im(size_total, ua_input);
-
-        std::copy(input_re.begin(), input_re.end(), in_re.begin());
-        std::copy(input_im.begin(), input_im.end(), in_im.begin());
-
-        std::vector<sycl::event> no_dependencies;
-
-        oneapi::mkl::dft::compute_forward<descriptor_t, PrecisionType, PrecisionType>(
-            descriptor, in_re.data(), in_im.data(), out_re.data(), out_im.data(), no_dependencies)
-            .wait_and_throw();
-        std::vector<FwdOutputType> output_data(size_total);
-        for (std::size_t i = 0; i < output_data.size(); ++i) {
-            output_data[i] = { out_re[i], out_im[i] };
-        }
-        EXPECT_TRUE(check_equal_vector(output_data.data(), out_host_ref.data(), output_data.size(),
-                                       abs_error_margin, rel_error_margin, std::cout));
-
-        oneapi::mkl::dft::compute_backward<std::remove_reference_t<decltype(descriptor)>,
-                                           PrecisionType, PrecisionType>(
-            descriptor, out_re.data(), out_im.data(), out_back_re.data(), out_back_im.data(),
-            no_dependencies)
-            .wait_and_throw();
-
-        for (std::size_t i = 0; i < output_data.size(); ++i) {
-            output_data[i] = { out_back_re[i], out_back_im[i] };
-        }
-
-        EXPECT_TRUE(check_equal_vector(output_data.data(), input.data(), input.size(),
-                                       abs_error_margin, rel_error_margin, std::cout));
-    }
-
-    return !::testing::Test::HasFailure();
-}
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-int DFT_Test<precision, domain>::test_out_of_place_real_real_buffer() {
-    if (!init(MemoryAccessModel::buffer)) {
-        return test_skipped;
-    }
-
-    if constexpr (domain == oneapi::mkl::dft::domain::REAL) {
-        std::cout << "skipping real split tests as they are not supported" << std::endl;
-
-        return test_skipped;
-    }
-    else {
-        descriptor_t descriptor{ sizes };
-
-        PrecisionType backward_scale = 1.f / static_cast<PrecisionType>(forward_elements);
-        descriptor.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                             oneapi::mkl::dft::config_value::NOT_INPLACE);
-        descriptor.set_value(oneapi::mkl::dft::config_param::COMPLEX_STORAGE,
-                             oneapi::mkl::dft::config_value::REAL_REAL);
-        descriptor.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batches);
-        descriptor.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, forward_elements);
-        descriptor.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, forward_elements);
-        descriptor.set_value(oneapi::mkl::dft::config_param::BACKWARD_SCALE, backward_scale);
-
-        commit_descriptor(descriptor, sycl_queue);
-
-        sycl::buffer<PrecisionType, 1> in_dev_re{ input_re.data(), sycl::range<1>(size_total) };
-        sycl::buffer<PrecisionType, 1> in_dev_im{ input_im.data(), sycl::range<1>(size_total) };
-        sycl::buffer<PrecisionType, 1> out_dev_re{ sycl::range<1>(size_total) };
-        sycl::buffer<PrecisionType, 1> out_dev_im{ sycl::range<1>(size_total) };
-        sycl::buffer<PrecisionType, 1> out_back_dev_re{ sycl::range<1>(size_total) };
-        sycl::buffer<PrecisionType, 1> out_back_dev_im{ sycl::range<1>(size_total) };
-
-        oneapi::mkl::dft::compute_forward<descriptor_t, PrecisionType, PrecisionType>(
-            descriptor, in_dev_re, in_dev_im, out_dev_re, out_dev_im);
-
-        {
-            auto acc_out_re = out_dev_re.get_host_access();
-            auto acc_out_im = out_dev_im.get_host_access();
-            std::vector<FwdOutputType> output_data(size_total, static_cast<FwdOutputType>(0));
-            for (std::size_t i = 0; i < output_data.size(); ++i) {
-                output_data[i] = { acc_out_re[i], acc_out_im[i] };
-            }
-            EXPECT_TRUE(check_equal_vector(output_data.data(), out_host_ref.data(),
-                                           output_data.size(), abs_error_margin, rel_error_margin,
-                                           std::cout));
-        }
-
-        oneapi::mkl::dft::compute_backward<std::remove_reference_t<decltype(descriptor)>,
-                                           PrecisionType, PrecisionType>(
-            descriptor, out_dev_re, out_dev_im, out_back_dev_re, out_back_dev_im);
-
-        {
-            auto acc_back_out_re = out_back_dev_re.get_host_access();
-            auto acc_back_out_im = out_back_dev_im.get_host_access();
-            std::vector<FwdInputType> output_data(size_total, static_cast<FwdInputType>(0));
-            for (std::size_t i = 0; i < output_data.size(); ++i) {
-                output_data[i] = { acc_back_out_re[i], acc_back_out_im[i] };
-            }
-            EXPECT_TRUE(check_equal_vector(output_data.data(), input.data(), input.size(),
-                                           abs_error_margin, rel_error_margin, std::cout));
-        }
-    }
-
-    return !::testing::Test::HasFailure();
-}
-
-#endif //ONEMKL_COMPUTE_OUT_OF_PLACE_REAL_REAL_HPP
diff --git a/tests/unit_tests/dft/include/compute_tester.hpp b/tests/unit_tests/dft/include/compute_tester.hpp
deleted file mode 100644
index 17ffac0cb..000000000
--- a/tests/unit_tests/dft/include/compute_tester.hpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef ONEMKL_COMPUTE_TESTER_HPP
-#define ONEMKL_COMPUTE_TESTER_HPP
-
-#include <algorithm>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl.hpp"
-#include "test_helper.hpp"
-#include "test_common.hpp"
-#include "reference_dft.hpp"
-
-#include <numeric>
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-struct DFT_Test {
-    using descriptor_t = oneapi::mkl::dft::descriptor<precision, domain>;
-
-    template <typename ElemT>
-    using usm_allocator_t = sycl::usm_allocator<ElemT, sycl::usm::alloc::shared, 64>;
-
-    using PrecisionType =
-        typename std::conditional_t<precision == oneapi::mkl::dft::precision::SINGLE, float,
-                                    double>;
-
-    using FwdInputType = typename std::conditional_t<domain == oneapi::mkl::dft::domain::REAL,
-                                                     PrecisionType, std::complex<PrecisionType>>;
-    using FwdOutputType = std::complex<PrecisionType>;
-
-    enum class MemoryAccessModel { buffer, usm };
-
-    const std::vector<std::int64_t> sizes;
-    const std::vector<std::int64_t> strides_fwd;
-    const std::vector<std::int64_t> strides_bwd;
-    const std::int64_t batches;
-    const std::int64_t forward_elements;
-    const std::size_t size_total;
-    double abs_error_margin{ 0 };
-    double rel_error_margin{ 0 };
-
-    sycl::device* dev;
-    sycl::queue sycl_queue;
-    sycl::context cxt;
-
-    std::vector<FwdInputType> input;
-    std::vector<PrecisionType> input_re;
-    std::vector<PrecisionType> input_im;
-    std::vector<FwdOutputType> out_host_ref;
-
-    DFT_Test(sycl::device* dev, std::vector<std::int64_t> sizes_,
-             std::vector<std::int64_t> strides_fwd, std::vector<std::int64_t> strides_bwd,
-             std::int64_t batches_)
-            : sizes{ std::move(sizes_) },
-              strides_fwd(std::move(strides_fwd)),
-              strides_bwd(std::move(strides_bwd)),
-              batches{ batches_ },
-              forward_elements{ std::accumulate(sizes.begin(), sizes.end(), 1,
-                                                std::multiplies<>{}) },
-              size_total{ cast_unsigned(forward_elements * batches) },
-              dev{ dev },
-              sycl_queue{ *dev, exception_handler },
-              cxt{ sycl_queue.get_context() } {
-        input = std::vector<FwdInputType>(size_total);
-        input_re = std::vector<PrecisionType>(size_total);
-        input_im = std::vector<PrecisionType>(size_total);
-
-        // out_host_ref contains redundant information for domain::REAL
-        // tests. This simplifies the test implementation, but increases
-        // storage and computational requirements. There is scope for
-        // improvement here if test performance becomes an issue.
-        out_host_ref = std::vector<FwdOutputType>(size_total);
-
-        rand_vector(input, size_total);
-        if constexpr (domain == oneapi::mkl::dft::domain::REAL) {
-            for (std::size_t i = 0; i < input.size(); ++i) {
-                input_re[i] = { input[i] };
-                input_im[i] = 0;
-            }
-        }
-        else {
-            for (std::size_t i = 0; i < input.size(); ++i) {
-                input_re[i] = { input[i].real() };
-                input_im[i] = { input[i].imag() };
-            }
-        }
-    }
-
-    bool skip_test(MemoryAccessModel mem_acc) {
-        if constexpr (precision == oneapi::mkl::dft::precision::DOUBLE) {
-            if (!sycl_queue.get_device().has(sycl::aspect::fp64)) {
-                std::cout << "Device does not support double precision." << std::endl;
-                return true;
-            }
-        }
-
-        if (mem_acc == MemoryAccessModel::usm &&
-            !sycl_queue.get_device().has(sycl::aspect::usm_shared_allocations)) {
-            std::cout << "Device does not support usm shared allocations." << std::endl;
-            return true;
-        }
-
-        return false;
-    }
-
-    bool init(MemoryAccessModel mem_acc) {
-        for (int i = 0; i < batches; i += 1) {
-            reference_forward_dft<FwdInputType, FwdOutputType>(
-                sizes, input.data() + i * forward_elements,
-                out_host_ref.data() + i * forward_elements);
-        }
-        auto max_norm_ref = *std::max_element(std::begin(out_host_ref), std::end(out_host_ref),
-                                              [](const FwdOutputType& a, const FwdOutputType& b) {
-                                                  return std::abs(a) < std::abs(b);
-                                              });
-        // Heuristic for the average-case error margins
-        abs_error_margin =
-            10 * std::abs(max_norm_ref) * std::log2(static_cast<double>(forward_elements));
-        rel_error_margin = 200.0 * std::log2(static_cast<double>(forward_elements));
-        return !skip_test(mem_acc);
-    }
-
-    int test_in_place_buffer();
-    int test_in_place_real_real_buffer();
-    int test_out_of_place_buffer();
-    int test_out_of_place_real_real_buffer();
-    int test_in_place_USM();
-    int test_in_place_real_real_USM();
-    int test_out_of_place_USM();
-    int test_out_of_place_real_real_USM();
-};
-
-#endif //ONEMKL_COMPUTE_TESTER_HPP
diff --git a/tests/unit_tests/dft/include/parseval_check.hpp b/tests/unit_tests/dft/include/parseval_check.hpp
deleted file mode 100644
index ece6f7d31..000000000
--- a/tests/unit_tests/dft/include/parseval_check.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef ONEMKL_PARSEVAL_CHECK_HPP
-#define ONEMKL_PARSEVAL_CHECK_HPP
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <vector>
-#include <numeric>
-
-#include "test_common.hpp"
-
-/** Use Parseval's theorem to verify the output of DFT. This does not guarantee that the output
- * of the DFT is correct, and is only a sanity check.
- * 
- * Check Sum(|in[i]|^2) == Sum(|out[i]|^2).
- * 
- * @tparam TypeFwd Forward domain type
- * @tparam TypeBwd Backward domain type
- * @param dft_len DFT size
- * @param in forward domain data
- * @param out bwd domain data
- * @param rescale_forward A value to multiply the in data by.
-*/
-template <typename TypeFwd, typename TypeBwd>
-bool parseval_check(std::size_t dft_len, const TypeFwd* in, TypeBwd* out,
-                    TypeFwd rescale_forward = 1) {
-    static_assert(is_complex<TypeBwd>());
-    bool complex_forward = is_complex<TypeFwd>();
-    auto bwd_len = complex_forward ? dft_len : dft_len / 2 + 1;
-
-    float in_sum{ 0 };
-    float out_sum{ 0 };
-    for (std::size_t i{ 0 }; i < dft_len; ++i) {
-        in_sum += static_cast<float>(std::abs(in[i] * rescale_forward) *
-                                     std::abs(in[i] * rescale_forward));
-    }
-    if (complex_forward) {
-        for (std::size_t i{ 0 }; i < bwd_len; ++i) {
-            out_sum += static_cast<float>(std::abs(out[i]) * std::abs(out[i]));
-        }
-    }
-    else {
-        for (std::size_t i{ 0 }; i < bwd_len - 1; ++i) {
-            out_sum += static_cast<float>(std::abs(out[i]) * std::abs(out[i]));
-        }
-        out_sum *= 2;
-        out_sum += static_cast<float>(std::abs(out[bwd_len - 1]) * std::abs(out[bwd_len - 1]));
-    }
-    out_sum /= static_cast<float>(dft_len);
-    auto max_norm_ref = *std::max_element(
-        in, in + dft_len, [](const auto& a, const auto& b) { return std::abs(a) < std::abs(b); });
-    // Heuristic for the average-case error margins
-    auto abs_error_margin = 10 * std::abs(max_norm_ref) * std::log2(static_cast<float>(dft_len));
-    if (std::abs(in_sum - out_sum) > abs_error_margin) {
-        std::cout << "Failed check with Parseval's theorem: Fwd sum = " << in_sum
-                  << ", Bwd sum = " << out_sum << " (tol = " << abs_error_margin << ")"
-                  << std::endl;
-        return false;
-    }
-    return true;
-}
-#endif // ONEMKL_PARSEVAL_CHECK_HPP
diff --git a/tests/unit_tests/dft/include/reference_dft.hpp b/tests/unit_tests/dft/include/reference_dft.hpp
deleted file mode 100644
index 236edc7b0..000000000
--- a/tests/unit_tests/dft/include/reference_dft.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef ONEMKL_REFERENCE_DFT_HPP
-#define ONEMKL_REFERENCE_DFT_HPP
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <vector>
-#include <numeric>
-
-#include <oneapi/mkl/exceptions.hpp>
-#include "test_common.hpp"
-
-namespace detail {
-using ref_t = long double; /* Do the calculations using long double */
-template <typename TypeIn, typename TypeOut>
-void reference_forward_dft_impl(const TypeIn *in, TypeOut *out, std::size_t N, std::size_t stride) {
-    static_assert(is_complex<TypeOut>(), "Output type of DFT must be complex");
-
-    constexpr ref_t TWOPI = 2.0L * 3.141592653589793238462643383279502884197L;
-
-    for (std::size_t k = 0; k < N; ++k) {
-        std::complex<ref_t> out_temp = 0;
-        const auto partial_expo = (static_cast<ref_t>(k) * TWOPI) / static_cast<ref_t>(N);
-        for (std::size_t n = 0; n < N; ++n) {
-            const auto expo = static_cast<ref_t>(n) * partial_expo;
-            out_temp += static_cast<std::complex<ref_t>>(in[n * stride]) *
-                        std::complex<ref_t>{ std::cos(expo), -std::sin(expo) };
-        }
-        out[k * stride] = static_cast<TypeOut>(out_temp);
-    }
-}
-
-template <typename TypeIn, typename TypeOut, int dims>
-struct reference {};
-
-template <typename TypeIn, typename TypeOut>
-struct reference<TypeIn, TypeOut, 1> {
-    static void forward_dft(const std::vector<std::size_t> &sizes, const TypeIn *in, TypeOut *out) {
-        reference_forward_dft_impl(in, out, sizes[0], 1);
-    }
-};
-
-template <typename TypeIn, typename TypeOut>
-struct reference<TypeIn, TypeOut, 2> {
-    static void forward_dft(const std::vector<std::size_t> &sizes, const TypeIn *in, TypeOut *out) {
-        const auto elements = std::accumulate(sizes.begin(), sizes.end(), 1U, std::multiplies<>{});
-        std::vector<std::complex<ref_t>> tmp(elements);
-        for (std::size_t i = 0; i < elements; i += sizes[1]) {
-            reference_forward_dft_impl(in + i, tmp.data() + i, sizes[1], 1);
-        }
-        for (std::size_t i = 0; i < sizes[1]; i++) {
-            reference_forward_dft_impl(tmp.data() + i, out + i, sizes[0], sizes[1]);
-        }
-    }
-};
-
-template <typename TypeIn, typename TypeOut>
-struct reference<TypeIn, TypeOut, 3> {
-    static void forward_dft(const std::vector<std::size_t> &sizes, const TypeIn *in, TypeOut *out) {
-        const auto elements = std::accumulate(sizes.begin(), sizes.end(), 1U, std::multiplies<>{});
-        std::vector<std::complex<ref_t>> tmp1(elements);
-        std::vector<std::complex<ref_t>> tmp2(elements);
-        for (std::size_t i = 0; i < elements; i += sizes[2]) {
-            reference_forward_dft_impl(in + i, tmp1.data() + i, sizes[2], 1);
-        }
-        for (std::size_t j = 0; j < elements; j += sizes[1] * sizes[2]) {
-            for (std::size_t i = 0; i < sizes[2]; i++) {
-                reference_forward_dft_impl(tmp1.data() + i + j, tmp2.data() + i + j, sizes[1],
-                                           sizes[2]);
-            }
-        }
-        for (std::size_t i = 0; i < sizes[1] * sizes[2]; i++) {
-            reference_forward_dft_impl(tmp2.data() + i, out + i, sizes[0], sizes[1] * sizes[2]);
-        }
-    }
-};
-} // namespace detail
-
-/** Naive DFT implementation for reference.
- *  
- * Directly compute a single 1D forward DFT of the form:
- * for k in range(0, N):
- *   out[k] = sum( exp(2 pi k n im / N) * in[n] for n in range(0, N) )
- * where N is the size of the input / output arrays. The input may be
- * real or complex, but the output must be complex.
- *  
- * @tparam TypeIn The forward data type. Must be complex or real.
- * @tparam TypeOut The transformed (backward) data type. Written to. Must be 
- * complex.
- * @param in The input forward data.
- * @param out Where to write the output data.
- * @param N The number of elements in the input data set.
- * @param stride the stride between elements in the data set, measured in elements.
-**/
-template <typename TypeIn, typename TypeOut>
-void reference_forward_dft(const std::vector<std::int64_t> &sizes, const TypeIn *in, TypeOut *out) {
-    std::vector<std::size_t> unsigned_sizes(sizes.size());
-    std::transform(sizes.begin(), sizes.end(), unsigned_sizes.begin(),
-                   [](std::int64_t size) { return cast_unsigned(size); });
-    switch (unsigned_sizes.size()) {
-        case 1: detail::reference<TypeIn, TypeOut, 1>::forward_dft(unsigned_sizes, in, out); break;
-        case 2: detail::reference<TypeIn, TypeOut, 2>::forward_dft(unsigned_sizes, in, out); break;
-        case 3: detail::reference<TypeIn, TypeOut, 3>::forward_dft(unsigned_sizes, in, out); break;
-        default:
-            throw oneapi::mkl::unimplemented(
-                "reference_dft", "forward_dft",
-                "dft with size " + std::to_string(unsigned_sizes.size()));
-    }
-}
-
-#endif //ONEMKL_REFERENCE_DFT_HPP
diff --git a/tests/unit_tests/dft/include/test_common.hpp b/tests/unit_tests/dft/include/test_common.hpp
deleted file mode 100644
index b13723105..000000000
--- a/tests/unit_tests/dft/include/test_common.hpp
+++ /dev/null
@@ -1,391 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#ifndef ONEMKL_TEST_COMMON_HPP
-#define ONEMKL_TEST_COMMON_HPP
-
-#include <cstdint>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <stdexcept>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-template <typename T>
-struct complex_info {
-    using real_type = T;
-    static const bool is_complex = false;
-};
-
-template <typename T>
-struct complex_info<std::complex<T>> {
-    using real_type = T;
-    static const bool is_complex = true;
-};
-
-template <typename T>
-constexpr bool is_complex() {
-    return complex_info<T>::is_complex;
-}
-
-inline std::size_t cast_unsigned(std::int64_t i) {
-    if (i < 0) {
-        throw std::runtime_error("Unexpected negative value");
-    }
-    return static_cast<std::size_t>(i);
-}
-
-template <typename fp>
-bool check_equal(fp x, fp x_ref, double abs_error_mag, double rel_error_mag, std::ostream &out) {
-    using fp_real = typename complex_info<fp>::real_type;
-    static_assert(std::is_floating_point_v<fp_real>,
-                  "Expected floating-point real or complex type.");
-
-    const fp_real epsilon = []() {
-        if constexpr (sizeof(double) == sizeof(long double) && std::is_same_v<fp_real, double>) {
-            // The reference DFT uses long double to maintain accuracy
-            // when this isn't possible, lower the accuracy requirements
-            return 1e-12;
-        }
-        else {
-            return std::numeric_limits<fp_real>::epsilon();
-        }
-    }();
-    const auto abs_bound = static_cast<fp_real>(abs_error_mag) * epsilon;
-    const auto rel_bound = static_cast<fp_real>(rel_error_mag) * epsilon;
-
-    const auto aerr = std::abs(x - x_ref);
-    const auto rerr = aerr / std::abs(x_ref);
-    const bool ok = (rerr <= rel_bound) || (aerr <= abs_bound);
-    if (!ok) {
-        out << "Mismatching results: actual = " << x << " vs. reference = " << x_ref << "\n";
-        out << " relative error = " << rerr << " absolute error = " << aerr
-            << " relative bound = " << rel_bound << " absolute bound = " << abs_bound << "\n";
-    }
-    return ok;
-}
-
-template <typename vec1, typename vec2>
-bool check_equal_vector(vec1 &&v, vec2 &&v_ref, std::size_t n, double abs_error_mag,
-                        double rel_error_mag, std::ostream &out) {
-    constexpr int max_print = 20;
-    int count = 0;
-    bool good = true;
-
-    for (std::size_t i = 0; i < n; ++i) {
-        // Allow to convert the unsigned index `i` to a signed one to keep this function generic and allow for `v` and `v_ref` to be a vector, a pointer or a random access iterator.
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wsign-conversion"
-        auto res = v[i];
-        auto ref = v_ref[i];
-#pragma clang diagnostic pop
-        if (!check_equal(res, ref, abs_error_mag, rel_error_mag, out)) {
-            out << " at index i =" << i << "\n";
-            good = false;
-            ++count;
-            if (count > max_print) {
-                return good;
-            }
-        }
-    }
-
-    return good;
-}
-
-// Random initialization.
-template <typename t>
-inline t rand_scalar() {
-    if constexpr (std::is_same_v<t, int32_t>) {
-        return std::rand() % 256 - 128;
-    }
-    else if constexpr (std::is_floating_point_v<t>) {
-        return t(std::rand()) / t(RAND_MAX) - t(0.5);
-    }
-    else {
-        static_assert(complex_info<t>::is_complex, "unexpect type in rand_scalar");
-        using fp = typename complex_info<t>::real_type;
-        return t(rand_scalar<fp>(), rand_scalar<fp>());
-    }
-}
-
-template <typename vec>
-void rand_vector(vec &v, std::size_t n) {
-    using fp = typename vec::value_type;
-    v.resize(n);
-    for (std::size_t i = 0; i < n; i++) {
-        v[i] = rand_scalar<fp>();
-    }
-}
-
-// Catch asynchronous exceptions.
-auto exception_handler = [](sycl::exception_list exceptions) {
-    for (std::exception_ptr const &e : exceptions) {
-        try {
-            std::rethrow_exception(e);
-        }
-        catch (sycl::exception e) {
-            std::cout << "Caught asynchronous SYCL exception:\n" << e.what() << "\n";
-            print_error_code(e);
-        }
-    }
-};
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-void commit_descriptor(oneapi::mkl::dft::descriptor<precision, domain> &descriptor,
-                       sycl::queue queue) {
-#ifdef CALL_RT_API
-    descriptor.commit(queue);
-#else
-    TEST_RUN_CT_SELECT_NO_ARGS(queue, descriptor.commit);
-#endif
-}
-
-// is it assumed that the unused elements of the array are ignored
-inline std::array<std::int64_t, 4> get_conjugate_even_complex_strides(
-    const std::vector<std::int64_t> &sizes) {
-    switch (sizes.size()) {
-        case 1: return { 0, 1 };
-        case 2: return { 0, sizes[1] / 2 + 1, 1 };
-        case 3: return { 0, sizes[1] * (sizes[2] / 2 + 1), (sizes[2] / 2 + 1), 1 };
-        default:
-            throw oneapi::mkl::unimplemented(
-                "dft/test_common", __FUNCTION__,
-                "not implemented for " + std::to_string(sizes.size()) + " dimensions");
-            return {};
-    }
-}
-
-// is it assumed that the unused elements of the array are ignored
-inline std::array<std::int64_t, 4> get_default_strides(const std::vector<std::int64_t> &sizes) {
-    if (sizes.size() > 3) {
-        throw oneapi::mkl::unimplemented(
-            "dft/test_common", __FUNCTION__,
-            "not implemented for " + std::to_string(sizes.size()) + " dimensions");
-    }
-
-    switch (sizes.size()) {
-        case 1: return { 0, 1 };
-        case 2: return { 0, sizes[1], 1 };
-        case 3: return { 0, sizes[1] * sizes[2], sizes[2], 1 };
-        default:
-            throw oneapi::mkl::unimplemented(
-                "dft/test_common", __FUNCTION__,
-                "not implemented for " + std::to_string(sizes.size()) + " dimensions");
-            return {};
-    }
-}
-
-template <typename T>
-T get_default(const std::vector<T> vec, std::size_t idx, T default_) {
-    if (idx >= vec.size()) {
-        return default_;
-    }
-    return vec[idx];
-}
-
-template <oneapi::mkl::dft::domain domain, bool in_place = false>
-std::pair<std::int64_t, std::int64_t> get_default_distances(
-    const std::vector<std::int64_t> &sizes, const std::vector<std::int64_t> &strides_fwd,
-    const std::vector<std::int64_t> &strides_bwd) {
-    std::int64_t size0 = sizes[0];
-    std::int64_t size1 = get_default(sizes, 1, 1l);
-    std::int64_t size2 = get_default(sizes, 2, 1l);
-    std::int64_t size0_real =
-        domain == oneapi::mkl::dft::domain::REAL && sizes.size() == 1 ? size0 / 2 + 1 : size0;
-    std::int64_t size1_real =
-        domain == oneapi::mkl::dft::domain::REAL && sizes.size() == 2 ? size1 / 2 + 1 : size1;
-    std::int64_t size2_real =
-        domain == oneapi::mkl::dft::domain::REAL && sizes.size() == 3 ? size2 / 2 + 1 : size2;
-    std::int64_t backward_distance = size0_real * size1_real * size2_real;
-    std::int64_t forward_distance = size0 * size1 * size2;
-    if (strides_fwd.size() > 1) {
-        forward_distance =
-            std::max({ size0 * strides_fwd[1], size1 * get_default(strides_fwd, 2, 0l),
-                       size2 * get_default(strides_fwd, 3, 0l) });
-    }
-    if (strides_bwd.size() > 1) {
-        backward_distance =
-            std::max({ size0 * strides_bwd[1], size1 * get_default(strides_bwd, 2, 0l),
-                       size2 * get_default(strides_bwd, 3, 0l) });
-    }
-    if (in_place) {
-        forward_distance =
-            std::max(forward_distance,
-                     backward_distance * (domain == oneapi::mkl::dft::domain::REAL ? 2L : 1L));
-    }
-    return { forward_distance, backward_distance };
-}
-
-//up to 3 dimensions, empty strides = default
-template <typename T_vec, typename Allocator = std::allocator<typename T_vec::value_type>>
-std::vector<typename T_vec::value_type, Allocator> strided_copy(
-    const T_vec &contiguous, const std::vector<std::int64_t> &sizes,
-    const std::vector<std::int64_t> &strides, std::int64_t batches, std::int64_t distance,
-    Allocator alloc = {}) {
-    if (strides.size() == 0) {
-        return { contiguous.begin(), contiguous.end(), alloc };
-    }
-    using T = typename T_vec::value_type;
-    std::int64_t size0 = sizes[0];
-    std::int64_t size1 = get_default(sizes, 1, 1l);
-    std::int64_t size2 = get_default(sizes, 2, 1l);
-
-    std::int64_t stride0 = strides[0];
-    std::int64_t stride1 = strides[1];
-    std::int64_t stride2 = get_default(strides, 2, 0l);
-    std::int64_t stride3 = get_default(strides, 3, 0l);
-    std::vector<T, Allocator> res(cast_unsigned(distance * batches + stride0), alloc);
-    for (std::int64_t b = 0; b < batches; b++) {
-        for (std::int64_t i = 0; i < size0; i++) {
-            for (std::int64_t j = 0; j < size1; j++) {
-                for (std::int64_t k = 0; k < size2; k++) {
-                    res[cast_unsigned(b * distance + i * stride1 + j * stride2 + k * stride3 +
-                                      stride0)] =
-                        contiguous[cast_unsigned(((b * size0 + i) * size1 + j) * size2 + k)];
-                }
-            }
-        }
-    }
-    return res;
-}
-
-//up to 3 dimensions, empty strides = default
-template <bool ConjugateEvenStrides, typename vec1, typename vec2>
-bool check_equal_strided(const vec1 &v, const vec2 &v_ref, std::vector<int64_t> sizes,
-                         std::vector<int64_t> strides, double abs_error_mag, double rel_error_mag,
-                         std::ostream &out) {
-    if (strides.size() == 0) {
-        std::array<std::int64_t, 4> strides_arr;
-        if constexpr (ConjugateEvenStrides) {
-            strides_arr = get_conjugate_even_complex_strides(sizes);
-        }
-        else {
-            strides_arr = get_default_strides(sizes);
-        }
-        strides = { &strides_arr[0], &strides_arr[sizes.size() + 1] };
-    }
-    using T = std::decay_t<decltype(v[0])>;
-    std::int64_t size0 = sizes[0];
-    std::int64_t size1 = get_default(sizes, 1, 1l);
-    std::int64_t size2 = get_default(sizes, 2, 1l);
-    std::int64_t size0_real = ConjugateEvenStrides && sizes.size() == 1 ? size0 / 2 + 1 : size0;
-    std::int64_t size1_real = ConjugateEvenStrides && sizes.size() == 2 ? size1 / 2 + 1 : size1;
-    std::int64_t size2_real = ConjugateEvenStrides && sizes.size() == 3 ? size2 / 2 + 1 : size2;
-
-    std::int64_t stride0 = strides[0];
-    std::int64_t stride1 = strides[1];
-    std::int64_t stride2 = get_default(strides, 2, 0l);
-    std::int64_t stride3 = get_default(strides, 3, 0l);
-
-    constexpr int max_print = 20;
-    int count = 0;
-    bool good = true;
-
-    for (std::int64_t i = 0; i < size0_real; i++) {
-        for (std::int64_t j = 0; j < size1_real; j++) {
-            for (std::int64_t k = 0; k < size2_real; k++) {
-                T res = v[cast_unsigned(i * stride1 + j * stride2 + k * stride3 + stride0)];
-                T ref = v_ref[cast_unsigned((i * size1 + j) * size2 + k)];
-                if (!check_equal(res, ref, abs_error_mag, rel_error_mag, out)) {
-                    out << " at position " << i << ", " << j << ", " << k << "\n";
-                    out << " at indices " << i * stride1 + j * stride2 + k * stride3 + stride0
-                        << ", " << (i * size1 + j) * size2 + k << "\n";
-                    good = false;
-                    ++count;
-                    if (count > max_print) {
-                        return good;
-                    }
-                }
-            }
-        }
-    }
-    return good;
-}
-
-struct DFTParams {
-    std::vector<std::int64_t> sizes;
-    std::vector<std::int64_t> strides_fwd;
-    std::vector<std::int64_t> strides_bwd;
-    std::int64_t batches;
-    DFTParams(std::vector<std::int64_t> sizes, std::int64_t batches)
-            : sizes(sizes),
-              strides_fwd({}),
-              strides_bwd({}),
-              batches(batches) {}
-    DFTParams(std::vector<std::int64_t> sizes, std::vector<std::int64_t> strides_fwd,
-              std::vector<std::int64_t> strides_bwd, std::int64_t batches)
-            : sizes(sizes),
-              strides_fwd(strides_fwd),
-              strides_bwd(strides_bwd),
-              batches(batches) {}
-};
-
-class DFTParamsPrint {
-public:
-    std::string operator()(
-        testing::TestParamInfo<std::tuple<sycl::device *, DFTParams>> dev) const {
-        auto [device, params] = dev.param;
-        std::string info_name;
-
-        assert(params.sizes.size() > 0);
-        info_name.append("sizes_");
-
-        // intersperse dimensions with "x"
-        std::for_each(params.sizes.begin(), params.sizes.end() - 1,
-                      [&info_name](auto s) { info_name.append(std::to_string(s)).append("x"); });
-        info_name.append(std::to_string(params.sizes.back()));
-
-        if (params.strides_fwd.size() != 0) {
-            info_name.append("_fwd_strides_");
-            // intersperse strides with "_"
-            std::for_each(
-                params.strides_fwd.begin(), params.strides_fwd.end() - 1,
-                [&info_name](auto s) { info_name.append(std::to_string(s)).append("_"); });
-            info_name.append(std::to_string(params.strides_fwd.back()));
-        }
-        if (params.strides_bwd.size() != 0) {
-            info_name.append("_bwd_strides_");
-            // intersperse strides with "_"
-            std::for_each(
-                params.strides_bwd.begin(), params.strides_bwd.end() - 1,
-                [&info_name](auto s) { info_name.append(std::to_string(s)).append("_"); });
-            info_name.append(std::to_string(params.strides_bwd.back()));
-        }
-
-        info_name.append("_batches_").append(std::to_string(params.batches));
-
-        std::string dev_name = device->get_info<sycl::info::device::name>();
-        std::for_each(dev_name.begin(), dev_name.end(), [](auto &c) {
-            if (!isalnum(c))
-                c = '_';
-        });
-
-        info_name.append("_").append(dev_name);
-
-        return info_name;
-    }
-};
-
-#endif //ONEMKL_TEST_COMMON_HPP
diff --git a/tests/unit_tests/dft/source/CMakeLists.txt b/tests/unit_tests/dft/source/CMakeLists.txt
deleted file mode 100644
index 364ad564f..000000000
--- a/tests/unit_tests/dft/source/CMakeLists.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(DFT_SOURCES "compute_tests.cpp" "descriptor_tests.cpp" "workspace_external_tests.cpp")
-
-include(WarningsUtils)
-
-if (BUILD_SHARED_LIBS)
-    add_library(dft_source_rt OBJECT ${DFT_SOURCES})
-    target_compile_options(dft_source_rt PRIVATE -DCALL_RT_API -DNOMINMAX)
-    target_include_directories(dft_source_rt
-            PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-            PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-            PUBLIC ${PROJECT_SOURCE_DIR}/include
-            PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-            PUBLIC ${CMAKE_BINARY_DIR}/bin
-            )
-    if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-        add_sycl_to_target(TARGET dft_source_rt SOURCES ${DFT_SOURCES})
-    else ()
-        target_link_libraries(dft_source_rt PUBLIC ONEMKL::SYCL::SYCL)
-    endif ()
-    target_link_libraries(dft_source_rt PRIVATE onemkl_warnings)
-endif ()
-
-add_library(dft_source_ct OBJECT ${DFT_SOURCES})
-target_compile_options(dft_source_ct PRIVATE -DNOMINMAX)
-target_include_directories(dft_source_ct
-        PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-        PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-        PUBLIC ${PROJECT_SOURCE_DIR}/include
-        PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-        PUBLIC ${CMAKE_BINARY_DIR}/bin
-        )
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET dft_source_ct SOURCES ${DFT_SOURCES})
-else ()
-    target_link_libraries(dft_source_ct PUBLIC ONEMKL::SYCL::SYCL)
-endif ()
-target_link_libraries(dft_source_ct PRIVATE onemkl_warnings)
-
diff --git a/tests/unit_tests/dft/source/compute_tests.cpp b/tests/unit_tests/dft/source/compute_tests.cpp
deleted file mode 100644
index 005f833ef..000000000
--- a/tests/unit_tests/dft/source/compute_tests.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "test_helper.hpp"
-#include "test_common.hpp"
-#include <gtest/gtest.h>
-
-#include "compute_inplace.hpp"
-#include "compute_inplace_real_real.hpp"
-#include "compute_out_of_place.hpp"
-#include "compute_out_of_place_real_real.hpp"
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-class ComputeTests_in_place_COMPLEX
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, DFTParams>> {};
-class ComputeTests_real_real_in_place_COMPLEX
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, DFTParams>> {};
-class ComputeTests_out_of_place_COMPLEX
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, DFTParams>> {};
-class ComputeTests_real_real_out_of_place_COMPLEX
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, DFTParams>> {};
-
-class ComputeTests_in_place_REAL
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, DFTParams>> {};
-class ComputeTests_real_real_in_place_REAL
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, DFTParams>> {};
-class ComputeTests_out_of_place_REAL
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, DFTParams>> {};
-class ComputeTests_real_real_out_of_place_REAL
-        : public ::testing::TestWithParam<std::tuple<sycl::device *, DFTParams>> {};
-
-#define INSTANTIATE_TEST(PRECISION, DOMAIN, PLACE, LAYOUT, STORAGE)                       \
-    TEST_P(ComputeTests##_##LAYOUT##PLACE##_##DOMAIN,                                     \
-           DOMAIN##_##PRECISION##_##PLACE##_##LAYOUT##STORAGE) {                          \
-        try {                                                                             \
-            auto test = DFT_Test<oneapi::mkl::dft::precision::PRECISION,                  \
-                                 oneapi::mkl::dft::domain::DOMAIN>{                       \
-                std::get<0>(GetParam()), std::get<1>(GetParam()).sizes,                   \
-                std::get<1>(GetParam()).strides_fwd, std::get<1>(GetParam()).strides_bwd, \
-                std::get<1>(GetParam()).batches                                           \
-            };                                                                            \
-            EXPECT_TRUEORSKIP(test.test_##PLACE##_##LAYOUT##STORAGE());                   \
-        }                                                                                 \
-        catch (oneapi::mkl::unimplemented & e) {                                          \
-            std::cout << "Skipping test because: \"" << e.what() << "\"" << std::endl;    \
-            GTEST_SKIP();                                                                 \
-        }                                                                                 \
-        catch (std::exception & e) {                                                      \
-            std::string msg = e.what();                                                   \
-            if (msg.find("FFT_UNIMPLEMENTED") != std::string::npos) {                     \
-                std::cout << "Skipping test because: \"" << msg << "\"" << std::endl;     \
-                GTEST_SKIP();                                                             \
-            }                                                                             \
-            throw;                                                                        \
-        }                                                                                 \
-    }
-
-#define INSTANTIATE_TEST_DIMENSIONS_PRECISION_DOMAIN(PLACE, LAYOUT, STORAGE) \
-    INSTANTIATE_TEST(SINGLE, COMPLEX, PLACE, LAYOUT, STORAGE)                \
-    INSTANTIATE_TEST(SINGLE, REAL, PLACE, LAYOUT, STORAGE)                   \
-    INSTANTIATE_TEST(DOUBLE, COMPLEX, PLACE, LAYOUT, STORAGE)                \
-    INSTANTIATE_TEST(DOUBLE, REAL, PLACE, LAYOUT, STORAGE)
-
-#define INSTANTIATE_TEST_DIMENSIONS_PRECISION_DOMAIN_PLACE_LAYOUT(STORAGE)      \
-    INSTANTIATE_TEST_DIMENSIONS_PRECISION_DOMAIN(in_place, , STORAGE)           \
-    INSTANTIATE_TEST_DIMENSIONS_PRECISION_DOMAIN(in_place, real_real_, STORAGE) \
-    INSTANTIATE_TEST_DIMENSIONS_PRECISION_DOMAIN(out_of_place, , STORAGE)       \
-    INSTANTIATE_TEST_DIMENSIONS_PRECISION_DOMAIN(out_of_place, real_real_, STORAGE)
-
-INSTANTIATE_TEST_DIMENSIONS_PRECISION_DOMAIN_PLACE_LAYOUT(buffer)
-INSTANTIATE_TEST_DIMENSIONS_PRECISION_DOMAIN_PLACE_LAYOUT(USM)
-
-using shape = std::vector<std::int64_t>;
-using i64 = std::int64_t;
-// Parameter format - { shape of transform, number of transforms } or { shape, forward strides, backward strides, number of transforms }
-// strides need to be chosen in a way that also makes sense for real transforms
-std::vector<DFTParams> test_params{
-    { shape{ 8 }, i64{ 1 } },
-    { shape{ 9 }, i64{ 2 } },
-    { shape{ 8 }, i64{ 27 } },
-    { shape{ 22 }, i64{ 1 } },
-    { shape{ 128 }, i64{ 1 } },
-
-    { shape{ 4, 4 }, i64{ 1 } },
-    { shape{ 4, 4 }, i64{ 2 } },
-    { shape{ 4, 3 }, i64{ 9 } },
-    { shape{ 7, 8 }, i64{ 1 } },
-    { shape{ 64, 5 }, i64{ 1 } },
-
-    { shape{ 2, 2, 2 }, i64{ 1 } },
-    { shape{ 2, 2, 3 }, i64{ 2 } },
-    { shape{ 2, 2, 2 }, i64{ 27 } },
-    { shape{ 3, 7, 2 }, i64{ 1 } },
-    { shape{ 8, 8, 9 }, i64{ 1 } },
-
-    { shape{ 4, 3 }, shape{ 2, 3, 1 }, shape{ 2, 3, 1 }, i64{ 2 } },
-    { shape{ 4, 3 }, shape{ 0, 4, 1 }, shape{ 0, 3, 1 }, i64{ 3 } },
-    { shape{ 4, 3 }, shape{ 4, 6, 2 }, shape{ 2, 6, 2 }, i64{ 2 } },
-    { shape{ 4, 3 }, shape{ 1, 1, 4 }, shape{ 1, 1, 4 }, i64{ 9 } },
-    { shape{ 4, 4 }, shape{ 2, 4, 1 }, shape{ 0, 4, 1 }, i64{ 2 } },
-    { shape{ 4, 4 }, shape{ 0, 1, 5 }, shape{ 0, 1, 4 }, i64{ 2 } },
-    { shape{ 4, 4 }, shape{ 0, 1, 4 }, shape{ 0, 2, 9 }, i64{ 2 } },
-    { shape{ 4, 4 }, shape{ 0, 7, 1 }, shape{ 0, 5, 1 }, i64{ 2 } },
-    { shape{ 4, 4 }, shape{ 0, 8, 2 }, shape{ 0, 8, 2 }, i64{ 2 } },
-    { shape{ 4, 4 }, shape{ 0, 4, 1 }, shape{ 0, 1, 4 }, i64{ 2 } },
-
-    { shape{ 4, 4, 4 }, shape{ 2, 1, 4, 16 }, shape{ 4, 1, 4, 16 }, i64{ 2 } },
-    { shape{ 4, 4, 4 }, shape{ 4, 17, 4, 1 }, shape{ 4, 23, 5, 1 }, i64{ 2 } },
-    { shape{ 4, 4, 4 }, shape{ 0, 32, 8, 2 }, shape{ 0, 32, 8, 2 }, i64{ 2 } },
-    { shape{ 4, 4, 4 }, shape{ 2, 4, 1, 16 }, shape{ 1, 4, 16, 1 }, i64{ 2 } },
-    { shape{ 4, 4, 4 }, shape{ 0, 1, 32, 8 }, shape{ 0, 1, 32, 8 }, i64{ 2 } },
-};
-std::vector<DFTParams> test_params_real_in_place{
-    { shape{ 8 }, i64{ 1 } },
-    { shape{ 9 }, i64{ 2 } },
-    { shape{ 8 }, i64{ 27 } },
-    { shape{ 22 }, i64{ 1 } },
-    { shape{ 128 }, i64{ 1 } },
-
-    { shape{ 4, 4 }, i64{ 1 } },
-    { shape{ 4, 4 }, i64{ 2 } },
-    { shape{ 4, 3 }, i64{ 9 } },
-    { shape{ 7, 8 }, i64{ 1 } },
-    { shape{ 64, 5 }, i64{ 1 } },
-
-    { shape{ 2, 2, 2 }, i64{ 1 } },
-    { shape{ 2, 2, 3 }, i64{ 2 } },
-    { shape{ 2, 2, 2 }, i64{ 27 } },
-    { shape{ 3, 7, 2 }, i64{ 1 } },
-    { shape{ 8, 8, 9 }, i64{ 1 } },
-
-    { shape{ 4, 3 }, shape{ 0, 4, 1 }, shape{ 0, 2, 1 }, i64{ 2 } },
-    { shape{ 4, 3 }, shape{ 0, 6, 1 }, shape{ 0, 3, 1 }, i64{ 2 } },
-    { shape{ 4, 3 }, shape{ 0, 8, 2 }, shape{ 0, 4, 2 }, i64{ 2 } },
-    { shape{ 4, 3 }, shape{ 2, 4, 1 }, shape{ 1, 2, 1 }, i64{ 2 } },
-    { shape{ 4, 3 }, shape{ 6, 1, 4 }, shape{ 3, 1, 4 }, i64{ 9 } },
-    { shape{ 4, 3 }, shape{ 0, 1, 5 }, shape{ 0, 1, 5 }, i64{ 2 } },
-    { shape{ 4, 3 }, shape{ 0, 3, 12 }, shape{ 0, 3, 12 }, i64{ 9 } },
-
-    { shape{ 4, 4, 4 }, shape{ 4, 1, 4, 16 }, shape{ 2, 1, 4, 16 }, i64{ 2 } },
-    { shape{ 4, 4, 4 }, shape{ 0, 48, 12, 2 }, shape{ 0, 24, 6, 2 }, i64{ 2 } },
-    { shape{ 4, 4, 4 }, shape{ 0, 1, 48, 8 }, shape{ 0, 1, 24, 8 }, i64{ 2 } },
-};
-
-INSTANTIATE_TEST_SUITE_P(ComputeTestSuite, ComputeTests_in_place_COMPLEX,
-                         testing::Combine(testing::ValuesIn(devices),
-                                          testing::ValuesIn(test_params)),
-                         DFTParamsPrint{});
-INSTANTIATE_TEST_SUITE_P(ComputeTestSuite, ComputeTests_real_real_in_place_COMPLEX,
-                         testing::Combine(testing::ValuesIn(devices),
-                                          testing::ValuesIn(test_params)),
-                         DFTParamsPrint{});
-INSTANTIATE_TEST_SUITE_P(ComputeTestSuite, ComputeTests_out_of_place_COMPLEX,
-                         testing::Combine(testing::ValuesIn(devices),
-                                          testing::ValuesIn(test_params)),
-                         DFTParamsPrint{});
-INSTANTIATE_TEST_SUITE_P(ComputeTestSuite, ComputeTests_real_real_out_of_place_COMPLEX,
-                         testing::Combine(testing::ValuesIn(devices),
-                                          testing::ValuesIn(test_params)),
-                         DFTParamsPrint{});
-
-INSTANTIATE_TEST_SUITE_P(ComputeTestSuite, ComputeTests_in_place_REAL,
-                         testing::Combine(testing::ValuesIn(devices),
-                                          testing::ValuesIn(test_params_real_in_place)),
-                         DFTParamsPrint{});
-INSTANTIATE_TEST_SUITE_P(ComputeTestSuite, ComputeTests_real_real_in_place_REAL,
-                         testing::Combine(testing::ValuesIn(devices),
-                                          testing::ValuesIn(test_params_real_in_place)),
-                         DFTParamsPrint{});
-INSTANTIATE_TEST_SUITE_P(ComputeTestSuite, ComputeTests_out_of_place_REAL,
-                         testing::Combine(testing::ValuesIn(devices),
-                                          testing::ValuesIn(test_params)),
-                         DFTParamsPrint{});
-INSTANTIATE_TEST_SUITE_P(ComputeTestSuite, ComputeTests_real_real_out_of_place_REAL,
-                         testing::Combine(testing::ValuesIn(devices),
-                                          testing::ValuesIn(test_params)),
-                         DFTParamsPrint{});
-
-} // anonymous namespace
diff --git a/tests/unit_tests/dft/source/descriptor_tests.cpp b/tests/unit_tests/dft/source/descriptor_tests.cpp
deleted file mode 100644
index a420eb1e2..000000000
--- a/tests/unit_tests/dft/source/descriptor_tests.cpp
+++ /dev/null
@@ -1,782 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#include <iostream>
-#include <vector>
-#include <variant>
-#include <thread>
-#include <chrono>
-#include <condition_variable>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "test_helper.hpp"
-#include "test_common.hpp"
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-constexpr std::int64_t default_1d_lengths = 4;
-const std::vector<std::int64_t> default_3d_lengths{ 124, 5, 3 };
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-static void set_and_get_lengths() {
-    /* Negative Testing */
-    {
-        oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_3d_lengths };
-        EXPECT_THROW(descriptor.set_value(oneapi::mkl::dft::config_param::LENGTHS, nullptr),
-                     oneapi::mkl::invalid_argument);
-    }
-
-    /* 1D */
-    {
-        const std::int64_t dimensions = 1;
-        oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_1d_lengths };
-
-        const std::int64_t new_lengths{ 2345 };
-        std::int64_t lengths_value{ 0 };
-        std::int64_t dimensions_before_set{ 0 };
-        std::int64_t dimensions_after_set{ 0 };
-
-        descriptor.get_value(oneapi::mkl::dft::config_param::LENGTHS, &lengths_value);
-        descriptor.get_value(oneapi::mkl::dft::config_param::DIMENSION, &dimensions_before_set);
-        EXPECT_EQ(default_1d_lengths, lengths_value);
-        EXPECT_EQ(dimensions, dimensions_before_set);
-
-        descriptor.set_value(oneapi::mkl::dft::config_param::LENGTHS, new_lengths);
-        descriptor.get_value(oneapi::mkl::dft::config_param::LENGTHS, &lengths_value);
-        descriptor.get_value(oneapi::mkl::dft::config_param::DIMENSION, &dimensions_after_set);
-        EXPECT_EQ(new_lengths, lengths_value);
-        EXPECT_EQ(dimensions, dimensions_after_set);
-    }
-
-    /* >= 2D */
-    {
-        const std::int64_t dimensions = 3;
-
-        oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_3d_lengths };
-
-        std::vector<std::int64_t> lengths_value(3);
-        std::vector<std::int64_t> new_lengths{ 1, 2, 7 };
-        std::int64_t dimensions_before_set{ 0 };
-        std::int64_t dimensions_after_set{ 0 };
-
-        descriptor.get_value(oneapi::mkl::dft::config_param::LENGTHS, lengths_value.data());
-        descriptor.get_value(oneapi::mkl::dft::config_param::DIMENSION, &dimensions_before_set);
-
-        EXPECT_EQ(default_3d_lengths, lengths_value);
-        EXPECT_EQ(dimensions, dimensions_before_set);
-
-        descriptor.set_value(oneapi::mkl::dft::config_param::LENGTHS, new_lengths.data());
-        descriptor.get_value(oneapi::mkl::dft::config_param::LENGTHS, lengths_value.data());
-        descriptor.get_value(oneapi::mkl::dft::config_param::DIMENSION, &dimensions_after_set);
-
-        EXPECT_EQ(new_lengths, lengths_value);
-        EXPECT_EQ(dimensions, dimensions_after_set);
-    }
-}
-
-// Test for deprecated functionality
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-static void set_and_get_io_strides() {
-    oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_3d_lengths };
-
-    EXPECT_THROW(descriptor.set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES, nullptr),
-                 oneapi::mkl::invalid_argument);
-    EXPECT_THROW(descriptor.set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES, nullptr),
-                 oneapi::mkl::invalid_argument);
-
-    constexpr std::int64_t strides_size = 4;
-    const std::int64_t default_stride_d1 = default_3d_lengths[2] * default_3d_lengths[1];
-    const std::int64_t default_stride_d2 = default_3d_lengths[2];
-    const std::int64_t default_stride_d3 = 1;
-
-    std::vector<std::int64_t> default_strides_value{ 0, default_stride_d1, default_stride_d2,
-                                                     default_stride_d3 };
-
-    std::vector<std::int64_t> input_strides_value;
-    std::vector<std::int64_t> output_strides_value;
-    if constexpr (domain == oneapi::mkl::dft::domain::COMPLEX) {
-        input_strides_value = { 50, default_stride_d1 * 2, default_stride_d2 * 2,
-                                default_stride_d3 * 2 };
-        output_strides_value = { 50, default_stride_d1 * 2, default_stride_d2 * 2,
-                                 default_stride_d3 * 2 };
-    }
-    else {
-        input_strides_value = { 0, default_3d_lengths[1] * (default_3d_lengths[2] / 2 + 1) * 2,
-                                (default_3d_lengths[2] / 2 + 1) * 2, 1 };
-        output_strides_value = { 0, default_3d_lengths[1] * (default_3d_lengths[2] / 2 + 1),
-                                 (default_3d_lengths[2] / 2 + 1), 1 };
-    }
-
-    std::vector<std::int64_t> input_strides_before_set(strides_size);
-    std::vector<std::int64_t> input_strides_after_set(strides_size);
-    std::vector<std::int64_t> fwd_strides_after_set(strides_size, -1);
-    std::vector<std::int64_t> bwd_strides_after_set(strides_size, -1);
-
-    descriptor.get_value(oneapi::mkl::dft::config_param::INPUT_STRIDES,
-                         input_strides_before_set.data());
-    EXPECT_EQ(default_strides_value, input_strides_before_set);
-    descriptor.set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES, input_strides_value.data());
-    descriptor.get_value(oneapi::mkl::dft::config_param::INPUT_STRIDES,
-                         input_strides_after_set.data());
-    descriptor.get_value(oneapi::mkl::dft::config_param::FWD_STRIDES, fwd_strides_after_set.data());
-    descriptor.get_value(oneapi::mkl::dft::config_param::BWD_STRIDES, bwd_strides_after_set.data());
-    EXPECT_EQ(input_strides_value, input_strides_after_set);
-    EXPECT_EQ(std::vector<std::int64_t>(strides_size, 0), fwd_strides_after_set);
-    EXPECT_EQ(std::vector<std::int64_t>(strides_size, 0), bwd_strides_after_set);
-
-    std::vector<std::int64_t> output_strides_before_set(strides_size);
-    std::vector<std::int64_t> output_strides_after_set(strides_size);
-    descriptor.get_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES,
-                         output_strides_before_set.data());
-    EXPECT_EQ(default_strides_value, output_strides_before_set);
-    descriptor.set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES,
-                         output_strides_value.data());
-    descriptor.get_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES,
-                         output_strides_after_set.data());
-    EXPECT_EQ(output_strides_value, output_strides_after_set);
-}
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-static void set_and_get_fwd_bwd_strides() {
-    oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_3d_lengths };
-
-    EXPECT_THROW(descriptor.set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, nullptr),
-                 oneapi::mkl::invalid_argument);
-    EXPECT_THROW(descriptor.set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, nullptr),
-                 oneapi::mkl::invalid_argument);
-
-    constexpr std::int64_t strides_size = 4;
-    const std::int64_t default_stride_d1 = default_3d_lengths[2] * default_3d_lengths[1];
-    const std::int64_t default_stride_d2 = default_3d_lengths[2];
-    const std::int64_t default_stride_d3 = 1;
-
-    std::vector<std::int64_t> default_strides_value{ 0, default_stride_d1, default_stride_d2,
-                                                     default_stride_d3 };
-
-    std::vector<std::int64_t> fwd_strides_value;
-    std::vector<std::int64_t> bwd_strides_value;
-    if constexpr (domain == oneapi::mkl::dft::domain::COMPLEX) {
-        fwd_strides_value = { 50, default_stride_d1 * 2, default_stride_d2 * 2,
-                              default_stride_d3 * 2 };
-        bwd_strides_value = { 50, default_stride_d1 * 2, default_stride_d2 * 2,
-                              default_stride_d3 * 2 };
-    }
-    else {
-        fwd_strides_value = { 0, default_3d_lengths[1] * (default_3d_lengths[2] / 2 + 1) * 2,
-                              (default_3d_lengths[2] / 2 + 1) * 2, 1 };
-        bwd_strides_value = { 0, default_3d_lengths[1] * (default_3d_lengths[2] / 2 + 1),
-                              (default_3d_lengths[2] / 2 + 1), 1 };
-    }
-
-    std::vector<std::int64_t> fwd_strides_before_set(strides_size);
-    std::vector<std::int64_t> fwd_strides_after_set(strides_size);
-    std::vector<std::int64_t> input_strides_after_set(strides_size, -1);
-    std::vector<std::int64_t> output_strides_after_set(strides_size, -1);
-
-    descriptor.get_value(oneapi::mkl::dft::config_param::FWD_STRIDES,
-                         fwd_strides_before_set.data());
-    EXPECT_EQ(default_strides_value, fwd_strides_before_set);
-    descriptor.set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, fwd_strides_value.data());
-    descriptor.get_value(oneapi::mkl::dft::config_param::FWD_STRIDES, fwd_strides_after_set.data());
-    descriptor.get_value(oneapi::mkl::dft::config_param::INPUT_STRIDES,
-                         input_strides_after_set.data());
-    descriptor.get_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES,
-                         output_strides_after_set.data());
-    EXPECT_EQ(fwd_strides_value, fwd_strides_after_set);
-    EXPECT_EQ(std::vector<std::int64_t>(strides_size, 0), input_strides_after_set);
-    EXPECT_EQ(std::vector<std::int64_t>(strides_size, 0), output_strides_after_set);
-
-    std::vector<std::int64_t> bwd_strides_before_set(strides_size);
-    std::vector<std::int64_t> bwd_strides_after_set(strides_size);
-    descriptor.get_value(oneapi::mkl::dft::config_param::BWD_STRIDES,
-                         bwd_strides_before_set.data());
-    EXPECT_EQ(default_strides_value, bwd_strides_before_set);
-    descriptor.set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, bwd_strides_value.data());
-    descriptor.get_value(oneapi::mkl::dft::config_param::BWD_STRIDES, bwd_strides_after_set.data());
-    EXPECT_EQ(bwd_strides_value, bwd_strides_after_set);
-}
-#pragma clang diagnostic pop
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-static void set_and_get_values() {
-    oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_1d_lengths };
-
-    using Precision_Type =
-        typename std::conditional_t<precision == oneapi::mkl::dft::precision::SINGLE, float,
-                                    double>;
-
-    {
-        auto forward_scale_set_value = Precision_Type(143.5);
-        Precision_Type forward_scale_before_set;
-        Precision_Type forward_scale_after_set;
-
-        descriptor.get_value(oneapi::mkl::dft::config_param::FORWARD_SCALE,
-                             &forward_scale_before_set);
-        EXPECT_EQ(1.0, forward_scale_before_set);
-        descriptor.set_value(oneapi::mkl::dft::config_param::FORWARD_SCALE,
-                             forward_scale_set_value);
-        descriptor.get_value(oneapi::mkl::dft::config_param::FORWARD_SCALE,
-                             &forward_scale_after_set);
-        EXPECT_EQ(forward_scale_set_value, forward_scale_after_set);
-    }
-
-    {
-        auto backward_scale_set_value = Precision_Type(143.5);
-        Precision_Type backward_scale_before_set;
-        Precision_Type backward_scale_after_set;
-
-        descriptor.get_value(oneapi::mkl::dft::config_param::BACKWARD_SCALE,
-                             &backward_scale_before_set);
-        EXPECT_EQ(1.0, backward_scale_before_set);
-        descriptor.set_value(oneapi::mkl::dft::config_param::BACKWARD_SCALE,
-                             backward_scale_set_value);
-        descriptor.get_value(oneapi::mkl::dft::config_param::BACKWARD_SCALE,
-                             &backward_scale_after_set);
-        EXPECT_EQ(backward_scale_set_value, backward_scale_after_set);
-    }
-
-    {
-        std::int64_t n_transforms_set_value{ 12 };
-        std::int64_t n_transforms_before_set;
-        std::int64_t n_transforms_after_set;
-
-        descriptor.get_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
-                             &n_transforms_before_set);
-        EXPECT_EQ(1, n_transforms_before_set);
-        descriptor.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
-                             n_transforms_set_value);
-        descriptor.get_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
-                             &n_transforms_after_set);
-        EXPECT_EQ(n_transforms_set_value, n_transforms_after_set);
-    }
-
-    {
-        std::int64_t fwd_distance_set_value{ 12 };
-        std::int64_t fwd_distance_before_set;
-        std::int64_t fwd_distance_after_set;
-
-        descriptor.get_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,
-                             &fwd_distance_before_set);
-        EXPECT_EQ(1, fwd_distance_before_set);
-        descriptor.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, fwd_distance_set_value);
-        descriptor.get_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, &fwd_distance_after_set);
-        EXPECT_EQ(fwd_distance_set_value, fwd_distance_after_set);
-
-        std::int64_t bwd_distance_set_value{ domain == oneapi::mkl::dft::domain::REAL
-                                                 ? (fwd_distance_set_value / 2) + 1
-                                                 : fwd_distance_set_value };
-        std::int64_t bwd_distance_before_set;
-        std::int64_t bwd_distance_after_set;
-
-        descriptor.get_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,
-                             &bwd_distance_before_set);
-        EXPECT_EQ(1, bwd_distance_before_set);
-        descriptor.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, bwd_distance_set_value);
-        descriptor.get_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, &bwd_distance_after_set);
-        EXPECT_EQ(bwd_distance_set_value, bwd_distance_after_set);
-    }
-
-    {
-        oneapi::mkl::dft::config_value value{
-            oneapi::mkl::dft::config_value::COMMITTED
-        }; // Initialize with invalid value
-        descriptor.get_value(oneapi::mkl::dft::config_param::PLACEMENT, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::INPLACE, value);
-
-        descriptor.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                             oneapi::mkl::dft::config_value::NOT_INPLACE);
-        descriptor.get_value(oneapi::mkl::dft::config_param::PLACEMENT, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::NOT_INPLACE, value);
-
-        descriptor.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                             oneapi::mkl::dft::config_value::INPLACE);
-        descriptor.get_value(oneapi::mkl::dft::config_param::PLACEMENT, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::INPLACE, value);
-    }
-
-    {
-        oneapi::mkl::dft::config_value value{
-            oneapi::mkl::dft::config_value::COMMITTED
-        }; // Initialize with invalid value
-        descriptor.get_value(oneapi::mkl::dft::config_param::COMPLEX_STORAGE, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::COMPLEX_COMPLEX, value);
-
-        descriptor.set_value(oneapi::mkl::dft::config_param::COMPLEX_STORAGE,
-                             oneapi::mkl::dft::config_value::REAL_REAL);
-        descriptor.get_value(oneapi::mkl::dft::config_param::COMPLEX_STORAGE, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::REAL_REAL, value);
-
-        descriptor.set_value(oneapi::mkl::dft::config_param::COMPLEX_STORAGE,
-                             oneapi::mkl::dft::config_value::COMPLEX_COMPLEX);
-        descriptor.get_value(oneapi::mkl::dft::config_param::COMPLEX_STORAGE, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::COMPLEX_COMPLEX, value);
-    }
-
-    {
-        oneapi::mkl::dft::config_value value{
-            oneapi::mkl::dft::config_value::COMMITTED
-        }; // Initialize with invalid value
-        descriptor.get_value(oneapi::mkl::dft::config_param::CONJUGATE_EVEN_STORAGE, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::COMPLEX_COMPLEX, value);
-
-        descriptor.set_value(oneapi::mkl::dft::config_param::CONJUGATE_EVEN_STORAGE,
-                             oneapi::mkl::dft::config_value::COMPLEX_COMPLEX);
-
-        value = oneapi::mkl::dft::config_value::COMMITTED; // Initialize with invalid value
-        descriptor.get_value(oneapi::mkl::dft::config_param::CONJUGATE_EVEN_STORAGE, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::COMPLEX_COMPLEX, value);
-    }
-
-    {
-        oneapi::mkl::dft::config_value value{
-            oneapi::mkl::dft::config_value::COMMITTED
-        }; // Initialize with invalid value
-        descriptor.get_value(oneapi::mkl::dft::config_param::REAL_STORAGE, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::REAL_REAL, value);
-
-        descriptor.set_value(oneapi::mkl::dft::config_param::REAL_STORAGE,
-                             oneapi::mkl::dft::config_value::REAL_REAL);
-
-        value = oneapi::mkl::dft::config_value::COMMITTED; // Initialize with invalid value
-        descriptor.get_value(oneapi::mkl::dft::config_param::REAL_STORAGE, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::REAL_REAL, value);
-    }
-
-    {
-        oneapi::mkl::dft::config_value value{
-            oneapi::mkl::dft::config_value::COMMITTED
-        }; // Initialize with invalid value
-        descriptor.get_value(oneapi::mkl::dft::config_param::ORDERING, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::ORDERED, value);
-
-        descriptor.set_value(oneapi::mkl::dft::config_param::ORDERING,
-                             oneapi::mkl::dft::config_value::BACKWARD_SCRAMBLED);
-        descriptor.get_value(oneapi::mkl::dft::config_param::ORDERING, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::BACKWARD_SCRAMBLED, value);
-
-        descriptor.set_value(oneapi::mkl::dft::config_param::ORDERING,
-                             oneapi::mkl::dft::config_value::ORDERED);
-        descriptor.get_value(oneapi::mkl::dft::config_param::ORDERING, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::ORDERED, value);
-    }
-
-    {
-        bool value = true;
-        descriptor.get_value(oneapi::mkl::dft::config_param::TRANSPOSE, &value);
-        EXPECT_EQ(false, value);
-
-        descriptor.set_value(oneapi::mkl::dft::config_param::TRANSPOSE, true);
-        descriptor.get_value(oneapi::mkl::dft::config_param::TRANSPOSE, &value);
-        EXPECT_EQ(true, value);
-        /* Set value to false again because transpose is not implemented and will fail on commit
-         * when using the MKLGPU backend */
-        descriptor.set_value(oneapi::mkl::dft::config_param::TRANSPOSE, false);
-    }
-
-    {
-        /* Only value currently supported for PACKED_FORMAT is the config_value::CCE_FORMAT */
-        oneapi::mkl::dft::config_value value{
-            oneapi::mkl::dft::config_value::COMMITTED
-        }; // Initialize with invalid value
-        descriptor.get_value(oneapi::mkl::dft::config_param::PACKED_FORMAT, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::CCE_FORMAT, value);
-
-        descriptor.set_value(oneapi::mkl::dft::config_param::PACKED_FORMAT,
-                             oneapi::mkl::dft::config_value::CCE_FORMAT);
-
-        value = oneapi::mkl::dft::config_value::COMMITTED; // Initialize with invalid value
-        descriptor.get_value(oneapi::mkl::dft::config_param::PACKED_FORMAT, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::CCE_FORMAT, value);
-    }
-
-    {
-        oneapi::mkl::dft::config_value value{
-            oneapi::mkl::dft::config_value::COMMITTED
-        }; // Initialize with invalid value
-        descriptor.get_value(oneapi::mkl::dft::config_param::WORKSPACE_PLACEMENT, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::WORKSPACE_AUTOMATIC, value);
-
-        descriptor.set_value(oneapi::mkl::dft::config_param::WORKSPACE_PLACEMENT,
-                             oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
-
-        value = oneapi::mkl::dft::config_value::COMMITTED; // Initialize with invalid value
-        descriptor.get_value(oneapi::mkl::dft::config_param::WORKSPACE_PLACEMENT, &value);
-        EXPECT_EQ(oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL, value);
-        descriptor.set_value(oneapi::mkl::dft::config_param::WORKSPACE_PLACEMENT,
-                             oneapi::mkl::dft::config_value::WORKSPACE_AUTOMATIC);
-    }
-}
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-static void get_readonly_values() {
-    oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_1d_lengths };
-
-    oneapi::mkl::dft::domain domain_value;
-    descriptor.get_value(oneapi::mkl::dft::config_param::FORWARD_DOMAIN, &domain_value);
-    EXPECT_EQ(domain_value, domain);
-
-    oneapi::mkl::dft::precision precision_value;
-    descriptor.get_value(oneapi::mkl::dft::config_param::PRECISION, &precision_value);
-    EXPECT_EQ(precision_value, precision);
-
-    std::int64_t dimension_value;
-    descriptor.get_value(oneapi::mkl::dft::config_param::DIMENSION, &dimension_value);
-    EXPECT_EQ(dimension_value, 1);
-
-    oneapi::mkl::dft::descriptor<precision, domain> descriptor3D{ default_3d_lengths };
-    descriptor3D.get_value(oneapi::mkl::dft::config_param::DIMENSION, &dimension_value);
-    EXPECT_EQ(dimension_value, 3);
-
-    oneapi::mkl::dft::config_value commit_status;
-    descriptor.get_value(oneapi::mkl::dft::config_param::COMMIT_STATUS, &commit_status);
-    EXPECT_EQ(commit_status, oneapi::mkl::dft::config_value::UNCOMMITTED);
-}
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-static void set_readonly_values() {
-    oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_1d_lengths };
-
-    EXPECT_THROW(descriptor.set_value(oneapi::mkl::dft::config_param::FORWARD_DOMAIN,
-                                      oneapi::mkl::dft::domain::REAL),
-                 oneapi::mkl::invalid_argument);
-    EXPECT_THROW(descriptor.set_value(oneapi::mkl::dft::config_param::FORWARD_DOMAIN,
-                                      oneapi::mkl::dft::domain::COMPLEX),
-                 oneapi::mkl::invalid_argument);
-
-    EXPECT_THROW(descriptor.set_value(oneapi::mkl::dft::config_param::PRECISION,
-                                      oneapi::mkl::dft::precision::SINGLE),
-                 oneapi::mkl::invalid_argument);
-    EXPECT_THROW(descriptor.set_value(oneapi::mkl::dft::config_param::PRECISION,
-                                      oneapi::mkl::dft::precision::DOUBLE),
-                 oneapi::mkl::invalid_argument);
-
-    std::int64_t set_dimension{ 3 };
-    EXPECT_THROW(descriptor.set_value(oneapi::mkl::dft::config_param::DIMENSION, set_dimension),
-                 oneapi::mkl::invalid_argument);
-
-    EXPECT_THROW(descriptor.set_value(oneapi::mkl::dft::config_param::COMMIT_STATUS,
-                                      oneapi::mkl::dft::config_value::COMMITTED),
-                 oneapi::mkl::invalid_argument);
-    EXPECT_THROW(descriptor.set_value(oneapi::mkl::dft::config_param::COMMIT_STATUS,
-                                      oneapi::mkl::dft::config_value::UNCOMMITTED),
-                 oneapi::mkl::invalid_argument);
-}
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-static void get_commited(sycl::queue& sycl_queue) {
-    oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_1d_lengths };
-    commit_descriptor(descriptor, sycl_queue);
-
-    oneapi::mkl::dft::config_value commit_status;
-    descriptor.get_value(oneapi::mkl::dft::config_param::COMMIT_STATUS, &commit_status);
-    EXPECT_EQ(commit_status, oneapi::mkl::dft::config_value::COMMITTED);
-}
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-inline void recommit_values(sycl::queue& sycl_queue) {
-    using oneapi::mkl::dft::config_param;
-    using oneapi::mkl::dft::config_value;
-    using PrecisionType =
-        typename std::conditional_t<precision == oneapi::mkl::dft::precision::SINGLE, float,
-                                    double>;
-    using value = std::variant<config_value, std::int64_t, std::int64_t*, bool, PrecisionType>;
-
-    // this will hold a param to change and the value to change it to
-    using test_params = std::vector<std::pair<config_param, value>>;
-
-    oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_1d_lengths };
-    EXPECT_NO_THROW(commit_descriptor(descriptor, sycl_queue));
-
-    std::array<std::int64_t, 2> strides{ 0, 1 };
-
-    std::vector<test_params> argument_groups{
-        // not changeable
-        // FORWARD_DOMAIN, PRECISION, DIMENSION, COMMIT_STATUS
-        { std::make_pair(config_param::COMPLEX_STORAGE, config_value::COMPLEX_COMPLEX),
-          std::make_pair(config_param::REAL_STORAGE, config_value::REAL_REAL),
-          std::make_pair(config_param::CONJUGATE_EVEN_STORAGE, config_value::COMPLEX_COMPLEX) },
-        { std::make_pair(config_param::PLACEMENT, config_value::NOT_INPLACE),
-          std::make_pair(config_param::NUMBER_OF_TRANSFORMS, std::int64_t{ 5 }),
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-          std::make_pair(config_param::INPUT_STRIDES, strides.data()),
-          std::make_pair(config_param::OUTPUT_STRIDES, strides.data()),
-#pragma clang diagnostic pop
-          std::make_pair(config_param::FWD_DISTANCE, std::int64_t{ 60 }),
-          std::make_pair(config_param::BWD_DISTANCE, std::int64_t{ 70 }) },
-        { std::make_pair(config_param::WORKSPACE, config_value::ALLOW),
-          std::make_pair(config_param::ORDERING, config_value::ORDERED),
-          std::make_pair(config_param::TRANSPOSE, bool{ false }),
-          std::make_pair(config_param::PACKED_FORMAT, config_value::CCE_FORMAT) },
-        { std::make_pair(config_param::LENGTHS, std::int64_t{ 10 }),
-          std::make_pair(config_param::FORWARD_SCALE, PrecisionType(1.2)),
-          std::make_pair(config_param::BACKWARD_SCALE, PrecisionType(3.4)) }
-    };
-
-    for (std::size_t i = 0; i < argument_groups.size(); i += 1) {
-        for (auto argument : argument_groups[i]) {
-            std::visit([&descriptor, p = argument.first](auto&& a) { descriptor.set_value(p, a); },
-                       argument.second);
-        }
-        try {
-            commit_descriptor(descriptor, sycl_queue);
-        }
-        catch (oneapi::mkl::unimplemented e) {
-            std::cout << "unimplemented exception at index " << i << " with error : " << e.what()
-                      << "\ncontinuing...\n";
-        }
-        catch (oneapi::mkl::exception& e) {
-            FAIL() << "exception at index " << i << " with error : " << e.what();
-        }
-    }
-}
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-inline void change_queue_causes_wait(sycl::queue& busy_queue) {
-    // create a queue with work on it, and then show that work is waited on when the descriptor
-    // is committed to a new queue.
-    // its possible to have a false positive result, but a false negative should not be possible.
-    // sleeps have been added to reduce the false positives to show that we are actually waiting for
-    // notification/queue.
-    using namespace std::chrono_literals;
-    std::condition_variable cv;
-    std::mutex cv_m;
-    // signal used to avoid spurious wakeups
-    bool signal = false;
-
-    sycl::queue free_queue(busy_queue.get_device(), exception_handler);
-
-    // commit the descriptor on the "busy" queue
-    oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_1d_lengths };
-    EXPECT_NO_THROW(commit_descriptor(descriptor, busy_queue));
-
-    // add some work to the busy queue
-    auto e = busy_queue.submit([&](sycl::handler& cgh) {
-        cgh.host_task([&] {
-            std::unique_lock<std::mutex> lock(cv_m);
-            ASSERT_TRUE(cv.wait_for(lock, 5s, [&] { return signal; })); // returns false on timeout
-            std::this_thread::sleep_for(100ms);
-        });
-    });
-    std::this_thread::sleep_for(500ms);
-
-    // busy queue is still waiting on that conditional_variable
-    auto before_status = e.template get_info<sycl::info::event::command_execution_status>();
-    ASSERT_NE(before_status, sycl::info::event_command_status::complete);
-
-    // notify the conditional variable
-    {
-        std::lock_guard<std::mutex> lock(cv_m);
-        signal = true;
-    }
-    cv.notify_all();
-
-    // commit the descriptor to the "free" queue
-    EXPECT_NO_THROW(commit_descriptor(descriptor, free_queue));
-
-    // busy queue task has now completed.
-    auto after_status = e.template get_info<sycl::info::event::command_execution_status>();
-    ASSERT_EQ(after_status, sycl::info::event_command_status::complete);
-}
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-inline void swap_out_dead_queue(sycl::queue& sycl_queue) {
-    // test that commit still works when the previously committed queue is no longer in scope
-    // the queue is not actually dead (due to reference counting)
-
-    // commit the descriptor on the "busy" queue
-    oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_1d_lengths };
-    {
-        sycl::queue transient_queue(sycl_queue.get_device(), exception_handler);
-        EXPECT_NO_THROW(commit_descriptor(descriptor, transient_queue));
-    }
-    EXPECT_NO_THROW(commit_descriptor(descriptor, sycl_queue));
-
-    using ftype = typename std::conditional_t<precision == oneapi::mkl::dft::precision::SINGLE,
-                                              float, double>;
-    using forward_type = typename std::conditional_t<domain == oneapi::mkl::dft::domain::REAL,
-                                                     ftype, std::complex<ftype>>;
-
-    // add two so that real-complex transforms have space for all the conjugate even components
-    auto inout = sycl::malloc_device<forward_type>(default_1d_lengths + 2, sycl_queue);
-    sycl_queue.wait();
-
-    auto transform_event = oneapi::mkl::dft::compute_forward<decltype(descriptor), forward_type>(
-        descriptor, inout, std::vector<sycl::event>{});
-    sycl_queue.wait();
-
-    // after waiting on the second queue, the event should be completed
-    auto status = transform_event.template get_info<sycl::info::event::command_execution_status>();
-    ASSERT_EQ(status, sycl::info::event_command_status::complete);
-    sycl::free(inout, sycl_queue);
-}
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-static int test_move() {
-    using config_param = oneapi::mkl::dft::config_param;
-    // Use forward distance to test an element copied by value (ie. not on heap)
-    std::int64_t fwdDistanceRef(123);
-    // Use the DFT dimensions to test heap allocated values.
-    {
-        // Move constructor
-        oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_1d_lengths };
-        descriptor.set_value(config_param::FWD_DISTANCE, fwdDistanceRef);
-        oneapi::mkl::dft::descriptor<precision, domain> descMoved{ std::move(descriptor) };
-        std::int64_t fwdDistance(0), dftLength(0);
-        descMoved.get_value(config_param::FWD_DISTANCE, &fwdDistance);
-        EXPECT_EQ(fwdDistance, fwdDistanceRef);
-        descMoved.get_value(config_param::LENGTHS, &dftLength);
-        EXPECT_EQ(default_1d_lengths, dftLength);
-    }
-    {
-        // Move assignment
-        oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_1d_lengths };
-        descriptor.set_value(config_param::FWD_DISTANCE, fwdDistanceRef);
-        oneapi::mkl::dft::descriptor<precision, domain> descMoved{ default_1d_lengths };
-        descMoved = std::move(descriptor);
-        std::int64_t fwdDistance(0), dftLength(0);
-        descMoved.get_value(config_param::FWD_DISTANCE, &fwdDistance);
-        EXPECT_EQ(fwdDistance, fwdDistanceRef);
-        descMoved.get_value(config_param::LENGTHS, &dftLength);
-        EXPECT_EQ(default_1d_lengths, dftLength);
-    }
-
-    return !::testing::Test::HasFailure();
-}
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-static int test_getter_setter() {
-    set_and_get_lengths<precision, domain>();
-    set_and_get_io_strides<precision, domain>();
-    set_and_get_fwd_bwd_strides<precision, domain>();
-    set_and_get_values<precision, domain>();
-    get_readonly_values<precision, domain>();
-    set_readonly_values<precision, domain>();
-
-    return !::testing::Test::HasFailure();
-}
-
-template <oneapi::mkl::dft::precision precision, oneapi::mkl::dft::domain domain>
-int test_commit(sycl::device* dev) {
-    sycl::queue sycl_queue(*dev, exception_handler);
-
-    if constexpr (precision == oneapi::mkl::dft::precision::DOUBLE) {
-        if (!dev->has(sycl::aspect::fp64)) {
-            std::cout << "Device does not support double precision." << std::endl;
-            return test_skipped;
-        }
-    }
-
-    // test that descriptor is supported
-    try {
-        oneapi::mkl::dft::descriptor<precision, domain> descriptor{ default_1d_lengths };
-        commit_descriptor(descriptor, sycl_queue);
-    }
-    catch (oneapi::mkl::unimplemented& e) {
-        std::cout << "Skipping because simple commit not supported. Reason: \"" << e.what()
-                  << "\"\n";
-        return test_skipped;
-    }
-
-    get_commited<precision, domain>(sycl_queue);
-    recommit_values<precision, domain>(sycl_queue);
-    change_queue_causes_wait<precision, domain>(sycl_queue);
-    swap_out_dead_queue<precision, domain>(sycl_queue);
-
-    return !::testing::Test::HasFailure();
-}
-
-TEST(DescriptorTests, DescriptorMoveRealSingle) {
-    EXPECT_TRUE((test_move<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL>()));
-}
-
-TEST(DescriptorTests, DescriptorMoveRealDouble) {
-    EXPECT_TRUE((test_move<oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::REAL>()));
-}
-
-TEST(DescriptorTests, DescriptorMoveComplexSingle) {
-    EXPECT_TRUE(
-        (test_move<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::COMPLEX>()));
-}
-
-TEST(DescriptorTests, DescriptorMoveComplexDouble) {
-    EXPECT_TRUE(
-        (test_move<oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::COMPLEX>()));
-}
-
-TEST(DescriptorTests, DescriptorTestsRealSingle) {
-    EXPECT_TRUE((
-        test_getter_setter<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL>()));
-}
-
-TEST(DescriptorTests, DescriptorTestsRealDouble) {
-    EXPECT_TRUE((
-        test_getter_setter<oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::REAL>()));
-}
-
-TEST(DescriptorTests, DescriptorTestsComplexSingle) {
-    EXPECT_TRUE((test_getter_setter<oneapi::mkl::dft::precision::SINGLE,
-                                    oneapi::mkl::dft::domain::COMPLEX>()));
-}
-
-TEST(DescriptorTests, DescriptorTestsComplexDouble) {
-    EXPECT_TRUE((test_getter_setter<oneapi::mkl::dft::precision::DOUBLE,
-                                    oneapi::mkl::dft::domain::COMPLEX>()));
-}
-
-class DescriptorCommitTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(DescriptorCommitTests, DescriptorCommitTestsRealSingle) {
-    EXPECT_TRUEORSKIP(
-        (test_commit<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL>(
-            GetParam())));
-}
-
-TEST_P(DescriptorCommitTests, DescriptorCommitTestsRealDouble) {
-    EXPECT_TRUEORSKIP(
-        (test_commit<oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::REAL>(
-            GetParam())));
-}
-
-TEST_P(DescriptorCommitTests, DescriptorCommitTestsComplexSingle) {
-    EXPECT_TRUEORSKIP(
-        (test_commit<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::COMPLEX>(
-            GetParam())));
-}
-
-TEST_P(DescriptorCommitTests, DescriptorCommitTestsComplexDouble) {
-    EXPECT_TRUEORSKIP(
-        (test_commit<oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::COMPLEX>(
-            GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(DescriptorCommitTestSuite, DescriptorCommitTests,
-                         testing::ValuesIn(devices), ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/dft/source/workspace_external_tests.cpp b/tests/unit_tests/dft/source/workspace_external_tests.cpp
deleted file mode 100644
index f96544a90..000000000
--- a/tests/unit_tests/dft/source/workspace_external_tests.cpp
+++ /dev/null
@@ -1,403 +0,0 @@
-/***************************************************************************
-*  Copyright (C) Codeplay Software Limited
-*  Licensed under the Apache License, Version 2.0 (the "License");
-*  you may not use this file except in compliance with the License.
-*  You may obtain a copy of the License at
-*
-*      http://www.apache.org/licenses/LICENSE-2.0
-*
-*  For your convenience, a copy of the License has been included in this
-*  repository.
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*
-**************************************************************************/
-
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "test_helper.hpp"
-#include "test_common.hpp"
-#include "parseval_check.hpp"
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-class WorkspaceExternalTests : public ::testing::TestWithParam<sycl::device*> {};
-
-template <oneapi::mkl::dft::precision prec, oneapi::mkl::dft::domain dom>
-int test_workspace_external_usm_impl(std::size_t dft_size, sycl::device* dev) {
-    using namespace oneapi::mkl::dft;
-    using scalar_t = std::conditional_t<prec == precision::DOUBLE, double, float>;
-    using forward_t = std::conditional_t<dom == domain::COMPLEX, std::complex<scalar_t>, scalar_t>;
-    using backward_t = std::complex<scalar_t>;
-
-    sycl::queue sycl_queue(*dev);
-    if (prec == precision::DOUBLE && !sycl_queue.get_device().has(sycl::aspect::fp64)) {
-        std::cout << "Device does not support double precision." << std::endl;
-        return test_skipped;
-    }
-    descriptor<prec, dom> desc(static_cast<std::int64_t>(dft_size));
-
-    desc.set_value(config_param::WORKSPACE_PLACEMENT, config_value::WORKSPACE_EXTERNAL);
-    desc.set_value(config_param::PLACEMENT, config_value::NOT_INPLACE);
-    try {
-        commit_descriptor(desc, sycl_queue);
-    }
-    catch (oneapi::mkl::unimplemented&) {
-        std::cout << "Test configuration not implemented." << std::endl;
-        return test_skipped;
-    }
-    std::int64_t workspace_bytes = -1;
-    desc.get_value(config_param::WORKSPACE_EXTERNAL_BYTES, &workspace_bytes);
-    if (workspace_bytes < 0) {
-        return ::testing::Test::HasFailure();
-    }
-    scalar_t* workspace = sycl::malloc_device<scalar_t>(
-        static_cast<std::size_t>(workspace_bytes) / sizeof(scalar_t), sycl_queue);
-    desc.set_workspace(workspace);
-    // Generate data
-    std::vector<forward_t> host_fwd(static_cast<std::size_t>(dft_size));
-    std::size_t bwd_size = dom == domain::COMPLEX ? dft_size : dft_size / 2 + 1;
-    std::vector<backward_t> host_bwd(bwd_size);
-    rand_vector(host_fwd, dft_size);
-
-    // Allocate enough memory that we don't have to worry about the domain.
-    forward_t* device_fwd = sycl::malloc_device<forward_t>(dft_size, sycl_queue);
-    backward_t* deviceBwd = sycl::malloc_device<backward_t>(bwd_size, sycl_queue);
-    sycl_queue.copy(host_fwd.data(), device_fwd, dft_size);
-    sycl_queue.wait_and_throw();
-
-    compute_forward<decltype(desc), forward_t, backward_t>(desc, device_fwd, deviceBwd);
-    sycl_queue.wait_and_throw();
-
-    sycl_queue.copy(deviceBwd, host_bwd.data(), bwd_size);
-    sycl_queue.wait_and_throw();
-
-    // To see external workspaces, larger sizes of DFT may be needed. Using the reference DFT with larger sizes is slow,
-    // so use Parseval's theorum as a sanity check instead.
-    bool sanityCheckPasses = parseval_check(dft_size, host_fwd.data(), host_bwd.data());
-
-    if (sanityCheckPasses) {
-        sycl_queue.copy(host_fwd.data(), device_fwd, dft_size);
-        sycl_queue.wait_and_throw();
-        compute_backward<decltype(desc), backward_t, forward_t>(desc, deviceBwd, device_fwd);
-        sycl_queue.wait_and_throw();
-        sycl_queue.copy(device_fwd, host_fwd.data(), dft_size);
-        sycl_queue.wait_and_throw();
-        forward_t rescale =
-            static_cast<forward_t>(1) / static_cast<forward_t>(static_cast<scalar_t>(dft_size));
-        sanityCheckPasses = parseval_check(dft_size, host_fwd.data(), host_bwd.data(), rescale);
-    }
-
-    sycl::free(device_fwd, sycl_queue);
-    sycl::free(deviceBwd, sycl_queue);
-    sycl::free(workspace, sycl_queue);
-    return sanityCheckPasses ? !::testing::Test::HasFailure() : ::testing::Test::HasFailure();
-}
-
-template <oneapi::mkl::dft::precision prec, oneapi::mkl::dft::domain dom>
-int test_workspace_external_buffer_impl(std::size_t dft_size, sycl::device* dev) {
-    using namespace oneapi::mkl::dft;
-    using scalar_t = std::conditional_t<prec == precision::DOUBLE, double, float>;
-    using forward_t = std::conditional_t<dom == domain::COMPLEX, std::complex<scalar_t>, scalar_t>;
-    using backward_t = std::complex<scalar_t>;
-
-    sycl::queue sycl_queue(*dev);
-    if (prec == precision::DOUBLE && !sycl_queue.get_device().has(sycl::aspect::fp64)) {
-        std::cout << "Device does not support double precision." << std::endl;
-        return test_skipped;
-    }
-    descriptor<prec, dom> desc(static_cast<std::int64_t>(dft_size));
-
-    desc.set_value(config_param::WORKSPACE_PLACEMENT, config_value::WORKSPACE_EXTERNAL);
-    desc.set_value(config_param::PLACEMENT, config_value::NOT_INPLACE);
-    try {
-        commit_descriptor(desc, sycl_queue);
-    }
-    catch (oneapi::mkl::unimplemented&) {
-        std::cout << "Test configuration not implemented." << std::endl;
-        return test_skipped;
-    }
-    std::int64_t workspace_bytes = -1;
-    desc.get_value(config_param::WORKSPACE_EXTERNAL_BYTES, &workspace_bytes);
-    if (workspace_bytes < 0) {
-        return ::testing::Test::HasFailure();
-    }
-    sycl::buffer<scalar_t> workspace(static_cast<std::size_t>(workspace_bytes) / sizeof(scalar_t));
-    desc.set_workspace(workspace);
-    // Generate data
-    std::vector<forward_t> host_fwd(static_cast<std::size_t>(dft_size));
-    std::size_t bwd_size =
-        dom == domain::COMPLEX ? dft_size : dft_size / 2 + 1; // TODO: Check this!
-    std::vector<backward_t> host_bwd(bwd_size);
-    rand_vector(host_fwd, dft_size);
-    auto host_fwdCpy = host_fwd; // Some backends modify the input data (rocFFT).
-
-    {
-        sycl::buffer<forward_t> buf_fwd(host_fwd);
-        sycl::buffer<backward_t> buf_bwd(host_bwd);
-        compute_forward<decltype(desc), forward_t, backward_t>(desc, buf_fwd, buf_bwd);
-    }
-
-    // To see external workspaces, larger sizes of DFT may be needed. Using the reference DFT with larger sizes is slow,
-    // so use Parseval's theorum as a sanity check instead.
-    bool sanityCheckPasses = parseval_check(dft_size, host_fwdCpy.data(), host_bwd.data());
-
-    if (sanityCheckPasses) {
-        auto host_bwdCpy = host_bwd;
-        {
-            sycl::buffer<forward_t> buf_fwd(host_fwd);
-            sycl::buffer<backward_t> buf_bwd(host_bwd);
-            compute_backward<decltype(desc), backward_t, forward_t>(desc, buf_bwd, buf_fwd);
-            sycl_queue.wait_and_throw();
-        }
-        forward_t rescale =
-            static_cast<forward_t>(1) / static_cast<forward_t>(static_cast<scalar_t>(dft_size));
-        sanityCheckPasses = parseval_check(dft_size, host_fwd.data(), host_bwdCpy.data(), rescale);
-    }
-
-    return sanityCheckPasses ? !::testing::Test::HasFailure() : ::testing::Test::HasFailure();
-}
-
-template <oneapi::mkl::dft::precision prec, oneapi::mkl::dft::domain dom>
-void test_workspace_external_usm(sycl::device* dev) {
-    EXPECT_TRUEORSKIP((test_workspace_external_usm_impl<prec, dom>(2, dev)));
-    EXPECT_TRUEORSKIP((test_workspace_external_usm_impl<prec, dom>(1024 * 3 * 5 * 7 * 16, dev)));
-}
-
-template <oneapi::mkl::dft::precision prec, oneapi::mkl::dft::domain dom>
-void test_workspace_external_buffer(sycl::device* dev) {
-    EXPECT_TRUEORSKIP((test_workspace_external_buffer_impl<prec, dom>(2, dev)));
-    EXPECT_TRUEORSKIP((test_workspace_external_buffer_impl<prec, dom>(1024 * 3 * 5 * 7 * 16, dev)));
-}
-
-TEST_P(WorkspaceExternalTests, TestWorkspaceExternalSingleUsm) {
-    using precision = oneapi::mkl::dft::precision;
-    using domain = oneapi::mkl::dft::domain;
-    test_workspace_external_usm<precision::SINGLE, domain::REAL>(GetParam());
-    test_workspace_external_usm<precision::SINGLE, domain::COMPLEX>(GetParam());
-}
-
-TEST_P(WorkspaceExternalTests, TestWorkspaceExternalDoubleUsm) {
-    using precision = oneapi::mkl::dft::precision;
-    using domain = oneapi::mkl::dft::domain;
-    test_workspace_external_usm<precision::DOUBLE, domain::REAL>(GetParam());
-    test_workspace_external_usm<precision::DOUBLE, domain::COMPLEX>(GetParam());
-}
-
-TEST_P(WorkspaceExternalTests, TestWorkspaceExternalSingleBuffer) {
-    using precision = oneapi::mkl::dft::precision;
-    using domain = oneapi::mkl::dft::domain;
-    test_workspace_external_buffer<precision::SINGLE, domain::REAL>(GetParam());
-    test_workspace_external_buffer<precision::SINGLE, domain::COMPLEX>(GetParam());
-}
-
-TEST_P(WorkspaceExternalTests, TestWorkspaceExternalDoubleBuffer) {
-    using precision = oneapi::mkl::dft::precision;
-    using domain = oneapi::mkl::dft::domain;
-    test_workspace_external_buffer<precision::DOUBLE, domain::REAL>(GetParam());
-    test_workspace_external_buffer<precision::DOUBLE, domain::COMPLEX>(GetParam());
-}
-
-/// A test where set_workspace is called when an external workspace is not set.
-TEST_P(WorkspaceExternalTests, SetWorkspaceOnWorkspaceAutomatic) {
-    using namespace oneapi::mkl::dft;
-    sycl::queue sycl_queue(*GetParam());
-    const int dft_len = 1024 * 3 * 5 * 7 * 16; // A size likely to require an external workspace.
-    float* fft_data_usm = sycl::malloc_device<float>(dft_len * 2, sycl_queue);
-    sycl::buffer<float> fft_data_buf(dft_len * 2);
-    descriptor<precision::SINGLE, domain::COMPLEX> desc_usm(dft_len), desc_buf(dft_len);
-    try {
-        // WORKSPACE_EXTERNAL is NOT set.
-        commit_descriptor(desc_usm, sycl_queue);
-        commit_descriptor(desc_buf, sycl_queue);
-    }
-    catch (oneapi::mkl::unimplemented&) {
-        // The DFT size may not be supported. Use a size that is likely to be supported, even if
-        // that means no external workspace is actually used.
-        descriptor<precision::SINGLE, domain::COMPLEX> desc_usm2(2), desc_buf2(2);
-        desc_usm = std::move(desc_usm2);
-        desc_buf = std::move(desc_buf2);
-        commit_descriptor(desc_usm, sycl_queue);
-        commit_descriptor(desc_buf, sycl_queue);
-    }
-    std::int64_t workspace_bytes = 0;
-    desc_usm.get_value(config_param::WORKSPACE_EXTERNAL_BYTES, &workspace_bytes);
-
-    // No workspace set yet: all of the following should work.
-    compute_forward(desc_usm, fft_data_usm);
-    compute_forward(desc_buf, fft_data_buf);
-    compute_backward(desc_usm, fft_data_usm);
-    compute_backward(desc_buf, fft_data_buf);
-    compute_forward(desc_usm, fft_data_buf);
-    compute_forward(desc_buf, fft_data_usm);
-    compute_backward(desc_usm, fft_data_buf);
-    compute_backward(desc_buf, fft_data_usm);
-    sycl_queue.wait_and_throw();
-
-    // Set workspace
-    float* usm_workspace = sycl::malloc_device<float>(
-        static_cast<std::size_t>(workspace_bytes) / sizeof(float), sycl_queue);
-    sycl::buffer<float> bufferWorkspace(static_cast<std::size_t>(workspace_bytes) / sizeof(float));
-    desc_usm.set_workspace(usm_workspace);
-    desc_buf.set_workspace(bufferWorkspace);
-
-    // Should work:
-    compute_forward(desc_usm, fft_data_usm);
-    sycl_queue.wait_and_throw();
-    compute_forward(desc_buf, fft_data_buf);
-    sycl_queue.wait_and_throw();
-    compute_backward(desc_usm, fft_data_usm);
-    sycl_queue.wait_and_throw();
-    compute_backward(desc_buf, fft_data_buf);
-    sycl_queue.wait_and_throw();
-
-    // Should not work:
-    EXPECT_THROW(compute_forward(desc_usm, fft_data_buf), oneapi::mkl::invalid_argument);
-    EXPECT_THROW(compute_forward(desc_buf, fft_data_usm), oneapi::mkl::invalid_argument);
-    EXPECT_THROW(compute_backward(desc_usm, fft_data_buf), oneapi::mkl::invalid_argument);
-    EXPECT_THROW(compute_backward(desc_buf, fft_data_usm), oneapi::mkl::invalid_argument);
-    sycl_queue.wait_and_throw();
-
-    // Free any allocations:
-    sycl::free(usm_workspace, sycl_queue);
-    sycl::free(fft_data_usm, sycl_queue);
-}
-
-/// Test that the implementation throws as expected.
-TEST_P(WorkspaceExternalTests, ThrowOnBadCalls) {
-    using namespace oneapi::mkl::dft;
-    sycl::queue sycl_queue(*GetParam());
-    const int dft_len = 1024 * 3 * 5 * 7 * 16; // A size likely to require an external workspace.
-    float* fft_data_usm = sycl::malloc_device<float>(dft_len * 2, sycl_queue);
-    sycl::buffer<float> fft_data_buf(dft_len * 2);
-    descriptor<precision::SINGLE, domain::COMPLEX> desc_usm(dft_len), desc_buf(dft_len);
-    desc_usm.set_value(config_param::WORKSPACE_PLACEMENT, config_value::WORKSPACE_EXTERNAL);
-    desc_buf.set_value(config_param::WORKSPACE_PLACEMENT, config_value::WORKSPACE_EXTERNAL);
-    // We expect the following to throw because the decriptor has not been committed.
-    std::int64_t workspace_bytes = -10;
-    float* usm_workspace = nullptr;
-    EXPECT_THROW(desc_usm.get_value(config_param::WORKSPACE_EXTERNAL_BYTES, &workspace_bytes),
-                 oneapi::mkl::invalid_argument);
-    EXPECT_THROW(desc_usm.set_workspace(usm_workspace), oneapi::mkl::uninitialized);
-    try {
-        commit_descriptor(desc_usm, sycl_queue);
-        commit_descriptor(desc_buf, sycl_queue);
-    }
-    catch (oneapi::mkl::unimplemented&) {
-        // DFT size may not be supported. Use a DFT size that probably will be, even if it
-        // won't actually use an external workspace internally.
-        descriptor<precision::SINGLE, domain::COMPLEX> desc_usm2(2), desc_buf2(2);
-        desc_usm = std::move(desc_usm2);
-        desc_buf = std::move(desc_buf2);
-        desc_usm.set_value(config_param::WORKSPACE_PLACEMENT, config_value::WORKSPACE_EXTERNAL);
-        desc_buf.set_value(config_param::WORKSPACE_PLACEMENT, config_value::WORKSPACE_EXTERNAL);
-        commit_descriptor(desc_usm, sycl_queue);
-        commit_descriptor(desc_buf, sycl_queue);
-    }
-
-    desc_usm.get_value(config_param::WORKSPACE_EXTERNAL_BYTES, &workspace_bytes);
-    EXPECT_GE(workspace_bytes, 0);
-
-    // We haven't set a workspace, so the following should fail;
-    EXPECT_THROW(compute_forward(desc_usm, fft_data_usm), oneapi::mkl::invalid_argument);
-    sycl_queue.wait_and_throw();
-    EXPECT_THROW(compute_forward(desc_usm, fft_data_buf), oneapi::mkl::invalid_argument);
-    sycl_queue.wait_and_throw();
-
-    if (workspace_bytes > 0) {
-        EXPECT_THROW(desc_usm.set_workspace(nullptr), oneapi::mkl::invalid_argument);
-        sycl::buffer<float> undersize_workspace(
-            static_cast<std::size_t>(workspace_bytes) / sizeof(float) - 1);
-        EXPECT_THROW(desc_buf.set_workspace(undersize_workspace), oneapi::mkl::invalid_argument);
-    }
-
-    usm_workspace = sycl::malloc_device<float>(
-        static_cast<std::size_t>(workspace_bytes) / sizeof(float), sycl_queue);
-    sycl::buffer<float> bufferWorkspace(static_cast<std::size_t>(workspace_bytes) / sizeof(float));
-
-    desc_usm.set_workspace(usm_workspace);
-    desc_buf.set_workspace(bufferWorkspace);
-
-    // Should work:
-    compute_forward(desc_usm, fft_data_usm);
-    sycl_queue.wait_and_throw();
-    compute_forward(desc_buf, fft_data_buf);
-    sycl_queue.wait_and_throw();
-    compute_backward(desc_usm, fft_data_usm);
-    sycl_queue.wait_and_throw();
-    compute_backward(desc_buf, fft_data_buf);
-    sycl_queue.wait_and_throw();
-
-    // Should not work:
-    EXPECT_THROW(compute_forward(desc_usm, fft_data_buf), oneapi::mkl::invalid_argument);
-    EXPECT_THROW(compute_forward(desc_buf, fft_data_usm), oneapi::mkl::invalid_argument);
-    EXPECT_THROW(compute_backward(desc_usm, fft_data_buf), oneapi::mkl::invalid_argument);
-    EXPECT_THROW(compute_backward(desc_buf, fft_data_usm), oneapi::mkl::invalid_argument);
-    sycl_queue.wait_and_throw();
-
-    // Free any allocations:
-    sycl::free(usm_workspace, sycl_queue);
-    sycl::free(fft_data_usm, sycl_queue);
-}
-
-TEST_P(WorkspaceExternalTests, RecommitBehaviour) {
-    using namespace oneapi::mkl::dft;
-    sycl::queue sycl_queue(*GetParam());
-    const int dft_len = 1024 * 3 * 5 * 7 * 16; // A size likely to require an external workspace.
-    float* fft_data_usm = sycl::malloc_device<float>(dft_len * 2, sycl_queue);
-    descriptor<precision::SINGLE, domain::COMPLEX> desc_usm(dft_len);
-    try {
-        // WORKSPACE_EXTERNAL is NOT set.
-        commit_descriptor(desc_usm, sycl_queue);
-    }
-    catch (oneapi::mkl::unimplemented&) {
-        // DFT size may not be supported. Use a DFT size that probably will be, even if it
-        // won't actually use an external workspace internally.
-        descriptor<precision::SINGLE, domain::COMPLEX> desc_usm2(2);
-        desc_usm = std::move(desc_usm2);
-        commit_descriptor(desc_usm, sycl_queue);
-    }
-    std::int64_t workspace_bytes = 0;
-    desc_usm.get_value(config_param::WORKSPACE_EXTERNAL_BYTES, &workspace_bytes);
-    float* usm_workspace = sycl::malloc_device<float>(
-        static_cast<std::size_t>(workspace_bytes) / sizeof(float), sycl_queue);
-
-    // Should work with workspace automatic
-    compute_forward(desc_usm, fft_data_usm);
-    sycl_queue.wait_and_throw();
-
-    desc_usm.set_value(config_param::WORKSPACE_PLACEMENT, config_value::WORKSPACE_EXTERNAL);
-    commit_descriptor(desc_usm, sycl_queue);
-
-    // No workspace, expect throw
-    EXPECT_THROW(compute_forward(desc_usm, fft_data_usm), oneapi::mkl::invalid_argument);
-
-    desc_usm.set_workspace(usm_workspace);
-
-    compute_forward(desc_usm, fft_data_usm);
-    sycl_queue.wait_and_throw();
-
-    // Recommitting should require workspace to be set again.
-    commit_descriptor(desc_usm, sycl_queue);
-    EXPECT_THROW(compute_forward(desc_usm, fft_data_usm), oneapi::mkl::invalid_argument);
-    sycl_queue.wait_and_throw();
-
-    // Free any allocations:
-    sycl::free(usm_workspace, sycl_queue);
-    sycl::free(fft_data_usm, sycl_queue);
-}
-
-INSTANTIATE_TEST_SUITE_P(WorkspaceExternalTestSuite, WorkspaceExternalTests,
-                         testing::ValuesIn(devices), ::DeviceNamePrint());
diff --git a/tests/unit_tests/include/test_helper.hpp b/tests/unit_tests/include/test_helper.hpp
deleted file mode 100644
index 7e0024195..000000000
--- a/tests/unit_tests/include/test_helper.hpp
+++ /dev/null
@@ -1,379 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _TEST_HELPER_HPP_
-#define _TEST_HELPER_HPP_
-
-#include <iostream>
-#include <string>
-#include <tuple>
-#include <gtest/gtest.h>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl/detail/backend_selector.hpp"
-
-#ifdef _WIN64
-#include <malloc.h>
-#else
-#include <stdlib.h>
-#endif
-
-#define test_failed  0
-#define test_passed  1
-#define test_skipped 2
-
-// Note GTEST_SKIP may not print the associated message when using ctest.
-// However, running a test binary with the flag `--terse-output` will print them.
-
-#define EXPECT_TRUEORSKIP(a)             \
-    do {                                 \
-        int res = a;                     \
-        if (res == test_skipped)         \
-            GTEST_SKIP();                \
-        else                             \
-            EXPECT_EQ(res, test_passed); \
-    } while (0);
-
-// GTEST_SKIP stops the execution of the program.
-// This macro lets a test use multiple EXPECT_TRUE_OR_FUTURE_SKIP and mark a test as skipped only once at the end.
-#define EXPECT_TRUE_OR_FUTURE_SKIP(a, num_passed, num_skipped) \
-    do {                                                       \
-        int res = a;                                           \
-        if (res == test_skipped)                               \
-            ++num_skipped;                                     \
-        else {                                                 \
-            ++num_passed;                                      \
-            EXPECT_EQ(res, test_passed);                       \
-        }                                                      \
-    } while (0);
-
-#define CHECK_DOUBLE_ON_DEVICE(d)                                        \
-    if (d->get_info<sycl::info::device::double_fp_config>().size() == 0) \
-    GTEST_SKIP() << "Double precision is not supported on the device"
-
-#if defined(ENABLE_MKLCPU_BACKEND) || defined(ENABLE_NETLIB_BACKEND)
-#ifdef ENABLE_MKLCPU_BACKEND
-#define TEST_RUN_INTELCPU_SELECT_NO_ARGS(q, func) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu>{ q })
-#define TEST_RUN_INTELCPU_SELECT(q, func, ...) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu>{ q }, __VA_ARGS__)
-#else
-#define TEST_RUN_INTELCPU_SELECT(q, func, ...) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::netlib>{ q }, __VA_ARGS__)
-#endif
-#else
-#define TEST_RUN_INTELCPU_SELECT_NO_ARGS(q, func)
-#define TEST_RUN_INTELCPU_SELECT(q, func, ...)
-#endif
-
-#ifdef ENABLE_MKLGPU_BACKEND
-#define TEST_RUN_INTELGPU_SELECT_NO_ARGS(q, func) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::mklgpu>{ q })
-#define TEST_RUN_INTELGPU_SELECT(q, func, ...) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::mklgpu>{ q }, __VA_ARGS__)
-#else
-#define TEST_RUN_INTELGPU_SELECT_NO_ARGS(q, func)
-#define TEST_RUN_INTELGPU_SELECT(q, func, ...)
-#endif
-
-#ifdef ENABLE_CUBLAS_BACKEND
-#define TEST_RUN_NVIDIAGPU_CUBLAS_SELECT(q, func, ...) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, __VA_ARGS__)
-#else
-#define TEST_RUN_NVIDIAGPU_CUBLAS_SELECT(q, func, ...)
-#endif
-#ifdef ENABLE_CUSOLVER_BACKEND
-#define TEST_RUN_NVIDIAGPU_CUSOLVER_SELECT(q, func, ...) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::cusolver>{ q }, __VA_ARGS__)
-#else
-#define TEST_RUN_NVIDIAGPU_CUSOLVER_SELECT(q, func, ...)
-#endif
-
-#ifdef ENABLE_ROCBLAS_BACKEND
-#define TEST_RUN_AMDGPU_ROCBLAS_SELECT(q, func, ...) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::rocblas>{ q }, __VA_ARGS__)
-#else
-#define TEST_RUN_AMDGPU_ROCBLAS_SELECT(q, func, ...)
-#endif
-
-#ifdef ENABLE_CURAND_BACKEND
-#define TEST_RUN_NVIDIAGPU_CURAND_SELECT(q, func, ...) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::curand>{ q }, __VA_ARGS__)
-#else
-#define TEST_RUN_NVIDIAGPU_CURAND_SELECT(q, func, ...)
-#endif
-
-#ifdef ENABLE_ROCRAND_BACKEND
-#define TEST_RUN_AMDGPU_ROCRAND_SELECT(q, func, ...) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::rocrand>{ q }, __VA_ARGS__)
-#else
-#define TEST_RUN_AMDGPU_ROCRAND_SELECT(q, func, ...)
-#endif
-
-#ifdef ENABLE_ROCSOLVER_BACKEND
-#define TEST_RUN_AMDGPU_ROCSOLVER_SELECT(q, func, ...) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::rocsolver>{ q }, __VA_ARGS__)
-#else
-#define TEST_RUN_AMDGPU_ROCSOLVER_SELECT(q, func, ...)
-#endif
-
-#ifdef ENABLE_PORTBLAS_BACKEND
-#define TEST_RUN_PORTBLAS_SELECT(q, func, ...) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::portblas>{ q }, __VA_ARGS__)
-#else
-#define TEST_RUN_PORTBLAS_SELECT(q, func, ...)
-#endif
-
-#ifdef ENABLE_CUFFT_BACKEND
-#define TEST_RUN_NVIDIAGPU_CUFFT_SELECT_NO_ARGS(q, func) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::cufft>{ q })
-#define TEST_RUN_NVIDIAGPU_CUFFT_SELECT(q, func, ...) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::cufft>{ q }, __VA_ARGS__)
-#else
-#define TEST_RUN_NVIDIAGPU_CUFFT_SELECT_NO_ARGS(q, func)
-#define TEST_RUN_NVIDIAGPU_CUFFT_SELECT(q, func, ...)
-#endif
-
-#ifdef ENABLE_ROCFFT_BACKEND
-#define TEST_RUN_AMDGPU_ROCFFT_SELECT_NO_ARGS(q, func) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::rocfft>{ q })
-#define TEST_RUN_AMDGPU_ROCFFT_SELECT(q, func, ...) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::rocfft>{ q }, __VA_ARGS__)
-#else
-#define TEST_RUN_AMDGPU_ROCFFT_SELECT_NO_ARGS(q, func)
-#define TEST_RUN_AMDGPU_ROCFFT_SELECT(q, func, ...)
-#endif
-
-#ifdef ENABLE_PORTFFT_BACKEND
-#define TEST_RUN_PORTFFT_SELECT_NO_ARGS(q, func) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::portfft>{ q })
-#define TEST_RUN_PORTFFT_SELECT(q, func, ...) \
-    func(oneapi::mkl::backend_selector<oneapi::mkl::backend::portfft>{ q }, __VA_ARGS__)
-#else
-#define TEST_RUN_PORTFFT_SELECT_NO_ARGS(q, func)
-#define TEST_RUN_PORTFFT_SELECT(q, func, ...)
-#endif
-
-#ifndef __HIPSYCL__
-#define CHECK_HOST_OR_CPU(q) q.get_device().is_cpu()
-#else
-#define CHECK_HOST_OR_CPU(q) q.is_host() || q.get_device().is_cpu()
-#endif
-
-#define TEST_RUN_CT_SELECT_NO_ARGS(q, func)                                \
-    do {                                                                   \
-        if (CHECK_HOST_OR_CPU(q)) {                                        \
-            TEST_RUN_INTELCPU_SELECT_NO_ARGS(q, func);                     \
-        }                                                                  \
-        else if (q.get_device().is_gpu()) {                                \
-            unsigned int vendor_id = static_cast<unsigned int>(            \
-                q.get_device().get_info<sycl::info::device::vendor_id>()); \
-            if (vendor_id == INTEL_ID) {                                   \
-                TEST_RUN_INTELGPU_SELECT_NO_ARGS(q, func);                 \
-            }                                                              \
-            else if (vendor_id == NVIDIA_ID) {                             \
-                TEST_RUN_NVIDIAGPU_CUFFT_SELECT_NO_ARGS(q, func);          \
-            }                                                              \
-            else if (vendor_id == AMD_ID) {                                \
-                TEST_RUN_AMDGPU_ROCFFT_SELECT_NO_ARGS(q, func);            \
-            }                                                              \
-        }                                                                  \
-        TEST_RUN_PORTFFT_SELECT_NO_ARGS(q, func);                          \
-    } while (0);
-
-#define TEST_RUN_CT_SELECT(q, func, ...)                                   \
-    do {                                                                   \
-        if (CHECK_HOST_OR_CPU(q))                                          \
-            TEST_RUN_INTELCPU_SELECT(q, func, __VA_ARGS__);                \
-        else if (q.get_device().is_gpu()) {                                \
-            unsigned int vendor_id = static_cast<unsigned int>(            \
-                q.get_device().get_info<sycl::info::device::vendor_id>()); \
-            if (vendor_id == INTEL_ID)                                     \
-                TEST_RUN_INTELGPU_SELECT(q, func, __VA_ARGS__);            \
-            else if (vendor_id == NVIDIA_ID) {                             \
-                TEST_RUN_NVIDIAGPU_CUBLAS_SELECT(q, func, __VA_ARGS__);    \
-                TEST_RUN_NVIDIAGPU_CUSOLVER_SELECT(q, func, __VA_ARGS__);  \
-                TEST_RUN_NVIDIAGPU_CURAND_SELECT(q, func, __VA_ARGS__);    \
-            }                                                              \
-            else if (vendor_id == AMD_ID) {                                \
-                TEST_RUN_AMDGPU_ROCBLAS_SELECT(q, func, __VA_ARGS__);      \
-                TEST_RUN_AMDGPU_ROCRAND_SELECT(q, func, __VA_ARGS__);      \
-                TEST_RUN_AMDGPU_ROCSOLVER_SELECT(q, func, __VA_ARGS__);    \
-                TEST_RUN_AMDGPU_ROCFFT_SELECT(q, func, __VA_ARGS__);       \
-            }                                                              \
-        }                                                                  \
-        TEST_RUN_PORTBLAS_SELECT(q, func, __VA_ARGS__);                    \
-        TEST_RUN_PORTFFT_SELECT(q, func, __VA_ARGS__);                     \
-    } while (0);
-
-#define TEST_RUN_BLAS_CT_SELECT(q, func, ...)                              \
-    do {                                                                   \
-        if (CHECK_HOST_OR_CPU(q))                                          \
-            TEST_RUN_INTELCPU_SELECT(q, func, __VA_ARGS__);                \
-        else if (q.get_device().is_gpu()) {                                \
-            unsigned int vendor_id = static_cast<unsigned int>(            \
-                q.get_device().get_info<sycl::info::device::vendor_id>()); \
-            if (vendor_id == INTEL_ID)                                     \
-                TEST_RUN_INTELGPU_SELECT(q, func, __VA_ARGS__);            \
-            else if (vendor_id == NVIDIA_ID) {                             \
-                TEST_RUN_NVIDIAGPU_CUBLAS_SELECT(q, func, __VA_ARGS__);    \
-            }                                                              \
-            else if (vendor_id == AMD_ID) {                                \
-                TEST_RUN_AMDGPU_ROCBLAS_SELECT(q, func, __VA_ARGS__);      \
-            }                                                              \
-        }                                                                  \
-        TEST_RUN_PORTBLAS_SELECT(q, func, __VA_ARGS__);                    \
-    } while (0);
-
-#define TEST_RUN_RNG_CT_SELECT(q, func, ...)                               \
-    do {                                                                   \
-        if (CHECK_HOST_OR_CPU(q))                                          \
-            TEST_RUN_INTELCPU_SELECT(q, func, __VA_ARGS__);                \
-        else if (q.get_device().is_gpu()) {                                \
-            unsigned int vendor_id = static_cast<unsigned int>(            \
-                q.get_device().get_info<sycl::info::device::vendor_id>()); \
-            if (vendor_id == INTEL_ID)                                     \
-                TEST_RUN_INTELGPU_SELECT(q, func, __VA_ARGS__);            \
-            else if (vendor_id == NVIDIA_ID) {                             \
-                TEST_RUN_NVIDIAGPU_CURAND_SELECT(q, func, __VA_ARGS__);    \
-            }                                                              \
-            else if (vendor_id == AMD_ID) {                                \
-                TEST_RUN_AMDGPU_ROCRAND_SELECT(q, func, __VA_ARGS__);      \
-            }                                                              \
-        }                                                                  \
-    } while (0);
-
-#define TEST_RUN_LAPACK_CT_SELECT(q, func, ...)                            \
-    do {                                                                   \
-        if (CHECK_HOST_OR_CPU(q))                                          \
-            TEST_RUN_INTELCPU_SELECT(q, func, __VA_ARGS__);                \
-        else if (q.get_device().is_gpu()) {                                \
-            unsigned int vendor_id = static_cast<unsigned int>(            \
-                q.get_device().get_info<sycl::info::device::vendor_id>()); \
-            if (vendor_id == INTEL_ID)                                     \
-                TEST_RUN_INTELGPU_SELECT(q, func, __VA_ARGS__);            \
-            else if (vendor_id == NVIDIA_ID) {                             \
-                TEST_RUN_NVIDIAGPU_CUSOLVER_SELECT(q, func, __VA_ARGS__);  \
-            }                                                              \
-            else if (vendor_id == AMD_ID) {                                \
-                TEST_RUN_AMDGPU_ROCSOLVER_SELECT(q, func, __VA_ARGS__);    \
-            }                                                              \
-        }                                                                  \
-    } while (0);
-
-void print_error_code(sycl::exception const &e);
-
-class DeviceNamePrint {
-public:
-    std::string operator()(testing::TestParamInfo<sycl::device *> dev) const {
-        std::string dev_name = dev.param->get_info<sycl::info::device::name>();
-        for (std::string::size_type i = 0; i < dev_name.size(); ++i) {
-            if (!isalnum(dev_name[i]))
-                dev_name[i] = '_';
-        }
-        if (dev_name.size() == 0)
-            dev_name = dev_name.append("_");
-        return dev_name;
-    }
-};
-
-class LayoutDeviceNamePrint {
-public:
-    std::string operator()(
-        testing::TestParamInfo<std::tuple<sycl::device *, oneapi::mkl::layout>> dev) const {
-        std::string layout_name =
-            std::get<1>(dev.param) == oneapi::mkl::layout::col_major ? "Column_Major" : "Row_Major";
-        std::string dev_name = std::get<0>(dev.param)->get_info<sycl::info::device::name>();
-        for (std::string::size_type i = 0; i < dev_name.size(); ++i) {
-            if (!isalnum(dev_name[i]))
-                dev_name[i] = '_';
-        }
-        std::string info_name = (layout_name.append("_")).append(dev_name);
-        return info_name;
-    }
-};
-
-/* to accommodate Windows and Linux differences between alligned_alloc and
-   _aligned_malloc calls use oneapi::mkl::aligned_alloc and oneapi::mkl::aligned_free instead */
-namespace oneapi {
-namespace mkl {
-
-static inline void *aligned_alloc(size_t align, size_t size) {
-#ifdef _WIN64
-    return ::_aligned_malloc(size, align);
-#else
-    return ::aligned_alloc(align, size);
-#endif
-}
-
-static inline void aligned_free(void *p) {
-#ifdef _WIN64
-    ::_aligned_free(p);
-#else
-    ::free(p);
-#endif
-}
-
-/* Support for Unified Shared Memory allocations for different backends */
-static inline void *malloc_shared(size_t align, size_t size, sycl::device dev, sycl::context ctx) {
-    (void)align;
-#ifdef _WIN64
-    return sycl::malloc_shared(size, dev, ctx);
-#else
-#if defined(ENABLE_CUBLAS_BACKEND) || defined(ENABLE_ROCBLAS_BACKEND)
-    return sycl::aligned_alloc_shared(align, size, dev, ctx);
-#endif
-#if !defined(ENABLE_CUBLAS_BACKEND) && !defined(ENABLE_ROCBLAS_BACKEND)
-    return sycl::malloc_shared(size, dev, ctx);
-#endif
-#endif
-}
-
-static inline void *malloc_device(size_t align, size_t size, sycl::device dev, sycl::context ctx) {
-    (void)align;
-#ifdef _WIN64
-    return sycl::malloc_device(size, dev, ctx);
-#else
-#if defined(ENABLE_CUBLAS_BACKEND) || defined(ENABLE_ROCBLAS_BACKEND)
-    return sycl::aligned_alloc_device(align, size, dev, ctx);
-#endif
-#if !defined(ENABLE_CUBLAS_BACKEND) && !defined(ENABLE_ROCBLAS_BACKEND)
-    return sycl::malloc_device(size, dev, ctx);
-#endif
-#endif
-}
-
-static inline void free_shared(void *p, sycl::context ctx) {
-    sycl::free(p, ctx);
-}
-
-static inline void free_usm(void *p, sycl::context ctx) {
-    sycl::free(p, ctx);
-}
-
-} // namespace mkl
-} // namespace oneapi
-
-#endif // _TEST_HELPER_HPP_
diff --git a/tests/unit_tests/lapack/CMakeLists.txt b/tests/unit_tests/lapack/CMakeLists.txt
deleted file mode 100644
index 9f2470044..000000000
--- a/tests/unit_tests/lapack/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_subdirectory(source)
-add_subdirectory(common)
diff --git a/tests/unit_tests/lapack/common/CMakeLists.txt b/tests/unit_tests/lapack/common/CMakeLists.txt
deleted file mode 100644
index 53479702b..000000000
--- a/tests/unit_tests/lapack/common/CMakeLists.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-#===============================================================================
-# Copyright 2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build object from all test sources
-set(LAPACK_COMMON_SOURCES "dependency_check.cpp" "test_log.cpp")
-
-if(BUILD_SHARED_LIBS)
-  target_sources(lapack_source_rt PRIVATE ${LAPACK_COMMON_SOURCES})
-endif()
-
-target_sources(lapack_source_ct PRIVATE ${LAPACK_COMMON_SOURCES})
diff --git a/tests/unit_tests/lapack/common/dependency_check.cpp b/tests/unit_tests/lapack/common/dependency_check.cpp
deleted file mode 100644
index 30d2d1d4a..000000000
--- a/tests/unit_tests/lapack/common/dependency_check.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "lapack_common.hpp"
-
-namespace {
-
-std::vector<int64_t> host_data(1024);
-int64_t* device_data = nullptr;
-
-} // namespace
-
-sycl::event create_dependency(sycl::queue queue) {
-    ::device_data = device_alloc<int64_t>(queue, ::host_data.size());
-    return host_to_device_copy(queue, ::host_data.data(), ::device_data, ::host_data.size());
-}
-
-void log_status(const char* name, sycl::info::event_command_status status) {
-    test_log::lout << name << " command execution status: ";
-    if (sycl::info::event_command_status::submitted == status)
-        test_log::lout << "submitted";
-    else if (sycl::info::event_command_status::running == status)
-        test_log::lout << "running";
-    else if (sycl::info::event_command_status::complete == status)
-        test_log::lout << "complete";
-    else
-        test_log::lout << "status unknown";
-    test_log::lout << " (" << static_cast<int64_t>(status) << ")" << std::endl;
-}
-
-bool check_dependency(sycl::queue queue, sycl::event in_event, sycl::event func_event) {
-    sycl::info::event_command_status in_status;
-    sycl::info::event_command_status func_status;
-
-    do {
-        func_status = func_event.get_info<sycl::info::event::command_execution_status>();
-    } while (func_status != sycl::info::event_command_status::running &&
-             func_status != sycl::info::event_command_status::complete);
-    in_status = in_event.get_info<sycl::info::event::command_execution_status>();
-
-    /* Print results */
-    auto result = (in_status == sycl::info::event_command_status::complete);
-    if (!result) {
-        log_status("in_event", in_status);
-        log_status("func_event", func_status);
-    }
-
-    device_free(queue, ::device_data);
-    return result;
-}
diff --git a/tests/unit_tests/lapack/common/test_log.cpp b/tests/unit_tests/lapack/common/test_log.cpp
deleted file mode 100644
index 939d97639..000000000
--- a/tests/unit_tests/lapack/common/test_log.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <array>
-#include <iostream>
-#include <sstream>
-
-namespace test_log {
-
-std::stringstream lout{};
-std::array<char, 1024> buffer{};
-std::string padding{};
-
-void print() {
-    std::cout.clear();
-    if (lout.rdbuf()->in_avail()) { /* check if stream is non-empty */
-        while (lout.good()) {
-            std::string line;
-            std::getline(lout, line);
-            std::cout << padding << "\t" << line << std::endl;
-        }
-    }
-    lout.str("");
-    lout.clear();
-}
-
-} // namespace test_log
diff --git a/tests/unit_tests/lapack/include/lapack_accuracy_checks.hpp b/tests/unit_tests/lapack/include/lapack_accuracy_checks.hpp
deleted file mode 100644
index 757541b11..000000000
--- a/tests/unit_tests/lapack/include/lapack_accuracy_checks.hpp
+++ /dev/null
@@ -1,649 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#include <cstdio>
-#include <complex>
-#include <vector>
-
-#include "lapack_common.hpp"
-#include "lapack_reference_wrappers.hpp"
-
-/* computes |A - Ref| / (|Ref| min(m,n) eps) < threshold */
-template <typename fp>
-bool rel_mat_err_check(int64_t m, int64_t n, const std::vector<fp>& A, int64_t lda,
-                       const std::vector<fp>& Ref, int64_t ldr, float threshold = 10.0,
-                       char norm_type = '1') {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    std::vector<fp> residual(m * n);
-    for (int64_t col = 0; col < n; col++) {
-        for (int64_t row = 0; row < m; row++) {
-            residual[row + col * m] = A[row + col * lda] - Ref[row + col * ldr];
-        }
-    }
-
-    /* Compute norm of residual and check if it is within tolerance threshold */
-    auto norm_residual = reference::lange(norm_type, m, n, residual.data(), m);
-    auto norm_Ref = reference::lange(norm_type, m, n, Ref.data(), ldr);
-    auto ulp = reference::lamch<fp_real>('P');
-    auto denom = norm_Ref * std::min(m, n) * ulp;
-    denom = denom > 0.0 ? denom : ulp;
-
-    auto rel_err = norm_residual / denom;
-
-    bool result = rel_err < threshold;
-    if (!result) {
-        snprintf(test_log::buffer.data(), test_log::buffer.size(),
-                 "|A - Ref| / (|Ref| min(m,n) eps) = |%e| / (|%e| %d * %e) = %e", norm_residual,
-                 norm_Ref, static_cast<int>(std::min(m, n)), ulp, rel_err);
-        test_log::lout << test_log::buffer.data() << std::endl;
-        test_log::lout << "threshold = " << threshold << std::endl;
-    }
-    return result;
-}
-
-/* computes |A - I| / (|I| n eps) < threshold */
-template <typename fp>
-bool rel_id_err_check(int64_t n, const std::vector<fp>& A, int64_t lda, float threshold = 10.0,
-                      char norm_type = '1') {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    std::vector<fp> residual(n * n);
-    reference::lacpy('F', n, n, A.data(), lda, residual.data(), n);
-    for (int64_t diag = 0; diag < n; diag++) {
-        residual[diag + diag * n] -= static_cast<fp_real>(1.0);
-    }
-
-    /* Compute norm of residual and check if it is within tolerance threshold */
-    auto norm_residual = reference::lange(norm_type, n, n, residual.data(), n);
-    auto ulp = reference::lamch<fp_real>('P');
-    auto denom = n * n * ulp;
-    denom = denom > 0.0 ? denom : ulp;
-    auto rel_err = norm_residual / denom;
-
-    bool result = rel_err < threshold;
-    if (!result) {
-        snprintf(test_log::buffer.data(), test_log::buffer.size(),
-                 "|A - I| / (|I| n eps) = |%e| / (|%d| %d * %e) = %e", norm_residual,
-                 static_cast<int>(n), static_cast<int>(n), ulp, rel_err);
-        test_log::lout << test_log::buffer.data() << std::endl;
-        test_log::lout << "threshold = " << threshold << std::endl;
-    }
-    return result;
-}
-
-/* computes |V - Ref| / (|Ref| eps) < threshold */
-template <typename fp>
-bool rel_vec_err_check(int64_t n, const std::vector<fp>& A, const std::vector<fp>& Ref,
-                       float threshold = 10.0, char norm_type = '1') {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    std::vector<fp> residual(n);
-    for (int64_t row = 0; row < n; row++) {
-        residual[row] = A[row] - Ref[row];
-    }
-
-    /* Compute norm of residual and check if it is within tolerance threshold */
-    auto norm_residual = reference::lange(norm_type, n, 1, residual.data(), n);
-    auto norm_Ref = reference::lange(norm_type, n, 1, Ref.data(), n);
-    auto ulp = reference::lamch<fp_real>('P');
-    auto denom = norm_Ref * ulp;
-    denom = denom > 0.0 ? denom : ulp;
-    auto rel_err = norm_residual / denom;
-
-    bool result = rel_err < threshold;
-    if (!result) {
-        snprintf(test_log::buffer.data(), test_log::buffer.size(),
-                 "|V - Ref| / (|Ref| eps) = |%e| / (|%e| %e) = %e", norm_residual, norm_Ref, ulp,
-                 rel_err);
-        test_log::lout << test_log::buffer.data() << std::endl;
-        test_log::lout << "threshold = " << threshold << std::endl;
-    }
-    return result;
-}
-
-template <typename fp>
-bool check_geqrf_accuracy(int64_t m, int64_t n, const std::vector<fp>& A, int64_t lda,
-                          const std::vector<fp>& tau, const std::vector<fp>& A_initial) {
-    bool result = true;
-    /* |A - Q R| < |A| O(eps) */
-    std::vector<fp> R(m * n);
-    int64_t ldr = m;
-    reference::laset(oneapi::mkl::uplo::lower, m, n, 0.0, 0.0, R.data(), ldr);
-    reference::lacpy(oneapi::mkl::uplo::upper, m, n, A.data(), lda, R.data(), ldr);
-    auto info = reference::or_un_mqr(oneapi::mkl::side::left, oneapi::mkl::transpose::nontrans, m,
-                                     n, std::min(m, n), A.data(), lda, tau.data(), R.data(), ldr);
-    if (0 != info) {
-        test_log::lout << "reference ormqr/unmqr failed with info = " << info << std::endl;
-        return false;
-    }
-    const auto& QR = R;
-    auto ldqr = ldr;
-    if (!rel_mat_err_check(m, n, QR, ldqr, A_initial, lda)) {
-        test_log::lout << "Factorization check failed" << std::endl;
-        result = false;
-    }
-
-    /* | I - Q Q' | < n O(eps) */
-    std::vector<fp> Q(m * m);
-    int64_t ldq = m;
-    reference::lacpy('L', m - 1, n, A.data() + 1, lda, Q.data() + 1, ldq);
-    info = reference::or_un_gqr(m, m, std::min(m, n), Q.data(), ldq, tau.data());
-    if (0 != info) {
-        test_log::lout << "reference org/ungqr failed with info = " << info << std::endl;
-        return false;
-    }
-    std::vector<fp> QQ(m * m);
-    int64_t ldqq = m;
-    reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::conjtrans, m, m, m,
-                    1.0, Q.data(), ldq, Q.data(), ldq, 0.0, QQ.data(), ldqq);
-    if (!rel_id_err_check(m, QQ, ldqq)) {
-        test_log::lout << "Orthogonality check failed" << std::endl;
-        result = false;
-    }
-
-    return result;
-}
-
-template <typename fp>
-bool check_gerqf_accuracy(const std::vector<fp>& A, const std::vector<fp>& A_initial,
-                          const std::vector<fp>& tau, int64_t m, int64_t n, int64_t lda) {
-    bool result = true;
-
-    /* |A - R Q| < |A| O(eps) */
-    if (m >= n) {
-        std::vector<fp> R(m * n);
-        int64_t ldr = m;
-        reference::lacpy('A', m, n, A.data(), lda, R.data(), ldr);
-        reference::laset(oneapi::mkl::uplo::lower, n - 1, n - 1, 0.0, 0.0,
-                         R.data() + ((m - n + 1) + 0 * ldr), ldr);
-
-        std::vector<fp> Q(lda * n);
-        int64_t ldq = n;
-        reference::lacpy('A', n, n, A.data() + ((m - n) + 0 * lda), lda, Q.data(), ldq);
-
-        auto info =
-            reference::or_un_mrq(oneapi::mkl::side::right, oneapi::mkl::transpose::nontrans, m, n,
-                                 std::min(m, n), Q.data(), ldq, tau.data(), R.data(), ldr);
-        if (0 != info) {
-            test_log::lout << "reference ormqr/unmqr failed with info = " << info << std::endl;
-            return false;
-        }
-        if (!rel_mat_err_check(m, n, R, ldr, A_initial, lda)) {
-            test_log::lout << "Factorization check failed" << std::endl;
-            result = false;
-        }
-    }
-    else {
-        std::vector<fp> R(m * n);
-        int64_t ldr = m;
-        reference::laset(oneapi::mkl::uplo::lower, m, m, 0.0, 0.0, R.data(), ldr);
-        reference::lacpy(oneapi::mkl::uplo::upper, m, m, A.data() + (0 + (n - m) * lda), lda,
-                         R.data() + (0 + (n - m) * ldr), ldr);
-
-        std::vector<fp> Q(n * n);
-        int64_t ldq = n;
-        reference::lacpy('A', m, n, A.data(), lda, Q.data() + (n - m + 0 * ldq), ldq);
-
-        std::vector<fp> tau2(n);
-        for (int64_t i = 0; i < std::min(m, n); i++)
-            tau2[n - m + i] = tau[i];
-        auto info = reference::or_un_mrq(oneapi::mkl::side::right, oneapi::mkl::transpose::nontrans,
-                                         m, n, n, Q.data(), ldq, tau2.data(), R.data(), ldr);
-        if (0 != info) {
-            test_log::lout << "reference ormqr/unmqr failed with info = " << info << std::endl;
-            return false;
-        }
-        if (!rel_mat_err_check(m, n, R, ldr, A_initial, lda)) {
-            test_log::lout << "Factorization check failed" << std::endl;
-            result = false;
-        }
-    }
-
-    /* | I - Q Q' | < n O(eps) */
-    std::vector<fp> Q(std::min(m, n) * n);
-    int64_t ldq = std::min(m, n);
-    if (m <= n)
-        reference::lacpy('A', m, n, A.data(), lda, Q.data(), ldq);
-    else
-        reference::lacpy('A', n, n, A.data() + ((m - n) + 0 * lda), lda, Q.data(), ldq);
-    auto info = reference::or_un_grq(std::min(m, n), n, std::min(m, n), Q.data(), ldq, tau.data());
-    if (0 != info) {
-        test_log::lout << "reference orgqr/ungqr failed with info = " << info << std::endl;
-        return false;
-    }
-
-    std::vector<fp> QQ(std::min(m, n) * std::min(m, n));
-    int64_t ldqq = std::min(m, n);
-    reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::conjtrans,
-                    std::min(m, n), std::min(m, n), n, 1.0, Q.data(), ldq, Q.data(), ldq, 0.0,
-                    QQ.data(), ldqq);
-
-    if (!rel_id_err_check(std::min(m, n), QQ, ldqq)) {
-        test_log::lout << "Orthogonality check failed" << std::endl;
-        result = false;
-    }
-
-    return result;
-}
-
-template <typename fp>
-bool check_getrf_accuracy(int64_t m, int64_t n, const std::vector<fp>& A, int64_t lda,
-                          const std::vector<int64_t>& ipiv, const std::vector<fp>& A_initial) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    std::vector<fp> residual(m * n);
-
-    /* Compute P L U */
-    reference::laset('A', m, n, 0.0, 0.0, residual.data(), m);
-    if (m < n) {
-        reference::lacpy(oneapi::mkl::uplo::upper, m, n, A.data(), lda, residual.data(), m);
-        reference::trmm(oneapi::mkl::side::left, oneapi::mkl::uplo::lower,
-                        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit, m, n, 1.0,
-                        A.data(), lda, residual.data(), m);
-    }
-    else {
-        reference::lacpy(oneapi::mkl::uplo::lower, m, n, A.data(), lda, residual.data(), m);
-        for (int64_t diag = 0; diag < n; diag++)
-            residual[diag + diag * m] = 1.0;
-        reference::trmm(oneapi::mkl::side::right, oneapi::mkl::uplo::upper,
-                        oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, m, n, 1.0,
-                        A.data(), lda, residual.data(), m);
-    }
-    reference::laswp(n, residual.data(), m, 1, std::min(m, n), ipiv.data(), -1);
-
-    /* Compute | L U - A | / ( |A| min(m,n) ulp ) */
-    for (int64_t col = 0; col < n; col++) {
-        for (int64_t row = 0; row < m; row++) {
-            residual[row + col * m] -= A_initial[row + col * lda];
-        }
-    }
-    auto norm_residual = reference::lange('1', m, n, residual.data(), m);
-    auto norm_A = reference::lange('1', m, n, A_initial.data(), lda);
-    auto ulp = reference::lamch<fp_real>('P');
-    auto denom = norm_A * std::min(m, n) * ulp;
-    denom = denom > 0.0 ? denom : ulp;
-    auto rel_err = norm_residual / denom;
-
-    fp_real threshold = 30.0;
-    bool result = rel_err < threshold;
-    if (!result) {
-        snprintf(test_log::buffer.data(), test_log::buffer.size(),
-                 "| L * U - A | / ( |A| * min(m,n) * ulp ) = |%e| / (|%e| %d * %e) = %e",
-                 norm_residual, norm_A, static_cast<int>(std::min(m, n)), ulp, rel_err);
-        test_log::lout << test_log::buffer.data() << std::endl;
-        test_log::lout << "threshold = " << threshold << std::endl;
-    }
-
-    return result;
-}
-
-template <typename fp>
-bool check_getri_accuracy(int64_t n, std::vector<fp> A, int64_t lda, std::vector<int64_t>& ipiv,
-                          const std::vector<fp>& A_initial) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    // Norms of original matrix A matrix and inv(A) for error analysis
-    fp_real norm_A = reference::lange('1', n, n, A_initial.data(), lda);
-    fp_real norm_invA = reference::lange('1', n, n, A.data(), lda);
-    fp_real ulp = reference::lamch<fp_real>('P');
-    std::vector<fp> residual(n * n + n);
-    fp_real threshold = 30.0;
-
-    /* denom = ( |A| * |inv(A)| * n * ulp )  */
-    fp_real denom = n * ulp * norm_A * norm_invA;
-    denom = denom > 0.0 ? denom : ulp;
-
-    /* Compute | I - inv(A)*A |. Store in residual array */
-    reference::laset('A', n, n, 0.0, 1.0, residual.data(), n);
-    reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n, n, n,
-                    -1.0, A.data(), lda, A_initial.data(), lda, 1.0, residual.data(), n);
-
-    /* | I - inv(A)*A | / ( |A| * |inv(A)| * n * ulp ) */
-    fp_real norm_residual = reference::lange('1', n, n, residual.data(), n);
-    auto rel_err = norm_residual / denom;
-    bool result = rel_err < threshold;
-    if (!result) {
-        snprintf(test_log::buffer.data(), test_log::buffer.size(),
-                 "| I - inv(A) A | / ( |A| |inv(A)| n ulp ) = |%e| / ( |%e| |%e| %d * %e ) = %e",
-                 norm_residual, norm_A, norm_invA, static_cast<int>(n), ulp, rel_err);
-        test_log::lout << test_log::buffer.data() << std::endl;
-        test_log::lout << "threshold = " << threshold << std::endl;
-    }
-
-    /* Compute | I - A*inv(A) |. Store in residual */
-    reference::laset('A', n, n, 0.0, 1.0, residual.data(), n);
-    reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n, n, n,
-                    -1.0, A_initial.data(), lda, A.data(), lda, 1.0, residual.data(), n);
-
-    /* | I - A*inv(A) | / ( |A| * |inv(A)| * n * ulp ) */
-    norm_residual = reference::lange('1', n, n, residual.data(), n);
-    rel_err = norm_residual / denom;
-    result = rel_err < threshold;
-    if (!result) {
-        snprintf(test_log::buffer.data(), test_log::buffer.size(),
-                 "| I - inv(A) A | / ( |A| |inv(A)| n ulp ) = |%e | / ( |%e| |%e| %d * %e) = %e",
-                 norm_residual, norm_A, norm_invA, static_cast<int>(n), ulp, rel_err);
-        test_log::lout << test_log::buffer.data() << std::endl;
-        test_log::lout << "threshold = " << threshold << std::endl;
-    }
-
-    return result;
-}
-
-template <typename fp>
-bool check_getrs_accuracy(oneapi::mkl::transpose transa, int64_t n, int64_t nrhs,
-                          const std::vector<fp>& B, int64_t ldb, const std::vector<fp>& A_initial,
-                          int64_t lda, std::vector<fp> B_initial) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    // Compute A*X - B. Store result in B_initial
-    reference::gemm(transa, oneapi::mkl::transpose::nontrans, n, nrhs, n, -1.0, A_initial.data(),
-                    lda, B.data(), ldb, 1.0, B_initial.data(), ldb);
-
-    // Compute norm residual |A*X - B|
-    fp_real norm_residual = reference::lange('1', n, nrhs, B_initial.data(), ldb);
-
-    // Norms of original matrix A matrix and solution matrix B for error analysis
-    fp_real norm_A = reference::lange('1', n, n, A_initial.data(), lda);
-    fp_real norm_B = reference::lange('1', n, nrhs, B.data(), ldb);
-    fp_real ulp = reference::lamch<fp_real>('P');
-    fp_real denom = n * ulp * norm_A * norm_B;
-    denom = denom > 0.0 ? denom : ulp;
-    auto rel_err = norm_residual / denom;
-
-    fp_real threshold = 30.0;
-    bool result = rel_err < threshold;
-    if (!result) {
-        snprintf(test_log::buffer.data(), test_log::buffer.size(),
-                 "| AX - B | / ( |A| |X| n ulp ) = |%e| / ( |%e| |%e| %d * %e ) = %e",
-                 norm_residual, norm_A, norm_B, static_cast<int>(n), ulp, rel_err);
-        test_log::lout << test_log::buffer.data() << std::endl;
-        test_log::lout << "threshold = " << threshold << std::endl;
-    }
-
-    return result;
-}
-
-template <typename fp>
-bool check_or_un_gbr_accuracy(oneapi::mkl::generate vect, int64_t m, int64_t n, int64_t k,
-                              const std::vector<fp>& Q, int64_t ldq) {
-    bool result = true;
-
-    if (vect == oneapi::mkl::generate::Q) {
-        int64_t rows_Q = m;
-        int64_t cols_Q = (m >= k) ? n : m;
-
-        /* | I - Q'Q | < m O(eps) */
-        std::vector<fp> QQ(cols_Q * cols_Q);
-        int64_t ldqq = cols_Q;
-        reference::gemm(oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, cols_Q,
-                        cols_Q, rows_Q, 1.0, Q.data(), ldq, Q.data(), ldq, 0.0, QQ.data(), ldqq);
-        if (!rel_id_err_check(cols_Q, QQ, ldqq)) {
-            test_log::lout << "Q Orthogonality check failed" << std::endl;
-            result = false;
-        }
-    }
-    else { /* vect == oneapi::mkl::generate::P */
-        auto& P = Q;
-        auto& ldp = ldq;
-        int64_t rows_P = (k < n) ? m : n;
-        int64_t cols_P = n;
-
-        /* | I - (P')(P')' | < m O(eps) */
-        std::vector<fp> PP(rows_P * rows_P);
-        int64_t ldpp = rows_P;
-        reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::conjtrans, rows_P,
-                        rows_P, cols_P, 1.0, P.data(), ldp, P.data(), ldp, 0.0, PP.data(), ldpp);
-        if (!rel_id_err_check(rows_P, PP, ldpp)) {
-            test_log::lout << "P^t Orthogonality check failed" << std::endl;
-            result = false;
-        }
-    }
-    return result;
-}
-
-template <typename fp>
-bool check_or_un_gqr_accuracy(int64_t m, int64_t n, const std::vector<fp>& Q, int64_t ldq) {
-    bool result = true;
-
-    /* | I - Q'Q | < m O(eps) */
-    std::vector<fp> QQ(n * n);
-    int64_t ldqq = n;
-    reference::gemm(oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, n, n, m,
-                    1.0, Q.data(), ldq, Q.data(), ldq, 0.0, QQ.data(), ldqq);
-    if (!rel_id_err_check(n, QQ, n)) {
-        test_log::lout << "Orthogonality check failed" << std::endl;
-        result = false;
-    }
-    return result;
-}
-
-template <typename fp>
-bool check_or_un_gtr_accuracy(int64_t n, const std::vector<fp>& Q, int64_t ldq) {
-    bool result = true;
-
-    /* | I - Q'Q | < m O(eps) */
-    std::vector<fp> QQ(n * n);
-    int64_t ldqq = n;
-    reference::gemm(oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, n, n, n,
-                    1.0, Q.data(), ldq, Q.data(), ldq, 0.0, QQ.data(), ldqq);
-    if (!rel_id_err_check(n, QQ, n)) {
-        test_log::lout << "Orthogonality check failed" << std::endl;
-        result = false;
-    }
-    return result;
-}
-
-template <typename fp>
-bool check_potrf_accuracy(const std::vector<fp>& init, const std::vector<fp>& sol,
-                          oneapi::mkl::uplo uplo, int64_t n, int64_t lda) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    std::vector<fp> ref(init);
-    reference::potrf(uplo, n, ref.data(), lda);
-
-    fp_real eps = reference::lamch<fp_real>('e');
-    fp_real error, max_error = 0;
-    bool lower =
-        (uplo == oneapi::mkl::uplo::
-                     upper); // lower for row-major (which is this source) is upper for column major
-    bool result = true;
-    // Check solution values are inside allowed error bounds derived in:
-    //   J. W. Demmel, On floating point errors in Cholesky, LAPACK Working Note 14 CS-89–87,
-    //   Department of Computer Science, University of Tennessee, Knoxville, TN, USA, 1989.
-    for (int64_t i = 0; i < n; i++) {
-        for (int64_t j = 0; j < i + 1; j++) {
-            fp exact = lower ? ref[i * lda + j] : ref[j * lda + i];
-            fp solve = lower ? sol[i * lda + j] : sol[j * lda + i];
-            error = std::abs(solve - exact);
-            if (error > ((n + 1) * eps / (1 - (n + 1) * eps)) *
-                            std::sqrt(std::abs(init[i * lda + i] * init[j * lda + j]))) {
-                result = false;
-            }
-            if (error > max_error)
-                max_error = error;
-        }
-    }
-    if (!result)
-        test_log::lout << "Tolerance exceded, max_error = " << max_error << std::endl;
-
-    return result;
-}
-
-template <typename fp>
-bool check_potrs_accuracy(oneapi::mkl::uplo uplo, int64_t n, int64_t nrhs, const std::vector<fp>& B,
-                          int64_t ldb, std::vector<fp> A_initial, int64_t lda,
-                          std::vector<fp> B_initial) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    hermitian_to_full(uplo, n, A_initial, lda);
-    // Compute A*X - B. Store result in B_initial
-    reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n, nrhs, n,
-                    -1.0, A_initial.data(), lda, B.data(), ldb, 1.0, B_initial.data(), ldb);
-
-    // Compute norm residual |A*X - B|
-    fp_real norm_residual = reference::lange('1', n, nrhs, B_initial.data(), ldb);
-
-    // Norms of original matrix A matrix and solution matrix B for error analysis
-    fp_real norm_A = reference::lange('1', n, n, A_initial.data(), lda);
-    fp_real norm_B = reference::lange('1', n, nrhs, B.data(), ldb);
-    fp_real ulp = reference::lamch<fp_real>('P');
-    fp_real denom = n * ulp * norm_A * norm_B;
-    denom = denom > 0.0 ? denom : ulp;
-    auto rel_err = norm_residual / denom;
-
-    fp_real threshold = 30.0;
-    bool result = rel_err < threshold;
-    if (!result) {
-        snprintf(test_log::buffer.data(), test_log::buffer.size(),
-                 "| AX - B | / ( |A| |X| n ulp ) = |%e| / ( |%e| |%e| %d * %e ) = %e",
-                 norm_residual, norm_A, norm_B, static_cast<int>(n), ulp, rel_err);
-        test_log::lout << test_log::buffer.data() << std::endl;
-        test_log::lout << "threshold = " << threshold << std::endl;
-    }
-
-    return result;
-}
-
-template <typename fp>
-bool check_sy_he_evd_accuracy(oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, int64_t n,
-                              const std::vector<fp>& A, int64_t lda,
-                              const std::vector<typename complex_info<fp>::real_type>& w,
-                              std::vector<fp> A_initial) {
-    using fp_real = typename complex_info<fp>::real_type;
-    const auto& Z = A;
-    auto ldz = lda;
-    const auto& D = w;
-    hermitian_to_full(uplo, n, A_initial, lda);
-    bool result = true;
-
-    /* |D_ref - D| < |D_ref| O(eps) */
-    std::vector<fp_real> D_ref(n);
-
-    if constexpr (complex_info<fp>::is_complex)
-        reference::heevd(oneapi::mkl::job::novec, uplo, n, std::vector<fp>(A_initial).data(), lda,
-                         D_ref.data());
-    else
-        reference::syevd(oneapi::mkl::job::novec, uplo, n, std::vector<fp>(A_initial).data(), lda,
-                         D_ref.data());
-
-    if (!rel_vec_err_check(n, D_ref, D, 10.0)) {
-        test_log::lout << "Eigenvalue check failed" << std::endl;
-        result = false;
-    }
-
-    if (oneapi::mkl::job::vec == jobz) {
-        /* |A - Z D Z'| < |A| O(eps) */
-        std::vector<fp> ZD(n * n);
-        int64_t ldzd = n;
-        std::vector<fp> ZDZ(n * n);
-        int64_t ldzdz = n;
-        for (int64_t col = 0; col < n; col++)
-            for (int64_t row = 0; row < n; row++)
-                ZD[row + col * ldzd] = Z[row + col * ldz] * D[col];
-        reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::conjtrans, n, n,
-                        n, 1.0, ZD.data(), ldzd, Z.data(), ldz, 0.0, ZDZ.data(), ldzdz);
-
-        if (!rel_mat_err_check(n, n, A_initial, lda, ZDZ, ldzdz)) {
-            test_log::lout << "Factorization check failed" << std::endl;
-            result = false;
-        }
-
-        /* |I - Z Z'| < n O(eps) */
-        std::vector<fp> ZZ(n * n);
-        int64_t ldzz = n;
-        reference::sy_he_rk(oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, n, n, 1.0,
-                            Z.data(), ldz, 0.0, ZZ.data(), ldzz);
-        hermitian_to_full(oneapi::mkl::uplo::upper, n, ZZ, ldzz);
-        if (!rel_id_err_check(n, ZZ, ldzz)) {
-            test_log::lout << "Orthogonality check failed" << std::endl;
-            result = false;
-        }
-    }
-    return result;
-}
-
-template <typename fp>
-bool check_trtrs_accuracy(oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                          oneapi::mkl::diag diag, int64_t n, int64_t nrhs, std::vector<fp> A,
-                          int64_t lda, const std::vector<fp>& B, int64_t ldb,
-                          const std::vector<fp>& B_initial) {
-    using fp_real = typename complex_info<fp>::real_type;
-    fp_real threshold = 10.0;
-
-    /* |A x - b| = |A (x-x_0)| < |A| |x-x0| < |A| |x| cond(A) O(eps) */
-    if (diag == oneapi::mkl::diag::unit)
-        for (int64_t d = 0; d < n; d++)
-            A[d + d * lda] = 1.0;
-    if (uplo == oneapi::mkl::uplo::upper)
-        for (int64_t col = 0; col < n; col++)
-            for (int64_t row = col + 1; row < n; row++)
-                A[row + col * lda] = 0.0;
-    else
-        for (int64_t col = 0; col < n; col++)
-            for (int64_t row = 0; row < col; row++)
-                A[row + col * lda] = 0.0;
-
-    auto norm_A = reference::lange('I', n, n, A.data(), lda);
-    auto norm_x = reference::lange('I', n, nrhs, B.data(), ldb);
-
-    fp_real cond_A;
-    if (diag == oneapi::mkl::diag::unit)
-        cond_A = 1.0;
-    else {
-        fp_real min = std::abs(A[0]);
-        fp_real max = std::abs(A[0]);
-        for (int64_t d = 0; d < n; d++) {
-            auto val = std::abs(A[d + d * lda]);
-            min = (val < min) ? val : min;
-            max = (val > max) ? val : max;
-        }
-        cond_A = max / min;
-    }
-
-    auto ulp = reference::lamch<fp_real>('P');
-    auto denom = norm_A * norm_x * cond_A * ulp;
-    denom = denom > 0.0 ? denom : ulp;
-
-    std::vector<fp> residual(n * nrhs);
-    int64_t ldr = n;
-    reference::gemm(trans, oneapi::mkl::transpose::nontrans, n, nrhs, n, 1.0, A.data(), lda,
-                    B.data(), ldb, 0.0, residual.data(), ldr);
-    for (int64_t col = 0; col < nrhs; col++)
-        for (int64_t row = 0; row < n; row++)
-            residual[row + col * ldr] -= B_initial[row + col * ldb];
-
-    auto norm_residual = reference::lange('I', n, nrhs, residual.data(), ldr);
-    auto rel_err = norm_residual / denom;
-
-    bool result = rel_err < threshold;
-    if (!result) {
-        snprintf(test_log::buffer.data(), test_log::buffer.size(),
-                 "|Ax - b| / (|A| |x| cond(A) eps) = |%e| / (|%e| |%e| %e * %e) = %e",
-                 norm_residual, norm_A, norm_x, cond_A, ulp, rel_err);
-        test_log::lout << test_log::buffer.data() << std::endl;
-        test_log::lout << "threshold = " << threshold << std::endl;
-        test_log::lout << "Solve check failed" << std::endl;
-    }
-
-    return result;
-}
diff --git a/tests/unit_tests/lapack/include/lapack_common.hpp b/tests/unit_tests/lapack/include/lapack_common.hpp
deleted file mode 100644
index 1cebb7553..000000000
--- a/tests/unit_tests/lapack/include/lapack_common.hpp
+++ /dev/null
@@ -1,303 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#include <complex>
-#include <random>
-#include <sstream>
-#include <stdexcept>
-#include <type_traits>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/types.hpp"
-
-namespace test_log {
-
-extern std::stringstream lout;
-extern std::array<char, 1024> buffer;
-extern std::string padding;
-void print();
-
-} // namespace test_log
-
-inline void print_device_info(const sycl::device& device) {
-    sycl::platform platform = device.get_platform();
-    std::cout << test_log::padding << std::endl;
-    std::cout << test_log::padding << "Device Info" << std::endl;
-    std::cout << test_log::padding << "name : " << device.get_info<sycl::info::device::name>()
-              << std::endl;
-    std::cout << test_log::padding
-              << "driver version : " << device.get_info<sycl::info::device::driver_version>()
-              << std::endl;
-    std::cout << test_log::padding
-              << "platform : " << platform.get_info<sycl::info::platform::name>() << std::endl;
-    std::cout << test_log::padding
-              << "platform version : " << platform.get_info<sycl::info::platform::version>()
-              << std::endl;
-    std::cout << test_log::padding
-              << "vendor : " << platform.get_info<sycl::info::platform::vendor>() << std::endl;
-    std::cout << test_log::padding << std::endl;
-}
-
-inline void async_error_handler(sycl::exception_list exceptions) {
-    if (exceptions.size()) {
-        for (auto const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (std::exception const& e) {
-                test_log::lout << e.what() << std::endl;
-            }
-        }
-        std::string message{ std::to_string(exceptions.size()) +
-                             " exception(s) caught during asynchronous operation" };
-        throw std::runtime_error(message);
-    }
-}
-
-template <typename T>
-struct complex_info {
-    using real_type = T;
-    static const bool is_complex = false;
-};
-
-template <typename T>
-struct complex_info<std::complex<T>> {
-    using real_type = T;
-    static const bool is_complex = true;
-};
-
-template <typename fp>
-fp get_real(fp val) {
-    return val;
-}
-template <typename fp>
-std::complex<fp> get_real(std::complex<fp> val) {
-    return val.real();
-}
-
-template <typename fp>
-fp get_conj(fp val) {
-    return val;
-}
-template <typename fp>
-std::complex<fp> get_conj(std::complex<fp> val) {
-    return std::conj(val);
-}
-
-template <typename fp>
-fp rand_scalar(uint64_t& seed) {
-    std::minstd_rand rng(seed);
-    seed = rng();
-    return 2 * (static_cast<fp>(seed) / static_cast<fp>(rng.max())) - 0.0;
-}
-template <>
-inline std::complex<float> rand_scalar(uint64_t& seed) {
-    return std::complex<float>(rand_scalar<float>(seed), rand_scalar<float>(seed));
-}
-template <>
-inline std::complex<double> rand_scalar(uint64_t& seed) {
-    return std::complex<double>(rand_scalar<double>(seed), rand_scalar<double>(seed));
-}
-
-template <typename fp>
-void rand_matrix(uint64_t& seed, oneapi::mkl::transpose trans, int64_t m, int64_t n,
-                 std::vector<fp>& M, int64_t ld, int64_t offset = 0) {
-    if (trans == oneapi::mkl::transpose::nontrans)
-        for (int64_t col = 0; col < n; col++)
-            for (int64_t row = 0; row < m; row++)
-                M[offset + row + col * ld] = rand_scalar<fp>(seed);
-    else
-        for (int64_t row = 0; row < m; row++)
-            for (int64_t col = 0; col < n; col++)
-                M[offset + col + row * ld] = rand_scalar<fp>(seed);
-}
-
-template <typename fp>
-void rand_matrix_diag_dom(uint64_t& seed, oneapi::mkl::transpose trans, int64_t m, int64_t n,
-                          std::vector<fp>& M, int64_t ld, int64_t offset = 0) {
-    using fp_real = typename complex_info<fp>::real_type;
-    int64_t minsh;
-    minsh = std::min(m, n);
-    if (trans == oneapi::mkl::transpose::nontrans)
-        for (int64_t col = 0; col < n; col++)
-            for (int64_t row = 0; row < m; row++) {
-                M[offset + row + col * ld] = rand_scalar<fp>(seed);
-                if (row == col)
-                    M[offset + row + col * ld] += static_cast<fp_real>(minsh);
-            }
-    else
-        for (int64_t row = 0; row < m; row++)
-            for (int64_t col = 0; col < n; col++) {
-                M[offset + col + row * ld] = rand_scalar<fp>(seed);
-                if (row == col)
-                    M[offset + col + row * ld] += static_cast<fp_real>(minsh);
-            }
-}
-
-template <typename fp>
-void rand_symmetric_matrix(uint64_t& seed, oneapi::mkl::uplo uplo, int64_t n, std::vector<fp>& M,
-                           int64_t ld, int64_t offset = 0) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    if (uplo == oneapi::mkl::uplo::upper)
-        for (int64_t col = 0; col < n; col++)
-            for (int64_t row = 0; row <= col; row++)
-                M[offset + row + col * ld] = rand_scalar<fp>(seed);
-    else
-        for (int64_t col = 0; col < n; col++)
-            for (int64_t row = col; row < n; row++)
-                M[offset + row + col * ld] = rand_scalar<fp>(seed);
-}
-
-template <typename fp>
-void rand_hermitian_matrix(uint64_t& seed, oneapi::mkl::uplo uplo, int64_t n, std::vector<fp>& M,
-                           int64_t ld, int64_t offset = 0) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    rand_symmetric_matrix(seed, uplo, n, M, ld, offset);
-    for (int64_t diag = 0; diag < n; diag++)
-        M[offset + diag + diag * ld] = rand_scalar<fp_real>(seed);
-}
-
-template <typename fp>
-void rand_pos_def_matrix(uint64_t& seed, oneapi::mkl::uplo uplo, int64_t n, std::vector<fp>& M,
-                         int64_t ld, int64_t offset = 0) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    rand_hermitian_matrix(seed, uplo, n, M, ld, offset);
-    for (int64_t diag = 0; diag < n; diag++)
-        M[offset + diag + diag * ld] += static_cast<fp_real>(n);
-    return;
-}
-
-template <typename fp>
-void symmetric_to_full(oneapi::mkl::uplo uplo, int64_t n, std::vector<fp>& A, int64_t lda) {
-    if (oneapi::mkl::uplo::upper == uplo)
-        for (int64_t col = 0; col < n; col++)
-            for (int64_t row = col + 1; row < n; row++)
-                A[row + col * lda] = A[col + row * lda];
-    else
-        for (int64_t col = 0; col < n; col++)
-            for (int64_t row = 0; row < col; row++)
-                A[row + col * lda] = A[col + row * lda];
-    return;
-}
-
-template <typename fp>
-void hermitian_to_full(oneapi::mkl::uplo uplo, int64_t n, std::vector<fp>& A, int64_t lda) {
-    for (int64_t diag = 0; diag < n; diag++)
-        A[diag + diag * lda] = get_real(A[diag + diag * lda]);
-    if (oneapi::mkl::uplo::upper == uplo)
-        for (int64_t col = 0; col < n; col++)
-            for (int64_t row = col + 1; row < n; row++)
-                A[row + col * lda] = get_conj(A[col + row * lda]);
-    else
-        for (int64_t col = 0; col < n; col++)
-            for (int64_t row = 0; row < col; row++)
-                A[row + col * lda] = get_conj(A[col + row * lda]);
-    return;
-}
-
-template <typename T>
-std::vector<T> copy_vector(const std::vector<T>& vec, int64_t count, int64_t offset) {
-    return std::vector<T>(vec.begin() + offset, vec.begin() + offset + count);
-}
-
-template <typename buffer_T>
-struct is_buf {
-    static constexpr bool value{ false };
-};
-template <typename T, int dimensions, typename AllocatorT>
-struct is_buf<sycl::buffer<T, dimensions, AllocatorT, void>> {
-    static constexpr bool value{ true };
-};
-
-template <typename data_T>
-using is_buffer_type = typename std::enable_if<is_buf<data_T>::value>::type*;
-template <typename data_T>
-using is_not_buffer_type = typename std::enable_if<!is_buf<data_T>::value>::type*;
-
-template <typename data_T, typename = void*>
-struct data_T_info {};
-template <typename data_T>
-struct data_T_info<data_T, is_buffer_type<data_T>> {
-    using value_type = typename data_T::value_type;
-};
-template <typename data_T>
-struct data_T_info<data_T, is_not_buffer_type<data_T>> {
-    using value_type = data_T;
-};
-
-template <typename data_T, typename T = typename data_T::value_type,
-          is_buffer_type<data_T> = nullptr>
-sycl::buffer<T, 1> device_alloc(sycl::queue queue, size_t count, size_t alignment = 4096) {
-    sycl::buffer<T, 1> buf{ sycl::range<1>(count) };
-    return buf;
-}
-template <typename data_T, typename T = data_T, is_not_buffer_type<data_T> = nullptr>
-T* device_alloc(sycl::queue queue, size_t count, size_t alignment = 4096) {
-    T* dev_ptr = (T*)sycl::malloc_device(count * sizeof(T), queue);
-    return dev_ptr;
-}
-
-template <typename data_T, is_buffer_type<data_T> = nullptr>
-void device_free(sycl::queue queue, data_T buf) {}
-template <typename data_T, is_not_buffer_type<data_T> = nullptr>
-void device_free(sycl::queue queue, data_T* dev_ptr) {
-    const sycl::context ctx = queue.get_context();
-    sycl::free(dev_ptr, ctx);
-}
-
-template <typename data_T, is_buffer_type<data_T> = nullptr>
-void host_to_device_copy(sycl::queue queue, typename data_T::value_type* source, data_T dest,
-                         size_t count) {
-    queue.submit([&](sycl::handler& cgh) {
-        auto dest_accessor =
-            dest.template get_access<sycl::access::mode::discard_write>(cgh, sycl::range<1>(count));
-        cgh.copy(source, dest_accessor);
-    });
-}
-template <typename data_T, is_not_buffer_type<data_T> = nullptr>
-sycl::event host_to_device_copy(sycl::queue queue, data_T* source, data_T* dest, size_t count) {
-    return queue.memcpy(dest, source, count * sizeof(data_T));
-}
-
-template <typename data_T, is_buffer_type<data_T> = nullptr>
-void device_to_host_copy(sycl::queue queue, data_T source, typename data_T::value_type* dest,
-                         size_t count) {
-    queue.submit([&](sycl::handler& cgh) {
-        auto source_accessor =
-            source.template get_access<sycl::access::mode::read>(cgh, sycl::range<1>(count));
-        cgh.copy(source_accessor, dest);
-    });
-}
-template <typename data_T, is_not_buffer_type<data_T> = nullptr>
-sycl::event device_to_host_copy(sycl::queue queue, data_T* source, data_T* dest, size_t count) {
-    return queue.memcpy(dest, source, count * sizeof(data_T));
-}
-
-sycl::event create_dependency(sycl::queue queue);
-bool check_dependency(sycl::queue, sycl::event in_event, sycl::event func_event);
diff --git a/tests/unit_tests/lapack/include/lapack_gtest_suite.hpp b/tests/unit_tests/lapack/include/lapack_gtest_suite.hpp
deleted file mode 100644
index 41e349d7c..000000000
--- a/tests/unit_tests/lapack/include/lapack_gtest_suite.hpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-using RealSinglePrecisionBuffer = sycl::buffer<float, 1>;
-using RealDoublePrecisionBuffer = sycl::buffer<double, 1>;
-using ComplexSinglePrecisionBuffer = sycl::buffer<std::complex<float>, 1>;
-using ComplexDoublePrecisionBuffer = sycl::buffer<std::complex<double>, 1>;
-using RealSinglePrecisionUsm = float;
-using RealDoublePrecisionUsm = double;
-using ComplexSinglePrecisionUsm = std::complex<float>;
-using ComplexDoublePrecisionUsm = std::complex<double>;
-
-#define CREATE_TEST_CLASS(SUITE, TEST) \
-    class SUITE##TEST : public ::testing::TestWithParam<sycl::device*> {}
-
-#define INSTANTIATE_TEST_CLASS(SUITE, TEST) \
-    INSTANTIATE_TEST_SUITE_P(SUITE, SUITE##TEST, ::testing::ValuesIn(devices), DeviceNamePrint());
-
-#define INSTANTIATE_GTEST_SUITE_ACCURACY(SUITE) \
-    CREATE_TEST_CLASS(SUITE, AccuracyUsm);      \
-    DEFINE_TEST_ACCURACY_USM_REAL(SUITE);       \
-    DEFINE_TEST_ACCURACY_USM_COMPLEX(SUITE);    \
-    INSTANTIATE_TEST_CLASS(SUITE, AccuracyUsm); \
-    CREATE_TEST_CLASS(SUITE, AccuracyBuffer);   \
-    DEFINE_TEST_ACCURACY_BUFFER_REAL(SUITE);    \
-    DEFINE_TEST_ACCURACY_BUFFER_COMPLEX(SUITE); \
-    INSTANTIATE_TEST_CLASS(SUITE, AccuracyBuffer)
-
-#define INSTANTIATE_GTEST_SUITE_ACCURACY_REAL(SUITE) \
-    CREATE_TEST_CLASS(SUITE, AccuracyUsm);           \
-    DEFINE_TEST_ACCURACY_USM_REAL(SUITE);            \
-    INSTANTIATE_TEST_CLASS(SUITE, AccuracyUsm);      \
-    CREATE_TEST_CLASS(SUITE, AccuracyBuffer);        \
-    DEFINE_TEST_ACCURACY_BUFFER_REAL(SUITE);         \
-    INSTANTIATE_TEST_CLASS(SUITE, AccuracyBuffer)
-
-#define INSTANTIATE_GTEST_SUITE_ACCURACY_COMPLEX(SUITE) \
-    CREATE_TEST_CLASS(SUITE, AccuracyUsm);              \
-    DEFINE_TEST_ACCURACY_USM_COMPLEX(SUITE);            \
-    INSTANTIATE_TEST_CLASS(SUITE, AccuracyUsm);         \
-    CREATE_TEST_CLASS(SUITE, AccuracyBuffer);           \
-    DEFINE_TEST_ACCURACY_BUFFER_COMPLEX(SUITE);         \
-    INSTANTIATE_TEST_CLASS(SUITE, AccuracyBuffer)
-
-#define INSTANTIATE_GTEST_SUITE_ACCURACY_USM(SUITE) \
-    CREATE_TEST_CLASS(SUITE, AccuracyUsm);          \
-    DEFINE_TEST_ACCURACY_USM_REAL(SUITE);           \
-    DEFINE_TEST_ACCURACY_USM_COMPLEX(SUITE);        \
-    INSTANTIATE_TEST_CLASS(SUITE, AccuracyUsm)
-
-#define INSTANTIATE_GTEST_SUITE_ACCURACY_USM_REAL(SUITE) \
-    CREATE_TEST_CLASS(SUITE, AccuracyUsm);               \
-    DEFINE_TEST_ACCURACY_USM_REAL(SUITE);                \
-    INSTANTIATE_TEST_CLASS(SUITE, AccuracyUsm)
-
-#define INSTANTIATE_GTEST_SUITE_ACCURACY_USM_COMPLEX(SUITE) \
-    CREATE_TEST_CLASS(SUITE, AccuracyUsm);                  \
-    DEFINE_TEST_ACCURACY_USM_COMPLEX(SUITE);                \
-    INSTANTIATE_TEST_CLASS(SUITE, AccuracyUsm)
-
-#define DEFINE_TEST_ACCURACY_USM_REAL(SUITE)                                                   \
-    TEST_P(SUITE##AccuracyUsm, RealSinglePrecision) {                                          \
-        test_log::padding = "[          ] ";                                                   \
-        EXPECT_TRUE(accuracy_controller.run(::accuracy<RealSinglePrecisionUsm>, *GetParam())); \
-    }                                                                                          \
-    TEST_P(SUITE##AccuracyUsm, RealDoublePrecision) {                                          \
-        CHECK_DOUBLE_ON_DEVICE(GetParam());                                                    \
-        test_log::padding = "[          ] ";                                                   \
-        EXPECT_TRUE(accuracy_controller.run(::accuracy<RealDoublePrecisionUsm>, *GetParam())); \
-    }
-
-#define DEFINE_TEST_ACCURACY_USM_COMPLEX(SUITE)                                                   \
-    TEST_P(SUITE##AccuracyUsm, ComplexSinglePrecision) {                                          \
-        test_log::padding = "[          ] ";                                                      \
-        EXPECT_TRUE(accuracy_controller.run(::accuracy<ComplexSinglePrecisionUsm>, *GetParam())); \
-    }                                                                                             \
-    TEST_P(SUITE##AccuracyUsm, ComplexDoublePrecision) {                                          \
-        CHECK_DOUBLE_ON_DEVICE(GetParam());                                                       \
-        test_log::padding = "[          ] ";                                                      \
-        EXPECT_TRUE(accuracy_controller.run(::accuracy<ComplexDoublePrecisionUsm>, *GetParam())); \
-    }
-
-#define DEFINE_TEST_ACCURACY_BUFFER_REAL(SUITE)                                                   \
-    TEST_P(SUITE##AccuracyBuffer, RealSinglePrecision) {                                          \
-        test_log::padding = "[          ] ";                                                      \
-        EXPECT_TRUE(accuracy_controller.run(::accuracy<RealSinglePrecisionBuffer>, *GetParam())); \
-    }                                                                                             \
-    TEST_P(SUITE##AccuracyBuffer, RealDoublePrecision) {                                          \
-        CHECK_DOUBLE_ON_DEVICE(GetParam());                                                       \
-        test_log::padding = "[          ] ";                                                      \
-        EXPECT_TRUE(accuracy_controller.run(::accuracy<RealDoublePrecisionBuffer>, *GetParam())); \
-    }
-
-#define DEFINE_TEST_ACCURACY_BUFFER_COMPLEX(SUITE)                                           \
-    TEST_P(SUITE##AccuracyBuffer, ComplexSinglePrecision) {                                  \
-        test_log::padding = "[          ] ";                                                 \
-        EXPECT_TRUE(                                                                         \
-            accuracy_controller.run(::accuracy<ComplexSinglePrecisionBuffer>, *GetParam())); \
-    }                                                                                        \
-    TEST_P(SUITE##AccuracyBuffer, ComplexDoublePrecision) {                                  \
-        CHECK_DOUBLE_ON_DEVICE(GetParam());                                                  \
-        test_log::padding = "[          ] ";                                                 \
-        EXPECT_TRUE(                                                                         \
-            accuracy_controller.run(::accuracy<ComplexDoublePrecisionBuffer>, *GetParam())); \
-    }
-
-#define INSTANTIATE_GTEST_SUITE_DEPENDENCY(SUITE) \
-    CREATE_TEST_CLASS(SUITE, DependencyUsm);      \
-    DEFINE_TEST_DEPENDENCY_REAL(SUITE);           \
-    DEFINE_TEST_DEPENDENCY_COMPLEX(SUITE);        \
-    INSTANTIATE_TEST_CLASS(SUITE, DependencyUsm)
-
-#define INSTANTIATE_GTEST_SUITE_DEPENDENCY_REAL(SUITE) \
-    CREATE_TEST_CLASS(SUITE, DependencyUsm);           \
-    DEFINE_TEST_DEPENDENCY_REAL(SUITE);                \
-    INSTANTIATE_TEST_CLASS(SUITE, DependencyUsm);
-
-#define INSTANTIATE_GTEST_SUITE_DEPENDENCY_COMPLEX(SUITE) \
-    CREATE_TEST_CLASS(SUITE, DependencyUsm);              \
-    DEFINE_TEST_DEPENDENCY_COMPLEX(SUITE);                \
-    INSTANTIATE_TEST_CLASS(SUITE, DependencyUsm);
-
-#define DEFINE_TEST_DEPENDENCY_REAL(SUITE)                                                     \
-    TEST_P(SUITE##DependencyUsm, RealSinglePrecision) {                                        \
-        test_log::padding = "[          ] ";                                                   \
-        EXPECT_TRUE(                                                                           \
-            dependency_controller.run(::usm_dependency<RealSinglePrecisionUsm>, *GetParam())); \
-    }                                                                                          \
-    TEST_P(SUITE##DependencyUsm, RealDoublePrecision) {                                        \
-        CHECK_DOUBLE_ON_DEVICE(GetParam());                                                    \
-        test_log::padding = "[          ] ";                                                   \
-        EXPECT_TRUE(                                                                           \
-            dependency_controller.run(::usm_dependency<RealDoublePrecisionUsm>, *GetParam())); \
-    }
-
-#define DEFINE_TEST_DEPENDENCY_COMPLEX(SUITE)                                                     \
-    TEST_P(SUITE##DependencyUsm, ComplexSinglePrecision) {                                        \
-        test_log::padding = "[          ] ";                                                      \
-        EXPECT_TRUE(                                                                              \
-            dependency_controller.run(::usm_dependency<ComplexSinglePrecisionUsm>, *GetParam())); \
-    }                                                                                             \
-    TEST_P(SUITE##DependencyUsm, ComplexDoublePrecision) {                                        \
-        CHECK_DOUBLE_ON_DEVICE(GetParam());                                                       \
-        test_log::padding = "[          ] ";                                                      \
-        EXPECT_TRUE(                                                                              \
-            dependency_controller.run(::usm_dependency<ComplexDoublePrecisionUsm>, *GetParam())); \
-    }\
diff --git a/tests/unit_tests/lapack/include/lapack_reference_wrappers.hpp b/tests/unit_tests/lapack/include/lapack_reference_wrappers.hpp
deleted file mode 100644
index cb09ec16a..000000000
--- a/tests/unit_tests/lapack/include/lapack_reference_wrappers.hpp
+++ /dev/null
@@ -1,906 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#ifndef lapack_int
-#define lapack_int int64_t
-#endif
-#ifndef CBLAS_INT
-#define CBLAS_INT int64_t
-#endif
-#define WeirdNEC
-extern "C" {
-#ifdef USE_MKLREF
-#include "mkl_cblas.h"
-#include "mkl_lapacke.h"
-#else
-#include "cblas.h"
-#include "lapacke.h"
-#endif
-}
-static_assert(sizeof(lapack_int) == 8);
-static_assert(sizeof(CBLAS_INT) == 8);
-
-namespace reference {
-inline CBLAS_TRANSPOSE cblas_trans(oneapi::mkl::transpose t) {
-    if (t == oneapi::mkl::transpose::nontrans)
-        return CblasNoTrans;
-    if (t == oneapi::mkl::transpose::trans)
-        return CblasTrans;
-    if (t == oneapi::mkl::transpose::conjtrans)
-        return CblasConjTrans;
-    return CblasNoTrans;
-}
-inline CBLAS_UPLO cblas_uplo(oneapi::mkl::uplo u) {
-    if (u == oneapi::mkl::uplo::upper)
-        return CblasUpper;
-    if (u == oneapi::mkl::uplo::lower)
-        return CblasLower;
-    return CblasUpper;
-}
-inline CBLAS_DIAG cblas_diag(oneapi::mkl::diag d) {
-    if (d == oneapi::mkl::diag::nonunit)
-        return CblasNonUnit;
-    if (d == oneapi::mkl::diag::unit)
-        return CblasUnit;
-    return CblasNonUnit;
-}
-inline CBLAS_SIDE cblas_side(const char *c) {
-    return *c == 'R' || *c == 'r' ? CblasRight : CblasLeft;
-}
-inline CBLAS_SIDE cblas_side(oneapi::mkl::side s) {
-    if (s == oneapi::mkl::side::left)
-        return CblasLeft;
-    if (s == oneapi::mkl::side::right)
-        return CblasRight;
-    return CblasLeft;
-}
-inline char to_char(oneapi::mkl::transpose t) {
-    if (t == oneapi::mkl::transpose::nontrans)
-        return 'N';
-    if (t == oneapi::mkl::transpose::trans)
-        return 'T';
-    if (t == oneapi::mkl::transpose::conjtrans)
-        return 'C';
-    return 'N';
-}
-inline char to_char(oneapi::mkl::offset t) {
-    if (t == oneapi::mkl::offset::fix)
-        return 'F';
-    if (t == oneapi::mkl::offset::row)
-        return 'R';
-    if (t == oneapi::mkl::offset::column)
-        return 'C';
-    return 'N';
-}
-
-inline char to_char(oneapi::mkl::uplo u) {
-    if (u == oneapi::mkl::uplo::upper)
-        return 'U';
-    if (u == oneapi::mkl::uplo::lower)
-        return 'L';
-    return 'U';
-}
-
-inline char to_char(oneapi::mkl::diag d) {
-    if (d == oneapi::mkl::diag::nonunit)
-        return 'N';
-    if (d == oneapi::mkl::diag::unit)
-        return 'U';
-    return 'N';
-}
-
-inline char to_char(oneapi::mkl::side s) {
-    if (s == oneapi::mkl::side::left)
-        return 'L';
-    if (s == oneapi::mkl::side::right)
-        return 'R';
-    return 'L';
-}
-
-inline char to_char(oneapi::mkl::job j) {
-    if (j == oneapi::mkl::job::novec)
-        return 'N';
-    if (j == oneapi::mkl::job::vec)
-        return 'V';
-    if (j == oneapi::mkl::job::updatevec)
-        return 'U';
-    if (j == oneapi::mkl::job::allvec)
-        return 'A';
-    if (j == oneapi::mkl::job::somevec)
-        return 'S';
-    if (j == oneapi::mkl::job::overwritevec)
-        return 'O';
-    return 'N';
-}
-inline char to_char(oneapi::mkl::jobsvd j) {
-    if (j == oneapi::mkl::jobsvd::novec)
-        return 'N';
-    if (j == oneapi::mkl::jobsvd::vectors)
-        return 'A';
-    if (j == oneapi::mkl::jobsvd::vectorsina)
-        return 'O';
-    if (j == oneapi::mkl::jobsvd::somevec)
-        return 'S';
-    return 'N';
-}
-inline char to_char(oneapi::mkl::generate v) {
-    if (v == oneapi::mkl::generate::Q)
-        return 'Q';
-    if (v == oneapi::mkl::generate::P)
-        return 'P';
-    return 'Q';
-}
-
-inline void gemm(oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, int64_t m, int64_t n,
-                 int64_t k, float alpha, const float *a, int64_t lda, const float *b, int64_t ldb,
-                 float beta, float *c, int64_t ldc) {
-    cblas_sgemm(CblasColMajor, cblas_trans(transa), cblas_trans(transb), m, n, k, alpha, a, lda, b,
-                ldb, beta, c, ldc);
-}
-inline void gemm(oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, int64_t m, int64_t n,
-                 int64_t k, double alpha, const double *a, int64_t lda, const double *b,
-                 int64_t ldb, double beta, double *c, int64_t ldc) {
-    cblas_dgemm(CblasColMajor, cblas_trans(transa), cblas_trans(transb), m, n, k, alpha, a, lda, b,
-                ldb, beta, c, ldc);
-}
-inline void gemm(oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, int64_t m, int64_t n,
-                 int64_t k, std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 const std::complex<float> *b, int64_t ldb, std::complex<float> beta,
-                 std::complex<float> *c, int64_t ldc) {
-    cblas_cgemm(CblasColMajor, cblas_trans(transa), cblas_trans(transb), m, n, k, (void *)&alpha,
-                (void *)a, lda, (void *)(b), ldb, (void *)&beta, (void *)c, ldc);
-}
-inline void gemm(oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, int64_t m, int64_t n,
-                 int64_t k, std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 const std::complex<double> *b, int64_t ldb, std::complex<double> beta,
-                 std::complex<double> *c, int64_t ldc) {
-    cblas_zgemm(CblasColMajor, cblas_trans(transa), cblas_trans(transb), m, n, k, (void *)&alpha,
-                (void *)a, lda, (void *)(b), ldb, (void *)&beta, (void *)c, ldc);
-}
-
-inline int64_t syevd(oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, float *a, int64_t lda,
-                     float *w) {
-    return LAPACKE_ssyevd(LAPACK_COL_MAJOR, to_char(j), to_char(u), n, a, lda, w);
-}
-inline int64_t syevd(oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, double *a, int64_t lda,
-                     double *w) {
-    return LAPACKE_dsyevd(LAPACK_COL_MAJOR, to_char(j), to_char(u), n, a, lda, w);
-}
-
-inline int64_t sygvd(int64_t itype, oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, float *a,
-                     int64_t lda, float *b, int64_t ldb, float *w) {
-    return LAPACKE_ssygvd(LAPACK_COL_MAJOR, itype, to_char(j), to_char(u), n, a, lda, b, ldb, w);
-}
-inline int64_t sygvd(int64_t itype, oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, double *a,
-                     int64_t lda, double *b, int64_t ldb, double *w) {
-    return LAPACKE_dsygvd(LAPACK_COL_MAJOR, itype, to_char(j), to_char(u), n, a, lda, b, ldb, w);
-}
-
-inline void syrk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k, float alpha,
-                 const float *a, int64_t lda, float beta, float *c, int64_t ldc) {
-    cblas_ssyrk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, c, ldc);
-}
-inline void syrk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k, double alpha,
-                 const double *a, int64_t lda, double beta, double *c, int64_t ldc) {
-    cblas_dsyrk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, c, ldc);
-}
-inline void syrk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k,
-                 std::complex<float> alpha, const std::complex<float> *a, int64_t lda,
-                 std::complex<float> beta, std::complex<float> *c, int64_t ldc) {
-    cblas_csyrk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, (void *)&alpha, a, lda,
-                (void *)&beta, (void *)c, ldc);
-}
-inline void syrk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k,
-                 std::complex<double> alpha, const std::complex<double> *a, int64_t lda,
-                 std::complex<double> beta, std::complex<double> *c, int64_t ldc) {
-    cblas_zsyrk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, (void *)&alpha, a, lda,
-                (void *)&beta, (void *)c, ldc);
-}
-inline void herk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k, float alpha,
-                 const std::complex<float> *a, int64_t lda, float beta, std::complex<float> *c,
-                 int64_t ldc) {
-    cblas_cherk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, (void *)c,
-                ldc);
-}
-inline void herk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k, double alpha,
-                 const std::complex<double> *a, int64_t lda, double beta, std::complex<double> *c,
-                 int64_t ldc) {
-    cblas_zherk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, (void *)c,
-                ldc);
-}
-inline void sy_he_rk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k,
-                     float alpha, const float *a, int64_t lda, float beta, float *c, int64_t ldc) {
-    cblas_ssyrk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, c, ldc);
-}
-inline void sy_he_rk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k,
-                     double alpha, const double *a, int64_t lda, double beta, double *c,
-                     int64_t ldc) {
-    cblas_dsyrk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, c, ldc);
-}
-inline void sy_he_rk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k,
-                     float alpha, const std::complex<float> *a, int64_t lda, float beta,
-                     std::complex<float> *c, int64_t ldc) {
-    cblas_cherk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, (void *)c,
-                ldc);
-}
-inline void sy_he_rk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k,
-                     double alpha, const std::complex<double> *a, int64_t lda, double beta,
-                     std::complex<double> *c, int64_t ldc) {
-    cblas_zherk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, (void *)c,
-                ldc);
-}
-
-inline void trmm(oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose transa,
-                 oneapi::mkl::diag diag, int64_t m, int64_t n, float alpha, const float *a,
-                 int64_t lda, float *b, int64_t ldb) {
-    cblas_strmm(CblasColMajor, cblas_side(side), cblas_uplo(uplo), cblas_trans(transa),
-                cblas_diag(diag), m, n, alpha, a, lda, b, ldb);
-}
-inline void trmm(oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose transa,
-                 oneapi::mkl::diag diag, int64_t m, int64_t n, double alpha, const double *a,
-                 int64_t lda, double *b, int64_t ldb) {
-    cblas_dtrmm(CblasColMajor, cblas_side(side), cblas_uplo(uplo), cblas_trans(transa),
-                cblas_diag(diag), m, n, alpha, a, lda, b, ldb);
-}
-inline void trmm(oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose transa,
-                 oneapi::mkl::diag diag, int64_t m, int64_t n, std::complex<float> alpha,
-                 const std::complex<float> *a, int64_t lda, std::complex<float> *b, int64_t ldb) {
-    cblas_ctrmm(CblasColMajor, cblas_side(side), cblas_uplo(uplo), cblas_trans(transa),
-                cblas_diag(diag), m, n, (void *)&alpha, (void *)(a), lda, (void *)(b), ldb);
-}
-inline void trmm(oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose transa,
-                 oneapi::mkl::diag diag, int64_t m, int64_t n, std::complex<double> alpha,
-                 const std::complex<double> *a, int64_t lda, std::complex<double> *b, int64_t ldb) {
-    cblas_ztrmm(CblasColMajor, cblas_side(side), cblas_uplo(uplo), cblas_trans(transa),
-                cblas_diag(diag), m, n, (void *)&alpha, (void *)(a), lda, (void *)(b), ldb);
-}
-
-inline void swap(int64_t n, float *X, int64_t incX, float *Y, int64_t incY) {
-    cblas_sswap(n, X, incX, Y, incY);
-}
-inline void swap(int64_t n, double *X, int64_t incX, double *Y, int64_t incY) {
-    cblas_dswap(n, X, incX, Y, incY);
-}
-inline void swap(int64_t n, std::complex<float> *X, int64_t incX, std::complex<float> *Y,
-                 int64_t incY) {
-    cblas_cswap(n, (void *)X, incX, (void *)Y, incY);
-}
-inline void swap(int64_t n, std::complex<double> *X, int64_t incX, std::complex<double> *Y,
-                 int64_t incY) {
-    cblas_zswap(n, (void *)X, incX, (void *)Y, incY);
-}
-
-template <typename fp_real>
-fp_real lamch(char cmach);
-template <>
-inline float lamch(char cmach) {
-    return LAPACKE_slamch(cmach);
-}
-template <>
-inline double lamch(char cmach) {
-    return LAPACKE_dlamch(cmach);
-}
-
-inline float lange(char norm, int64_t m, int64_t n, const std::complex<float> *a, int64_t lda) {
-    return LAPACKE_clange(LAPACK_COL_MAJOR, norm, m, n,
-                          reinterpret_cast<const lapack_complex_float *>(a), lda);
-}
-inline double lange(char norm, int64_t m, int64_t n, const double *a, int64_t lda) {
-    return LAPACKE_dlange(LAPACK_COL_MAJOR, norm, m, n, a, lda);
-}
-inline float lange(char norm, int64_t m, int64_t n, const float *a, int64_t lda) {
-    return LAPACKE_slange(LAPACK_COL_MAJOR, norm, m, n, a, lda);
-}
-inline double lange(char norm, int64_t m, int64_t n, const std::complex<double> *a, int64_t lda) {
-    return LAPACKE_zlange(LAPACK_COL_MAJOR, norm, m, n,
-                          reinterpret_cast<const lapack_complex_double *>(a), lda);
-}
-
-inline float lanhe(char norm, oneapi::mkl::uplo u, int64_t n, const std::complex<float> *a,
-                   int64_t lda) {
-    return LAPACKE_clanhe(LAPACK_COL_MAJOR, norm, to_char(u), n,
-                          reinterpret_cast<const lapack_complex_float *>(a), lda);
-}
-inline double lanhe(char norm, oneapi::mkl::uplo u, int64_t n, const std::complex<double> *a,
-                    int64_t lda) {
-    return LAPACKE_zlanhe(LAPACK_COL_MAJOR, norm, to_char(u), n,
-                          reinterpret_cast<const lapack_complex_double *>(a), lda);
-}
-
-inline float lansy(char norm, oneapi::mkl::uplo u, int64_t n, const std::complex<float> *a,
-                   int64_t lda) {
-    return LAPACKE_clansy(LAPACK_COL_MAJOR, norm, to_char(u), n,
-                          reinterpret_cast<const lapack_complex_float *>(a), lda);
-}
-inline double lansy(char norm, oneapi::mkl::uplo u, int64_t n, const double *a, int64_t lda) {
-    return LAPACKE_dlansy(LAPACK_COL_MAJOR, norm, to_char(u), n, a, lda);
-}
-inline float lansy(char norm, oneapi::mkl::uplo u, int64_t n, const float *a, int64_t lda) {
-    return LAPACKE_slansy(LAPACK_COL_MAJOR, norm, to_char(u), n, a, lda);
-}
-inline double lansy(char norm, oneapi::mkl::uplo u, int64_t n, const std::complex<double> *a,
-                    int64_t lda) {
-    return LAPACKE_zlansy(LAPACK_COL_MAJOR, norm, to_char(u), n,
-                          reinterpret_cast<const lapack_complex_double *>(a), lda);
-}
-
-inline int64_t lacpy(char u, int64_t m, int64_t n, const std::complex<float> *a, int64_t lda,
-                     std::complex<float> *b, int64_t ldb) {
-    return LAPACKE_clacpy(LAPACK_COL_MAJOR, u, m, n,
-                          reinterpret_cast<const lapack_complex_float *>(a), lda,
-                          reinterpret_cast<lapack_complex_float *>(b), ldb);
-}
-inline int64_t lacpy(char u, int64_t m, int64_t n, const double *a, int64_t lda, double *b,
-                     int64_t ldb) {
-    return LAPACKE_dlacpy(LAPACK_COL_MAJOR, u, m, n, a, lda, b, ldb);
-}
-inline int64_t lacpy(char u, int64_t m, int64_t n, const float *a, int64_t lda, float *b,
-                     int64_t ldb) {
-    return LAPACKE_slacpy(LAPACK_COL_MAJOR, u, m, n, a, lda, b, ldb);
-}
-inline int64_t lacpy(char u, int64_t m, int64_t n, const std::complex<double> *a, int64_t lda,
-                     std::complex<double> *b, int64_t ldb) {
-    return LAPACKE_zlacpy(LAPACK_COL_MAJOR, u, m, n,
-                          reinterpret_cast<const lapack_complex_double *>(a), lda,
-                          reinterpret_cast<lapack_complex_double *>(b), ldb);
-}
-inline int64_t lacpy(oneapi::mkl::uplo u, int64_t m, int64_t n, const std::complex<float> *a,
-                     int64_t lda, std::complex<float> *b, int64_t ldb) {
-    return LAPACKE_clacpy(LAPACK_COL_MAJOR, to_char(u), m, n,
-                          reinterpret_cast<const lapack_complex_float *>(a), lda,
-                          reinterpret_cast<lapack_complex_float *>(b), ldb);
-}
-inline int64_t lacpy(oneapi::mkl::uplo u, int64_t m, int64_t n, const double *a, int64_t lda,
-                     double *b, int64_t ldb) {
-    return LAPACKE_dlacpy(LAPACK_COL_MAJOR, to_char(u), m, n, a, lda, b, ldb);
-}
-inline int64_t lacpy(oneapi::mkl::uplo u, int64_t m, int64_t n, const float *a, int64_t lda,
-                     float *b, int64_t ldb) {
-    return LAPACKE_slacpy(LAPACK_COL_MAJOR, to_char(u), m, n, a, lda, b, ldb);
-}
-inline int64_t lacpy(oneapi::mkl::uplo u, int64_t m, int64_t n, const std::complex<double> *a,
-                     int64_t lda, std::complex<double> *b, int64_t ldb) {
-    return LAPACKE_zlacpy(LAPACK_COL_MAJOR, to_char(u), m, n,
-                          reinterpret_cast<const lapack_complex_double *>(a), lda,
-                          reinterpret_cast<lapack_complex_double *>(b), ldb);
-}
-
-inline int64_t laset(oneapi::mkl::uplo u, int64_t m, int64_t n, std::complex<float> alpha,
-                     std::complex<float> beta, std::complex<float> *a, int64_t lda) {
-    return LAPACKE_claset(LAPACK_COL_MAJOR, to_char(u), m, n,
-                          reinterpret_cast<lapack_complex_float &>(alpha),
-                          reinterpret_cast<lapack_complex_float &>(beta),
-                          reinterpret_cast<lapack_complex_float *>(a), lda);
-}
-inline int64_t laset(oneapi::mkl::uplo u, int64_t m, int64_t n, double alpha, double beta,
-                     double *a, int64_t lda) {
-    return LAPACKE_dlaset(LAPACK_COL_MAJOR, to_char(u), m, n, alpha, beta, a, lda);
-}
-inline int64_t laset(oneapi::mkl::uplo u, int64_t m, int64_t n, float alpha, float beta, float *a,
-                     int64_t lda) {
-    return LAPACKE_slaset(LAPACK_COL_MAJOR, to_char(u), m, n, alpha, beta, a, lda);
-}
-inline int64_t laset(oneapi::mkl::uplo u, int64_t m, int64_t n, std::complex<double> alpha,
-                     std::complex<double> beta, std::complex<double> *a, int64_t lda) {
-    return LAPACKE_zlaset(LAPACK_COL_MAJOR, to_char(u), m, n,
-                          reinterpret_cast<lapack_complex_double &>(alpha),
-                          reinterpret_cast<lapack_complex_double &>(beta),
-                          reinterpret_cast<lapack_complex_double *>(a), lda);
-}
-inline int64_t laset(char u, int64_t m, int64_t n, std::complex<float> alpha,
-                     std::complex<float> beta, std::complex<float> *a, int64_t lda) {
-    return LAPACKE_claset(LAPACK_COL_MAJOR, u, m, n,
-                          reinterpret_cast<lapack_complex_float &>(alpha),
-                          reinterpret_cast<lapack_complex_float &>(beta),
-                          reinterpret_cast<lapack_complex_float *>(a), lda);
-}
-inline int64_t laset(char u, int64_t m, int64_t n, double alpha, double beta, double *a,
-                     int64_t lda) {
-    return LAPACKE_dlaset(LAPACK_COL_MAJOR, u, m, n, alpha, beta, a, lda);
-}
-inline int64_t laset(char u, int64_t m, int64_t n, float alpha, float beta, float *a, int64_t lda) {
-    return LAPACKE_slaset(LAPACK_COL_MAJOR, u, m, n, alpha, beta, a, lda);
-}
-inline int64_t laset(char u, int64_t m, int64_t n, std::complex<double> alpha,
-                     std::complex<double> beta, std::complex<double> *a, int64_t lda) {
-    return LAPACKE_zlaset(LAPACK_COL_MAJOR, u, m, n,
-                          reinterpret_cast<lapack_complex_double &>(alpha),
-                          reinterpret_cast<lapack_complex_double &>(beta),
-                          reinterpret_cast<lapack_complex_double *>(a), lda);
-}
-
-inline int64_t gebrd(int64_t m, int64_t n, std::complex<float> *a, int64_t lda, float *d, float *e,
-                     std::complex<float> *tauq, std::complex<float> *taup) {
-    return LAPACKE_cgebrd(LAPACK_COL_MAJOR, m, n, reinterpret_cast<lapack_complex_float *>(a), lda,
-                          d, e, reinterpret_cast<lapack_complex_float *>(tauq),
-                          reinterpret_cast<lapack_complex_float *>(taup));
-}
-inline int64_t gebrd(int64_t m, int64_t n, double *a, int64_t lda, double *d, double *e,
-                     double *tauq, double *taup) {
-    return LAPACKE_dgebrd(LAPACK_COL_MAJOR, m, n, a, lda, d, e, tauq, taup);
-}
-inline int64_t gebrd(int64_t m, int64_t n, float *a, int64_t lda, float *d, float *e, float *tauq,
-                     float *taup) {
-    return LAPACKE_sgebrd(LAPACK_COL_MAJOR, m, n, a, lda, d, e, tauq, taup);
-}
-inline int64_t gebrd(int64_t m, int64_t n, std::complex<double> *a, int64_t lda, double *d,
-                     double *e, std::complex<double> *tauq, std::complex<double> *taup) {
-    return LAPACKE_zgebrd(LAPACK_COL_MAJOR, m, n, reinterpret_cast<lapack_complex_double *>(a), lda,
-                          d, e, reinterpret_cast<lapack_complex_double *>(tauq),
-                          reinterpret_cast<lapack_complex_double *>(taup));
-}
-
-inline int64_t geqrf(int64_t m, int64_t n, std::complex<float> *a, int64_t lda,
-                     std::complex<float> *tau) {
-    return LAPACKE_cgeqrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast<lapack_complex_float *>(a), lda,
-                          reinterpret_cast<lapack_complex_float *>(tau));
-}
-inline int64_t geqrf(int64_t m, int64_t n, double *a, int64_t lda, double *tau) {
-    return LAPACKE_dgeqrf(LAPACK_COL_MAJOR, m, n, a, lda, tau);
-}
-inline int64_t geqrf(int64_t m, int64_t n, float *a, int64_t lda, float *tau) {
-    return LAPACKE_sgeqrf(LAPACK_COL_MAJOR, m, n, a, lda, tau);
-}
-inline int64_t geqrf(int64_t m, int64_t n, std::complex<double> *a, int64_t lda,
-                     std::complex<double> *tau) {
-    return LAPACKE_zgeqrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast<lapack_complex_double *>(a), lda,
-                          reinterpret_cast<lapack_complex_double *>(tau));
-}
-
-inline int64_t gerqf(int64_t m, int64_t n, std::complex<float> *a, int64_t lda,
-                     std::complex<float> *tau) {
-    return LAPACKE_cgerqf(LAPACK_COL_MAJOR, m, n, reinterpret_cast<lapack_complex_float *>(a), lda,
-                          reinterpret_cast<lapack_complex_float *>(tau));
-}
-inline int64_t gerqf(int64_t m, int64_t n, double *a, int64_t lda, double *tau) {
-    return LAPACKE_dgerqf(LAPACK_COL_MAJOR, m, n, a, lda, tau);
-}
-inline int64_t gerqf(int64_t m, int64_t n, float *a, int64_t lda, float *tau) {
-    return LAPACKE_sgerqf(LAPACK_COL_MAJOR, m, n, a, lda, tau);
-}
-inline int64_t gerqf(int64_t m, int64_t n, std::complex<double> *a, int64_t lda,
-                     std::complex<double> *tau) {
-    return LAPACKE_zgerqf(LAPACK_COL_MAJOR, m, n, reinterpret_cast<lapack_complex_double *>(a), lda,
-                          reinterpret_cast<lapack_complex_double *>(tau));
-}
-
-inline int64_t gesvd(oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, int64_t m, int64_t n,
-                     std::complex<float> *a, int64_t lda, float *s, std::complex<float> *u,
-                     int64_t ldu, std::complex<float> *vt, int64_t ldvt, float *superb) {
-    return LAPACKE_cgesvd(LAPACK_COL_MAJOR, to_char(jobu), to_char(jobvt), m, n,
-                          reinterpret_cast<lapack_complex_float *>(a), lda, s,
-                          reinterpret_cast<lapack_complex_float *>(u), ldu,
-                          reinterpret_cast<lapack_complex_float *>(vt), ldvt, superb);
-}
-inline int64_t gesvd(oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, int64_t m, int64_t n,
-                     double *a, int64_t lda, double *s, double *u, int64_t ldu, double *vt,
-                     int64_t ldvt, double *superb) {
-    return LAPACKE_dgesvd(LAPACK_COL_MAJOR, to_char(jobu), to_char(jobvt), m, n, a, lda, s, u, ldu,
-                          vt, ldvt, superb);
-}
-inline int64_t gesvd(oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, int64_t m, int64_t n,
-                     float *a, int64_t lda, float *s, float *u, int64_t ldu, float *vt,
-                     int64_t ldvt, float *superb) {
-    return LAPACKE_sgesvd(LAPACK_COL_MAJOR, to_char(jobu), to_char(jobvt), m, n, a, lda, s, u, ldu,
-                          vt, ldvt, superb);
-}
-inline int64_t gesvd(oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, int64_t m, int64_t n,
-                     std::complex<double> *a, int64_t lda, double *s, std::complex<double> *u,
-                     int64_t ldu, std::complex<double> *vt, int64_t ldvt, double *superb) {
-    return LAPACKE_zgesvd(LAPACK_COL_MAJOR, to_char(jobu), to_char(jobvt), m, n,
-                          reinterpret_cast<lapack_complex_double *>(a), lda, s,
-                          reinterpret_cast<lapack_complex_double *>(u), ldu,
-                          reinterpret_cast<lapack_complex_double *>(vt), ldvt, superb);
-}
-
-inline int64_t getrf(int64_t m, int64_t n, std::complex<float> *a, int64_t lda, int64_t *ipiv) {
-    return LAPACKE_cgetrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast<lapack_complex_float *>(a), lda,
-                          reinterpret_cast<lapack_int *>(ipiv));
-}
-inline int64_t getrf(int64_t m, int64_t n, double *a, int64_t lda, int64_t *ipiv) {
-    return LAPACKE_dgetrf(LAPACK_COL_MAJOR, m, n, a, lda, reinterpret_cast<lapack_int *>(ipiv));
-}
-inline int64_t getrf(int64_t m, int64_t n, float *a, int64_t lda, int64_t *ipiv) {
-    return LAPACKE_sgetrf(LAPACK_COL_MAJOR, m, n, a, lda, reinterpret_cast<lapack_int *>(ipiv));
-}
-inline int64_t getrf(int64_t m, int64_t n, std::complex<double> *a, int64_t lda, int64_t *ipiv) {
-    return LAPACKE_zgetrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast<lapack_complex_double *>(a), lda,
-                          reinterpret_cast<lapack_int *>(ipiv));
-}
-
-inline int64_t heevd(oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, std::complex<float> *a,
-                     int64_t lda, float *w) {
-    return LAPACKE_cheevd(LAPACK_COL_MAJOR, to_char(j), to_char(u), n,
-                          reinterpret_cast<lapack_complex_float *>(a), lda, w);
-}
-inline int64_t heevd(oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, std::complex<double> *a,
-                     int64_t lda, double *w) {
-    return LAPACKE_zheevd(LAPACK_COL_MAJOR, to_char(j), to_char(u), n,
-                          reinterpret_cast<lapack_complex_double *>(a), lda, w);
-}
-
-inline int64_t hegvd(int64_t itype, oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n,
-                     std::complex<float> *a, int64_t lda, std::complex<float> *b, int64_t ldb,
-                     float *w) {
-    return LAPACKE_chegvd(LAPACK_COL_MAJOR, itype, to_char(j), to_char(u), n,
-                          reinterpret_cast<lapack_complex_float *>(a), lda,
-                          reinterpret_cast<lapack_complex_float *>(b), ldb, w);
-}
-inline int64_t hegvd(int64_t itype, oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n,
-                     std::complex<double> *a, int64_t lda, std::complex<double> *b, int64_t ldb,
-                     double *w) {
-    return LAPACKE_zhegvd(LAPACK_COL_MAJOR, itype, to_char(j), to_char(u), n,
-                          reinterpret_cast<lapack_complex_double *>(a), lda,
-                          reinterpret_cast<lapack_complex_double *>(b), ldb, w);
-}
-
-inline int64_t hetrd(oneapi::mkl::uplo u, int64_t n, std::complex<float> *a, int64_t lda, float *d,
-                     float *e, std::complex<float> *tau) {
-    return LAPACKE_chetrd(LAPACK_COL_MAJOR, to_char(u), n,
-                          reinterpret_cast<lapack_complex_float *>(a), lda, d, e,
-                          reinterpret_cast<lapack_complex_float *>(tau));
-}
-inline int64_t hetrd(oneapi::mkl::uplo u, int64_t n, std::complex<double> *a, int64_t lda,
-                     double *d, double *e, std::complex<double> *tau) {
-    return LAPACKE_zhetrd(LAPACK_COL_MAJOR, to_char(u), n,
-                          reinterpret_cast<lapack_complex_double *>(a), lda, d, e,
-                          reinterpret_cast<lapack_complex_double *>(tau));
-}
-
-inline int64_t hetrf(oneapi::mkl::uplo u, int64_t n, std::complex<float> *a, int64_t lda,
-                     int64_t *ipiv) {
-    return LAPACKE_chetrf(LAPACK_COL_MAJOR, to_char(u), n,
-                          reinterpret_cast<lapack_complex_float *>(a), lda,
-                          reinterpret_cast<lapack_int *>(ipiv));
-}
-inline int64_t hetrf(oneapi::mkl::uplo u, int64_t n, std::complex<double> *a, int64_t lda,
-                     int64_t *ipiv) {
-    return LAPACKE_zhetrf(LAPACK_COL_MAJOR, to_char(u), n,
-                          reinterpret_cast<lapack_complex_double *>(a), lda,
-                          reinterpret_cast<lapack_int *>(ipiv));
-}
-
-inline int64_t ungtr(oneapi::mkl::uplo u, int64_t n, std::complex<float> *a, int64_t lda,
-                     const std::complex<float> *tau) {
-    return LAPACKE_cungtr(LAPACK_COL_MAJOR, to_char(u), n,
-                          reinterpret_cast<lapack_complex_float *>(a), lda,
-                          reinterpret_cast<const lapack_complex_float *>(tau));
-}
-inline int64_t ungtr(oneapi::mkl::uplo u, int64_t n, std::complex<double> *a, int64_t lda,
-                     const std::complex<double> *tau) {
-    return LAPACKE_zungtr(LAPACK_COL_MAJOR, to_char(u), n,
-                          reinterpret_cast<lapack_complex_double *>(a), lda,
-                          reinterpret_cast<const lapack_complex_double *>(tau));
-}
-
-inline int64_t unmtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans,
-                     int64_t m, int64_t n, const std::complex<float> *a, int64_t lda,
-                     const std::complex<float> *tau, std::complex<float> *c, int64_t ldc) {
-    return LAPACKE_cunmtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n,
-                          reinterpret_cast<const lapack_complex_float *>(a), lda,
-                          reinterpret_cast<const lapack_complex_float *>(tau),
-                          reinterpret_cast<lapack_complex_float *>(c), ldc);
-}
-inline int64_t unmtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans,
-                     int64_t m, int64_t n, const std::complex<double> *a, int64_t lda,
-                     const std::complex<double> *tau, std::complex<double> *c, int64_t ldc) {
-    return LAPACKE_zunmtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n,
-                          reinterpret_cast<const lapack_complex_double *>(a), lda,
-                          reinterpret_cast<const lapack_complex_double *>(tau),
-                          reinterpret_cast<lapack_complex_double *>(c), ldc);
-}
-
-inline int64_t orgtr(oneapi::mkl::uplo u, int64_t n, double *a, int64_t lda, const double *tau) {
-    return LAPACKE_dorgtr(LAPACK_COL_MAJOR, to_char(u), n, a, lda, tau);
-}
-inline int64_t orgtr(oneapi::mkl::uplo u, int64_t n, float *a, int64_t lda, const float *tau) {
-    return LAPACKE_sorgtr(LAPACK_COL_MAJOR, to_char(u), n, a, lda, tau);
-}
-
-inline int64_t ormtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans,
-                     int64_t m, int64_t n, float *a, int64_t lda, const float *tau, float *c,
-                     int64_t ldc) {
-    return LAPACKE_sormtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n, a, lda,
-                          tau, c, ldc);
-}
-inline int64_t ormtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans,
-                     int64_t m, int64_t n, double *a, int64_t lda, const double *tau, double *c,
-                     int64_t ldc) {
-    return LAPACKE_dormtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n, a, lda,
-                          tau, c, ldc);
-}
-
-inline int64_t or_un_mtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans,
-                         int64_t m, int64_t n, float *a, int64_t lda, const float *tau, float *c,
-                         int64_t ldc) {
-    return LAPACKE_sormtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n, a, lda,
-                          tau, c, ldc);
-}
-inline int64_t or_un_mtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans,
-                         int64_t m, int64_t n, double *a, int64_t lda, const double *tau, double *c,
-                         int64_t ldc) {
-    return LAPACKE_dormtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n, a, lda,
-                          tau, c, ldc);
-}
-inline int64_t or_un_mtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans,
-                         int64_t m, int64_t n, std::complex<float> *a, int64_t lda,
-                         std::complex<float> *tau, std::complex<float> *c, int64_t ldc) {
-    return LAPACKE_cunmtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n,
-                          reinterpret_cast<lapack_complex_float *>(a), lda,
-                          reinterpret_cast<lapack_complex_float *>(tau),
-                          reinterpret_cast<lapack_complex_float *>(c), ldc);
-}
-inline int64_t or_un_mtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans,
-                         int64_t m, int64_t n, std::complex<double> *a, int64_t lda,
-                         std::complex<double> *tau, std::complex<double> *c, int64_t ldc) {
-    return LAPACKE_zunmtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n,
-                          reinterpret_cast<lapack_complex_double *>(a), lda,
-                          reinterpret_cast<lapack_complex_double *>(tau),
-                          reinterpret_cast<lapack_complex_double *>(c), ldc);
-}
-
-inline int64_t sytrd(oneapi::mkl::uplo u, int64_t n, float *a, int64_t lda, float *d, float *e,
-                     float *tau) {
-    return LAPACKE_ssytrd(LAPACK_COL_MAJOR, to_char(u), n, a, lda, d, e, tau);
-}
-inline int64_t sytrd(oneapi::mkl::uplo u, int64_t n, double *a, int64_t lda, double *d, double *e,
-                     double *tau) {
-    return LAPACKE_dsytrd(LAPACK_COL_MAJOR, to_char(u), n, a, lda, d, e, tau);
-}
-
-inline int64_t sytrf(oneapi::mkl::uplo u, int64_t n, float *a, int64_t lda, int64_t *ipiv) {
-    return LAPACKE_ssytrf(LAPACK_COL_MAJOR, to_char(u), n, a, lda,
-                          reinterpret_cast<lapack_int *>(ipiv));
-}
-inline int64_t sytrf(oneapi::mkl::uplo u, int64_t n, double *a, int64_t lda, int64_t *ipiv) {
-    return LAPACKE_dsytrf(LAPACK_COL_MAJOR, to_char(u), n, a, lda,
-                          reinterpret_cast<lapack_int *>(ipiv));
-}
-inline int64_t sytrf(oneapi::mkl::uplo u, int64_t n, std::complex<float> *a, int64_t lda,
-                     int64_t *ipiv) {
-    return LAPACKE_csytrf(LAPACK_COL_MAJOR, to_char(u), n,
-                          reinterpret_cast<lapack_complex_float *>(a), lda,
-                          reinterpret_cast<lapack_int *>(ipiv));
-}
-inline int64_t sytrf(oneapi::mkl::uplo u, int64_t n, std::complex<double> *a, int64_t lda,
-                     int64_t *ipiv) {
-    return LAPACKE_zsytrf(LAPACK_COL_MAJOR, to_char(u), n,
-                          reinterpret_cast<lapack_complex_double *>(a), lda,
-                          reinterpret_cast<lapack_int *>(ipiv));
-}
-
-inline void orgbr(oneapi::mkl::generate vect, int64_t m, int64_t n, int64_t k, double *a,
-                  int64_t lda, const double *tau) {
-    LAPACKE_dorgbr(LAPACK_COL_MAJOR, to_char(vect), m, n, k, a, lda, tau);
-}
-inline void orgbr(oneapi::mkl::generate vect, int64_t m, int64_t n, int64_t k, float *a,
-                  int64_t lda, const float *tau) {
-    LAPACKE_sorgbr(LAPACK_COL_MAJOR, to_char(vect), m, n, k, a, lda, tau);
-}
-
-inline int64_t or_un_gqr(int64_t m, int64_t n, int64_t k, float *a, int64_t lda, const float *tau) {
-    return LAPACKE_sorgqr(LAPACK_COL_MAJOR, m, n, k, a, lda, tau);
-}
-inline int64_t or_un_gqr(int64_t m, int64_t n, int64_t k, double *a, int64_t lda,
-                         const double *tau) {
-    return LAPACKE_dorgqr(LAPACK_COL_MAJOR, m, n, k, a, lda, tau);
-}
-inline int64_t or_un_gqr(int64_t m, int64_t n, int64_t k, std::complex<float> *a, int64_t lda,
-                         const std::complex<float> *tau) {
-    return LAPACKE_cungqr(LAPACK_COL_MAJOR, m, n, k, reinterpret_cast<lapack_complex_float *>(a),
-                          lda, reinterpret_cast<const lapack_complex_float *>(tau));
-}
-inline int64_t or_un_gqr(int64_t m, int64_t n, int64_t k, std::complex<double> *a, int64_t lda,
-                         const std::complex<double> *tau) {
-    return LAPACKE_zungqr(LAPACK_COL_MAJOR, m, n, k, reinterpret_cast<lapack_complex_double *>(a),
-                          lda, reinterpret_cast<const lapack_complex_double *>(tau));
-}
-
-inline int64_t or_un_mqr(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n,
-                         int64_t k, const float *a, int64_t lda, const float *tau, float *c,
-                         int64_t ldc) {
-    return LAPACKE_sormqr(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k, a, lda, tau, c, ldc);
-}
-inline int64_t or_un_mqr(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n,
-                         int64_t k, const double *a, int64_t lda, const double *tau, double *c,
-                         int64_t ldc) {
-    return LAPACKE_dormqr(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k, a, lda, tau, c, ldc);
-}
-inline int64_t or_un_mqr(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n,
-                         int64_t k, const std::complex<float> *a, int64_t lda,
-                         const std::complex<float> *tau, std::complex<float> *c, int64_t ldc) {
-    return LAPACKE_cunmqr(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k,
-                          reinterpret_cast<const lapack_complex_float *>(a), lda,
-                          reinterpret_cast<const lapack_complex_float *>(tau),
-                          reinterpret_cast<lapack_complex_float *>(c), ldc);
-}
-inline int64_t or_un_mqr(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n,
-                         int64_t k, const std::complex<double> *a, int64_t lda,
-                         const std::complex<double> *tau, std::complex<double> *c, int64_t ldc) {
-    return LAPACKE_zunmqr(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k,
-                          reinterpret_cast<const lapack_complex_double *>(a), lda,
-                          reinterpret_cast<const lapack_complex_double *>(tau),
-                          reinterpret_cast<lapack_complex_double *>(c), ldc);
-}
-
-inline int64_t or_un_grq(int64_t m, int64_t n, int64_t k, float *a, int64_t lda, const float *tau) {
-    return LAPACKE_sorgrq(LAPACK_COL_MAJOR, m, n, k, a, lda, tau);
-}
-inline int64_t or_un_grq(int64_t m, int64_t n, int64_t k, double *a, int64_t lda,
-                         const double *tau) {
-    return LAPACKE_dorgrq(LAPACK_COL_MAJOR, m, n, k, a, lda, tau);
-}
-inline int64_t or_un_grq(int64_t m, int64_t n, int64_t k, std::complex<float> *a, int64_t lda,
-                         const std::complex<float> *tau) {
-    return LAPACKE_cungrq(LAPACK_COL_MAJOR, m, n, k, reinterpret_cast<lapack_complex_float *>(a),
-                          lda, reinterpret_cast<const lapack_complex_float *>(tau));
-}
-inline int64_t or_un_grq(int64_t m, int64_t n, int64_t k, std::complex<double> *a, int64_t lda,
-                         const std::complex<double> *tau) {
-    return LAPACKE_zungrq(LAPACK_COL_MAJOR, m, n, k, reinterpret_cast<lapack_complex_double *>(a),
-                          lda, reinterpret_cast<const lapack_complex_double *>(tau));
-}
-
-inline int64_t or_un_mrq(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n,
-                         int64_t k, const float *a, int64_t lda, const float *tau, float *c,
-                         int64_t ldc) {
-    return LAPACKE_sormrq(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k, a, lda, tau, c, ldc);
-}
-inline int64_t or_un_mrq(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n,
-                         int64_t k, const double *a, int64_t lda, const double *tau, double *c,
-                         int64_t ldc) {
-    return LAPACKE_dormrq(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k, a, lda, tau, c, ldc);
-}
-inline int64_t or_un_mrq(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n,
-                         int64_t k, const std::complex<float> *a, int64_t lda,
-                         const std::complex<float> *tau, std::complex<float> *c, int64_t ldc) {
-    return LAPACKE_cunmrq(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k,
-                          reinterpret_cast<const lapack_complex_float *>(a), lda,
-                          reinterpret_cast<const lapack_complex_float *>(tau),
-                          reinterpret_cast<lapack_complex_float *>(c), ldc);
-}
-inline int64_t or_un_mrq(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n,
-                         int64_t k, const std::complex<double> *a, int64_t lda,
-                         const std::complex<double> *tau, std::complex<double> *c, int64_t ldc) {
-    return LAPACKE_zunmrq(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k,
-                          reinterpret_cast<const lapack_complex_double *>(a), lda,
-                          reinterpret_cast<const lapack_complex_double *>(tau),
-                          reinterpret_cast<lapack_complex_double *>(c), ldc);
-}
-
-inline int64_t potrf(oneapi::mkl::uplo upper_lower, int64_t n, std::complex<float> *a,
-                     int64_t lda) {
-    return LAPACKE_cpotrf(LAPACK_COL_MAJOR, to_char(upper_lower), n,
-                          reinterpret_cast<lapack_complex_float *>(a), lda);
-}
-inline int64_t potrf(oneapi::mkl::uplo upper_lower, int64_t n, double *a, int64_t lda) {
-    return LAPACKE_dpotrf(LAPACK_COL_MAJOR, to_char(upper_lower), n, a, lda);
-}
-inline int64_t potrf(oneapi::mkl::uplo upper_lower, int64_t n, float *a, int64_t lda) {
-    return LAPACKE_spotrf(LAPACK_COL_MAJOR, to_char(upper_lower), n, a, lda);
-}
-inline int64_t potrf(oneapi::mkl::uplo upper_lower, int64_t n, std::complex<double> *a,
-                     int64_t lda) {
-    return LAPACKE_zpotrf(LAPACK_COL_MAJOR, to_char(upper_lower), n,
-                          reinterpret_cast<lapack_complex_double *>(a), lda);
-}
-
-inline int64_t potrs(oneapi::mkl::uplo upper_lower, int64_t n, int64_t nrhs,
-                     const std::complex<float> *a, int64_t lda, std::complex<float> *b,
-                     int64_t ldb) {
-    return LAPACKE_cpotrs(LAPACK_COL_MAJOR, to_char(upper_lower), n, nrhs,
-                          reinterpret_cast<const lapack_complex_float *>(a), lda,
-                          reinterpret_cast<lapack_complex_float *>(b), ldb);
-}
-inline int64_t potrs(oneapi::mkl::uplo upper_lower, int64_t n, int64_t nrhs, const double *a,
-                     int64_t lda, double *b, int64_t ldb) {
-    return LAPACKE_dpotrs(LAPACK_COL_MAJOR, to_char(upper_lower), n, nrhs, a, lda, b, ldb);
-}
-inline int64_t potrs(oneapi::mkl::uplo upper_lower, int64_t n, int64_t nrhs, const float *a,
-                     int64_t lda, float *b, int64_t ldb) {
-    return LAPACKE_spotrs(LAPACK_COL_MAJOR, to_char(upper_lower), n, nrhs, a, lda, b, ldb);
-}
-inline int64_t potrs(oneapi::mkl::uplo upper_lower, int64_t n, int64_t nrhs,
-                     const std::complex<double> *a, int64_t lda, std::complex<double> *b,
-                     int64_t ldb) {
-    return LAPACKE_zpotrs(LAPACK_COL_MAJOR, to_char(upper_lower), n, nrhs,
-                          reinterpret_cast<const lapack_complex_double *>(a), lda,
-                          reinterpret_cast<lapack_complex_double *>(b), ldb);
-}
-
-inline int64_t potri(oneapi::mkl::uplo upper_lower, int64_t n, std::complex<float> *a,
-                     int64_t lda) {
-    return LAPACKE_cpotri(LAPACK_COL_MAJOR, to_char(upper_lower), n,
-                          reinterpret_cast<lapack_complex_float *>(a), lda);
-}
-inline int64_t potri(oneapi::mkl::uplo upper_lower, int64_t n, double *a, int64_t lda) {
-    return LAPACKE_dpotri(LAPACK_COL_MAJOR, to_char(upper_lower), n, a, lda);
-}
-inline int64_t potri(oneapi::mkl::uplo upper_lower, int64_t n, float *a, int64_t lda) {
-    return LAPACKE_spotri(LAPACK_COL_MAJOR, to_char(upper_lower), n, a, lda);
-}
-inline int64_t potri(oneapi::mkl::uplo upper_lower, int64_t n, std::complex<double> *a,
-                     int64_t lda) {
-    return LAPACKE_zpotri(LAPACK_COL_MAJOR, to_char(upper_lower), n,
-                          reinterpret_cast<lapack_complex_double *>(a), lda);
-}
-
-inline int64_t laswp(int64_t n, std::complex<float> *a, int64_t lda, int64_t k1, int64_t k2,
-                     const int64_t *ipiv, int64_t incx) {
-    return LAPACKE_claswp(LAPACK_COL_MAJOR, n, reinterpret_cast<lapack_complex_float *>(a), lda, k1,
-                          k2, reinterpret_cast<const lapack_int *>(ipiv), incx);
-}
-inline int64_t laswp(int64_t n, double *a, int64_t lda, int64_t k1, int64_t k2, const int64_t *ipiv,
-                     int64_t incx) {
-    return LAPACKE_dlaswp(LAPACK_COL_MAJOR, n, a, lda, k1, k2,
-                          reinterpret_cast<const lapack_int *>(ipiv), incx);
-}
-inline int64_t laswp(int64_t n, float *a, int64_t lda, int64_t k1, int64_t k2, const int64_t *ipiv,
-                     int64_t incx) {
-    return LAPACKE_slaswp(LAPACK_COL_MAJOR, n, a, lda, k1, k2,
-                          reinterpret_cast<const lapack_int *>(ipiv), incx);
-}
-inline int64_t laswp(int64_t n, std::complex<double> *a, int64_t lda, int64_t k1, int64_t k2,
-                     const int64_t *ipiv, int64_t incx) {
-    return LAPACKE_zlaswp(LAPACK_COL_MAJOR, n, reinterpret_cast<lapack_complex_double *>(a), lda,
-                          k1, k2, reinterpret_cast<const lapack_int *>(ipiv), incx);
-}
-
-inline void ungbr(oneapi::mkl::generate vect, int64_t m, int64_t n, int64_t k,
-                  std::complex<float> *a, int64_t lda, const std::complex<float> *tau) {
-    LAPACKE_cungbr(LAPACK_COL_MAJOR, to_char(vect), m, n, k,
-                   reinterpret_cast<lapack_complex_float *>(a), lda,
-                   reinterpret_cast<const lapack_complex_float *>(tau));
-}
-inline void ungbr(oneapi::mkl::generate vect, int64_t m, int64_t n, int64_t k,
-                  std::complex<double> *a, int64_t lda, const std::complex<double> *tau) {
-    LAPACKE_zungbr(LAPACK_COL_MAJOR, to_char(vect), m, n, k,
-                   reinterpret_cast<lapack_complex_double *>(a), lda,
-                   reinterpret_cast<const lapack_complex_double *>(tau));
-}
-
-inline int64_t trtrs(oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                     int64_t n, int64_t nrhs, const float *a, int64_t lda, float *b, int64_t ldb) {
-    return LAPACKE_strtrs(LAPACK_COL_MAJOR, to_char(uplo), to_char(trans), to_char(diag), n, nrhs,
-                          a, lda, b, ldb);
-}
-inline int64_t trtrs(oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                     int64_t n, int64_t nrhs, const double *a, int64_t lda, double *b,
-                     int64_t ldb) {
-    return LAPACKE_dtrtrs(LAPACK_COL_MAJOR, to_char(uplo), to_char(trans), to_char(diag), n, nrhs,
-                          a, lda, b, ldb);
-}
-inline int64_t trtrs(oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                     int64_t n, int64_t nrhs, const std::complex<float> *a, int64_t lda,
-                     std::complex<float> *b, int64_t ldb) {
-    return LAPACKE_ctrtrs(LAPACK_COL_MAJOR, to_char(uplo), to_char(trans), to_char(diag), n, nrhs,
-                          reinterpret_cast<const lapack_complex_float *>(a), lda,
-                          reinterpret_cast<lapack_complex_float *>(b), ldb);
-}
-inline int64_t trtrs(oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag,
-                     int64_t n, int64_t nrhs, const std::complex<double> *a, int64_t lda,
-                     std::complex<double> *b, int64_t ldb) {
-    return LAPACKE_ztrtrs(LAPACK_COL_MAJOR, to_char(uplo), to_char(trans), to_char(diag), n, nrhs,
-                          reinterpret_cast<const lapack_complex_double *>(a), lda,
-                          reinterpret_cast<lapack_complex_double *>(b), ldb);
-}
-
-} //namespace reference
diff --git a/tests/unit_tests/lapack/include/lapack_test_controller.hpp b/tests/unit_tests/lapack/include/lapack_test_controller.hpp
deleted file mode 100644
index 918060959..000000000
--- a/tests/unit_tests/lapack/include/lapack_test_controller.hpp
+++ /dev/null
@@ -1,251 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#pragma once
-
-#include <complex>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "lapack_common.hpp"
-#include "oneapi/mkl/exceptions.hpp"
-
-template <class T>
-std::istream& operator>>(std::istream& is, T& t) {
-    int64_t i;
-    is >> i;
-    t = static_cast<T>(i);
-    return is;
-}
-inline std::ostream& operator<<(std::ostream& os, const oneapi::mkl::job& t) {
-    os << static_cast<int64_t>(t);
-    return os;
-}
-inline std::ostream& operator<<(std::ostream& os, const oneapi::mkl::jobsvd& t) {
-    os << static_cast<int64_t>(t);
-    return os;
-}
-inline std::ostream& operator<<(std::ostream& os, const oneapi::mkl::transpose& t) {
-    os << static_cast<int64_t>(t);
-    return os;
-}
-inline std::ostream& operator<<(std::ostream& os, const oneapi::mkl::uplo& t) {
-    os << static_cast<int64_t>(t);
-    return os;
-}
-inline std::ostream& operator<<(std::ostream& os, const oneapi::mkl::side& t) {
-    os << static_cast<int64_t>(t);
-    return os;
-}
-inline std::ostream& operator<<(std::ostream& os, const oneapi::mkl::diag& t) {
-    os << static_cast<int64_t>(t);
-    return os;
-}
-inline std::ostream& operator<<(std::ostream& os, const oneapi::mkl::generate& t) {
-    os << static_cast<int64_t>(t);
-    return os;
-}
-
-class result_T {
-public:
-    enum class result { fail, pass, exception };
-
-    result_T() : result_{ result::pass } {}
-    result_T(bool b) : result_{ b ? result::pass : result::fail } {}
-    result_T(const std::exception& e, result t = result::exception)
-            : result_{ t },
-              what_{ e.what() } {}
-
-    operator bool() const& {
-        return result_ == result::pass;
-    }
-
-    friend bool operator==(const result_T& lhs, const result_T& rhs);
-    friend std::ostream& operator<<(std::ostream& os, result_T result);
-
-private:
-    result result_;
-    std::string what_;
-};
-
-inline bool operator==(const result_T& lhs, const result_T& rhs) {
-    return (lhs.result_ == rhs.result_ && lhs.what_ == rhs.what_);
-}
-inline bool operator!=(const result_T& lhs, const result_T& rhs) {
-    return !(lhs == rhs);
-}
-
-inline std::ostream& operator<<(std::ostream& os, result_T result) {
-    switch (result.result_) {
-        case result_T::result::pass: os << "PASS"; break;
-        case result_T::result::fail: os << "FAIL"; break;
-        case result_T::result::exception: os << "EXCEPTION " << result.what_; break;
-    }
-    return os;
-}
-
-template <typename T>
-struct function_info;
-
-template <typename... Args>
-struct function_info<bool(const sycl::device&, Args...)> {
-    using arg_type = std::tuple<Args...>;
-    static constexpr size_t arg_count = sizeof...(Args);
-
-    template <size_t n>
-    struct arg {
-        using type = typename std::tuple_element<n, std::tuple<Args...>>::type;
-    };
-};
-
-template <typename T>
-struct InputTestController {
-    using TestPointer = T;
-    using ArgTuple_T = typename function_info<T>::arg_type;
-    static constexpr size_t arg_count = function_info<T>::arg_count;
-    std::vector<ArgTuple_T> vargs;
-
-    InputTestController(const char* input) {
-        if constexpr (arg_count == 0) /* test does not take input */
-            return;
-
-        if (input) {
-            std::stringstream input_stream(input);
-            if (input_stream.fail())
-                std::cout << "Failed to process input: \'" << input << "\'" << std::endl;
-            else
-                store_input(input_stream, std::make_index_sequence<arg_count>());
-        }
-        else { /* search for input file */
-            std::cout << "Test parameters not found" << std::endl;
-        }
-    }
-
-    template <size_t... I>
-    void store_input(std::istream& input_stream, std::index_sequence<I...>) {
-        if constexpr (arg_count == 0) /* test does not take input */
-            return;
-        else {
-            ArgTuple_T args;
-            while ((..., (input_stream >> std::get<I>(args)))) {
-                vargs.push_back(args);
-                input_stream.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
-            }
-        }
-    }
-
-    template <size_t... I>
-    void print_result(size_t input_file_line, result_T result, const ArgTuple_T& args = {},
-                      std::index_sequence<I...> = std::make_index_sequence<0>{}) {
-        std::cout.clear();
-        std::cout << test_log::padding << "[" << input_file_line << "]: ";
-        (..., (std::cout << std::get<I>(args) << " "));
-        std::cout << "# " << result << std::endl;
-        test_log::print();
-    }
-
-    result_T call_test(TestPointer tp, const sycl::device& dev, ArgTuple_T args) {
-        auto tp_args = tuple_cat(std::make_tuple(dev), args);
-        result_T result;
-        try {
-            result = std::apply(tp, tp_args);
-        }
-        catch (const oneapi::mkl::unsupported_device& e) {
-            result = result_T{ e, result_T::result::pass };
-        }
-        catch (const oneapi::mkl::unimplemented& e) {
-            result = result_T{ e, result_T::result::pass };
-        }
-        catch (const std::exception& e) {
-            result = result_T{ e };
-        }
-        return result;
-    }
-
-    result_T run(TestPointer tp, const sycl::device& dev) {
-        print_device_info(dev);
-        if constexpr (arg_count == 0) { /* test does not take input */
-            result_T result = call_test(tp, dev, {});
-            print_result(0, result);
-            return result;
-        }
-        else {
-            if (!vargs.size()) {
-                test_log::lout << arg_count << " inputs expected, found none" << std::endl;
-                print_result(1, false);
-            }
-            result_T aggregate_result;
-            size_t input_file_line = 1;
-            for (auto& args : vargs) {
-                result_T result = call_test(tp, dev, args);
-                if (!result) {
-                    aggregate_result = result;
-                }
-                print_result(input_file_line++, result, args,
-                             std::make_index_sequence<arg_count>());
-            }
-            return aggregate_result;
-        }
-    }
-
-    result_T run_print_on_fail(TestPointer tp, const sycl::device& dev) {
-        print_device_info(dev);
-        if constexpr (arg_count == 0) { /* test does not take input */
-            result_T result = call_test(tp, dev, {});
-            if (!result) {
-                print_result(0, result);
-            }
-            else {
-                test_log::lout.str("");
-                test_log::lout.clear();
-            }
-            return result;
-        }
-        else {
-            if (!vargs.size()) {
-                test_log::lout << arg_count << " inputs expected, found none" << std::endl;
-                print_result(1, false);
-            }
-            result_T aggregate_result;
-            size_t input_file_line = 0;
-            for (auto& args : vargs) {
-                input_file_line++;
-                result_T result = call_test(tp, dev, args);
-                if (!result) {
-                    print_result(input_file_line, result, args,
-                                 std::make_index_sequence<arg_count>());
-                }
-                else {
-                    test_log::lout.str("");
-                    test_log::lout.clear();
-                }
-                if (!result)
-                    aggregate_result = result;
-            }
-            return aggregate_result;
-        }
-    }
-};
diff --git a/tests/unit_tests/lapack/source/CMakeLists.txt b/tests/unit_tests/lapack/source/CMakeLists.txt
deleted file mode 100644
index c61403c40..000000000
--- a/tests/unit_tests/lapack/source/CMakeLists.txt
+++ /dev/null
@@ -1,95 +0,0 @@
-#===============================================================================
-# Copyright 2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-#Build object from all test sources
-set(LAPACK_SOURCES
-    "gebrd.cpp"
-    "geqrf.cpp"
-    "geqrf_batch_group.cpp"
-    "geqrf_batch_stride.cpp"
-    "gerqf.cpp"
-    "gesvd.cpp"
-    "getrf.cpp"
-    "getrf_batch_group.cpp"
-    "getrf_batch_stride.cpp"
-    "getri.cpp"
-    "getri_batch_group.cpp"
-    "getri_batch_stride.cpp"
-    "getrs.cpp"
-    "getrs_batch_group.cpp"
-    "getrs_batch_stride.cpp"
-    "heevd.cpp"
-    "hegvd.cpp"
-    "hetrd.cpp"
-    "hetrf.cpp"
-    "orgbr.cpp"
-    "orgqr.cpp"
-    "orgqr_batch_group.cpp"
-    "orgqr_batch_stride.cpp"
-    "orgtr.cpp"
-    "ormqr.cpp"
-    "ormrq.cpp"
-    "ormtr.cpp"
-    "potrf.cpp"
-    "potrf_batch_group.cpp"
-    "potrf_batch_stride.cpp"
-    "potri.cpp"
-    "potrs.cpp"
-    "potrs_batch_group.cpp"
-    "potrs_batch_stride.cpp"
-    "syevd.cpp"
-    "sygvd.cpp"
-    "sytrd.cpp"
-    "sytrf.cpp"
-    "trtrs.cpp"
-    "ungbr.cpp"
-    "ungqr.cpp"
-    "ungqr_batch_group.cpp"
-    "ungqr_batch_stride.cpp"
-    "ungtr.cpp"
-    "unmqr.cpp"
-    "unmrq.cpp"
-    "unmtr.cpp"
-)
-
-if(BUILD_SHARED_LIBS)
-  add_library(lapack_source_rt OBJECT ${LAPACK_SOURCES})
-  target_compile_options(lapack_source_rt PRIVATE -DCALL_RT_API -DNOMINMAX)
-  target_include_directories(lapack_source_rt
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-      PUBLIC ${LAPACKE_INCLUDE}
-  )
-  target_link_libraries(lapack_source_rt PUBLIC ONEMKL::SYCL::SYCL)
-endif()
-
-add_library(lapack_source_ct OBJECT ${LAPACK_SOURCES})
-target_compile_options(lapack_source_ct PRIVATE -DNOMINMAX)
-target_include_directories(lapack_source_ct
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-    PUBLIC ${PROJECT_SOURCE_DIR}/include
-    PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-    PUBLIC ${CMAKE_BINARY_DIR}/bin
-    PUBLIC ${LAPACKE_INCLUDE}
-)
-target_link_libraries(lapack_source_ct PUBLIC ONEMKL::SYCL::SYCL)
diff --git a/tests/unit_tests/lapack/source/gebrd.cpp b/tests/unit_tests/lapack/source/gebrd.cpp
deleted file mode 100644
index 66eb0b231..000000000
--- a/tests/unit_tests/lapack/source/gebrd.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-29 37 34 27182
-27 25 33 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, int64_t m, int64_t n, int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    int64_t min_mn = std::min<int64_t>(m, n);
-
-    std::vector<fp> A_initial(lda * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda);
-
-    std::vector<fp> A = A_initial;
-    std::vector<fp_real> d(min_mn);
-    std::vector<fp_real> e(std::max<int64_t>(min_mn - 1, 1));
-
-    std::vector<fp> tauq(min_mn);
-    std::vector<fp> taup(min_mn);
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto d_dev = device_alloc<data_T, fp_real>(queue, d.size());
-        auto e_dev = device_alloc<data_T, fp_real>(queue, e.size());
-        auto tauq_dev = device_alloc<data_T>(queue, tauq.size());
-        auto taup_dev = device_alloc<data_T>(queue, taup.size());
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::gebrd_scratchpad_size<fp>(queue, m, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::gebrd_scratchpad_size<fp>, m, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::gebrd(queue, m, n, A_dev, lda, d_dev, e_dev, tauq_dev, taup_dev,
-                                   scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::gebrd, m, n, A_dev, lda, d_dev, e_dev,
-                                  tauq_dev, taup_dev, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, d_dev, d.data(), d.size());
-        device_to_host_copy(queue, e_dev, e.data(), e.size());
-        device_to_host_copy(queue, tauq_dev, tauq.data(), tauq.size());
-        device_to_host_copy(queue, taup_dev, taup.data(), taup.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, d_dev);
-        device_free(queue, e_dev);
-        device_free(queue, tauq_dev);
-        device_free(queue, taup_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    reference::gebrd(m, n, A_initial.data(), lda, d.data(), e.data(), tauq.data(), taup.data());
-    return rel_mat_err_check<fp>(m, n, A, lda, A_initial, lda, 30.0);
-}
-
-const char* dependency_input = R"(
-1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, int64_t m, int64_t n, int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    int64_t min_mn = std::min<int64_t>(m, n);
-
-    std::vector<fp> A_initial(lda * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda);
-
-    auto A = A_initial;
-    std::vector<fp_real> d(min_mn);
-    std::vector<fp_real> e(std::max<int64_t>(min_mn - 1, 1));
-    std::vector<fp> tauq(min_mn);
-    std::vector<fp> taup(min_mn);
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto d_dev = device_alloc<data_T, fp_real>(queue, d.size());
-        auto e_dev = device_alloc<data_T, fp_real>(queue, e.size());
-        auto tauq_dev = device_alloc<data_T>(queue, tauq.size());
-        auto taup_dev = device_alloc<data_T>(queue, taup.size());
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::gebrd_scratchpad_size<fp>(queue, m, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::gebrd_scratchpad_size<fp>, m, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::gebrd(
-            queue, m, n, A_dev, lda, d_dev, e_dev, tauq_dev, taup_dev, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::gebrd, m, n, A_dev, lda,
-                                  d_dev, e_dev, tauq_dev, taup_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, d_dev);
-        device_free(queue, e_dev);
-        device_free(queue, tauq_dev);
-        device_free(queue, taup_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(Gebrd);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(Gebrd);
diff --git a/tests/unit_tests/lapack/source/geqrf.cpp b/tests/unit_tests/lapack/source/geqrf.cpp
deleted file mode 100644
index 27577e972..000000000
--- a/tests/unit_tests/lapack/source/geqrf.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-31 50 37 27182
-30 30 34 27182
-50 37 54 27182
-31 22 37 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, int64_t m, int64_t n, int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(lda * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda);
-
-    std::vector<fp> A = A_initial;
-    std::vector<fp> tau(std::min(m, n));
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::geqrf_scratchpad_size<fp>(queue, m, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::geqrf_scratchpad_size<fp>, m, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::geqrf(queue, m, n, A_dev, lda, tau_dev, scratchpad_dev,
-                                   scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::geqrf, m, n, A_dev, lda, tau_dev,
-                                  scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, tau_dev, tau.data(), tau.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_geqrf_accuracy(m, n, A, lda, tau, A_initial);
-}
-
-const char* dependency_input = R"(
-1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, int64_t m, int64_t n, int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(lda * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda);
-
-    std::vector<fp> A = A_initial;
-    std::vector<fp> tau(std::min(m, n));
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::geqrf_scratchpad_size<fp>(queue, m, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::geqrf_scratchpad_size<fp>, m, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::geqrf(queue, m, n, A_dev, lda, tau_dev, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::geqrf, m, n, A_dev, lda,
-                                  tau_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(Geqrf);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(Geqrf);
diff --git a/tests/unit_tests/lapack/source/geqrf_batch_group.cpp b/tests/unit_tests/lapack/source/geqrf_batch_group.cpp
deleted file mode 100644
index 416466028..000000000
--- a/tests/unit_tests/lapack/source/geqrf_batch_group.cpp
+++ /dev/null
@@ -1,298 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <list>
-#include <numeric>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-27182
-)";
-
-template <typename fp>
-bool accuracy(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<int64_t> m_vec = { 2, 4 };
-    std::vector<int64_t> n_vec = { 4, 4 };
-    std::vector<int64_t> lda_vec = { 5, 5 };
-    std::vector<int64_t> group_sizes_vec = { 2, 2 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_initial_list;
-    std::list<std::vector<fp>> A_list;
-    std::list<std::vector<fp>> tau_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto m = m_vec[group_id];
-        auto n = n_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_initial_list.emplace_back(lda * n);
-            auto& A_initial = A_initial_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda);
-
-            A_list.emplace_back(A_initial);
-            tau_list.emplace_back(std::min(m, n));
-        }
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> tau_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        fp** tau_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        auto A_iter = A_list.begin();
-        auto tau_iter = tau_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, tau_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-            tau_dev_list.emplace_back(tau_iter->size(), usm_fp_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::geqrf_batch_scratchpad_size<fp>(
-            queue, m_vec.data(), n_vec.data(), lda_vec.data(), group_count, group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::geqrf_batch_scratchpad_size<fp>,
-            m_vec.data(), n_vec.data(), lda_vec.data(), group_count, group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        auto tau_dev_iter = tau_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_dev_iter++, tau_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-            tau_dev_ptrs[global_id] = tau_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-        }
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::geqrf_batch(queue, m_vec.data(), n_vec.data(), A_dev_ptrs,
-                                         lda_vec.data(), tau_dev_ptrs, group_count,
-                                         group_sizes_vec.data(), scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::geqrf_batch, m_vec.data(),
-                                  n_vec.data(), A_dev_ptrs, lda_vec.data(), tau_dev_ptrs,
-                                  group_count, group_sizes_vec.data(), scratchpad_dev,
-                                  scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        A_iter = A_list.begin();
-        tau_iter = tau_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, tau_iter++) {
-            device_to_host_copy(queue, A_dev_ptrs[global_id], A_iter->data(), A_iter->size());
-            device_to_host_copy(queue, tau_dev_ptrs[global_id], tau_iter->data(), tau_iter->size());
-        }
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-        if (tau_dev_ptrs) {
-            sycl::free(tau_dev_ptrs, queue);
-        }
-    }
-
-    bool result = true;
-
-    int64_t global_id = 0;
-    auto A_iter = A_list.begin();
-    auto tau_iter = tau_list.begin();
-    auto A_initial_iter = A_initial_list.begin();
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto m = m_vec[group_id];
-        auto n = n_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-        for (int64_t local_id = 0; local_id < group_size;
-             local_id++, global_id++, A_iter++, tau_iter++, A_initial_iter++) {
-            if (!check_geqrf_accuracy(m, n, *A_iter, lda, *tau_iter, *A_initial_iter)) {
-                test_log::lout << "batch routine (" << global_id << ", " << group_id << ", "
-                               << local_id << ") (global_id, group_id, local_id) failed"
-                               << std::endl;
-                result = false;
-            }
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1
-)";
-
-template <typename fp>
-bool usm_dependency(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<int64_t> m_vec = { 1 };
-    std::vector<int64_t> n_vec = { 1 };
-    std::vector<int64_t> lda_vec = { 1 };
-    std::vector<int64_t> group_sizes_vec = { 1 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_initial_list;
-    std::list<std::vector<fp>> A_list;
-    std::list<std::vector<fp>> tau_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto m = m_vec[group_id];
-        auto n = n_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_initial_list.emplace_back(lda * n);
-            auto& A_initial = A_initial_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda);
-
-            A_list.emplace_back(A_initial);
-            tau_list.emplace_back(std::min(m, n));
-        }
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> tau_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        fp** tau_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        auto A_iter = A_list.begin();
-        auto tau_iter = tau_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, tau_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-            tau_dev_list.emplace_back(tau_iter->size(), usm_fp_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::geqrf_batch_scratchpad_size<fp>(
-            queue, m_vec.data(), n_vec.data(), lda_vec.data(), group_count, group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::geqrf_batch_scratchpad_size<fp>,
-            m_vec.data(), n_vec.data(), lda_vec.data(), group_count, group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        auto tau_dev_iter = tau_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_dev_iter++, tau_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-            tau_dev_ptrs[global_id] = tau_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-        }
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::geqrf_batch(
-            queue, m_vec.data(), n_vec.data(), A_dev_ptrs, lda_vec.data(), tau_dev_ptrs,
-            group_count, group_sizes_vec.data(), scratchpad_dev, scratchpad_size,
-            std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::geqrf_batch,
-                                  m_vec.data(), n_vec.data(), A_dev_ptrs, lda_vec.data(),
-                                  tau_dev_ptrs, group_count, group_sizes_vec.data(), scratchpad_dev,
-                                  scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-        if (tau_dev_ptrs) {
-            sycl::free(tau_dev_ptrs, queue);
-        }
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_USM(GeqrfBatchGroup);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(GeqrfBatchGroup);
diff --git a/tests/unit_tests/lapack/source/geqrf_batch_stride.cpp b/tests/unit_tests/lapack/source/geqrf_batch_stride.cpp
deleted file mode 100644
index 16ceef63a..000000000
--- a/tests/unit_tests/lapack/source/geqrf_batch_stride.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-31 27 33 1024 40 3 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, int64_t m, int64_t n, int64_t lda, int64_t stride_a,
-              int64_t stride_tau, int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(stride_a * batch_size);
-    std::vector<fp> tau(stride_tau * batch_size);
-
-    for (int64_t i = 0; i < batch_size; i++)
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda, i * stride_a);
-
-    std::vector<fp> A = A_initial;
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::geqrf_batch_scratchpad_size<fp>(
-            queue, m, n, lda, stride_a, stride_tau, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::geqrf_batch_scratchpad_size<fp>, m, n,
-            lda, stride_a, stride_tau, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::geqrf_batch(queue, m, n, A_dev, lda, stride_a, tau_dev, stride_tau,
-                                         batch_size, scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::geqrf_batch, m, n, A_dev, lda,
-                                  stride_a, tau_dev, stride_tau, batch_size, scratchpad_dev,
-                                  scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, tau_dev, tau.data(), tau.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    bool result = true;
-    for (int64_t i = 0; i < batch_size; i++) {
-        auto A_ = copy_vector(A, lda * n, i * stride_a);
-        auto tau_ = copy_vector(tau, std::min(m, n), i * stride_tau);
-        auto A_initial_ = copy_vector(A_initial, lda * n, i * stride_a);
-        if (!check_geqrf_accuracy(m, n, A_, lda, tau_, A_initial_)) {
-            test_log::lout << "batch routine index " << i << " failed" << std::endl;
-            result = false;
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, int64_t m, int64_t n, int64_t lda, int64_t stride_a,
-                    int64_t stride_tau, int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(stride_a * batch_size);
-    std::vector<fp> tau(stride_tau * batch_size);
-
-    for (int64_t i = 0; i < batch_size; i++)
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda, i * stride_a);
-
-    std::vector<fp> A = A_initial;
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::geqrf_batch_scratchpad_size<fp>(
-            queue, m, n, lda, stride_a, stride_tau, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::geqrf_batch_scratchpad_size<fp>, m, n,
-            lda, stride_a, stride_tau, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::geqrf_batch(
-            queue, m, n, A_dev, lda, stride_a, tau_dev, stride_tau, batch_size, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::geqrf_batch, m, n, A_dev,
-                                  lda, stride_a, tau_dev, stride_tau, batch_size, scratchpad_dev,
-                                  scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(GeqrfBatchStride);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(GeqrfBatchStride);
diff --git a/tests/unit_tests/lapack/source/gerqf.cpp b/tests/unit_tests/lapack/source/gerqf.cpp
deleted file mode 100644
index dac6d79aa..000000000
--- a/tests/unit_tests/lapack/source/gerqf.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-31 50 37 27182
-30 30 34 27182
-50 37 54 27182
-31 22 37 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, int64_t m, int64_t n, int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(lda * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda);
-
-    std::vector<fp> A = A_initial;
-    std::vector<fp> tau(std::min(m, n));
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::gerqf_scratchpad_size<fp>(queue, m, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::gerqf_scratchpad_size<fp>, m, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::gerqf(queue, m, n, A_dev, lda, tau_dev, scratchpad_dev,
-                                   scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::gerqf, m, n, A_dev, lda, tau_dev,
-                                  scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, tau_dev, tau.data(), tau.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_gerqf_accuracy(A, A_initial, tau, m, n, lda);
-}
-
-const char* dependency_input = R"(
-1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, int64_t m, int64_t n, int64_t lda, uint64_t seed) {
-    using fp = data_T;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(lda * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda);
-
-    std::vector<fp> A = A_initial;
-    std::vector<fp> tau(std::min(m, n));
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::gerqf_scratchpad_size<fp>(queue, m, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::gerqf_scratchpad_size<fp>, m, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::gerqf(queue, m, n, A_dev, lda, tau_dev, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::gerqf, m, n, A_dev, lda,
-                                  tau_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(Gerqf);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(Gerqf);
diff --git a/tests/unit_tests/lapack/source/gesvd.cpp b/tests/unit_tests/lapack/source/gesvd.cpp
deleted file mode 100644
index 1e143315b..000000000
--- a/tests/unit_tests/lapack/source/gesvd.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 1 8 8 10 10 10 27182
-1 1 30 24 42 33 33 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-              int64_t m, int64_t n, int64_t lda, int64_t ldu, int64_t ldvt, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    int64_t min_mn = std::min(m, n);
-    int64_t ucols = min_mn;
-    if (jobu == oneapi::mkl::jobsvd::vectors)
-        ucols = m;
-    int64_t vtrows = min_mn;
-    if (jobvt == oneapi::mkl::jobsvd::vectors)
-        vtrows = n;
-
-    std::vector<fp> A(lda * n);
-    std::vector<fp> U(ldu * ucols);
-    std::vector<fp> Vt(ldvt * n);
-    std::vector<fp_real> s(min_mn);
-
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A, lda);
-    std::vector<fp> A_initial = A;
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto U_dev = device_alloc<data_T>(queue, U.size());
-        auto Vt_dev = device_alloc<data_T>(queue, Vt.size());
-        auto s_dev = device_alloc<data_T, fp_real>(queue, s.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::gesvd_scratchpad_size<fp>(
-            queue, jobu, jobvt, m, n, lda, ldu, ldvt);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::gesvd_scratchpad_size<fp>,
-                                  jobu, jobvt, m, n, lda, ldu, ldvt);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, A_dev, lda, s_dev, U_dev, ldu, Vt_dev,
-                                   ldvt, scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::gesvd, jobu, jobvt, m, n, A_dev, lda,
-                                  s_dev, U_dev, ldu, Vt_dev, ldvt, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, U_dev, U.data(), U.size());
-        device_to_host_copy(queue, Vt_dev, Vt.data(), Vt.size());
-        device_to_host_copy(queue, s_dev, s.data(), s.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, U_dev);
-        device_free(queue, Vt_dev);
-        device_free(queue, s_dev);
-        device_free(queue, scratchpad_dev);
-    }
-    bool result = true;
-
-    if (jobu == oneapi::mkl::jobsvd::vectors && jobvt == oneapi::mkl::jobsvd::vectors) {
-        /* |A - U S V'| < |A| O(eps) */
-        std::vector<fp> US(m * n);
-        int64_t ldus = m;
-        for (int64_t col = 0; col < min_mn; col++)
-            for (int64_t row = 0; row < m; row++)
-                US[row + col * ldus] = U[row + col * ldu] * s[col];
-        std::vector<fp> USV(m * n);
-        int64_t ldusv = m;
-        reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, m, n, n,
-                        1.0, US.data(), ldus, Vt.data(), ldvt, 0.0, USV.data(), ldusv);
-        if (!rel_mat_err_check(m, n, A_initial, lda, USV, ldusv)) {
-            test_log::lout << "Factorization check failed" << std::endl;
-            result = false;
-        }
-    }
-
-    if (jobu == oneapi::mkl::jobsvd::vectorsina)
-        reference::lacpy('A', m, ucols, A.data(), lda, U.data(), ldu);
-    if (jobvt == oneapi::mkl::jobsvd::vectorsina)
-        reference::lacpy('A', vtrows, n, A.data(), lda, Vt.data(), ldvt);
-
-    if (jobu == oneapi::mkl::jobsvd::vectors || jobu == oneapi::mkl::jobsvd::somevec ||
-        jobu == oneapi::mkl::jobsvd::vectorsina) {
-        /* |I - U' U| < n O(eps) */
-        std::vector<fp> UU(ucols * ucols);
-        int64_t lduu = ucols;
-        reference::gemm(oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, ucols,
-                        ucols, m, 1.0, U.data(), ldu, U.data(), ldu, 0.0, UU.data(), lduu);
-        if (!rel_id_err_check(ucols, UU, lduu)) {
-            test_log::lout << "U Orthogonality check failed" << std::endl;
-            result = false;
-        }
-    }
-
-    if (jobvt == oneapi::mkl::jobsvd::vectors || jobvt == oneapi::mkl::jobsvd::somevec ||
-        jobvt == oneapi::mkl::jobsvd::vectorsina) {
-        /* |I - V' V| < n O(eps) */
-        std::vector<fp> VV(vtrows * vtrows);
-        int64_t ldvv = vtrows;
-        reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::conjtrans, vtrows,
-                        vtrows, n, 1.0, Vt.data(), ldvt, Vt.data(), ldvt, 0.0, VV.data(), ldvv);
-        if (!rel_id_err_check(vtrows, VV, ldvv)) {
-            test_log::lout << "V Orthogonality check failed" << std::endl;
-            result = false;
-        }
-    }
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt,
-                    int64_t m, int64_t n, int64_t lda, int64_t ldu, int64_t ldvt, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    int64_t min_mn = std::min(m, n);
-    int64_t ucols = min_mn;
-    if (jobu == oneapi::mkl::jobsvd::vectors)
-        ucols = m;
-    int64_t vtrows = min_mn;
-    if (jobvt == oneapi::mkl::jobsvd::vectors)
-        vtrows = n;
-
-    std::vector<fp> A(lda * n);
-    std::vector<fp> U(ldu * ucols);
-    std::vector<fp> Vt(ldvt * n);
-    std::vector<fp_real> s(min_mn);
-
-    rand_matrix_diag_dom(seed, oneapi::mkl::transpose::nontrans, m, n, A, lda);
-    std::vector<fp> A_initial = A;
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto U_dev = device_alloc<data_T>(queue, U.size());
-        auto Vt_dev = device_alloc<data_T>(queue, Vt.size());
-        auto s_dev = device_alloc<data_T, fp_real>(queue, s.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::gesvd_scratchpad_size<fp>(
-            queue, jobu, jobvt, m, n, lda, ldu, ldvt);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::gesvd_scratchpad_size<fp>,
-                                  jobu, jobvt, m, n, lda, ldu, ldvt);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::gesvd(
-            queue, jobu, jobvt, m, n, A_dev, lda, s_dev, U_dev, ldu, Vt_dev, ldvt, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::gesvd, jobu, jobvt, m, n,
-                                  A_dev, lda, s_dev, U_dev, ldu, Vt_dev, ldvt, scratchpad_dev,
-                                  scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, U_dev);
-        device_free(queue, Vt_dev);
-        device_free(queue, s_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(Gesvd);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(Gesvd);
diff --git a/tests/unit_tests/lapack/source/getrf.cpp b/tests/unit_tests/lapack/source/getrf.cpp
deleted file mode 100644
index 4537ef665..000000000
--- a/tests/unit_tests/lapack/source/getrf.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-13 28 13 27182
-24 15 29 27182
-67 74 74 27182
-82 39 85 27182
-98 98 98 27182
-329 329 329 27182
-428 428 428 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, int64_t m, int64_t n, int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(lda * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda);
-
-    std::vector<fp> A = A_initial;
-    std::vector<int64_t> ipiv(std::min(m, n));
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::getrf_scratchpad_size<fp>(queue, m, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getrf_scratchpad_size<fp>, m, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::getrf(queue, m, n, A_dev, lda, ipiv_dev, scratchpad_dev,
-                                   scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::getrf, m, n, A_dev, lda, ipiv_dev,
-                                  scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, ipiv_dev, ipiv.data(), ipiv.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_getrf_accuracy(m, n, A, lda, ipiv, A_initial);
-}
-
-const char* dependency_input = R"(
-1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, int64_t m, int64_t n, int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(lda * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda);
-
-    std::vector<fp> A = A_initial;
-    std::vector<int64_t> ipiv(std::min(m, n));
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::getrf_scratchpad_size<fp>(queue, m, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getrf_scratchpad_size<fp>, m, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::getrf(queue, m, n, A_dev, lda, ipiv_dev, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::getrf, m, n, A_dev, lda,
-                                  ipiv_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(Getrf);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(Getrf);
diff --git a/tests/unit_tests/lapack/source/getrf_batch_group.cpp b/tests/unit_tests/lapack/source/getrf_batch_group.cpp
deleted file mode 100644
index 12e651746..000000000
--- a/tests/unit_tests/lapack/source/getrf_batch_group.cpp
+++ /dev/null
@@ -1,307 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <list>
-#include <numeric>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-27182
-)";
-
-template <typename fp>
-bool accuracy(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<int64_t> m_vec = { 2, 4 };
-    std::vector<int64_t> n_vec = { 4, 4 };
-    std::vector<int64_t> lda_vec = { 5, 5 };
-    std::vector<int64_t> group_sizes_vec = { 2, 2 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_initial_list;
-    std::list<std::vector<fp>> A_list;
-    std::list<std::vector<int64_t>> ipiv_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto m = m_vec[group_id];
-        auto n = n_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_initial_list.emplace_back(lda * n);
-            auto& A_initial = A_initial_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda);
-
-            A_list.emplace_back(A_initial);
-            ipiv_list.emplace_back(std::min(m, n));
-        }
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        std::list<std::vector<int64_t, sycl::usm_allocator<int64_t, sycl::usm::alloc::shared>>>
-            ipiv_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        int64_t** ipiv_dev_ptrs = sycl::malloc_shared<int64_t*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        sycl::usm_allocator<int64_t, sycl::usm::alloc::shared> usm_int_allocator{
-            queue.get_context(), dev
-        };
-        auto A_iter = A_list.begin();
-        auto ipiv_iter = ipiv_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, ipiv_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-            ipiv_dev_list.emplace_back(ipiv_iter->size(), usm_int_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<fp>(
-            queue, m_vec.data(), n_vec.data(), lda_vec.data(), group_count, group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<fp>,
-            m_vec.data(), n_vec.data(), lda_vec.data(), group_count, group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        auto ipiv_dev_iter = ipiv_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_dev_iter++, ipiv_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-            ipiv_dev_ptrs[global_id] = ipiv_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-        }
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::getrf_batch(queue, m_vec.data(), n_vec.data(), A_dev_ptrs,
-                                         lda_vec.data(), ipiv_dev_ptrs, group_count,
-                                         group_sizes_vec.data(), scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::getrf_batch, m_vec.data(),
-                                  n_vec.data(), A_dev_ptrs, lda_vec.data(), ipiv_dev_ptrs,
-                                  group_count, group_sizes_vec.data(), scratchpad_dev,
-                                  scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        A_iter = A_list.begin();
-        ipiv_iter = ipiv_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, ipiv_iter++) {
-            device_to_host_copy(queue, A_dev_ptrs[global_id], A_iter->data(), A_iter->size());
-            device_to_host_copy(queue, ipiv_dev_ptrs[global_id], ipiv_iter->data(),
-                                ipiv_iter->size());
-        }
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-        if (ipiv_dev_ptrs) {
-            sycl::free(ipiv_dev_ptrs, queue);
-        }
-    }
-
-    bool result = true;
-
-    int64_t global_id = 0;
-    auto A_iter = A_list.begin();
-    auto ipiv_iter = ipiv_list.begin();
-    auto A_initial_iter = A_initial_list.begin();
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto m = m_vec[group_id];
-        auto n = n_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-        for (int64_t local_id = 0; local_id < group_size;
-             local_id++, global_id++, A_iter++, ipiv_iter++, A_initial_iter++) {
-            if (!check_getrf_accuracy(m, n, *A_iter, lda, *ipiv_iter, *A_initial_iter)) {
-                test_log::lout << "batch routine (" << global_id << ", " << group_id << ", "
-                               << local_id << ") (global_id, group_id, local_id) failed"
-                               << std::endl;
-                result = false;
-            }
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1
-)";
-
-template <typename fp>
-bool usm_dependency(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<int64_t> m_vec = { 1 };
-    std::vector<int64_t> n_vec = { 1 };
-    std::vector<int64_t> lda_vec = { 1 };
-    std::vector<int64_t> group_sizes_vec = { 1 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_initial_list;
-    std::list<std::vector<fp>> A_list;
-    std::list<std::vector<int64_t>> ipiv_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto m = m_vec[group_id];
-        auto n = n_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_initial_list.emplace_back(lda * n);
-            auto& A_initial = A_initial_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda);
-
-            A_list.emplace_back(A_initial);
-            ipiv_list.emplace_back(std::min(m, n));
-        }
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        std::list<std::vector<int64_t, sycl::usm_allocator<int64_t, sycl::usm::alloc::shared>>>
-            ipiv_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        int64_t** ipiv_dev_ptrs = sycl::malloc_shared<int64_t*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        sycl::usm_allocator<int64_t, sycl::usm::alloc::shared> usm_int_allocator{
-            queue.get_context(), dev
-        };
-        auto A_iter = A_list.begin();
-        auto ipiv_iter = ipiv_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, ipiv_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-            ipiv_dev_list.emplace_back(ipiv_iter->size(), usm_int_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<fp>(
-            queue, m_vec.data(), n_vec.data(), lda_vec.data(), group_count, group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<fp>,
-            m_vec.data(), n_vec.data(), lda_vec.data(), group_count, group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        auto ipiv_dev_iter = ipiv_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_dev_iter++, ipiv_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-            ipiv_dev_ptrs[global_id] = ipiv_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-        }
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::getrf_batch(
-            queue, m_vec.data(), n_vec.data(), A_dev_ptrs, lda_vec.data(), ipiv_dev_ptrs,
-            group_count, group_sizes_vec.data(), scratchpad_dev, scratchpad_size,
-            std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, func_event = oneapi::mkl::lapack::getrf_batch, m_vec.data(), n_vec.data(),
-            A_dev_ptrs, lda_vec.data(), ipiv_dev_ptrs, group_count, group_sizes_vec.data(),
-            scratchpad_dev, scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-        if (ipiv_dev_ptrs) {
-            sycl::free(ipiv_dev_ptrs, queue);
-        }
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_USM(GetrfBatchGroup);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(GetrfBatchGroup);
diff --git a/tests/unit_tests/lapack/source/getrf_batch_stride.cpp b/tests/unit_tests/lapack/source/getrf_batch_stride.cpp
deleted file mode 100644
index 3e4ef6589..000000000
--- a/tests/unit_tests/lapack/source/getrf_batch_stride.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-31 27 33 1024 3 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, int64_t m, int64_t n, int64_t lda, int64_t stride_a,
-              int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(stride_a * batch_size);
-    for (int64_t i = 0; i < batch_size; i++)
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda, i * stride_a);
-
-    std::vector<fp> A = A_initial;
-    int64_t stride_ipiv = std::min(m, n);
-    std::vector<int64_t> ipiv(stride_ipiv * batch_size);
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<fp>(
-            queue, m, n, lda, stride_a, stride_ipiv, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<fp>, m, n,
-            lda, stride_a, stride_ipiv, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::getrf_batch(queue, m, n, A_dev, lda, stride_a, ipiv_dev, stride_ipiv,
-                                         batch_size, scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::getrf_batch, m, n, A_dev, lda,
-                                  stride_a, ipiv_dev, stride_ipiv, batch_size, scratchpad_dev,
-                                  scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, ipiv_dev, ipiv.data(), ipiv.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    bool result = true;
-    for (int64_t i = 0; i < batch_size; i++) {
-        auto A_ = copy_vector(A, lda * n, i * stride_a);
-        auto ipiv_ = copy_vector(ipiv, std::min(m, n), i * stride_ipiv);
-        auto A_initial_ = copy_vector(A_initial, lda * n, i * stride_a);
-        if (!check_getrf_accuracy(m, n, A_, lda, ipiv_, A_initial_)) {
-            test_log::lout << "batch routine index " << i << " failed" << std::endl;
-            result = false;
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, int64_t m, int64_t n, int64_t lda, int64_t stride_a,
-                    int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(stride_a * batch_size);
-    for (int64_t i = 0; i < batch_size; i++)
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A_initial, lda, i * stride_a);
-
-    std::vector<fp> A = A_initial;
-    int64_t stride_ipiv = std::min(m, n);
-    std::vector<int64_t> ipiv(stride_ipiv * batch_size);
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<fp>(
-            queue, m, n, lda, stride_a, stride_ipiv, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<fp>, m, n,
-            lda, stride_a, stride_ipiv, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::getrf_batch(
-            queue, m, n, A_dev, lda, stride_a, ipiv_dev, stride_ipiv, batch_size, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::getrf_batch, m, n, A_dev,
-                                  lda, stride_a, ipiv_dev, stride_ipiv, batch_size, scratchpad_dev,
-                                  scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(GetrfBatchStride);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(GetrfBatchStride);
diff --git a/tests/unit_tests/lapack/source/getri.cpp b/tests/unit_tests/lapack/source/getri.cpp
deleted file mode 100644
index a1aa2deda..000000000
--- a/tests/unit_tests/lapack/source/getri.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-25 66 27182
-32 92 27182
-76 87 27182
-89 89 27182
-25 66 27182
-32 92 27182
-89 89 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, int64_t n, int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(lda * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A_initial, lda);
-
-    std::vector<fp> A = A_initial;
-    std::vector<int64_t> ipiv(n);
-
-    auto info = reference::getrf(n, n, A.data(), lda, ipiv.data());
-    if (info != 0) {
-        test_log::lout << "Reference getrf failed with info = " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::getri_scratchpad_size<fp>(queue, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getri_scratchpad_size<fp>, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, ipiv.data(), ipiv_dev, ipiv.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::getri(queue, n, A_dev, lda, ipiv_dev, scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::getri, n, A_dev, lda, ipiv_dev,
-                                  scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_getri_accuracy(n, A, lda, ipiv, A_initial);
-}
-
-const char* dependency_input = R"(
-1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, int64_t n, int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(lda * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A_initial, lda);
-
-    std::vector<fp> A = A_initial;
-    std::vector<int64_t> ipiv(n);
-
-    int64_t info = 0;
-    info = reference::getrf(n, n, A.data(), lda, ipiv.data());
-    if (info != 0) {
-        test_log::lout << "Reference getrf failed with info = " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::getri_scratchpad_size<fp>(queue, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getri_scratchpad_size<fp>, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, ipiv.data(), ipiv_dev, ipiv.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::getri(queue, n, A_dev, lda, ipiv_dev, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::getri, n, A_dev, lda,
-                                  ipiv_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(Getri);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(Getri);
diff --git a/tests/unit_tests/lapack/source/getri_batch_group.cpp b/tests/unit_tests/lapack/source/getri_batch_group.cpp
deleted file mode 100644
index 244acfcc8..000000000
--- a/tests/unit_tests/lapack/source/getri_batch_group.cpp
+++ /dev/null
@@ -1,322 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <list>
-#include <numeric>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-27182
-)";
-
-template <typename fp>
-bool accuracy(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<int64_t> n_vec = { 6, 4 };
-    std::vector<int64_t> lda_vec = { 7, 6 };
-    std::vector<int64_t> group_sizes_vec = { 2, 2 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_initial_list;
-    std::list<std::vector<fp>> A_list;
-    std::list<std::vector<int64_t>> ipiv_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto n = n_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_initial_list.emplace_back(lda * n);
-            auto& A_initial = A_initial_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A_initial, lda);
-
-            A_list.emplace_back(A_initial);
-            auto& A = A_list.back();
-
-            ipiv_list.emplace_back(n);
-            auto& ipiv = ipiv_list.back();
-
-            auto info = reference::getrf(n, n, A.data(), lda, ipiv.data());
-            if (info != 0) {
-                test_log::lout << "Reference getrf failed with info = " << info << std::endl;
-                return false;
-            }
-        }
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        std::list<std::vector<int64_t, sycl::usm_allocator<int64_t, sycl::usm::alloc::shared>>>
-            ipiv_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        int64_t** ipiv_dev_ptrs = sycl::malloc_shared<int64_t*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        sycl::usm_allocator<int64_t, sycl::usm::alloc::shared> usm_int_allocator{
-            queue.get_context(), dev
-        };
-        auto A_iter = A_list.begin();
-        auto ipiv_iter = ipiv_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, ipiv_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-            ipiv_dev_list.emplace_back(ipiv_iter->size(), usm_int_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<fp>(
-            queue, n_vec.data(), lda_vec.data(), group_count, group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<fp>,
-            n_vec.data(), lda_vec.data(), group_count, group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        auto ipiv_dev_iter = ipiv_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_dev_iter++, ipiv_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-            ipiv_dev_ptrs[global_id] = ipiv_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        ipiv_iter = ipiv_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, ipiv_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-            host_to_device_copy(queue, ipiv_iter->data(), ipiv_dev_ptrs[global_id],
-                                ipiv_iter->size());
-        }
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::getri_batch(queue, n_vec.data(), A_dev_ptrs, lda_vec.data(),
-                                         ipiv_dev_ptrs, group_count, group_sizes_vec.data(),
-                                         scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::getri_batch, n_vec.data(), A_dev_ptrs,
-                                  lda_vec.data(), ipiv_dev_ptrs, group_count,
-                                  group_sizes_vec.data(), scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        A_iter = A_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++) {
-            device_to_host_copy(queue, A_dev_ptrs[global_id], A_iter->data(), A_iter->size());
-        }
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-        if (ipiv_dev_ptrs) {
-            sycl::free(ipiv_dev_ptrs, queue);
-        }
-    }
-
-    bool result = true;
-
-    int64_t global_id = 0;
-    auto A_iter = A_list.begin();
-    auto ipiv_iter = ipiv_list.begin();
-    auto A_initial_iter = A_initial_list.begin();
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto n = n_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-        for (int64_t local_id = 0; local_id < group_size;
-             local_id++, global_id++, A_iter++, ipiv_iter++, A_initial_iter++) {
-            if (!check_getri_accuracy(n, *A_iter, lda, *ipiv_iter, *A_initial_iter)) {
-                test_log::lout << "batch routine (" << global_id << ", " << group_id << ", "
-                               << local_id << ") (global_id, group_id, local_id) failed"
-                               << std::endl;
-                result = false;
-            }
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1
-)";
-
-template <typename fp>
-bool usm_dependency(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<int64_t> n_vec = { 1 };
-    std::vector<int64_t> lda_vec = { 1 };
-    std::vector<int64_t> group_sizes_vec = { 1 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_initial_list;
-    std::list<std::vector<fp>> A_list;
-    std::list<std::vector<int64_t>> ipiv_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto n = n_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_initial_list.emplace_back(lda * n);
-            auto& A_initial = A_initial_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A_initial, lda);
-
-            A_list.emplace_back(A_initial);
-            auto& A = A_list.back();
-
-            ipiv_list.emplace_back(n);
-            auto& ipiv = ipiv_list.back();
-
-            auto info = reference::getrf(n, n, A.data(), lda, ipiv.data());
-            if (info != 0) {
-                test_log::lout << "Reference getrf failed with info = " << info << std::endl;
-                return false;
-            }
-        }
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        std::list<std::vector<int64_t, sycl::usm_allocator<int64_t, sycl::usm::alloc::shared>>>
-            ipiv_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        int64_t** ipiv_dev_ptrs = sycl::malloc_shared<int64_t*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        sycl::usm_allocator<int64_t, sycl::usm::alloc::shared> usm_int_allocator{
-            queue.get_context(), dev
-        };
-        auto A_iter = A_list.begin();
-        auto ipiv_iter = ipiv_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, ipiv_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-            ipiv_dev_list.emplace_back(ipiv_iter->size(), usm_int_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<fp>(
-            queue, n_vec.data(), lda_vec.data(), group_count, group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<fp>,
-            n_vec.data(), lda_vec.data(), group_count, group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        auto ipiv_dev_iter = ipiv_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_dev_iter++, ipiv_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-            ipiv_dev_ptrs[global_id] = ipiv_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        ipiv_iter = ipiv_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, ipiv_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-            host_to_device_copy(queue, ipiv_iter->data(), ipiv_dev_ptrs[global_id],
-                                ipiv_iter->size());
-        }
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::getri_batch(
-            queue, n_vec.data(), A_dev_ptrs, lda_vec.data(), ipiv_dev_ptrs, group_count,
-            group_sizes_vec.data(), scratchpad_dev, scratchpad_size,
-            std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::getri_batch,
-                                  n_vec.data(), A_dev_ptrs, lda_vec.data(), ipiv_dev_ptrs,
-                                  group_count, group_sizes_vec.data(), scratchpad_dev,
-                                  scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-        if (ipiv_dev_ptrs) {
-            sycl::free(ipiv_dev_ptrs, queue);
-        }
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_USM(GetriBatchGroup);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(GetriBatchGroup);
diff --git a/tests/unit_tests/lapack/source/getri_batch_stride.cpp b/tests/unit_tests/lapack/source/getri_batch_stride.cpp
deleted file mode 100644
index 5a71d2d7e..000000000
--- a/tests/unit_tests/lapack/source/getri_batch_stride.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-31 33 1200 300 3 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, int64_t n, int64_t lda, int64_t stride_a,
-              int64_t stride_ipiv, int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-
-    std::vector<fp> A_initial(stride_a * batch_size);
-    std::vector<int64_t> ipiv(stride_ipiv * batch_size);
-    for (int64_t i = 0; i < batch_size; i++)
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A_initial, lda, i * stride_a);
-
-    std::vector<fp> A = A_initial;
-
-    for (int64_t i = 0; i < batch_size; i++) {
-        auto info =
-            reference::getrf(n, n, A.data() + i * stride_a, lda, ipiv.data() + i * stride_ipiv);
-        if (0 != info) {
-            test_log::lout << "batch routine index " << i
-                           << ": reference getrf failed with info: " << info << std::endl;
-            return false;
-        }
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<fp>(
-            queue, n, lda, stride_a, stride_ipiv, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<fp>, n, lda,
-            stride_a, stride_ipiv, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, ipiv.data(), ipiv_dev, ipiv.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::getri_batch(queue, n, A_dev, lda, stride_a, ipiv_dev, stride_ipiv,
-                                         batch_size, scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::getri_batch, n, A_dev, lda, stride_a,
-                                  ipiv_dev, stride_ipiv, batch_size, scratchpad_dev,
-                                  scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    bool result = true;
-    for (int64_t i = 0; i < batch_size; i++) {
-        auto A_ = copy_vector(A, lda * n, i * stride_a);
-        auto ipiv_ = copy_vector(ipiv, n, i * stride_ipiv);
-        auto A_initial_ = copy_vector(A_initial, lda * n, i * stride_a);
-        if (!check_getri_accuracy(n, A_, lda, ipiv_, A_initial_)) {
-            test_log::lout << "batch routine index " << i << " failed" << std::endl;
-            result = false;
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, int64_t n, int64_t lda, int64_t stride_a,
-                    int64_t stride_ipiv, int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-
-    std::vector<fp> A_initial(stride_a * batch_size);
-    std::vector<int64_t> ipiv(stride_ipiv * batch_size);
-    for (int64_t i = 0; i < batch_size; i++)
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A_initial, lda, i * stride_a);
-
-    std::vector<fp> A = A_initial;
-
-    for (int64_t i = 0; i < batch_size; i++) {
-        auto info =
-            reference::getrf(n, n, A.data() + i * stride_a, lda, ipiv.data() + i * stride_ipiv);
-        if (0 != info) {
-            test_log::lout << "batch routine index " << i
-                           << ": reference getrf failed with info: " << info << std::endl;
-            return false;
-        }
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<fp>(
-            queue, n, lda, stride_a, stride_ipiv, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<fp>, n, lda,
-            stride_a, stride_ipiv, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, ipiv.data(), ipiv_dev, ipiv.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::getri_batch(
-            queue, n, A_dev, lda, stride_a, ipiv_dev, stride_ipiv, batch_size, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::getri_batch, n, A_dev,
-                                  lda, stride_a, ipiv_dev, stride_ipiv, batch_size, scratchpad_dev,
-                                  scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(GetriBatchStride);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(GetriBatchStride);
diff --git a/tests/unit_tests/lapack/source/getrs.cpp b/tests/unit_tests/lapack/source/getrs.cpp
deleted file mode 100644
index bfc271758..000000000
--- a/tests/unit_tests/lapack/source/getrs.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-0 27 13 29 31 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::transpose trans, int64_t n, int64_t nrhs,
-              int64_t lda, int64_t ldb, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(lda * n);
-    std::vector<fp> B_initial(ldb * nrhs);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A_initial, lda);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, nrhs, B_initial, ldb);
-
-    std::vector<fp> A = A_initial;
-    std::vector<fp> B = B_initial;
-    std::vector<int64_t> ipiv(n);
-
-    auto info = reference::getrf(n, n, A.data(), lda, ipiv.data());
-    if (info != 0) {
-        test_log::lout << "Reference getrf failed with info = " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto B_dev = device_alloc<data_T>(queue, B.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::getrs_scratchpad_size<fp>(queue, trans, n, nrhs, lda, ldb);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::getrs_scratchpad_size<fp>,
-                                  trans, n, nrhs, lda, ldb);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, B.data(), B_dev, B.size());
-        host_to_device_copy(queue, ipiv.data(), ipiv_dev, ipiv.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, A_dev, lda, ipiv_dev, B_dev, ldb,
-                                   scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::getrs, trans, n, nrhs, A_dev, lda,
-                                  ipiv_dev, B_dev, ldb, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, B_dev, B.data(), B.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, B_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_getrs_accuracy(trans, n, nrhs, B, ldb, A_initial, lda, B_initial);
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::transpose trans, int64_t n, int64_t nrhs,
-                    int64_t lda, int64_t ldb, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(lda * n);
-    std::vector<fp> B_initial(ldb * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A_initial, lda);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, nrhs, B_initial, ldb);
-
-    std::vector<fp> A = A_initial;
-    std::vector<fp> B = B_initial;
-    std::vector<int64_t> ipiv(n);
-
-    int64_t info = 0;
-    info = reference::getrf(n, n, A.data(), lda, ipiv.data());
-    if (info != 0) {
-        test_log::lout << "Reference getrf failed with info = " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto B_dev = device_alloc<data_T>(queue, B.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::getrs_scratchpad_size<fp>(queue, trans, n, nrhs, lda, ldb);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::getrs_scratchpad_size<fp>,
-                                  trans, n, nrhs, lda, ldb);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, B.data(), B_dev, B.size());
-        host_to_device_copy(queue, ipiv.data(), ipiv_dev, ipiv.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::getrs(
-            queue, trans, n, nrhs, A_dev, lda, ipiv_dev, B_dev, ldb, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::getrs, trans, n, nrhs,
-                                  A_dev, lda, ipiv_dev, B_dev, ldb, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, B_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(Getrs);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(Getrs);
diff --git a/tests/unit_tests/lapack/source/getrs_batch_group.cpp b/tests/unit_tests/lapack/source/getrs_batch_group.cpp
deleted file mode 100644
index 2027663e4..000000000
--- a/tests/unit_tests/lapack/source/getrs_batch_group.cpp
+++ /dev/null
@@ -1,392 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <list>
-#include <numeric>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-27182
-)";
-
-template <typename fp>
-bool accuracy(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<oneapi::mkl::transpose> trans_vec = { oneapi::mkl::transpose::nontrans,
-                                                      oneapi::mkl::transpose::trans };
-    std::vector<int64_t> n_vec = { 4, 5 };
-    std::vector<int64_t> nrhs_vec = { 9, 6 };
-    std::vector<int64_t> lda_vec = { 6, 6 };
-    std::vector<int64_t> ldb_vec = { 9, 9 };
-    std::vector<int64_t> group_sizes_vec = { 2, 2 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_initial_list;
-    std::list<std::vector<fp>> B_initial_list;
-    std::list<std::vector<fp>> A_list;
-    std::list<std::vector<fp>> B_list;
-    std::list<std::vector<int64_t>> ipiv_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto trans = trans_vec[group_id];
-        auto n = n_vec[group_id];
-        auto nrhs = nrhs_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto ldb = ldb_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_initial_list.emplace_back(lda * n);
-            auto& A_initial = A_initial_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A_initial, lda);
-
-            A_list.emplace_back(A_initial);
-            auto& A = A_list.back();
-
-            B_initial_list.emplace_back(ldb * nrhs);
-            auto& B_initial = B_initial_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, nrhs, B_initial, lda);
-
-            B_list.emplace_back(B_initial);
-            auto& B = B_list.back();
-
-            ipiv_list.emplace_back(n);
-            auto& ipiv = ipiv_list.back();
-
-            auto info = reference::getrf(n, n, A.data(), lda, ipiv.data());
-            if (info != 0) {
-                test_log::lout << "Reference getrf failed with info = " << info << std::endl;
-                return false;
-            }
-        }
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> B_dev_list;
-        std::list<std::vector<int64_t, sycl::usm_allocator<int64_t, sycl::usm::alloc::shared>>>
-            ipiv_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        fp** B_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        int64_t** ipiv_dev_ptrs = sycl::malloc_shared<int64_t*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        sycl::usm_allocator<int64_t, sycl::usm::alloc::shared> usm_int_allocator{
-            queue.get_context(), dev
-        };
-        auto A_iter = A_list.begin();
-        auto B_iter = B_list.begin();
-        auto ipiv_iter = ipiv_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_iter++, B_iter++, ipiv_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-            B_dev_list.emplace_back(B_iter->size(), usm_fp_allocator);
-            ipiv_dev_list.emplace_back(ipiv_iter->size(), usm_int_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::getrs_batch_scratchpad_size<fp>(
-            queue, trans_vec.data(), n_vec.data(), nrhs_vec.data(), lda_vec.data(), ldb_vec.data(),
-            group_count, group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getrs_batch_scratchpad_size<fp>,
-            trans_vec.data(), n_vec.data(), nrhs_vec.data(), lda_vec.data(), ldb_vec.data(),
-            group_count, group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        auto B_dev_iter = B_dev_list.begin();
-        auto ipiv_dev_iter = ipiv_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_dev_iter++, B_dev_iter++, ipiv_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-            B_dev_ptrs[global_id] = B_dev_iter->data();
-            ipiv_dev_ptrs[global_id] = ipiv_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        B_iter = B_list.begin();
-        ipiv_iter = ipiv_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_iter++, B_iter++, ipiv_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-            host_to_device_copy(queue, B_iter->data(), B_dev_ptrs[global_id], B_iter->size());
-            host_to_device_copy(queue, ipiv_iter->data(), ipiv_dev_ptrs[global_id],
-                                ipiv_iter->size());
-        }
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::getrs_batch(queue, trans_vec.data(), n_vec.data(), nrhs_vec.data(),
-                                         A_dev_ptrs, lda_vec.data(), ipiv_dev_ptrs, B_dev_ptrs,
-                                         ldb_vec.data(), group_count, group_sizes_vec.data(),
-                                         scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::getrs_batch, trans_vec.data(),
-                                  n_vec.data(), nrhs_vec.data(), A_dev_ptrs, lda_vec.data(),
-                                  ipiv_dev_ptrs, B_dev_ptrs, ldb_vec.data(), group_count,
-                                  group_sizes_vec.data(), scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        B_iter = B_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, B_iter++) {
-            device_to_host_copy(queue, B_dev_ptrs[global_id], B_iter->data(), B_iter->size());
-        }
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-        if (B_dev_ptrs) {
-            sycl::free(B_dev_ptrs, queue);
-        }
-        if (ipiv_dev_ptrs) {
-            sycl::free(ipiv_dev_ptrs, queue);
-        }
-    }
-
-    bool result = true;
-
-    int64_t global_id = 0;
-    auto A_iter = A_list.begin();
-    auto B_iter = B_list.begin();
-    auto ipiv_iter = ipiv_list.begin();
-    auto A_initial_iter = A_initial_list.begin();
-    auto B_initial_iter = B_initial_list.begin();
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto trans = trans_vec[group_id];
-        auto n = n_vec[group_id];
-        auto nrhs = nrhs_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto ldb = ldb_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-        for (int64_t local_id = 0; local_id < group_size; local_id++, global_id++, A_iter++,
-                     B_iter++, ipiv_iter++, A_initial_iter++, B_initial_iter++) {
-            if (!check_getrs_accuracy(trans, n, nrhs, *B_iter, ldb, *A_initial_iter, lda,
-                                      *B_initial_iter)) {
-                test_log::lout << "batch routine (" << global_id << ", " << group_id << ", "
-                               << local_id << ") (global_id, group_id, local_id) failed"
-                               << std::endl;
-                result = false;
-            }
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1
-)";
-
-template <typename fp>
-bool usm_dependency(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<oneapi::mkl::transpose> trans_vec = { oneapi::mkl::transpose::nontrans };
-    std::vector<int64_t> n_vec = { 1 };
-    std::vector<int64_t> nrhs_vec = { 1 };
-    std::vector<int64_t> lda_vec = { 1 };
-    std::vector<int64_t> ldb_vec = { 1 };
-    std::vector<int64_t> group_sizes_vec = { 1 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_initial_list;
-    std::list<std::vector<fp>> B_initial_list;
-    std::list<std::vector<fp>> A_list;
-    std::list<std::vector<fp>> B_list;
-    std::list<std::vector<int64_t>> ipiv_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto trans = trans_vec[group_id];
-        auto n = n_vec[group_id];
-        auto nrhs = nrhs_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto ldb = ldb_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_initial_list.emplace_back(lda * n);
-            auto& A_initial = A_initial_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A_initial, lda);
-
-            A_list.emplace_back(A_initial);
-            auto& A = A_list.back();
-
-            B_initial_list.emplace_back(ldb * nrhs);
-            auto& B_initial = B_initial_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, nrhs, B_initial, lda);
-
-            B_list.emplace_back(B_initial);
-            auto& B = B_list.back();
-
-            ipiv_list.emplace_back(n);
-            auto& ipiv = ipiv_list.back();
-
-            auto info = reference::getrf(n, n, A.data(), lda, ipiv.data());
-            if (info != 0) {
-                test_log::lout << "Reference getrf failed with info = " << info << std::endl;
-                return false;
-            }
-        }
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> B_dev_list;
-        std::list<std::vector<int64_t, sycl::usm_allocator<int64_t, sycl::usm::alloc::shared>>>
-            ipiv_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        fp** B_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        int64_t** ipiv_dev_ptrs = sycl::malloc_shared<int64_t*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        sycl::usm_allocator<int64_t, sycl::usm::alloc::shared> usm_int_allocator{
-            queue.get_context(), dev
-        };
-        auto A_iter = A_list.begin();
-        auto B_iter = B_list.begin();
-        auto ipiv_iter = ipiv_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_iter++, B_iter++, ipiv_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-            B_dev_list.emplace_back(B_iter->size(), usm_fp_allocator);
-            ipiv_dev_list.emplace_back(ipiv_iter->size(), usm_int_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::getrs_batch_scratchpad_size<fp>(
-            queue, trans_vec.data(), n_vec.data(), nrhs_vec.data(), lda_vec.data(), ldb_vec.data(),
-            group_count, group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getrs_batch_scratchpad_size<fp>,
-            trans_vec.data(), n_vec.data(), nrhs_vec.data(), lda_vec.data(), ldb_vec.data(),
-            group_count, group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        auto B_dev_iter = B_dev_list.begin();
-        auto ipiv_dev_iter = ipiv_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_dev_iter++, B_dev_iter++, ipiv_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-            B_dev_ptrs[global_id] = B_dev_iter->data();
-            ipiv_dev_ptrs[global_id] = ipiv_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        B_iter = B_list.begin();
-        ipiv_iter = ipiv_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_iter++, B_iter++, ipiv_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-            host_to_device_copy(queue, B_iter->data(), B_dev_ptrs[global_id], B_iter->size());
-            host_to_device_copy(queue, ipiv_iter->data(), ipiv_dev_ptrs[global_id],
-                                ipiv_iter->size());
-        }
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::getrs_batch(
-            queue, trans_vec.data(), n_vec.data(), nrhs_vec.data(), A_dev_ptrs, lda_vec.data(),
-            ipiv_dev_ptrs, B_dev_ptrs, ldb_vec.data(), group_count, group_sizes_vec.data(),
-            scratchpad_dev, scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::getrs_batch,
-                                  trans_vec.data(), n_vec.data(), nrhs_vec.data(), A_dev_ptrs,
-                                  lda_vec.data(), ipiv_dev_ptrs, B_dev_ptrs, ldb_vec.data(),
-                                  group_count, group_sizes_vec.data(), scratchpad_dev,
-                                  scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-        if (B_dev_ptrs) {
-            sycl::free(B_dev_ptrs, queue);
-        }
-        if (ipiv_dev_ptrs) {
-            sycl::free(ipiv_dev_ptrs, queue);
-        }
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_USM(GetrsBatchGroup);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(GetrsBatchGroup);
diff --git a/tests/unit_tests/lapack/source/getrs_batch_stride.cpp b/tests/unit_tests/lapack/source/getrs_batch_stride.cpp
deleted file mode 100644
index 1faf3d3e6..000000000
--- a/tests/unit_tests/lapack/source/getrs_batch_stride.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-0 6 10 7 70 10 12 120 3 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::transpose trans, int64_t n, int64_t nrhs,
-              int64_t lda, int64_t stride_a, int64_t stride_ipiv, int64_t ldb, int64_t stride_b,
-              int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-
-    std::vector<fp> A_initial(stride_a * batch_size);
-    std::vector<fp> B_initial(stride_b * batch_size);
-    std::vector<int64_t> ipiv(stride_ipiv * batch_size);
-    for (int64_t i = 0; i < batch_size; i++) {
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A_initial, lda, i * stride_a);
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, nrhs, B_initial, ldb, i * stride_b);
-    }
-
-    std::vector<fp> A = A_initial;
-    std::vector<fp> B = B_initial;
-
-    for (int64_t i = 0; i < batch_size; i++) {
-        auto info =
-            reference::getrf(n, n, A.data() + i * stride_a, lda, ipiv.data() + i * stride_ipiv);
-        if (0 != info) {
-            test_log::lout << "batch routine index " << i
-                           << ": reference getrf failed with info: " << info << std::endl;
-            return false;
-        }
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto B_dev = device_alloc<data_T>(queue, B.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::getrs_batch_scratchpad_size<fp>(
-            queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getrs_batch_scratchpad_size<fp>, trans, n,
-            nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, B.data(), B_dev, B.size());
-        host_to_device_copy(queue, ipiv.data(), ipiv_dev, ipiv.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, A_dev, lda, stride_a, ipiv_dev,
-                                         stride_ipiv, B_dev, ldb, stride_b, batch_size,
-                                         scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::getrs_batch, trans, n, nrhs, A_dev,
-                                  lda, stride_a, ipiv_dev, stride_ipiv, B_dev, ldb, stride_b,
-                                  batch_size, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, B_dev, B.data(), B.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, B_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    bool result = true;
-    for (int64_t i = 0; i < batch_size; i++) {
-        auto B_ = copy_vector(B, ldb * nrhs, i * stride_b);
-        auto A_initial_ = copy_vector(A_initial, lda * n, i * stride_a);
-        auto B_initial_ = copy_vector(B_initial, ldb * nrhs, i * stride_b);
-        if (!check_getrs_accuracy(trans, n, nrhs, B_, ldb, A_initial_, lda, B_initial_)) {
-            test_log::lout << "batch routine index " << i << " failed" << std::endl;
-            result = false;
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::transpose trans, int64_t n, int64_t nrhs,
-                    int64_t lda, int64_t stride_a, int64_t stride_ipiv, int64_t ldb,
-                    int64_t stride_b, int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-
-    std::vector<fp> A_initial(stride_a * batch_size);
-    std::vector<fp> B_initial(stride_b * batch_size);
-    std::vector<int64_t> ipiv(stride_ipiv * batch_size);
-    for (auto i = 0; i < batch_size; ++i) {
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A_initial, lda, i * stride_a);
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, nrhs, n, B_initial, ldb, i * stride_b);
-    }
-
-    std::vector<fp> A = A_initial;
-    std::vector<fp> B = B_initial;
-
-    for (int64_t i = 0; i < batch_size; i++) {
-        auto info =
-            reference::getrf(n, n, A.data() + i * stride_a, lda, ipiv.data() + i * stride_ipiv);
-        if (0 != info) {
-            test_log::lout << "batch routine index " << i
-                           << ": reference getrf failed with info: " << info << std::endl;
-            return false;
-        }
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto B_dev = device_alloc<data_T>(queue, B.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::getrs_batch_scratchpad_size<fp>(
-            queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::getrs_batch_scratchpad_size<fp>, trans, n,
-            nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, B.data(), B_dev, B.size());
-        host_to_device_copy(queue, ipiv.data(), ipiv_dev, ipiv.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::getrs_batch(
-            queue, trans, n, nrhs, A_dev, lda, stride_a, ipiv_dev, stride_ipiv, B_dev, ldb,
-            stride_b, batch_size, scratchpad_dev, scratchpad_size,
-            std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::getrs_batch, trans, n,
-                                  nrhs, A_dev, lda, stride_a, ipiv_dev, stride_ipiv, B_dev, ldb,
-                                  stride_b, batch_size, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, B_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(GetrsBatchStride);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(GetrsBatchStride);
diff --git a/tests/unit_tests/lapack/source/heevd.cpp b/tests/unit_tests/lapack/source/heevd.cpp
deleted file mode 100644
index 62c23c3ad..000000000
--- a/tests/unit_tests/lapack/source/heevd.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 0 27 33 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, int64_t n,
-              int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    rand_hermitian_matrix(seed, uplo, n, A, lda);
-
-    std::vector<fp> A_initial = A;
-    std::vector<fp_real> w(n);
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto w_dev = device_alloc<data_T, fp_real>(queue, w.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::heevd_scratchpad_size<fp>(queue, jobz, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::heevd_scratchpad_size<fp>,
-                                  jobz, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::heevd(queue, jobz, uplo, n, A_dev, lda, w_dev, scratchpad_dev,
-                                   scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::heevd, jobz, uplo, n, A_dev, lda,
-                                  w_dev, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, w_dev, w.data(), w.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, w_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_sy_he_evd_accuracy(jobz, uplo, n, A, lda, w, A_initial);
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                    int64_t n, int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    rand_hermitian_matrix(seed, uplo, n, A, lda);
-
-    std::vector<fp> A_initial = A;
-    std::vector<fp_real> w(n);
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto w_dev = device_alloc<data_T, fp_real>(queue, w.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::heevd_scratchpad_size<fp>(queue, jobz, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::heevd_scratchpad_size<fp>,
-                                  jobz, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::heevd(queue, jobz, uplo, n, A_dev, lda, w_dev, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::heevd, jobz, uplo, n,
-                                  A_dev, lda, w_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, w_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_COMPLEX(Heevd);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_COMPLEX(Heevd);
diff --git a/tests/unit_tests/lapack/source/hegvd.cpp b/tests/unit_tests/lapack/source/hegvd.cpp
deleted file mode 100644
index 9a109e6b8..000000000
--- a/tests/unit_tests/lapack/source/hegvd.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 1 0 27 33 31 27182
-2 1 0 27 33 31 27182
-3 1 0 27 33 31 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-              int64_t n, int64_t lda, int64_t ldb, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    rand_hermitian_matrix(seed, uplo, n, A, lda);
-    std::vector<fp> B(ldb * n);
-    rand_pos_def_matrix(seed, uplo, n, B, ldb);
-
-    std::vector<fp> A_initial = A;
-    std::vector<fp> B_initial = B;
-    std::vector<fp_real> w(n);
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto B_dev = device_alloc<data_T>(queue, B.size());
-        auto w_dev = device_alloc<data_T, fp_real>(queue, w.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::hegvd_scratchpad_size<fp>(queue, itype, jobz, uplo, n, lda, ldb);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::hegvd_scratchpad_size<fp>,
-                                  itype, jobz, uplo, n, lda, ldb);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, B.data(), B_dev, B.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, A_dev, lda, B_dev, ldb, w_dev,
-                                   scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::hegvd, itype, jobz, uplo, n, A_dev,
-                                  lda, B_dev, ldb, w_dev, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, B_dev, B.data(), B.size());
-        device_to_host_copy(queue, w_dev, w.data(), w.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, B_dev);
-        device_free(queue, w_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    const auto& Z = A;
-    auto ldz = lda;
-    const auto& D = w;
-    hermitian_to_full(uplo, n, A_initial, lda);
-    hermitian_to_full(uplo, n, B_initial, ldb);
-    bool result = true;
-
-    /* |D_ref - D| < |D_ref| O(eps) */
-    std::vector<fp_real> D_ref(n);
-    reference::hegvd(itype, oneapi::mkl::job::novec, uplo, n, std::vector<fp>(A_initial).data(),
-                     lda, std::vector<fp>(B_initial).data(), ldb, D_ref.data());
-    if (!rel_vec_err_check(n, D_ref, D, 10.0)) {
-        test_log::lout << "Eigenvalue check failed" << std::endl;
-        result = false;
-    }
-
-    if (oneapi::mkl::job::vec == jobz) {
-        if (itype == 1) {
-            /* |A Z - B Z D| < |A Z| O(eps) */
-            std::vector<fp> AZ(n * n);
-            int64_t ldaz = n;
-            reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, A_initial.data(), lda, Z.data(), ldz, 0.0, AZ.data(), ldaz);
-
-            std::vector<fp> BZ(n * n);
-            int64_t ldbz = n;
-            reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, B_initial.data(), ldb, Z.data(), ldz, 0.0, BZ.data(), ldbz);
-
-            std::vector<fp> BZD(n * n);
-            int64_t ldbzd = n;
-            for (int64_t col = 0; col < n; col++)
-                for (int64_t row = 0; row < n; row++)
-                    BZD[row + col * ldbzd] = BZ[row + col * ldbz] * D[col];
-
-            if (!rel_mat_err_check(n, n, AZ, ldaz, BZD, ldbzd)) {
-                test_log::lout << "Factorization check failed" << std::endl;
-                result = false;
-            }
-
-            /* |I - Z' B Z| < n O(eps) */
-            std::vector<fp> ZBZ(n * n);
-            int64_t ldzbz = n;
-            reference::gemm(oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, Z.data(), ldz, BZ.data(), ldbz, 0.0, ZBZ.data(), ldzbz);
-            if (!rel_id_err_check(n, ZBZ, ldzbz)) {
-                test_log::lout << "Orthogonality check failed" << std::endl;
-                result = false;
-            }
-        }
-        else if (itype == 2) {
-            /* |A B Z - Z D| < |A B Z| O(eps) */
-            std::vector<fp> BZ(n * n);
-            int64_t ldbz = n;
-            reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, B_initial.data(), ldb, Z.data(), ldz, 0.0, BZ.data(), ldbz);
-
-            std::vector<fp> ABZ(n * n);
-            int64_t ldabz = n;
-            reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, A_initial.data(), lda, BZ.data(), ldbz, 0.0, ABZ.data(),
-                            ldabz);
-
-            std::vector<fp> ZD(n * n);
-            int64_t ldzd = n;
-            for (int64_t col = 0; col < n; col++)
-                for (int64_t row = 0; row < n; row++)
-                    ZD[row + col * ldzd] = Z[row + col * ldz] * D[col];
-
-            if (!rel_mat_err_check(n, n, ABZ, ldabz, ZD, ldbz)) {
-                test_log::lout << "Factorization check failed" << std::endl;
-                result = false;
-            }
-
-            /* |I - Z' B Z| < n O(eps) */
-            std::vector<fp> ZBZ(n * n);
-            int64_t ldzbz = n;
-            reference::gemm(oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, Z.data(), ldz, BZ.data(), ldbz, 0.0, ZBZ.data(), ldzbz);
-            if (!rel_id_err_check(n, ZBZ, ldzbz)) {
-                test_log::lout << "Orthogonality check failed" << std::endl;
-                result = false;
-            }
-        }
-        else {
-            /* |A Z - B^-1 Z D| < |A Z| O(eps) */
-            /* C = B^-1 Z */
-            std::vector<fp> AZ(n * n);
-            int64_t ldaz = n;
-            reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, A_initial.data(), lda, Z.data(), ldz, 0.0, AZ.data(), ldaz);
-
-            std::vector<fp> C(n * n);
-            int64_t ldc = n;
-            reference::lacpy('A', n, n, Z.data(), ldz, C.data(), ldc);
-            auto info = reference::potrs(uplo, n, n, B.data(), ldb, C.data(), ldc);
-            if (0 != info) {
-                test_log::lout << "reference potrs failed with info = " << info << std::endl;
-                return false;
-            }
-
-            std::vector<fp> CD(n * n);
-            int64_t ldcd = n;
-            for (int64_t col = 0; col < n; col++)
-                for (int64_t row = 0; row < n; row++)
-                    CD[row + col * ldcd] = C[row + col * ldc] * D[col];
-
-            if (!rel_mat_err_check(n, n, AZ, ldaz, CD, ldcd)) {
-                test_log::lout << "Factorization check failed" << std::endl;
-                result = false;
-            }
-
-            /* |I - Z' B^-1 Z| = |I - Z' C| < n O(eps) */
-            std::vector<fp> ZhC(n * n);
-            int64_t ldzhc = n;
-            reference::gemm(oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, Z.data(), ldz, C.data(), ldc, 0.0, ZhC.data(), ldzhc);
-            if (!rel_id_err_check(n, ZhC, ldzhc)) {
-                test_log::lout << "Orthogonality check failed" << std::endl;
-                result = false;
-            }
-        }
-    }
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, int64_t itype, oneapi::mkl::job jobz,
-                    oneapi::mkl::uplo uplo, int64_t n, int64_t lda, int64_t ldb, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    rand_hermitian_matrix(seed, uplo, n, A, lda);
-    std::vector<fp> B(ldb * n);
-    rand_pos_def_matrix(seed, uplo, n, B, ldb);
-
-    std::vector<fp> A_initial = A;
-    std::vector<fp> B_initial = B;
-    std::vector<fp_real> w(n);
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto B_dev = device_alloc<data_T>(queue, B.size());
-        auto w_dev = device_alloc<data_T, fp_real>(queue, w.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::hegvd_scratchpad_size<fp>(queue, itype, jobz, uplo, n, lda, ldb);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::hegvd_scratchpad_size<fp>,
-                                  itype, jobz, uplo, n, lda, ldb);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, B.data(), B_dev, B.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::hegvd(
-            queue, itype, jobz, uplo, n, A_dev, lda, B_dev, ldb, w_dev, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::hegvd, itype, jobz, uplo,
-                                  n, A_dev, lda, B_dev, ldb, w_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, B_dev);
-        device_free(queue, w_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_COMPLEX(Hegvd);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_COMPLEX(Hegvd);
diff --git a/tests/unit_tests/lapack/source/hetrd.cpp b/tests/unit_tests/lapack/source/hetrd.cpp
deleted file mode 100644
index 13172d64f..000000000
--- a/tests/unit_tests/lapack/source/hetrd.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 33 35 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-              uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<fp_real> d(n);
-    std::vector<fp_real> e(n);
-    std::vector<fp> tau(n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A, lda);
-    std::vector<fp> A_initial = A;
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto d_dev = device_alloc<data_T, fp_real>(queue, d.size());
-        auto e_dev = device_alloc<data_T, fp_real>(queue, e.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::hetrd_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::hetrd_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, d.data(), d_dev, d.size());
-        host_to_device_copy(queue, e.data(), e_dev, e.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::hetrd(queue, uplo, n, A_dev, lda, d_dev, e_dev, tau_dev,
-                                   scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::hetrd, uplo, n, A_dev, lda, d_dev,
-                                  e_dev, tau_dev, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, d_dev, d.data(), d.size());
-        device_to_host_copy(queue, e_dev, e.data(), e.size());
-        device_to_host_copy(queue, tau_dev, tau.data(), tau.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, d_dev);
-        device_free(queue, e_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    hermitian_to_full(uplo, n, A_initial, lda);
-    bool result = true;
-
-    /* |A - Q T Q'| < |A| O(eps) */
-    std::vector<fp> T(n * n);
-    int64_t ldt = n;
-    for (int64_t diag = 0; diag < n; diag++)
-        T[diag + diag * ldt] = d[diag];
-    for (int64_t diag = 0; diag < n - 1; diag++) {
-        T[diag + (diag + 1) * ldt] = e[diag];
-        T[diag + 1 + (diag)*ldt] = e[diag];
-    }
-
-    std::vector<fp> QTQ{ T };
-    int64_t ldqtq = n;
-    reference::or_un_mtr(oneapi::mkl::side::left, uplo, oneapi::mkl::transpose::nontrans, n, n,
-                         A.data(), lda, tau.data(), QTQ.data(), ldqtq);
-    reference::or_un_mtr(oneapi::mkl::side::right, uplo, oneapi::mkl::transpose::conjtrans, n, n,
-                         A.data(), lda, tau.data(), QTQ.data(), ldqtq);
-
-    if (!rel_mat_err_check(n, n, QTQ, ldqtq, A_initial, lda)) {
-        test_log::lout << "Factorization check failed" << std::endl;
-        result = false;
-    }
-
-    /* A[i, i] = d[i] */
-    for (int64_t diag = 0; diag < n; diag++)
-        d[diag] -= A[diag + diag * lda].real();
-    if (uplo == oneapi::mkl::uplo::upper)
-        for (int64_t diag = 0; diag < n - 1; diag++)
-            e[diag] -= A[diag + (diag + 1) * lda].real();
-    else
-        for (int64_t diag = 0; diag < n - 1; diag++)
-            e[diag] -= A[diag + 1 + (diag)*lda].real();
-
-    auto ulp = reference::lamch<fp_real>('P');
-    if (reference::lange('I', n, 1, d.data(), n) > 10.0 * ulp) {
-        test_log::lout << "Diagonal check failed" << std::endl;
-        result = false;
-    }
-    if (reference::lange('I', n - 1, 1, e.data(), n - 1) > 10.0 * ulp) {
-        test_log::lout << "Off-diagonal check failed" << std::endl;
-        result = false;
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-                    uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<fp_real> d(n);
-    std::vector<fp_real> e(n);
-    std::vector<fp> tau(n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A, lda);
-    std::vector<fp> A_initial = A;
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto d_dev = device_alloc<data_T, fp_real>(queue, d.size());
-        auto e_dev = device_alloc<data_T, fp_real>(queue, e.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::hetrd_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::hetrd_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, d.data(), d_dev, d.size());
-        host_to_device_copy(queue, e.data(), e_dev, e.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::hetrd(
-            queue, uplo, n, A_dev, lda, d_dev, e_dev, tau_dev, scratchpad_dev, scratchpad_size,
-            std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::hetrd, uplo, n, A_dev,
-                                  lda, d_dev, e_dev, tau_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, d_dev);
-        device_free(queue, e_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_COMPLEX(Hetrd);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_COMPLEX(Hetrd);
diff --git a/tests/unit_tests/lapack/source/hetrf.cpp b/tests/unit_tests/lapack/source/hetrf.cpp
deleted file mode 100644
index 73535a77f..000000000
--- a/tests/unit_tests/lapack/source/hetrf.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-0 27 33 27182
-0 42 45 27182
-1 27 33 27182
-1 42 45 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-              uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<int64_t> ipiv(n);
-    rand_hermitian_matrix(seed, uplo, n, A, lda);
-
-    std::vector<fp> A_initial = A;
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::hetrf_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::hetrf_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, ipiv.data(), ipiv_dev, ipiv.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::hetrf(queue, uplo, n, A_dev, lda, ipiv_dev, scratchpad_dev,
-                                   scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::hetrf, uplo, n, A_dev, lda, ipiv_dev,
-                                  scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, ipiv_dev, ipiv.data(), ipiv.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    std::vector<fp> U(n * n);
-    std::vector<fp> Uk(n * n);
-    int64_t ldu = n;
-    std::vector<fp> D(n * n);
-    int64_t ldd = n;
-    hermitian_to_full(uplo, n, A_initial, lda);
-    bool result = true;
-
-    for (int64_t d = 0; d < n; d++)
-        U[d + d * ldu] = 1.0;
-
-    if (uplo == oneapi::mkl::uplo::upper) {
-        int64_t k = n - 1;
-        while (k >= 0) {
-            reference::laset('A', n, n, 0.0, 1.0, Uk.data(), ldu);
-            if (ipiv[k] > 0) { /* 1x1 block case */
-
-                auto piv = ipiv[k] - 1;
-                for (int64_t i = 0; i < k; i++)
-                    Uk[i + k * ldu] = A[i + k * lda];
-                if (piv != k)
-                    reference::swap(n, Uk.data() + (k + 0 * ldu), ldu, Uk.data() + (piv + 0 * ldu),
-                                    ldu);
-                auto U_temp = U;
-                reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans,
-                                n, n, n, 1.0, U_temp.data(), ldu, Uk.data(), ldu, 0.0, U.data(),
-                                ldu);
-
-                D[k + k * ldd] = A[k + k * lda];
-                k -= 1;
-            }
-            else { /* 2x2 block case */
-
-                auto piv = -ipiv[k] - 1;
-                for (int64_t i = 0; i < k - 1; i++) {
-                    Uk[i + k * ldu] = A[i + k * lda];
-                    Uk[i + (k - 1) * ldu] = A[i + (k - 1) * lda];
-                }
-                if (piv != k - 1)
-                    reference::swap(n, Uk.data() + (k - 1 + 0 * ldu), ldu,
-                                    Uk.data() + (piv + 0 * ldu), ldu);
-                auto U_temp = U;
-                reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans,
-                                n, n, n, 1.0, U_temp.data(), ldu, Uk.data(), ldu, 0.0, U.data(),
-                                ldu);
-
-                D[k + k * ldd] = A[k + k * lda];
-                D[k - 1 + (k - 1) * ldd] = A[k - 1 + (k - 1) * lda];
-                D[k - 1 + k * ldd] = A[k - 1 + k * lda];
-                D[k + (k - 1) * ldd] = get_conj(A[k - 1 + k * lda]);
-                k -= 2;
-            }
-        }
-    }
-    else {
-        int64_t k = 0;
-        while (k < n) {
-            reference::laset('A', n, n, 0.0, 1.0, Uk.data(), ldu);
-            if (ipiv[k] > 0) { /* 1x1 block case */
-
-                auto piv = ipiv[k] - 1;
-                for (int64_t i = k + 1; i < n; i++)
-                    Uk[i + k * ldu] = A[i + k * lda];
-                if (piv != k)
-                    reference::swap(n, Uk.data() + (k + 0 * lda), ldu, Uk.data() + (piv + 0 * ldu),
-                                    ldu);
-                auto U_temp = U;
-                reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans,
-                                n, n, n, 1.0, U_temp.data(), ldu, Uk.data(), ldu, 0.0, U.data(),
-                                ldu);
-
-                D[k + (k)*ldd] = A[k + (k)*lda];
-                k += 1;
-            }
-            else { /* 2x2 block case */
-
-                auto piv = -ipiv[k] - 1;
-                for (int64_t i = k + 2; i < n; i++) {
-                    Uk[i + k * ldu] = A[i + k * lda];
-                    Uk[i + (k + 1) * ldu] = A[i + (k + 1) * lda];
-                }
-                if (piv != k)
-                    reference::swap(n, Uk.data() + (k + 1 + 0 * ldu), ldu,
-                                    Uk.data() + (piv + 0 * ldu), ldu);
-                auto U_temp = U;
-                reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans,
-                                n, n, n, 1.0, U_temp.data(), ldu, Uk.data(), ldu, 0.0, U.data(),
-                                ldu);
-
-                D[k + k * ldd] = A[k + k * lda];
-                D[k + 1 + (k + 1) * ldd] = A[k + 1 + (k + 1) * lda];
-                D[k + 1 + k * ldd] = A[k + 1 + k * lda];
-                D[k + (k + 1) * ldd] = get_conj(A[k + 1 + k * lda]);
-                k += 2;
-            }
-        }
-    }
-
-    /* |A - UDU'| < |A| O(eps) */
-    std::vector<fp> UD(n * n);
-    int64_t ldud = n;
-    reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n, n, n,
-                    1.0, U.data(), ldu, D.data(), ldd, 0.0, UD.data(), ldud);
-
-    std::vector<fp> UDU(n * n);
-    int64_t ldudu = n;
-    reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::conjtrans, n, n, n,
-                    1.0, UD.data(), ldud, U.data(), ldu, 0.0, UDU.data(), ldudu);
-
-    if (!rel_mat_err_check(n, n, UDU, ldudu, A_initial, lda)) {
-        test_log::lout << "Factorization check failed" << std::endl;
-        result = false;
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-                    uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<int64_t> ipiv(n);
-    rand_hermitian_matrix(seed, uplo, n, A, lda);
-
-    std::vector<fp> A_initial = A;
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::hetrf_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::hetrf_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, ipiv.data(), ipiv_dev, ipiv.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::hetrf(queue, uplo, n, A_dev, lda, ipiv_dev, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::hetrf, uplo, n, A_dev,
-                                  lda, ipiv_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_COMPLEX(Hetrf);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_COMPLEX(Hetrf);
diff --git a/tests/unit_tests/lapack/source/orgbr.cpp b/tests/unit_tests/lapack/source/orgbr.cpp
deleted file mode 100644
index 274cafce0..000000000
--- a/tests/unit_tests/lapack/source/orgbr.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-0 29 25 25 30 27182
-0 29 25 20 30 27182
-1 25 29 25 30 27182
-1 25 29 20 30 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::generate vect, int64_t m, int64_t n, int64_t k,
-              int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    int64_t m_A = m;
-    int64_t n_A = n;
-
-    if (vect == oneapi::mkl::generate::Q)
-        n_A = k;
-    else /* vect == oneapi::mkl::generate::P */
-        m_A = k;
-
-    int64_t min_mn_A = std::min<int64_t>(m_A, n_A);
-
-    std::vector<fp> A(lda * n);
-    std::vector<fp_real> d(min_mn_A);
-    std::vector<fp_real> e(std::max<int64_t>(min_mn_A - 1, 1));
-    std::vector<fp> tauq(min_mn_A);
-    std::vector<fp> taup(min_mn_A);
-
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m_A, n_A, A, lda);
-    reference::gebrd(m_A, n_A, A.data(), lda, d.data(), e.data(), tauq.data(), taup.data());
-
-    auto& tau = (vect == oneapi::mkl::generate::Q) ? tauq : taup;
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::orgbr_scratchpad_size<fp>(queue, vect, m, n, k, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::orgbr_scratchpad_size<fp>,
-                                  vect, m, n, k, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::orgbr(queue, vect, m, n, k, A_dev, lda, tau_dev, scratchpad_dev,
-                                   scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::orgbr, vect, m, n, k, A_dev, lda,
-                                  tau_dev, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_or_un_gbr_accuracy(vect, m, n, k, A, lda);
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::generate vect, int64_t m, int64_t n,
-                    int64_t k, int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    int64_t m_A = m;
-    int64_t n_A = n;
-
-    if (vect == oneapi::mkl::generate::Q)
-        n_A = k;
-    else /* vect == oneapi::mkl::generate::P */
-        m_A = k;
-
-    int64_t min_mn_A = std::min<int64_t>(m_A, n_A);
-
-    std::vector<fp> A(lda * n);
-    std::vector<fp_real> d(min_mn_A);
-    std::vector<fp_real> e(std::max<int64_t>(min_mn_A - 1, 1));
-    std::vector<fp> tauq(min_mn_A);
-    std::vector<fp> taup(min_mn_A);
-
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m_A, n_A, A, lda);
-    reference::gebrd(m_A, n_A, A.data(), lda, d.data(), e.data(), tauq.data(), taup.data());
-
-    auto& tau = (vect == oneapi::mkl::generate::Q) ? tauq : taup;
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::orgbr_scratchpad_size<fp>(queue, vect, m, n, k, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::orgbr_scratchpad_size<fp>,
-                                  vect, m, n, k, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::orgbr(queue, vect, m, n, k, A_dev, lda, tau_dev, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::orgbr, vect, m, n, k,
-                                  A_dev, lda, tau_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_REAL(Orgbr);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_REAL(Orgbr);
diff --git a/tests/unit_tests/lapack/source/orgqr.cpp b/tests/unit_tests/lapack/source/orgqr.cpp
deleted file mode 100644
index 9d62daf5f..000000000
--- a/tests/unit_tests/lapack/source/orgqr.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-50  50  30  70 27182
-50  30  30  70 27182
-50  30  10  70 27182
-200 200 180 220 27182
-200 180 180 220 27182
-200 180 160 220 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, int64_t m, int64_t n, int64_t k, int64_t lda,
-              uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<fp> tau(k);
-
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A, lda);
-    auto info = reference::geqrf(m, k, A.data(), lda, tau.data());
-    if (0 != info) {
-        test_log::lout << "reference geqrf failed with info: " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::orgqr_scratchpad_size<fp>(queue, m, n, k, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::orgqr_scratchpad_size<fp>, m, n, k, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::orgqr(queue, m, n, k, A_dev, lda, tau_dev, scratchpad_dev,
-                                   scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::orgqr, m, n, k, A_dev, lda, tau_dev,
-                                  scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_or_un_gqr_accuracy(m, n, A, lda);
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, int64_t m, int64_t n, int64_t k, int64_t lda,
-                    uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A, lda);
-    std::vector<fp> tau(k);
-
-    auto info = reference::geqrf(m, k, A.data(), lda, tau.data());
-    if (0 != info) {
-        test_log::lout << "reference geqrf failed with info: " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::orgqr_scratchpad_size<fp>(queue, m, n, k, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::orgqr_scratchpad_size<fp>, m, n, k, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::orgqr(queue, m, n, k, A_dev, lda, tau_dev, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::orgqr, m, n, k, A_dev,
-                                  lda, tau_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_REAL(Orgqr);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_REAL(Orgqr);
diff --git a/tests/unit_tests/lapack/source/orgqr_batch_group.cpp b/tests/unit_tests/lapack/source/orgqr_batch_group.cpp
deleted file mode 100644
index 3af796e7d..000000000
--- a/tests/unit_tests/lapack/source/orgqr_batch_group.cpp
+++ /dev/null
@@ -1,316 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <list>
-#include <numeric>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-27182
-)";
-
-template <typename fp>
-bool accuracy(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<int64_t> m_vec = { 5, 5 };
-    std::vector<int64_t> n_vec = { 3, 4 };
-    std::vector<int64_t> k_vec = { 2, 4 };
-    std::vector<int64_t> lda_vec = { 5, 6 };
-    std::vector<int64_t> group_sizes_vec = { 2, 2 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_list;
-    std::list<std::vector<fp>> tau_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto m = m_vec[group_id];
-        auto n = n_vec[group_id];
-        auto k = k_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_list.emplace_back(lda * n);
-            auto& A = A_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A, lda);
-
-            tau_list.emplace_back(k);
-            auto& tau = tau_list.back();
-            auto info = reference::geqrf(m, k, A.data(), lda, tau.data());
-            if (0 != info) {
-                test_log::lout << "reference geqrf failed with info = " << info << std::endl;
-                return false;
-            }
-        }
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> tau_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        fp** tau_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        auto A_iter = A_list.begin();
-        auto tau_iter = tau_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, tau_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-            tau_dev_list.emplace_back(tau_iter->size(), usm_fp_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::orgqr_batch_scratchpad_size<fp>(
-            queue, m_vec.data(), n_vec.data(), k_vec.data(), lda_vec.data(), group_count,
-            group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::orgqr_batch_scratchpad_size<fp>,
-            m_vec.data(), n_vec.data(), k_vec.data(), lda_vec.data(), group_count,
-            group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        auto tau_dev_iter = tau_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_dev_iter++, tau_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-            tau_dev_ptrs[global_id] = tau_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        tau_iter = tau_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, tau_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-            host_to_device_copy(queue, tau_iter->data(), tau_dev_ptrs[global_id], tau_iter->size());
-        }
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::orgqr_batch(queue, m_vec.data(), n_vec.data(), k_vec.data(),
-                                         A_dev_ptrs, lda_vec.data(), tau_dev_ptrs, group_count,
-                                         group_sizes_vec.data(), scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::orgqr_batch, m_vec.data(),
-                                  n_vec.data(), k_vec.data(), A_dev_ptrs, lda_vec.data(),
-                                  tau_dev_ptrs, group_count, group_sizes_vec.data(), scratchpad_dev,
-                                  scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        A_iter = A_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++) {
-            device_to_host_copy(queue, A_dev_ptrs[global_id], A_iter->data(), A_iter->size());
-        }
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-        if (tau_dev_ptrs) {
-            sycl::free(tau_dev_ptrs, queue);
-        }
-    }
-
-    bool result = true;
-
-    int64_t global_id = 0;
-    auto A_iter = A_list.begin();
-    auto tau_iter = tau_list.begin();
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto m = m_vec[group_id];
-        auto n = n_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-        for (int64_t local_id = 0; local_id < group_size;
-             local_id++, global_id++, A_iter++, tau_iter++) {
-            if (!check_or_un_gqr_accuracy(m, n, *A_iter, lda)) {
-                test_log::lout << "batch routine (" << global_id << ", " << group_id << ", "
-                               << local_id << ") (global_id, group_id, local_id) failed"
-                               << std::endl;
-                result = false;
-            }
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1
-)";
-
-template <typename fp>
-bool usm_dependency(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<int64_t> m_vec = { 1 };
-    std::vector<int64_t> n_vec = { 1 };
-    std::vector<int64_t> k_vec = { 1 };
-    std::vector<int64_t> lda_vec = { 1 };
-    std::vector<int64_t> group_sizes_vec = { 1 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_list;
-    std::list<std::vector<fp>> tau_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto m = m_vec[group_id];
-        auto n = n_vec[group_id];
-        auto k = k_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_list.emplace_back(lda * n);
-            auto& A = A_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A, lda);
-
-            tau_list.emplace_back(k);
-            auto& tau = tau_list.back();
-            auto info = reference::geqrf(m, k, A.data(), lda, tau.data());
-            if (0 != info) {
-                test_log::lout << "reference geqrf failed with info = " << info << std::endl;
-                return false;
-            }
-        }
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> tau_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        fp** tau_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        auto A_iter = A_list.begin();
-        auto tau_iter = tau_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, tau_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-            tau_dev_list.emplace_back(tau_iter->size(), usm_fp_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::orgqr_batch_scratchpad_size<fp>(
-            queue, m_vec.data(), n_vec.data(), k_vec.data(), lda_vec.data(), group_count,
-            group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::orgqr_batch_scratchpad_size<fp>,
-            m_vec.data(), n_vec.data(), k_vec.data(), lda_vec.data(), group_count,
-            group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        auto tau_dev_iter = tau_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_dev_iter++, tau_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-            tau_dev_ptrs[global_id] = tau_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        tau_iter = tau_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, tau_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-            host_to_device_copy(queue, tau_iter->data(), tau_dev_ptrs[global_id], tau_iter->size());
-        }
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::orgqr_batch(
-            queue, m_vec.data(), n_vec.data(), k_vec.data(), A_dev_ptrs, lda_vec.data(),
-            tau_dev_ptrs, group_count, group_sizes_vec.data(), scratchpad_dev, scratchpad_size,
-            std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::orgqr_batch,
-                                  m_vec.data(), n_vec.data(), k_vec.data(), A_dev_ptrs,
-                                  lda_vec.data(), tau_dev_ptrs, group_count, group_sizes_vec.data(),
-                                  scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-        if (tau_dev_ptrs) {
-            sycl::free(tau_dev_ptrs, queue);
-        }
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_USM_REAL(OrgqrBatchGroup);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_REAL(OrgqrBatchGroup);
diff --git a/tests/unit_tests/lapack/source/orgqr_batch_stride.cpp b/tests/unit_tests/lapack/source/orgqr_batch_stride.cpp
deleted file mode 100644
index 1cf3471c5..000000000
--- a/tests/unit_tests/lapack/source/orgqr_batch_stride.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-29 23 18 37 1024 40 3 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, int64_t m, int64_t n, int64_t k, int64_t lda,
-              int64_t stride_a, int64_t stride_tau, int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(stride_a * batch_size);
-    std::vector<fp> tau(stride_tau * batch_size);
-
-    for (int64_t i = 0; i < batch_size; i++) {
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A, lda, i * stride_a);
-        auto info =
-            reference::geqrf(m, k, A.data() + i * stride_a, lda, tau.data() + i * stride_tau);
-        if (0 != info) {
-            test_log::lout << "batch routine index " << i
-                           << ": reference geqrf failed with info: " << info << std::endl;
-            return false;
-        }
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::orgqr_batch_scratchpad_size<fp>(
-            queue, m, n, k, lda, stride_a, stride_tau, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::orgqr_batch_scratchpad_size<fp>, m, n, k,
-            lda, stride_a, stride_tau, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::orgqr_batch(queue, m, n, k, A_dev, lda, stride_a, tau_dev, stride_tau,
-                                         batch_size, scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::orgqr_batch, m, n, k, A_dev, lda,
-                                  stride_a, tau_dev, stride_tau, batch_size, scratchpad_dev,
-                                  scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, tau_dev, tau.data(), tau.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    bool result = true;
-    for (int64_t i = 0; i < batch_size; i++) {
-        auto A_ = copy_vector(A, lda * n, i * stride_a);
-        if (!check_or_un_gqr_accuracy(m, n, A_, lda)) {
-            test_log::lout << "batch routine index " << i << " failed" << std::endl;
-            result = false;
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, int64_t m, int64_t n, int64_t k, int64_t lda,
-                    int64_t stride_a, int64_t stride_tau, int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(stride_a * batch_size);
-    std::vector<fp> tau(stride_tau * batch_size);
-
-    for (int64_t i = 0; i < batch_size; i++) {
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A, lda, i * stride_a);
-        auto info =
-            reference::geqrf(m, k, A.data() + i * stride_a, lda, tau.data() + i * stride_tau);
-        if (0 != info) {
-            test_log::lout << "batch routine index " << i
-                           << ": reference geqrf failed with info: " << info << std::endl;
-            return false;
-        }
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::orgqr_batch_scratchpad_size<fp>(
-            queue, m, n, k, lda, stride_a, stride_tau, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::orgqr_batch_scratchpad_size<fp>, m, n, k,
-            lda, stride_a, stride_tau, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::orgqr_batch(
-            queue, m, n, k, A_dev, lda, stride_a, tau_dev, stride_tau, batch_size, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::orgqr_batch, m, n, k,
-                                  A_dev, lda, stride_a, tau_dev, stride_tau, batch_size,
-                                  scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_REAL(OrgqrBatchStride);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_REAL(OrgqrBatchStride);
diff --git a/tests/unit_tests/lapack/source/orgtr.cpp b/tests/unit_tests/lapack/source/orgtr.cpp
deleted file mode 100644
index 5a01745d5..000000000
--- a/tests/unit_tests/lapack/source/orgtr.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 41 59 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-              uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<fp_real> d(n);
-    std::vector<fp_real> e(n);
-    std::vector<fp> tau(n);
-
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A, lda);
-    auto info = reference::sytrd(uplo, n, A.data(), lda, d.data(), e.data(), tau.data());
-    if (0 != info) {
-        test_log::lout << "reference sytrd failed with info = " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::orgtr_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::orgtr_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::orgtr(queue, uplo, n, A_dev, lda, tau_dev, scratchpad_dev,
-                                   scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::orgtr, uplo, n, A_dev, lda, tau_dev,
-                                  scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_or_un_gtr_accuracy(n, A, lda);
-}
-
-const char* dependency_input = R"(
-1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-                    uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<fp_real> d(n);
-    std::vector<fp_real> e(n);
-    std::vector<fp> tau(n);
-
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A, lda);
-    auto info = reference::sytrd(uplo, n, A.data(), lda, d.data(), e.data(), tau.data());
-    if (0 != info) {
-        test_log::lout << "reference sytrd failed with info = " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::orgtr_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::orgtr_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::orgtr(queue, uplo, n, A_dev, lda, tau_dev, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::orgtr, uplo, n, A_dev,
-                                  lda, tau_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-        queue.wait_and_throw();
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_REAL(Orgtr);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_REAL(Orgtr);
diff --git a/tests/unit_tests/lapack/source/ormqr.cpp b/tests/unit_tests/lapack/source/ormqr.cpp
deleted file mode 100644
index e2ed49b96..000000000
--- a/tests/unit_tests/lapack/source/ormqr.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-0 0 50 70 50 70 70 27182
-0 0 20 22 20 22 22 27182
-0 1 50 70 50 70 70 27182
-0 1 20 22 20 22 22 27182
-1 0 50 70 70 90 70 27182
-1 0 20 22 22 24 22 27182
-1 1 50 70 70 90 70 27182
-1 1 20 22 22 24 22 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::side left_right, oneapi::mkl::transpose trans,
-              int64_t m, int64_t n, int64_t k, int64_t lda, int64_t ldc, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> C_initial(ldc * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, C_initial, ldc);
-    std::vector<fp> C = C_initial;
-
-    int64_t nq = (left_right == oneapi::mkl::side::left) ? m : n;
-    std::vector<fp> A(lda * k);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, nq, k, A, lda);
-    std::vector<fp> tau(k);
-
-    auto info = reference::geqrf(nq, k, A.data(), lda, tau.data());
-    if (0 != info) {
-        test_log::lout << "reference geqrf failed with info: " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-        auto C_dev = device_alloc<data_T>(queue, C.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::ormqr_scratchpad_size<fp>(
-            queue, left_right, trans, m, n, k, lda, ldc);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::ormqr_scratchpad_size<fp>,
-                                  left_right, trans, m, n, k, lda, ldc);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        host_to_device_copy(queue, C.data(), C_dev, C.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::ormqr(queue, left_right, trans, m, n, k, A_dev, lda, tau_dev, C_dev,
-                                   ldc, scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::ormqr, left_right, trans, m, n, k,
-                                  A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, C_dev, C.data(), C.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, C_dev);
-        device_free(queue, scratchpad_dev);
-    }
-    bool result = true;
-
-    /* |Q C - QC| < |QC| O(eps) */
-    const auto& QC = C;
-    auto& QC_ref = C_initial;
-    auto ldqc = ldc;
-    info = reference::or_un_mqr(left_right, trans, m, n, k, A.data(), lda, tau.data(),
-                                QC_ref.data(), ldqc);
-    if (0 != info) {
-        test_log::lout << "reference ormqr failed with info: " << info << std::endl;
-        return false;
-    }
-    if (!rel_mat_err_check(m, n, QC, ldqc, QC_ref, ldqc, 1.0)) {
-        test_log::lout << "Multiplication check failed" << std::endl;
-        result = false;
-    }
-    return result;
-}
-
-const char* dependency_input = R"(
-0 0 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::side left_right,
-                    oneapi::mkl::transpose trans, int64_t m, int64_t n, int64_t k, int64_t lda,
-                    int64_t ldc, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> C_initial(ldc * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, C_initial, ldc);
-    std::vector<fp> C = C_initial;
-
-    int64_t nq = (left_right == oneapi::mkl::side::left) ? m : n;
-    std::vector<fp> A(lda * k);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, nq, k, A, lda);
-    std::vector<fp> tau(k);
-
-    auto info = reference::geqrf(nq, k, A.data(), lda, tau.data());
-    if (0 != info) {
-        test_log::lout << "reference geqrf failed with info: " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-        auto C_dev = device_alloc<data_T>(queue, C.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::ormqr_scratchpad_size<fp>(
-            queue, left_right, trans, m, n, k, lda, ldc);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::ormqr_scratchpad_size<fp>,
-                                  left_right, trans, m, n, k, lda, ldc);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        host_to_device_copy(queue, C.data(), C_dev, C.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::ormqr(
-            queue, left_right, trans, m, n, k, A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::ormqr, left_right, trans,
-                                  m, n, k, A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev,
-                                  scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, C_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_REAL(Ormqr);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_REAL(Ormqr);
diff --git a/tests/unit_tests/lapack/source/ormrq.cpp b/tests/unit_tests/lapack/source/ormrq.cpp
deleted file mode 100644
index 4882e5bc7..000000000
--- a/tests/unit_tests/lapack/source/ormrq.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 0 30 30 30 33 31 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::side left_right, oneapi::mkl::transpose trans,
-              int64_t m, int64_t n, int64_t k, int64_t lda, int64_t ldc, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> C_initial(ldc * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, C_initial, ldc);
-    std::vector<fp> C = C_initial;
-
-    int64_t nq;
-    if (left_right == oneapi::mkl::side::left) {
-        if (k > m) {
-            test_log::lout << "Bad test input, side == left and k > m (" << k << " > " << m << ")"
-                           << std::endl;
-            return false;
-        }
-        nq = m;
-    }
-    else {
-        if (k > n) {
-            test_log::lout << "Bad test input, side == right and k > n (" << k << " > " << n << ")"
-                           << std::endl;
-            return false;
-        }
-        nq = n;
-    }
-
-    std::vector<fp> A(lda * k);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, nq, k, A, lda);
-    std::vector<fp> tau(k);
-
-    auto info = reference::gerqf(nq, k, A.data(), lda, tau.data());
-    if (0 != info) {
-        test_log::lout << "reference gerqf failed with info = " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-        auto C_dev = device_alloc<data_T>(queue, C.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::ormrq_scratchpad_size<fp>(
-            queue, left_right, trans, m, n, k, lda, ldc);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::ormrq_scratchpad_size<fp>,
-                                  left_right, trans, m, n, k, lda, ldc);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        host_to_device_copy(queue, C.data(), C_dev, C.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::ormrq(queue, left_right, trans, m, n, k, A_dev, lda, tau_dev, C_dev,
-                                   ldc, scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::ormrq, left_right, trans, m, n, k,
-                                  A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, C_dev, C.data(), C.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, C_dev);
-        device_free(queue, scratchpad_dev);
-    }
-    bool result = true;
-
-    /* |Q C - QC| < |QC| O(eps) */
-    const auto& QC = C;
-    auto& QC_ref = C_initial;
-    auto ldqc = ldc;
-    info = reference::or_un_mrq(left_right, trans, m, n, k, A.data(), lda, tau.data(),
-                                QC_ref.data(), ldqc);
-    if (0 != info) {
-        test_log::lout << "reference ormrq failed with info = " << info << std::endl;
-        return false;
-    }
-    if (!rel_mat_err_check(m, n, QC, ldqc, QC_ref, ldqc, 1.0)) {
-        test_log::lout << "Multiplication check failed" << std::endl;
-        result = false;
-    }
-    return result;
-}
-
-const char* dependency_input = R"(
-0 0 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::side left_right,
-                    oneapi::mkl::transpose trans, int64_t m, int64_t n, int64_t k, int64_t lda,
-                    int64_t ldc, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> C_initial(ldc * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, C_initial, ldc);
-    std::vector<fp> C = C_initial;
-
-    int64_t nq = (left_right == oneapi::mkl::side::left) ? m : n;
-    std::vector<fp> A(lda * k);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, nq, k, A, lda);
-    std::vector<fp> tau(k);
-
-    auto info = reference::gerqf(nq, k, A.data(), lda, tau.data());
-    if (0 != info) {
-        test_log::lout << "reference gerqf failed with info = " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-        auto C_dev = device_alloc<data_T>(queue, C.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::ormrq_scratchpad_size<fp>(
-            queue, left_right, trans, m, n, k, lda, ldc);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::ormrq_scratchpad_size<fp>,
-                                  left_right, trans, m, n, k, lda, ldc);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        host_to_device_copy(queue, C.data(), C_dev, C.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::ormrq(
-            queue, left_right, trans, m, n, k, A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::ormrq, left_right, trans,
-                                  m, n, k, A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev,
-                                  scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, C_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_REAL(Ormrq);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_REAL(Ormrq);
diff --git a/tests/unit_tests/lapack/source/ormtr.cpp b/tests/unit_tests/lapack/source/ormtr.cpp
deleted file mode 100644
index 4e8dd95b9..000000000
--- a/tests/unit_tests/lapack/source/ormtr.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 31 33 35 37 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t m, int64_t n, int64_t lda,
-              int64_t ldc, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    oneapi::mkl::side side = oneapi::mkl::side::right;
-    oneapi::mkl::transpose trans = oneapi::mkl::transpose::nontrans;
-
-    std::vector<fp> A(n * lda);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A, lda);
-
-    std::vector<fp> tau(n);
-    std::vector<fp_real> d(n);
-    std::vector<fp_real> e(n);
-    auto info = reference::sytrd(uplo, n, A.data(), lda, d.data(), e.data(), tau.data());
-    if (0 != info) {
-        test_log::lout << "reference sytrd failed with info = " << info << std::endl;
-        return false;
-    }
-
-    std::vector<fp> C(n * ldc);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, C, ldc);
-    std::vector<fp> C_initial = C;
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-        auto C_dev = device_alloc<data_T>(queue, C.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::ormtr_scratchpad_size<fp>(
-            queue, side, uplo, trans, m, n, lda, ldc);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::ormtr_scratchpad_size<fp>,
-                                  side, uplo, trans, m, n, lda, ldc);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        host_to_device_copy(queue, C.data(), C_dev, C.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::ormtr(queue, side, uplo, trans, m, n, A_dev, lda, tau_dev, C_dev, ldc,
-                                   scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::ormtr, side, uplo, trans, m, n, A_dev,
-                                  lda, tau_dev, C_dev, ldc, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, tau_dev, tau.data(), tau.size());
-        device_to_host_copy(queue, C_dev, C.data(), C.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, C_dev);
-    }
-    bool result = true;
-
-    auto& C_ref = C_initial;
-    info = reference::ormtr(side, uplo, trans, m, n, A.data(), lda, tau.data(), C_ref.data(), ldc);
-    if (0 != info) {
-        test_log::lout << "reference ormtr failed with info = " << info << std::endl;
-        return false;
-    }
-    if (!rel_mat_err_check(m, n, C, ldc, C_ref, ldc)) {
-        test_log::lout << "Multiplication check failed" << std::endl;
-        result = false;
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t m, int64_t n,
-                    int64_t lda, int64_t ldc, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    oneapi::mkl::side side = oneapi::mkl::side::right;
-    oneapi::mkl::transpose trans = oneapi::mkl::transpose::nontrans;
-
-    std::vector<fp> A(n * lda);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A, lda);
-
-    std::vector<fp> tau(n);
-    std::vector<fp_real> d(n);
-    std::vector<fp_real> e(n);
-    auto info = reference::sytrd(uplo, n, A.data(), lda, d.data(), e.data(), tau.data());
-    if (0 != info) {
-        test_log::lout << "reference sytrd failed with info = " << info << std::endl;
-        return false;
-    }
-
-    std::vector<fp> C(n * ldc);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, C, ldc);
-    std::vector<fp> C_initial = C;
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-        auto C_dev = device_alloc<data_T>(queue, C.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::ormtr_scratchpad_size<fp>(
-            queue, side, uplo, trans, m, n, lda, ldc);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::ormtr_scratchpad_size<fp>,
-                                  side, uplo, trans, m, n, lda, ldc);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        host_to_device_copy(queue, C.data(), C_dev, C.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::ormtr(
-            queue, side, uplo, trans, m, n, A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::ormtr, side, uplo, trans,
-                                  m, n, A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev,
-                                  scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-        queue.wait_and_throw();
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, C_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_REAL(Ormtr);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_REAL(Ormtr);
diff --git a/tests/unit_tests/lapack/source/potrf.cpp b/tests/unit_tests/lapack/source/potrf.cpp
deleted file mode 100644
index 7d2df8ea9..000000000
--- a/tests/unit_tests/lapack/source/potrf.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-0 31 33 27182
-0 43 43 27182
-1 23 25 27182
-1 52 52 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-              uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(lda * n);
-    rand_pos_def_matrix(seed, uplo, n, A_initial, lda);
-
-    std::vector<fp> A = A_initial;
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::potrf_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::potrf_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::potrf(queue, uplo, n, A_dev, lda, scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::potrf, uplo, n, A_dev, lda,
-                                  scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_potrf_accuracy(A_initial, A, uplo, n, lda);
-}
-
-const char* dependency_input = R"(
-1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-                    uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(lda * n);
-    rand_pos_def_matrix(seed, uplo, n, A_initial, lda);
-
-    std::vector<fp> A = A_initial;
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::potrf_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::potrf_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::potrf(queue, uplo, n, A_dev, lda, scratchpad_dev, scratchpad_size,
-                                       std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::potrf, uplo, n, A_dev,
-                                  lda, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(Potrf);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(Potrf);
diff --git a/tests/unit_tests/lapack/source/potrf_batch_group.cpp b/tests/unit_tests/lapack/source/potrf_batch_group.cpp
deleted file mode 100644
index 4a5b8dd58..000000000
--- a/tests/unit_tests/lapack/source/potrf_batch_group.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <list>
-#include <numeric>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-27182
-)";
-
-template <typename fp>
-bool accuracy(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<oneapi::mkl::uplo> uplo_vec = { oneapi::mkl::uplo::upper,
-                                                oneapi::mkl::uplo::lower };
-    std::vector<int64_t> n_vec = { 4, 4 };
-    std::vector<int64_t> lda_vec = { 5, 5 };
-    std::vector<int64_t> group_sizes_vec = { 2, 2 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_initial_list;
-    std::list<std::vector<fp>> A_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto uplo = uplo_vec[group_id];
-        auto n = n_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_initial_list.emplace_back(lda * n);
-            auto& A_initial = A_initial_list.back();
-            rand_pos_def_matrix(seed, uplo, n, A_initial, lda);
-
-            A_list.emplace_back(A_initial);
-        }
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        auto A_iter = A_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::potrf_batch_scratchpad_size<fp>(
-            queue, uplo_vec.data(), n_vec.data(), lda_vec.data(), group_count,
-            group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::potrf_batch_scratchpad_size<fp>,
-            uplo_vec.data(), n_vec.data(), lda_vec.data(), group_count, group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-        }
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::potrf_batch(queue, uplo_vec.data(), n_vec.data(), A_dev_ptrs,
-                                         lda_vec.data(), group_count, group_sizes_vec.data(),
-                                         scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::potrf_batch, uplo_vec.data(),
-                                  n_vec.data(), A_dev_ptrs, lda_vec.data(), group_count,
-                                  group_sizes_vec.data(), scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        A_iter = A_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++) {
-            device_to_host_copy(queue, A_dev_ptrs[global_id], A_iter->data(), A_iter->size());
-        }
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-    }
-
-    bool result = true;
-
-    int64_t global_id = 0;
-    auto A_iter = A_list.begin();
-    auto A_initial_iter = A_initial_list.begin();
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto uplo = uplo_vec[group_id];
-        auto n = n_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-        for (int64_t local_id = 0; local_id < group_size;
-             local_id++, global_id++, A_iter++, A_initial_iter++) {
-            if (!check_potrf_accuracy(*A_initial_iter, *A_iter, uplo, n, lda)) {
-                test_log::lout << "batch routine (" << global_id << ", " << group_id << ", "
-                               << local_id << ") (global_id, group_id, local_id) failed"
-                               << std::endl;
-                result = false;
-            }
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1
-)";
-
-template <typename fp>
-bool usm_dependency(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<oneapi::mkl::uplo> uplo_vec = { oneapi::mkl::uplo::upper };
-    std::vector<int64_t> n_vec = { 1 };
-    std::vector<int64_t> lda_vec = { 1 };
-    std::vector<int64_t> group_sizes_vec = { 1 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_initial_list;
-    std::list<std::vector<fp>> A_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto uplo = uplo_vec[group_id];
-        auto n = n_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_initial_list.emplace_back(lda * n);
-            auto& A_initial = A_initial_list.back();
-            rand_pos_def_matrix(seed, uplo, n, A_initial, lda);
-
-            A_list.emplace_back(A_initial);
-        }
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        auto A_iter = A_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::potrf_batch_scratchpad_size<fp>(
-            queue, uplo_vec.data(), n_vec.data(), lda_vec.data(), group_count,
-            group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::potrf_batch_scratchpad_size<fp>,
-            uplo_vec.data(), n_vec.data(), lda_vec.data(), group_count, group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-        }
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::potrf_batch(
-            queue, uplo_vec.data(), n_vec.data(), A_dev_ptrs, lda_vec.data(), group_count,
-            group_sizes_vec.data(), scratchpad_dev, scratchpad_size,
-            std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::potrf_batch,
-                                  uplo_vec.data(), n_vec.data(), A_dev_ptrs, lda_vec.data(),
-                                  group_count, group_sizes_vec.data(), scratchpad_dev,
-                                  scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_USM(PotrfBatchGroup);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(PotrfBatchGroup);
diff --git a/tests/unit_tests/lapack/source/potrf_batch_stride.cpp b/tests/unit_tests/lapack/source/potrf_batch_stride.cpp
deleted file mode 100644
index fae4f0bcc..000000000
--- a/tests/unit_tests/lapack/source/potrf_batch_stride.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 4 15 123 3 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-              int64_t stride_a, int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(stride_a * batch_size);
-    for (int64_t i = 0; i < batch_size; i++)
-        rand_pos_def_matrix(seed, uplo, n, A_initial, lda, i * stride_a);
-
-    std::vector<fp> A = A_initial;
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::potrf_batch_scratchpad_size<fp>(
-            queue, uplo, n, lda, stride_a, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::potrf_batch_scratchpad_size<fp>, uplo, n,
-            lda, stride_a, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::potrf_batch(queue, uplo, n, A_dev, lda, stride_a, batch_size,
-                                         scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::potrf_batch, uplo, n, A_dev, lda,
-                                  stride_a, batch_size, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    bool result = true;
-    for (int64_t i = 0; i < batch_size; i++) {
-        auto A_ = copy_vector(A, lda * n, i * stride_a);
-        auto A_initial_ = copy_vector(A_initial, lda * n, i * stride_a);
-        if (!check_potrf_accuracy(A_initial_, A_, uplo, n, lda)) {
-            test_log::lout << "batch routine index " << i << " failed" << std::endl;
-            result = false;
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-                    int64_t stride_a, int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(stride_a * batch_size);
-    for (int64_t i = 0; i < batch_size; i++)
-        rand_pos_def_matrix(seed, uplo, n, A_initial, lda, i * stride_a);
-
-    std::vector<fp> A = A_initial;
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::potrf_batch_scratchpad_size<fp>(
-            queue, uplo, n, lda, stride_a, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::potrf_batch_scratchpad_size<fp>, uplo, n,
-            lda, stride_a, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::potrf_batch(
-            queue, uplo, n, A_dev, lda, stride_a, batch_size, scratchpad_dev, scratchpad_size,
-            std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::potrf_batch, uplo, n,
-                                  A_dev, lda, stride_a, batch_size, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(PotrfBatchStride);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(PotrfBatchStride);
diff --git a/tests/unit_tests/lapack/source/potri.cpp b/tests/unit_tests/lapack/source/potri.cpp
deleted file mode 100644
index cd2f86449..000000000
--- a/tests/unit_tests/lapack/source/potri.cpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-0 30 42 27182
-0 33 33 27182
-1 31 41 27182
-1 45 45 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-              uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(n * lda);
-    rand_pos_def_matrix(seed, uplo, n, A_initial, lda);
-    std::vector<fp> A = A_initial;
-
-    auto info = reference::potrf(uplo, n, A.data(), lda);
-    if (0 != info) {
-        test_log::lout << "Reference potrf failed with info: " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::potri_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::potri_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::potri(queue, uplo, n, A_dev, lda, scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::potri, uplo, n, A_dev, lda,
-                                  scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    hermitian_to_full(uplo, n, A, lda);
-    hermitian_to_full(uplo, n, A_initial, lda);
-
-    auto norm_A = reference::lange('1', n, n, A_initial.data(), lda);
-    auto norm_Ainv = reference::lange('1', n, n, A.data(), lda);
-    auto ulp = reference::lamch<fp_real>('P');
-
-    std::vector<fp> resid(n * n);
-    int64_t ldr = n;
-    for (int64_t diag = 0; diag < n; diag++)
-        resid[diag + diag * ldr] = static_cast<fp>(1.0);
-    reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n, n, n,
-                    1.0, A_initial.data(), lda, A.data(), lda, -1.0, resid.data(), ldr);
-    auto norm_resid = reference::lange('1', n, n, resid.data(), ldr);
-    auto rel_err = norm_resid / (norm_A * norm_Ainv * n * ulp);
-    fp_real threshold = 30.0;
-
-    bool result = rel_err < threshold;
-    if (!result) {
-        snprintf(test_log::buffer.data(), test_log::buffer.size(),
-                 "|A inv(A) - I| / ( |A| |inv(A)| n ulp ) = |%e|/(|%e|*|%e|*%d*%e) = %e\n",
-                 norm_resid, norm_A, norm_Ainv, static_cast<int>(n), ulp, rel_err);
-        test_log::lout << test_log::buffer.data();
-        test_log::lout << "threshold = " << threshold << std::endl;
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-                    uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(n * lda);
-    rand_pos_def_matrix(seed, uplo, n, A_initial, lda);
-    std::vector<fp> A = A_initial;
-
-    auto info = reference::potrf(uplo, n, A.data(), lda);
-    if (0 != info) {
-        test_log::lout << "Reference potrf failed with info: " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::potri_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::potri_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::potri(queue, uplo, n, A_dev, lda, scratchpad_dev, scratchpad_size,
-                                       std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::potri, uplo, n, A_dev,
-                                  lda, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(Potri);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(Potri);
diff --git a/tests/unit_tests/lapack/source/potrs.cpp b/tests/unit_tests/lapack/source/potrs.cpp
deleted file mode 100644
index c534ec8ba..000000000
--- a/tests/unit_tests/lapack/source/potrs.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-0 30 4 42 31 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t nrhs, int64_t lda,
-              int64_t ldb, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(lda * n);
-    std::vector<fp> B_initial(ldb * nrhs);
-    rand_pos_def_matrix(seed, uplo, n, A_initial, lda);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, nrhs, B_initial, ldb);
-
-    std::vector<fp> A = A_initial;
-    std::vector<fp> B = B_initial;
-
-    auto info = reference::potrf(uplo, n, A.data(), lda);
-    if (0 != info) {
-        test_log::lout << "reference potrf failed with info = " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto B_dev = device_alloc<data_T>(queue, B.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::potrs_scratchpad_size<fp>(queue, uplo, n, nrhs, lda, ldb);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::potrs_scratchpad_size<fp>,
-                                  uplo, n, nrhs, lda, ldb);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, B.data(), B_dev, B.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, A_dev, lda, B_dev, ldb, scratchpad_dev,
-                                   scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::potrs, uplo, n, nrhs, A_dev, lda,
-                                  B_dev, ldb, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, B_dev, B.data(), B.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, B_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_potrs_accuracy(uplo, n, nrhs, B, ldb, A_initial, lda, B_initial);
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t nrhs,
-                    int64_t lda, int64_t ldb, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(lda * n);
-    std::vector<fp> B_initial(ldb * nrhs);
-    rand_pos_def_matrix(seed, uplo, n, A_initial, lda);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, nrhs, B_initial, ldb);
-
-    std::vector<fp> A = A_initial;
-    std::vector<fp> B = B_initial;
-
-    auto info = reference::potrf(uplo, n, A.data(), lda);
-    if (0 != info) {
-        test_log::lout << "reference potrf failed with info = " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto B_dev = device_alloc<data_T>(queue, B.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::potrs_scratchpad_size<fp>(queue, uplo, n, nrhs, lda, ldb);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::potrs_scratchpad_size<fp>,
-                                  uplo, n, nrhs, lda, ldb);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, B.data(), B_dev, B.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, A_dev, lda, B_dev, ldb, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::potrs, uplo, n, nrhs,
-                                  A_dev, lda, B_dev, ldb, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, B_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(Potrs);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(Potrs);
diff --git a/tests/unit_tests/lapack/source/potrs_batch_group.cpp b/tests/unit_tests/lapack/source/potrs_batch_group.cpp
deleted file mode 100644
index 35c5ead0c..000000000
--- a/tests/unit_tests/lapack/source/potrs_batch_group.cpp
+++ /dev/null
@@ -1,347 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <list>
-#include <numeric>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-27182
-)";
-
-template <typename fp>
-bool accuracy(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<oneapi::mkl::uplo> uplo_vec = { oneapi::mkl::uplo::upper,
-                                                oneapi::mkl::uplo::lower };
-    std::vector<int64_t> n_vec = { 4, 5 };
-    std::vector<int64_t> nrhs_vec = { 9, 6 };
-    std::vector<int64_t> lda_vec = { 6, 6 };
-    std::vector<int64_t> ldb_vec = { 9, 9 };
-    std::vector<int64_t> group_sizes_vec = { 2, 2 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_initial_list;
-    std::list<std::vector<fp>> B_initial_list;
-    std::list<std::vector<fp>> A_list;
-    std::list<std::vector<fp>> B_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto uplo = uplo_vec[group_id];
-        auto n = n_vec[group_id];
-        auto nrhs = nrhs_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto ldb = ldb_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_initial_list.emplace_back(lda * n);
-            auto& A_initial = A_initial_list.back();
-            rand_pos_def_matrix(seed, uplo, n, A_initial, lda);
-
-            A_list.emplace_back(A_initial);
-            auto& A = A_list.back();
-
-            B_initial_list.emplace_back(ldb * nrhs);
-            auto& B_initial = B_initial_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, nrhs, B_initial, lda);
-
-            B_list.emplace_back(B_initial);
-            auto& B = B_list.back();
-
-            auto info = reference::potrf(uplo, n, A.data(), lda);
-            if (info != 0) {
-                test_log::lout << "Reference getrf failed with info = " << info << std::endl;
-                return false;
-            }
-        }
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> B_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        fp** B_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        auto A_iter = A_list.begin();
-        auto B_iter = B_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, B_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-            B_dev_list.emplace_back(B_iter->size(), usm_fp_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::potrs_batch_scratchpad_size<fp>(
-            queue, uplo_vec.data(), n_vec.data(), nrhs_vec.data(), lda_vec.data(), ldb_vec.data(),
-            group_count, group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::potrs_batch_scratchpad_size<fp>,
-            uplo_vec.data(), n_vec.data(), nrhs_vec.data(), lda_vec.data(), ldb_vec.data(),
-            group_count, group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        auto B_dev_iter = B_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_dev_iter++, B_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-            B_dev_ptrs[global_id] = B_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        B_iter = B_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, B_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-            host_to_device_copy(queue, B_iter->data(), B_dev_ptrs[global_id], B_iter->size());
-        }
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::potrs_batch(queue, uplo_vec.data(), n_vec.data(), nrhs_vec.data(),
-                                         A_dev_ptrs, lda_vec.data(), B_dev_ptrs, ldb_vec.data(),
-                                         group_count, group_sizes_vec.data(), scratchpad_dev,
-                                         scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::potrs_batch, uplo_vec.data(),
-                                  n_vec.data(), nrhs_vec.data(), A_dev_ptrs, lda_vec.data(),
-                                  B_dev_ptrs, ldb_vec.data(), group_count, group_sizes_vec.data(),
-                                  scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        B_iter = B_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, B_iter++) {
-            device_to_host_copy(queue, B_dev_ptrs[global_id], B_iter->data(), B_iter->size());
-        }
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-        if (B_dev_ptrs) {
-            sycl::free(B_dev_ptrs, queue);
-        }
-    }
-
-    bool result = true;
-
-    int64_t global_id = 0;
-    auto A_iter = A_list.begin();
-    auto B_iter = B_list.begin();
-    auto A_initial_iter = A_initial_list.begin();
-    auto B_initial_iter = B_initial_list.begin();
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto uplo = uplo_vec[group_id];
-        auto n = n_vec[group_id];
-        auto nrhs = nrhs_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto ldb = ldb_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-        for (int64_t local_id = 0; local_id < group_size;
-             local_id++, global_id++, A_iter++, B_iter++, A_initial_iter++, B_initial_iter++) {
-            if (!check_potrs_accuracy(uplo, n, nrhs, *B_iter, ldb, *A_initial_iter, lda,
-                                      *B_initial_iter)) {
-                test_log::lout << "batch routine (" << global_id << ", " << group_id << ", "
-                               << local_id << ") (global_id, group_id, local_id) failed"
-                               << std::endl;
-                result = false;
-            }
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1
-)";
-
-template <typename fp>
-bool usm_dependency(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<oneapi::mkl::uplo> uplo_vec = { oneapi::mkl::uplo::upper };
-    std::vector<int64_t> n_vec = { 1 };
-    std::vector<int64_t> nrhs_vec = { 1 };
-    std::vector<int64_t> lda_vec = { 1 };
-    std::vector<int64_t> ldb_vec = { 1 };
-    std::vector<int64_t> group_sizes_vec = { 1 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_initial_list;
-    std::list<std::vector<fp>> B_initial_list;
-    std::list<std::vector<fp>> A_list;
-    std::list<std::vector<fp>> B_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto uplo = uplo_vec[group_id];
-        auto n = n_vec[group_id];
-        auto nrhs = nrhs_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto ldb = ldb_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_initial_list.emplace_back(lda * n);
-            auto& A_initial = A_initial_list.back();
-            rand_pos_def_matrix(seed, uplo, n, A_initial, lda);
-
-            A_list.emplace_back(A_initial);
-            auto& A = A_list.back();
-
-            B_initial_list.emplace_back(ldb * nrhs);
-            auto& B_initial = B_initial_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, nrhs, B_initial, lda);
-
-            B_list.emplace_back(B_initial);
-            auto& B = B_list.back();
-
-            auto info = reference::potrf(uplo, n, A.data(), lda);
-            if (info != 0) {
-                test_log::lout << "Reference getrf failed with info = " << info << std::endl;
-                return false;
-            }
-        }
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> B_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        fp** B_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        auto A_iter = A_list.begin();
-        auto B_iter = B_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, B_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-            B_dev_list.emplace_back(B_iter->size(), usm_fp_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::potrs_batch_scratchpad_size<fp>(
-            queue, uplo_vec.data(), n_vec.data(), nrhs_vec.data(), lda_vec.data(), ldb_vec.data(),
-            group_count, group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::potrs_batch_scratchpad_size<fp>,
-            uplo_vec.data(), n_vec.data(), nrhs_vec.data(), lda_vec.data(), ldb_vec.data(),
-            group_count, group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        auto B_dev_iter = B_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_dev_iter++, B_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-            B_dev_ptrs[global_id] = B_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        B_iter = B_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, B_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-            host_to_device_copy(queue, B_iter->data(), B_dev_ptrs[global_id], B_iter->size());
-        }
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::potrs_batch(
-            queue, uplo_vec.data(), n_vec.data(), nrhs_vec.data(), A_dev_ptrs, lda_vec.data(),
-            B_dev_ptrs, ldb_vec.data(), group_count, group_sizes_vec.data(), scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::potrs_batch,
-                                  uplo_vec.data(), n_vec.data(), nrhs_vec.data(), A_dev_ptrs,
-                                  lda_vec.data(), B_dev_ptrs, ldb_vec.data(), group_count,
-                                  group_sizes_vec.data(), scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-        if (B_dev_ptrs) {
-            sycl::free(B_dev_ptrs, queue);
-        }
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_USM(PotrsBatchGroup);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(PotrsBatchGroup);
diff --git a/tests/unit_tests/lapack/source/potrs_batch_stride.cpp b/tests/unit_tests/lapack/source/potrs_batch_stride.cpp
deleted file mode 100644
index de2568e86..000000000
--- a/tests/unit_tests/lapack/source/potrs_batch_stride.cpp
+++ /dev/null
@@ -1,205 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 4 15 5 123 22 400 3 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t nrhs, int64_t lda,
-              int64_t stride_a, int64_t ldb, int64_t stride_b, int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(stride_a * batch_size);
-    std::vector<fp> B_initial(stride_b * batch_size);
-    for (int64_t i = 0; i < batch_size; i++) {
-        rand_pos_def_matrix(seed, uplo, n, A_initial, lda, i * stride_a);
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, nrhs, B_initial, ldb, i * stride_b);
-    }
-
-    std::vector<fp> A = A_initial;
-    std::vector<fp> B = B_initial;
-
-    for (int64_t i = 0; i < batch_size; i++) {
-        auto info = reference::potrf(uplo, n, A.data() + i * stride_a, lda);
-        if (0 != info) {
-            test_log::lout << "batch routine index " << i
-                           << ": reference potrf failed with info: " << info << std::endl;
-            return false;
-        }
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto B_dev = device_alloc<data_T>(queue, B.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::potrs_batch_scratchpad_size<fp>(
-            queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::potrs_batch_scratchpad_size<fp>, uplo, n,
-            nrhs, lda, stride_a, ldb, stride_b, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, B.data(), B_dev, B.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, A_dev, lda, stride_a, B_dev, ldb,
-                                         stride_b, batch_size, scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::potrs_batch, uplo, n, nrhs, A_dev,
-                                  lda, stride_a, B_dev, ldb, stride_b, batch_size, scratchpad_dev,
-                                  scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, B_dev, B.data(), B.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, B_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    bool result = true;
-    for (int64_t i = 0; i < batch_size; i++) {
-        auto B_ = copy_vector(B, ldb * nrhs, i * stride_b);
-        auto A_initial_ = copy_vector(A_initial, lda * n, i * stride_a);
-        auto B_initial_ = copy_vector(B_initial, ldb * nrhs, i * stride_b);
-        if (!check_potrs_accuracy(uplo, n, nrhs, B_, ldb, A_initial_, lda, B_initial_)) {
-            test_log::lout << "batch routine index " << i << " failed" << std::endl;
-            result = false;
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t nrhs,
-                    int64_t lda, int64_t stride_a, int64_t ldb, int64_t stride_b,
-                    int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A_initial(stride_a * batch_size);
-    std::vector<fp> B_initial(stride_b * batch_size);
-    for (int64_t i = 0; i < batch_size; i++) {
-        rand_pos_def_matrix(seed, uplo, n, A_initial, lda, i * stride_a);
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, nrhs, B_initial, ldb, i * stride_b);
-    }
-
-    std::vector<fp> A = A_initial;
-    std::vector<fp> B = B_initial;
-
-    for (int64_t i = 0; i < batch_size; i++) {
-        auto info = reference::potrf(uplo, n, A.data() + i * stride_a, lda);
-        if (0 != info) {
-            test_log::lout << "batch routine index " << i
-                           << ": reference potrf failed with info: " << info << std::endl;
-            return false;
-        }
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto B_dev = device_alloc<data_T>(queue, B.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::potrs_batch_scratchpad_size<fp>(
-            queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::potrs_batch_scratchpad_size<fp>, uplo, n,
-            nrhs, lda, stride_a, ldb, stride_b, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, B.data(), B_dev, B.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::potrs_batch(
-            queue, uplo, n, nrhs, A_dev, lda, stride_a, B_dev, ldb, stride_b, batch_size,
-            scratchpad_dev, scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::potrs_batch, uplo, n,
-                                  nrhs, A_dev, lda, stride_a, B_dev, ldb, stride_b, batch_size,
-                                  scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, B_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(PotrsBatchStride);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(PotrsBatchStride);
diff --git a/tests/unit_tests/lapack/source/syevd.cpp b/tests/unit_tests/lapack/source/syevd.cpp
deleted file mode 100644
index 291713354..000000000
--- a/tests/unit_tests/lapack/source/syevd.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 0 27 33 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, int64_t n,
-              int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    rand_hermitian_matrix(seed, uplo, n, A, lda);
-
-    std::vector<fp> A_initial = A;
-    std::vector<fp_real> w(n);
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto w_dev = device_alloc<data_T, fp_real>(queue, w.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::syevd_scratchpad_size<fp>(queue, jobz, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::syevd_scratchpad_size<fp>,
-                                  jobz, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::syevd(queue, jobz, uplo, n, A_dev, lda, w_dev, scratchpad_dev,
-                                   scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::syevd, jobz, uplo, n, A_dev, lda,
-                                  w_dev, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, w_dev, w.data(), w.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, w_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_sy_he_evd_accuracy(jobz, uplo, n, A, lda, w, A_initial);
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                    int64_t n, int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    rand_hermitian_matrix(seed, uplo, n, A, lda);
-
-    std::vector<fp> A_initial = A;
-    std::vector<fp_real> w(n);
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto w_dev = device_alloc<data_T, fp_real>(queue, w.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::syevd_scratchpad_size<fp>(queue, jobz, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::syevd_scratchpad_size<fp>,
-                                  jobz, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::syevd(queue, jobz, uplo, n, A_dev, lda, w_dev, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::syevd, jobz, uplo, n,
-                                  A_dev, lda, w_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, w_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_REAL(Syevd);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_REAL(Syevd);
diff --git a/tests/unit_tests/lapack/source/sygvd.cpp b/tests/unit_tests/lapack/source/sygvd.cpp
deleted file mode 100644
index f800b03dd..000000000
--- a/tests/unit_tests/lapack/source/sygvd.cpp
+++ /dev/null
@@ -1,305 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 1 0 27 33 31 27182
-2 1 0 27 33 31 27182
-3 1 0 27 33 31 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-              int64_t n, int64_t lda, int64_t ldb, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    rand_hermitian_matrix(seed, uplo, n, A, lda);
-    std::vector<fp> B(ldb * n);
-    rand_pos_def_matrix(seed, uplo, n, B, ldb);
-
-    std::vector<fp> A_initial = A;
-    std::vector<fp> B_initial = B;
-    std::vector<fp_real> w(n);
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto B_dev = device_alloc<data_T>(queue, B.size());
-        auto w_dev = device_alloc<data_T, fp_real>(queue, w.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::sygvd_scratchpad_size<fp>(queue, itype, jobz, uplo, n, lda, ldb);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::sygvd_scratchpad_size<fp>,
-                                  itype, jobz, uplo, n, lda, ldb);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, B.data(), B_dev, B.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, A_dev, lda, B_dev, ldb, w_dev,
-                                   scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::sygvd, itype, jobz, uplo, n, A_dev,
-                                  lda, B_dev, ldb, w_dev, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, B_dev, B.data(), B.size());
-        device_to_host_copy(queue, w_dev, w.data(), w.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, B_dev);
-        device_free(queue, w_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    const auto& Z = A;
-    auto ldz = lda;
-    const auto& D = w;
-    hermitian_to_full(uplo, n, A_initial, lda);
-    hermitian_to_full(uplo, n, B_initial, ldb);
-    bool result = true;
-
-    /* |D_ref - D| < |D_ref| O(eps) */
-    std::vector<fp_real> D_ref(n);
-    auto info =
-        reference::sygvd(itype, oneapi::mkl::job::novec, uplo, n, std::vector<fp>(A_initial).data(),
-                         lda, std::vector<fp>(B_initial).data(), ldb, D_ref.data());
-    if (0 != info) {
-        test_log::lout << "reference sygvd failed with info = " << info << std::endl;
-        return false;
-    }
-    if (!rel_vec_err_check(n, D_ref, D, 10.0)) {
-        test_log::lout << "Eigenvalue check failed" << std::endl;
-        result = false;
-    }
-
-    if (oneapi::mkl::job::vec == jobz) {
-        if (itype == 1) {
-            /* |A Z - B Z D| < |A Z| O(eps) */
-            std::vector<fp> AZ(n * n);
-            int64_t ldaz = n;
-            reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, A_initial.data(), lda, Z.data(), ldz, 0.0, AZ.data(), ldaz);
-
-            std::vector<fp> BZ(n * n);
-            int64_t ldbz = n;
-            reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, B_initial.data(), ldb, Z.data(), ldz, 0.0, BZ.data(), ldbz);
-
-            std::vector<fp> BZD(n * n);
-            int64_t ldbzd = n;
-            for (int64_t col = 0; col < n; col++)
-                for (int64_t row = 0; row < n; row++)
-                    BZD[row + col * ldbzd] = BZ[row + col * ldbz] * D[col];
-
-            if (!rel_mat_err_check(n, n, AZ, ldaz, BZD, ldbzd)) {
-                test_log::lout << "Factorization check failed" << std::endl;
-                result = false;
-            }
-
-            /* |I - Z' B Z| < n O(eps) */
-            std::vector<fp> ZBZ(n * n);
-            int64_t ldzbz = n;
-            reference::gemm(oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, Z.data(), ldz, BZ.data(), ldbz, 0.0, ZBZ.data(), ldzbz);
-            if (!rel_id_err_check(n, ZBZ, ldzbz)) {
-                test_log::lout << "Orthogonality check failed" << std::endl;
-                result = false;
-            }
-        }
-        else if (itype == 2) {
-            /* |A B Z - Z D| < |A B Z| O(eps) */
-            std::vector<fp> BZ(n * n);
-            int64_t ldbz = n;
-            reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, B_initial.data(), ldb, Z.data(), ldz, 0.0, BZ.data(), ldbz);
-
-            std::vector<fp> ABZ(n * n);
-            int64_t ldabz = n;
-            reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, A_initial.data(), lda, BZ.data(), ldbz, 0.0, ABZ.data(),
-                            ldabz);
-
-            std::vector<fp> ZD(n * n);
-            int64_t ldzd = n;
-            for (int64_t col = 0; col < n; col++)
-                for (int64_t row = 0; row < n; row++)
-                    ZD[row + col * ldzd] = Z[row + col * ldz] * D[col];
-
-            if (!rel_mat_err_check(n, n, ABZ, ldabz, ZD, ldbz)) {
-                test_log::lout << "Factorization check failed" << std::endl;
-                result = false;
-            }
-
-            /* |I - Z' B Z| < n O(eps) */
-            std::vector<fp> ZBZ(n * n);
-            int64_t ldzbz = n;
-            reference::gemm(oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, Z.data(), ldz, BZ.data(), ldbz, 0.0, ZBZ.data(), ldzbz);
-            if (!rel_id_err_check(n, ZBZ, ldzbz)) {
-                test_log::lout << "Orthogonality check failed" << std::endl;
-                result = false;
-            }
-        }
-        else {
-            /* |A Z - B^-1 Z D| < |A Z| O(eps) */
-            /* C = B^-1 Z */
-            std::vector<fp> AZ(n * n);
-            int64_t ldaz = n;
-            reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, A_initial.data(), lda, Z.data(), ldz, 0.0, AZ.data(), ldaz);
-
-            std::vector<fp> C(n * n);
-            int64_t ldc = n;
-            reference::lacpy('A', n, n, Z.data(), ldz, C.data(), ldc);
-            auto info = reference::potrs(uplo, n, n, B.data(), ldb, C.data(), ldc);
-            if (0 != info) {
-                test_log::lout << "reference potrs failed with info = " << info << std::endl;
-                return false;
-            }
-
-            std::vector<fp> CD(n * n);
-            int64_t ldcd = n;
-            for (int64_t col = 0; col < n; col++)
-                for (int64_t row = 0; row < n; row++)
-                    CD[row + col * ldcd] = C[row + col * ldc] * D[col];
-
-            if (!rel_mat_err_check(n, n, AZ, ldaz, CD, ldcd)) {
-                test_log::lout << "Factorization check failed" << std::endl;
-                result = false;
-            }
-
-            /* |I - Z' B^-1 Z| = |I - Z' C| < n O(eps) */
-            std::vector<fp> ZhC(n * n);
-            int64_t ldzhc = n;
-            reference::gemm(oneapi::mkl::transpose::conjtrans, oneapi::mkl::transpose::nontrans, n,
-                            n, n, 1.0, Z.data(), ldz, C.data(), ldc, 0.0, ZhC.data(), ldzhc);
-            if (!rel_id_err_check(n, ZhC, ldzhc)) {
-                test_log::lout << "Orthogonality check failed" << std::endl;
-                result = false;
-            }
-        }
-    }
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, int64_t itype, oneapi::mkl::job jobz,
-                    oneapi::mkl::uplo uplo, int64_t n, int64_t lda, int64_t ldb, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    rand_hermitian_matrix(seed, uplo, n, A, lda);
-    std::vector<fp> B(ldb * n);
-    rand_pos_def_matrix(seed, uplo, n, B, ldb);
-
-    std::vector<fp> A_initial = A;
-    std::vector<fp> B_initial = B;
-    std::vector<fp_real> w(n);
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto B_dev = device_alloc<data_T>(queue, B.size());
-        auto w_dev = device_alloc<data_T, fp_real>(queue, w.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::sygvd_scratchpad_size<fp>(queue, itype, jobz, uplo, n, lda, ldb);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::sygvd_scratchpad_size<fp>,
-                                  itype, jobz, uplo, n, lda, ldb);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, B.data(), B_dev, B.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::sygvd(
-            queue, itype, jobz, uplo, n, A_dev, lda, B_dev, ldb, w_dev, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::sygvd, itype, jobz, uplo,
-                                  n, A_dev, lda, B_dev, ldb, w_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, B_dev);
-        device_free(queue, w_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_REAL(Sygvd);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_REAL(Sygvd);
diff --git a/tests/unit_tests/lapack/source/sytrd.cpp b/tests/unit_tests/lapack/source/sytrd.cpp
deleted file mode 100644
index 01ffe0dff..000000000
--- a/tests/unit_tests/lapack/source/sytrd.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 33 35 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-              uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<fp_real> d(n);
-    std::vector<fp_real> e(n);
-    std::vector<fp> tau(n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A, lda);
-    std::vector<fp> A_initial = A;
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto d_dev = device_alloc<data_T, fp_real>(queue, d.size());
-        auto e_dev = device_alloc<data_T, fp_real>(queue, e.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::sytrd_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::sytrd_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, d.data(), d_dev, d.size());
-        host_to_device_copy(queue, e.data(), e_dev, e.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::sytrd(queue, uplo, n, A_dev, lda, d_dev, e_dev, tau_dev,
-                                   scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::sytrd, uplo, n, A_dev, lda, d_dev,
-                                  e_dev, tau_dev, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, d_dev, d.data(), d.size());
-        device_to_host_copy(queue, e_dev, e.data(), e.size());
-        device_to_host_copy(queue, tau_dev, tau.data(), tau.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, d_dev);
-        device_free(queue, e_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    hermitian_to_full(uplo, n, A_initial, lda);
-    bool result = true;
-
-    /* |A - Q T Q'| < |A| O(eps) */
-    std::vector<fp> T(n * n);
-    int64_t ldt = n;
-    for (int64_t diag = 0; diag < n; diag++)
-        T[diag + diag * ldt] = d[diag];
-    for (int64_t diag = 0; diag < n - 1; diag++) {
-        T[diag + (diag + 1) * ldt] = e[diag];
-        T[diag + 1 + (diag)*ldt] = e[diag];
-    }
-
-    std::vector<fp> QTQ{ T };
-    int64_t ldqtq = n;
-    reference::or_un_mtr(oneapi::mkl::side::left, uplo, oneapi::mkl::transpose::nontrans, n, n,
-                         A.data(), lda, tau.data(), QTQ.data(), ldqtq);
-    reference::or_un_mtr(oneapi::mkl::side::right, uplo, oneapi::mkl::transpose::trans, n, n,
-                         A.data(), lda, tau.data(), QTQ.data(), ldqtq);
-
-    if (!rel_mat_err_check(n, n, QTQ, ldqtq, A_initial, lda)) {
-        test_log::lout << "Factorization check failed" << std::endl;
-        result = false;
-    }
-
-    /* A[i, i] = d[i] */
-    for (int64_t diag = 0; diag < n; diag++)
-        d[diag] -= A[diag + diag * lda];
-    if (uplo == oneapi::mkl::uplo::upper)
-        for (int64_t diag = 0; diag < n - 1; diag++)
-            e[diag] -= A[diag + (diag + 1) * lda];
-    else
-        for (int64_t diag = 0; diag < n - 1; diag++)
-            e[diag] -= A[diag + 1 + (diag)*lda];
-
-    auto ulp = reference::lamch<fp_real>('P');
-    if (reference::lange('I', n, 1, d.data(), n) > 10.0 * ulp) {
-        test_log::lout << "Diagonal check failed" << std::endl;
-        result = false;
-    }
-    if (reference::lange('I', n - 1, 1, e.data(), n - 1) > 10.0 * ulp) {
-        test_log::lout << "Off-diagonal check failed" << std::endl;
-        result = false;
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-                    uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<fp_real> d(n);
-    std::vector<fp_real> e(n);
-    std::vector<fp> tau(n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A, lda);
-    std::vector<fp> A_initial = A;
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto d_dev = device_alloc<data_T, fp_real>(queue, d.size());
-        auto e_dev = device_alloc<data_T, fp_real>(queue, e.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::sytrd_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::sytrd_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, d.data(), d_dev, d.size());
-        host_to_device_copy(queue, e.data(), e_dev, e.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::sytrd(
-            queue, uplo, n, A_dev, lda, d_dev, e_dev, tau_dev, scratchpad_dev, scratchpad_size,
-            std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::sytrd, uplo, n, A_dev,
-                                  lda, d_dev, e_dev, tau_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, d_dev);
-        device_free(queue, e_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_REAL(Sytrd);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_REAL(Sytrd);
diff --git a/tests/unit_tests/lapack/source/sytrf.cpp b/tests/unit_tests/lapack/source/sytrf.cpp
deleted file mode 100644
index 81d7fdb2d..000000000
--- a/tests/unit_tests/lapack/source/sytrf.cpp
+++ /dev/null
@@ -1,278 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-0 29 36 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-              uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<int64_t> ipiv(n);
-    rand_symmetric_matrix(seed, uplo, n, A, lda);
-
-    std::vector<fp> A_initial = A;
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::sytrf_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::sytrf_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, ipiv.data(), ipiv_dev, ipiv.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::sytrf(queue, uplo, n, A_dev, lda, ipiv_dev, scratchpad_dev,
-                                   scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::sytrf, uplo, n, A_dev, lda, ipiv_dev,
-                                  scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, ipiv_dev, ipiv.data(), ipiv.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    std::vector<fp> U(n * n);
-    std::vector<fp> Uk(n * n);
-    int64_t ldu = n;
-    std::vector<fp> D(n * n);
-    int64_t ldd = n;
-    symmetric_to_full(uplo, n, A_initial, lda);
-    bool result = true;
-
-    for (int64_t d = 0; d < n; d++)
-        U[d + d * ldu] = 1.0;
-
-    if (uplo == oneapi::mkl::uplo::upper) {
-        int64_t k = n - 1;
-        while (k >= 0) {
-            reference::laset('A', n, n, 0.0, 1.0, Uk.data(), ldu);
-            if (ipiv[k] > 0) { /* 1x1 block case */
-
-                auto piv = ipiv[k] - 1;
-                for (int64_t i = 0; i < k; i++)
-                    Uk[i + k * ldu] = A[i + k * lda];
-                if (piv != k)
-                    reference::swap(n, Uk.data() + (k + 0 * ldu), ldu, Uk.data() + (piv + 0 * ldu),
-                                    ldu);
-                auto U_temp = U;
-                reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans,
-                                n, n, n, 1.0, U_temp.data(), ldu, Uk.data(), ldu, 0.0, U.data(),
-                                ldu);
-
-                D[k + k * ldd] = A[k + k * lda];
-                k -= 1;
-            }
-            else { /* 2x2 block case */
-
-                auto piv = -ipiv[k] - 1;
-                for (int64_t i = 0; i < k - 1; i++) {
-                    Uk[i + k * ldu] = A[i + k * lda];
-                    Uk[i + (k - 1) * ldu] = A[i + (k - 1) * lda];
-                }
-                if (piv != k - 1)
-                    reference::swap(n, Uk.data() + (k - 1 + 0 * ldu), ldu,
-                                    Uk.data() + (piv + 0 * ldu), ldu);
-                auto U_temp = U;
-                reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans,
-                                n, n, n, 1.0, U_temp.data(), ldu, Uk.data(), ldu, 0.0, U.data(),
-                                ldu);
-
-                D[k + k * ldd] = A[k + k * lda];
-                D[k - 1 + (k - 1) * ldd] = A[k - 1 + (k - 1) * lda];
-                D[k - 1 + k * ldd] = A[k - 1 + k * lda];
-                D[k + (k - 1) * ldd] = A[k - 1 + k * lda];
-                k -= 2;
-            }
-        }
-    }
-    else {
-        int64_t k = 0;
-        while (k < n) {
-            reference::laset('A', n, n, 0.0, 1.0, Uk.data(), ldu);
-            if (ipiv[k] > 0) { /* 1x1 block case */
-
-                auto piv = ipiv[k] - 1;
-                for (int64_t i = k + 1; i < n; i++)
-                    Uk[i + k * ldu] = A[i + k * lda];
-                if (piv != k)
-                    reference::swap(n, Uk.data() + (k + 0 * lda), ldu, Uk.data() + (piv + 0 * ldu),
-                                    ldu);
-                auto U_temp = U;
-                reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans,
-                                n, n, n, 1.0, U_temp.data(), ldu, Uk.data(), ldu, 0.0, U.data(),
-                                ldu);
-
-                D[k + (k)*ldd] = A[k + (k)*lda];
-                k += 1;
-            }
-            else { /* 2x2 block case */
-
-                auto piv = -ipiv[k] - 1;
-                for (int64_t i = k + 2; i < n; i++) {
-                    Uk[i + k * ldu] = A[i + k * lda];
-                    Uk[i + (k + 1) * ldu] = A[i + (k + 1) * lda];
-                }
-                if (piv != k)
-                    reference::swap(n, Uk.data() + (k + 1 + 0 * ldu), ldu,
-                                    Uk.data() + (piv + 0 * ldu), ldu);
-                auto U_temp = U;
-                reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans,
-                                n, n, n, 1.0, U_temp.data(), ldu, Uk.data(), ldu, 0.0, U.data(),
-                                ldu);
-
-                D[k + k * ldd] = A[k + k * lda];
-                D[k + 1 + (k + 1) * ldd] = A[k + 1 + (k + 1) * lda];
-                D[k + 1 + k * ldd] = A[k + 1 + k * lda];
-                D[k + (k + 1) * ldd] = A[k + 1 + k * lda];
-                k += 2;
-            }
-        }
-    }
-
-    /* |A - UDU'| < |A| O(eps) */
-    std::vector<fp> UD(n * n);
-    int64_t ldud = n;
-    reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::nontrans, n, n, n,
-                    1.0, U.data(), ldu, D.data(), ldd, 0.0, UD.data(), ldud);
-
-    std::vector<fp> UDU(n * n);
-    int64_t ldudu = n;
-    reference::gemm(oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::trans, n, n, n, 1.0,
-                    UD.data(), ldud, U.data(), ldu, 0.0, UDU.data(), ldudu);
-
-    if (!rel_mat_err_check(n, n, UDU, ldudu, A_initial, lda)) {
-        test_log::lout << "Factorization check failed" << std::endl;
-        result = false;
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-                    uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<int64_t> ipiv(n);
-    rand_symmetric_matrix(seed, uplo, n, A, lda);
-
-    std::vector<fp> A_initial = A;
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto ipiv_dev = device_alloc<data_T, int64_t>(queue, ipiv.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::sytrf_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::sytrf_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, ipiv.data(), ipiv_dev, ipiv.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::sytrf(queue, uplo, n, A_dev, lda, ipiv_dev, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::sytrf, uplo, n, A_dev,
-                                  lda, ipiv_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, ipiv_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(Sytrf);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(Sytrf);
diff --git a/tests/unit_tests/lapack/source/trtrs.cpp b/tests/unit_tests/lapack/source/trtrs.cpp
deleted file mode 100644
index 4018a2c51..000000000
--- a/tests/unit_tests/lapack/source/trtrs.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 0 0 25 79 66 38 27182
-1 0 1 32 34 92 39 27182
-1 3 0 76 61 87 82 27182
-1 3 1 89 92 89 99 27182
-0 0 0 25 79 66 38 27182
-0 0 1 32 34 92 39 27182
-0 3 0 76 61 87 82 27182
-0 3 1 89 92 89 99 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-              oneapi::mkl::diag diag, int64_t n, int64_t nrhs, int64_t lda, int64_t ldb,
-              uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<fp> B(ldb * nrhs);
-
-    /* Initialize input data */
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A, lda);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, nrhs, B, ldb);
-    std::vector<fp> B_initial = B;
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto B_dev = device_alloc<data_T>(queue, B.size());
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::trtrs_scratchpad_size<fp>(
-            queue, uplo, trans, diag, n, nrhs, lda, ldb);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::trtrs_scratchpad_size<fp>,
-                                  uplo, trans, diag, n, nrhs, lda, ldb);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, B.data(), B_dev, B.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, A_dev, lda, B_dev, ldb,
-                                   scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::trtrs, uplo, trans, diag, n, nrhs,
-                                  A_dev, lda, B_dev, ldb, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, B_dev, B.data(), B.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, B_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_trtrs_accuracy(uplo, trans, diag, n, nrhs, A, lda, B, ldb, B_initial);
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans,
-                    oneapi::mkl::diag diag, int64_t n, int64_t nrhs, int64_t lda, int64_t ldb,
-                    uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<fp> B(ldb * nrhs);
-
-    /* Initialize input data */
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A, lda);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, nrhs, B, ldb);
-    std::vector<fp> B_initial = B;
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto B_dev = device_alloc<data_T>(queue, B.size());
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::trtrs_scratchpad_size<fp>(
-            queue, uplo, trans, diag, n, nrhs, lda, ldb);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::trtrs_scratchpad_size<fp>,
-                                  uplo, trans, diag, n, nrhs, lda, ldb);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, B.data(), B_dev, B.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::trtrs(
-            queue, uplo, trans, diag, n, nrhs, A_dev, lda, B_dev, ldb, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::trtrs, uplo, trans, diag,
-                                  n, nrhs, A_dev, lda, B_dev, ldb, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, B_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY(Trtrs);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY(Trtrs);
diff --git a/tests/unit_tests/lapack/source/ungbr.cpp b/tests/unit_tests/lapack/source/ungbr.cpp
deleted file mode 100644
index 7cdf8e52a..000000000
--- a/tests/unit_tests/lapack/source/ungbr.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-0 29 25 25 30 27182
-0 29 25 20 30 27182
-1 25 29 25 30 27182
-1 25 29 20 30 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::generate vect, int64_t m, int64_t n, int64_t k,
-              int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    int64_t m_A = m;
-    int64_t n_A = n;
-
-    if (vect == oneapi::mkl::generate::Q)
-        n_A = k;
-    else /* vect == oneapi::mkl::generate::P */
-        m_A = k;
-
-    int64_t min_mn_A = std::min<int64_t>(m_A, n_A);
-
-    std::vector<fp> A(lda * n);
-    std::vector<fp_real> d(min_mn_A);
-    std::vector<fp_real> e(std::max<int64_t>(min_mn_A - 1, 1));
-    std::vector<fp> tauq(min_mn_A);
-    std::vector<fp> taup(min_mn_A);
-
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m_A, n_A, A, lda);
-    reference::gebrd(m_A, n_A, A.data(), lda, d.data(), e.data(), tauq.data(), taup.data());
-
-    auto& tau = (vect == oneapi::mkl::generate::Q) ? tauq : taup;
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::ungbr_scratchpad_size<fp>(queue, vect, m, n, k, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::ungbr_scratchpad_size<fp>,
-                                  vect, m, n, k, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::ungbr(queue, vect, m, n, k, A_dev, lda, tau_dev, scratchpad_dev,
-                                   scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::ungbr, vect, m, n, k, A_dev, lda,
-                                  tau_dev, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_or_un_gbr_accuracy(vect, m, n, k, A, lda);
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::generate vect, int64_t m, int64_t n,
-                    int64_t k, int64_t lda, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    int64_t m_A = m;
-    int64_t n_A = n;
-
-    if (vect == oneapi::mkl::generate::Q)
-        n_A = k;
-    else /* vect == oneapi::mkl::generate::P */
-        m_A = k;
-
-    int64_t min_mn_A = std::min<int64_t>(m_A, n_A);
-
-    std::vector<fp> A(lda * n);
-    std::vector<fp_real> d(min_mn_A);
-    std::vector<fp_real> e(std::max<int64_t>(min_mn_A - 1, 1));
-    std::vector<fp> tauq(min_mn_A);
-    std::vector<fp> taup(min_mn_A);
-
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m_A, n_A, A, lda);
-    reference::gebrd(m_A, n_A, A.data(), lda, d.data(), e.data(), tauq.data(), taup.data());
-
-    auto& tau = (vect == oneapi::mkl::generate::Q) ? tauq : taup;
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::ungbr_scratchpad_size<fp>(queue, vect, m, n, k, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::ungbr_scratchpad_size<fp>,
-                                  vect, m, n, k, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::ungbr(queue, vect, m, n, k, A_dev, lda, tau_dev, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::ungbr, vect, m, n, k,
-                                  A_dev, lda, tau_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_COMPLEX(Ungbr);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_COMPLEX(Ungbr);
diff --git a/tests/unit_tests/lapack/source/ungqr.cpp b/tests/unit_tests/lapack/source/ungqr.cpp
deleted file mode 100644
index 08b8b1192..000000000
--- a/tests/unit_tests/lapack/source/ungqr.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-50  50  30  70 27182
-50  30  30  70 27182
-50  30  10  70 27182
-200 200 180 220 27182
-200 180 180 220 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, int64_t m, int64_t n, int64_t k, int64_t lda,
-              uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A, lda);
-    std::vector<fp> tau(k);
-
-    auto info = reference::geqrf(m, k, A.data(), lda, tau.data());
-    if (0 != info) {
-        test_log::lout << "reference geqrf failed with info: " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::ungqr_scratchpad_size<fp>(queue, m, n, k, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::ungqr_scratchpad_size<fp>, m, n, k, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::ungqr(queue, m, n, k, A_dev, lda, tau_dev, scratchpad_dev,
-                                   scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::ungqr, m, n, k, A_dev, lda, tau_dev,
-                                  scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_or_un_gqr_accuracy(m, n, A, lda);
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, int64_t m, int64_t n, int64_t k, int64_t lda,
-                    uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A, lda);
-    std::vector<fp> tau(k);
-
-    auto info = reference::geqrf(m, k, A.data(), lda, tau.data());
-    if (0 != info) {
-        test_log::lout << "reference geqrf failed with info: " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::ungqr_scratchpad_size<fp>(queue, m, n, k, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::ungqr_scratchpad_size<fp>, m, n, k, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::ungqr(queue, m, n, k, A_dev, lda, tau_dev, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::ungqr, m, n, k, A_dev,
-                                  lda, tau_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_COMPLEX(Ungqr);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_COMPLEX(Ungqr);
diff --git a/tests/unit_tests/lapack/source/ungqr_batch_group.cpp b/tests/unit_tests/lapack/source/ungqr_batch_group.cpp
deleted file mode 100644
index ddb350828..000000000
--- a/tests/unit_tests/lapack/source/ungqr_batch_group.cpp
+++ /dev/null
@@ -1,316 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <list>
-#include <numeric>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-27182
-)";
-
-template <typename fp>
-bool accuracy(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<int64_t> m_vec = { 5, 5 };
-    std::vector<int64_t> n_vec = { 3, 4 };
-    std::vector<int64_t> k_vec = { 2, 4 };
-    std::vector<int64_t> lda_vec = { 5, 6 };
-    std::vector<int64_t> group_sizes_vec = { 2, 2 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_list;
-    std::list<std::vector<fp>> tau_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto m = m_vec[group_id];
-        auto n = n_vec[group_id];
-        auto k = k_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_list.emplace_back(lda * n);
-            auto& A = A_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A, lda);
-
-            tau_list.emplace_back(k);
-            auto& tau = tau_list.back();
-            auto info = reference::geqrf(m, k, A.data(), lda, tau.data());
-            if (0 != info) {
-                test_log::lout << "reference geqrf failed with info = " << info << std::endl;
-                return false;
-            }
-        }
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> tau_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        fp** tau_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        auto A_iter = A_list.begin();
-        auto tau_iter = tau_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, tau_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-            tau_dev_list.emplace_back(tau_iter->size(), usm_fp_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::ungqr_batch_scratchpad_size<fp>(
-            queue, m_vec.data(), n_vec.data(), k_vec.data(), lda_vec.data(), group_count,
-            group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::ungqr_batch_scratchpad_size<fp>,
-            m_vec.data(), n_vec.data(), k_vec.data(), lda_vec.data(), group_count,
-            group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        auto tau_dev_iter = tau_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_dev_iter++, tau_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-            tau_dev_ptrs[global_id] = tau_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        tau_iter = tau_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, tau_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-            host_to_device_copy(queue, tau_iter->data(), tau_dev_ptrs[global_id], tau_iter->size());
-        }
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::ungqr_batch(queue, m_vec.data(), n_vec.data(), k_vec.data(),
-                                         A_dev_ptrs, lda_vec.data(), tau_dev_ptrs, group_count,
-                                         group_sizes_vec.data(), scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::ungqr_batch, m_vec.data(),
-                                  n_vec.data(), k_vec.data(), A_dev_ptrs, lda_vec.data(),
-                                  tau_dev_ptrs, group_count, group_sizes_vec.data(), scratchpad_dev,
-                                  scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        A_iter = A_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++) {
-            device_to_host_copy(queue, A_dev_ptrs[global_id], A_iter->data(), A_iter->size());
-        }
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-        if (tau_dev_ptrs) {
-            sycl::free(tau_dev_ptrs, queue);
-        }
-    }
-
-    bool result = true;
-
-    int64_t global_id = 0;
-    auto A_iter = A_list.begin();
-    auto tau_iter = tau_list.begin();
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto m = m_vec[group_id];
-        auto n = n_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-        for (int64_t local_id = 0; local_id < group_size;
-             local_id++, global_id++, A_iter++, tau_iter++) {
-            if (!check_or_un_gqr_accuracy(m, n, *A_iter, lda)) {
-                test_log::lout << "batch routine (" << global_id << ", " << group_id << ", "
-                               << local_id << ") (global_id, group_id, local_id) failed"
-                               << std::endl;
-                result = false;
-            }
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1
-)";
-
-template <typename fp>
-bool usm_dependency(const sycl::device& dev, uint64_t seed) {
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Test Parameters */
-    std::vector<int64_t> m_vec = { 1 };
-    std::vector<int64_t> n_vec = { 1 };
-    std::vector<int64_t> k_vec = { 1 };
-    std::vector<int64_t> lda_vec = { 1 };
-    std::vector<int64_t> group_sizes_vec = { 1 };
-
-    int64_t group_count = group_sizes_vec.size();
-    int64_t batch_size = std::accumulate(group_sizes_vec.begin(), group_sizes_vec.end(), 0);
-
-    std::list<std::vector<fp>> A_list;
-    std::list<std::vector<fp>> tau_list;
-
-    for (int64_t group_id = 0; group_id < group_count; group_id++) {
-        auto m = m_vec[group_id];
-        auto n = n_vec[group_id];
-        auto k = k_vec[group_id];
-        auto lda = lda_vec[group_id];
-        auto group_size = group_sizes_vec[group_id];
-
-        /* Allocate and Initialize on host */
-        for (int64_t local_id = 0; local_id < group_size; local_id++) {
-            A_list.emplace_back(lda * n);
-            auto& A = A_list.back();
-            rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A, lda);
-
-            tau_list.emplace_back(k);
-            auto& tau = tau_list.back();
-            auto info = reference::geqrf(m, k, A.data(), lda, tau.data());
-            if (0 != info) {
-                test_log::lout << "reference geqrf failed with info = " << info << std::endl;
-                return false;
-            }
-        }
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> A_dev_list;
-        std::list<std::vector<fp, sycl::usm_allocator<fp, sycl::usm::alloc::shared>>> tau_dev_list;
-        fp** A_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-        fp** tau_dev_ptrs = sycl::malloc_shared<fp*>(batch_size, queue);
-
-        /* Allocate on device */
-        sycl::usm_allocator<fp, sycl::usm::alloc::shared> usm_fp_allocator{ queue.get_context(),
-                                                                            dev };
-        auto A_iter = A_list.begin();
-        auto tau_iter = tau_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, tau_iter++) {
-            A_dev_list.emplace_back(A_iter->size(), usm_fp_allocator);
-            tau_dev_list.emplace_back(tau_iter->size(), usm_fp_allocator);
-        }
-
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::ungqr_batch_scratchpad_size<fp>(
-            queue, m_vec.data(), n_vec.data(), k_vec.data(), lda_vec.data(), group_count,
-            group_sizes_vec.data());
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::ungqr_batch_scratchpad_size<fp>,
-            m_vec.data(), n_vec.data(), k_vec.data(), lda_vec.data(), group_count,
-            group_sizes_vec.data());
-#endif
-        auto scratchpad_dev = device_alloc<fp>(queue, scratchpad_size);
-
-        auto A_dev_iter = A_dev_list.begin();
-        auto tau_dev_iter = tau_dev_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size;
-             global_id++, A_dev_iter++, tau_dev_iter++) {
-            A_dev_ptrs[global_id] = A_dev_iter->data();
-            tau_dev_ptrs[global_id] = tau_dev_iter->data();
-        }
-
-        A_iter = A_list.begin();
-        tau_iter = tau_list.begin();
-        for (int64_t global_id = 0; global_id < batch_size; global_id++, A_iter++, tau_iter++) {
-            host_to_device_copy(queue, A_iter->data(), A_dev_ptrs[global_id], A_iter->size());
-            host_to_device_copy(queue, tau_iter->data(), tau_dev_ptrs[global_id], tau_iter->size());
-        }
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::ungqr_batch(
-            queue, m_vec.data(), n_vec.data(), k_vec.data(), A_dev_ptrs, lda_vec.data(),
-            tau_dev_ptrs, group_count, group_sizes_vec.data(), scratchpad_dev, scratchpad_size,
-            std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::ungqr_batch,
-                                  m_vec.data(), n_vec.data(), k_vec.data(), A_dev_ptrs,
-                                  lda_vec.data(), tau_dev_ptrs, group_count, group_sizes_vec.data(),
-                                  scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        if (scratchpad_dev) {
-            sycl::free(scratchpad_dev, queue);
-        }
-        if (A_dev_ptrs) {
-            sycl::free(A_dev_ptrs, queue);
-        }
-        if (tau_dev_ptrs) {
-            sycl::free(tau_dev_ptrs, queue);
-        }
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_USM_COMPLEX(UngqrBatchGroup);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_COMPLEX(UngqrBatchGroup);
diff --git a/tests/unit_tests/lapack/source/ungqr_batch_stride.cpp b/tests/unit_tests/lapack/source/ungqr_batch_stride.cpp
deleted file mode 100644
index e656b9fb7..000000000
--- a/tests/unit_tests/lapack/source/ungqr_batch_stride.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-29 23 18 37 1024 40 3 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, int64_t m, int64_t n, int64_t k, int64_t lda,
-              int64_t stride_a, int64_t stride_tau, int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(stride_a * batch_size);
-    std::vector<fp> tau(stride_tau * batch_size);
-
-    for (int64_t i = 0; i < batch_size; i++) {
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A, lda, i * stride_a);
-        auto info =
-            reference::geqrf(m, k, A.data() + i * stride_a, lda, tau.data() + i * stride_tau);
-        if (0 != info) {
-            test_log::lout << "batch routine index " << i
-                           << ": reference geqrf failed with info: " << info << std::endl;
-            return false;
-        }
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::ungqr_batch_scratchpad_size<fp>(
-            queue, m, n, k, lda, stride_a, stride_tau, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::ungqr_batch_scratchpad_size<fp>, m, n, k,
-            lda, stride_a, stride_tau, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::ungqr_batch(queue, m, n, k, A_dev, lda, stride_a, tau_dev, stride_tau,
-                                         batch_size, scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::ungqr_batch, m, n, k, A_dev, lda,
-                                  stride_a, tau_dev, stride_tau, batch_size, scratchpad_dev,
-                                  scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, tau_dev, tau.data(), tau.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    bool result = true;
-    for (int64_t i = 0; i < batch_size; i++) {
-        auto A_ = copy_vector(A, lda * n, i * stride_a);
-        if (!check_or_un_gqr_accuracy(m, n, A_, lda)) {
-            test_log::lout << "batch routine index " << i << " failed" << std::endl;
-            result = false;
-        }
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, int64_t m, int64_t n, int64_t k, int64_t lda,
-                    int64_t stride_a, int64_t stride_tau, int64_t batch_size, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(stride_a * batch_size);
-    std::vector<fp> tau(stride_tau * batch_size);
-
-    for (int64_t i = 0; i < batch_size; i++) {
-        rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, A, lda, i * stride_a);
-        auto info =
-            reference::geqrf(m, k, A.data() + i * stride_a, lda, tau.data() + i * stride_tau);
-        if (0 != info) {
-            test_log::lout << "batch routine index " << i
-                           << ": reference geqrf failed with info: " << info << std::endl;
-            return false;
-        }
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::ungqr_batch_scratchpad_size<fp>(
-            queue, m, n, k, lda, stride_a, stride_tau, batch_size);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::ungqr_batch_scratchpad_size<fp>, m, n, k,
-            lda, stride_a, stride_tau, batch_size);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::ungqr_batch(
-            queue, m, n, k, A_dev, lda, stride_a, tau_dev, stride_tau, batch_size, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::ungqr_batch, m, n, k,
-                                  A_dev, lda, stride_a, tau_dev, stride_tau, batch_size,
-                                  scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_COMPLEX(UngqrBatchStride);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_COMPLEX(UngqrBatchStride);
diff --git a/tests/unit_tests/lapack/source/ungtr.cpp b/tests/unit_tests/lapack/source/ungtr.cpp
deleted file mode 100644
index b0ad8e8f2..000000000
--- a/tests/unit_tests/lapack/source/ungtr.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 41 59 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-              uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<fp_real> d(n);
-    std::vector<fp_real> e(n);
-    std::vector<fp> tau(n);
-
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A, lda);
-    auto info = reference::hetrd(uplo, n, A.data(), lda, d.data(), e.data(), tau.data());
-    if (0 != info) {
-        test_log::lout << "reference hetrd failed with info = " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::ungtr_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::ungtr_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::ungtr(queue, uplo, n, A_dev, lda, tau_dev, scratchpad_dev,
-                                   scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::ungtr, uplo, n, A_dev, lda, tau_dev,
-                                  scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return check_or_un_gtr_accuracy(n, A, lda);
-}
-
-const char* dependency_input = R"(
-1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t n, int64_t lda,
-                    uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> A(lda * n);
-    std::vector<fp_real> d(n);
-    std::vector<fp_real> e(n);
-    std::vector<fp> tau(n);
-
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A, lda);
-    auto info = reference::hetrd(uplo, n, A.data(), lda, d.data(), e.data(), tau.data());
-    if (0 != info) {
-        test_log::lout << "reference hetrd failed with info = " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size =
-            oneapi::mkl::lapack::ungtr_scratchpad_size<fp>(queue, uplo, n, lda);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(
-            queue, scratchpad_size = oneapi::mkl::lapack::ungtr_scratchpad_size<fp>, uplo, n, lda);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event =
-            oneapi::mkl::lapack::ungtr(queue, uplo, n, A_dev, lda, tau_dev, scratchpad_dev,
-                                       scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::ungtr, uplo, n, A_dev,
-                                  lda, tau_dev, scratchpad_dev, scratchpad_size,
-                                  std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-        queue.wait_and_throw();
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_COMPLEX(Ungtr);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_COMPLEX(Ungtr);
diff --git a/tests/unit_tests/lapack/source/unmqr.cpp b/tests/unit_tests/lapack/source/unmqr.cpp
deleted file mode 100644
index 2f555c1ca..000000000
--- a/tests/unit_tests/lapack/source/unmqr.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-0 0 50 70 50 70 70 27182
-0 0 20 22 20 22 22 27182
-0 3 50 70 50 70 70 27182
-0 3 20 22 20 22 22 27182
-1 0 50 70 70 90 70 27182
-1 0 20 22 22 24 22 27182
-1 3 50 70 70 90 70 27182
-1 3 20 22 22 24 22 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::side left_right, oneapi::mkl::transpose trans,
-              int64_t m, int64_t n, int64_t k, int64_t lda, int64_t ldc, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> C_initial(ldc * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, C_initial, ldc);
-    std::vector<fp> C = C_initial;
-
-    int64_t nq = (left_right == oneapi::mkl::side::left) ? m : n;
-    std::vector<fp> A(lda * k);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, nq, k, A, lda);
-    std::vector<fp> tau(k);
-
-    auto info = reference::geqrf(nq, k, A.data(), lda, tau.data());
-    if (0 != info) {
-        test_log::lout << "reference geqrf failed with info: " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-        auto C_dev = device_alloc<data_T>(queue, C.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::unmqr_scratchpad_size<fp>(
-            queue, left_right, trans, m, n, k, lda, ldc);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::unmqr_scratchpad_size<fp>,
-                                  left_right, trans, m, n, k, lda, ldc);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        host_to_device_copy(queue, C.data(), C_dev, C.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::unmqr(queue, left_right, trans, m, n, k, A_dev, lda, tau_dev, C_dev,
-                                   ldc, scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::unmqr, left_right, trans, m, n, k,
-                                  A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, C_dev, C.data(), C.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, C_dev);
-        device_free(queue, scratchpad_dev);
-    }
-    bool result = true;
-
-    /* |Q C - QC| < |QC| O(eps) */
-    const auto& QC = C;
-    auto& QC_ref = C_initial;
-    auto ldqc = ldc;
-    info = reference::or_un_mqr(left_right, trans, m, n, k, A.data(), lda, tau.data(),
-                                QC_ref.data(), ldqc);
-    if (0 != info) {
-        test_log::lout << "reference unmqr failed with info: " << info << std::endl;
-        return false;
-    }
-    if (!rel_mat_err_check(m, n, QC, ldqc, QC_ref, ldqc, 1.0)) {
-        test_log::lout << "Multiplication check failed" << std::endl;
-        result = false;
-    }
-    return result;
-}
-
-const char* dependency_input = R"(
-0 0 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::side left_right,
-                    oneapi::mkl::transpose trans, int64_t m, int64_t n, int64_t k, int64_t lda,
-                    int64_t ldc, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> C_initial(ldc * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, C_initial, ldc);
-    std::vector<fp> C = C_initial;
-
-    int64_t nq = (left_right == oneapi::mkl::side::left) ? m : n;
-    std::vector<fp> A(lda * k);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, nq, k, A, lda);
-    std::vector<fp> tau(k);
-
-    auto info = reference::geqrf(nq, k, A.data(), lda, tau.data());
-    if (0 != info) {
-        test_log::lout << "reference geqrf failed with info: " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-        auto C_dev = device_alloc<data_T>(queue, C.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::unmqr_scratchpad_size<fp>(
-            queue, left_right, trans, m, n, k, lda, ldc);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::unmqr_scratchpad_size<fp>,
-                                  left_right, trans, m, n, k, lda, ldc);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        host_to_device_copy(queue, C.data(), C_dev, C.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::unmqr(
-            queue, left_right, trans, m, n, k, A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::unmqr, left_right, trans,
-                                  m, n, k, A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev,
-                                  scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, C_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_COMPLEX(Unmqr);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_COMPLEX(Unmqr);
diff --git a/tests/unit_tests/lapack/source/unmrq.cpp b/tests/unit_tests/lapack/source/unmrq.cpp
deleted file mode 100644
index 628063837..000000000
--- a/tests/unit_tests/lapack/source/unmrq.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 0 30 30 30 33 31 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::side left_right, oneapi::mkl::transpose trans,
-              int64_t m, int64_t n, int64_t k, int64_t lda, int64_t ldc, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> C_initial(ldc * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, C_initial, ldc);
-    std::vector<fp> C = C_initial;
-
-    int64_t nq;
-    if (left_right == oneapi::mkl::side::left) {
-        if (k > m) {
-            test_log::lout << "Bad test input, side == left and k > m (" << k << " > " << m << ")"
-                           << std::endl;
-            return false;
-        }
-        nq = m;
-    }
-    else {
-        if (k > n) {
-            test_log::lout << "Bad test input, side == right and k > n (" << k << " > " << n << ")"
-                           << std::endl;
-            return false;
-        }
-        nq = n;
-    }
-
-    std::vector<fp> A(lda * k);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, nq, k, A, lda);
-    std::vector<fp> tau(k);
-
-    auto info = reference::gerqf(nq, k, A.data(), lda, tau.data());
-    if (0 != info) {
-        test_log::lout << "reference gerqf failed with info = " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-        auto C_dev = device_alloc<data_T>(queue, C.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::unmrq_scratchpad_size<fp>(
-            queue, left_right, trans, m, n, k, lda, ldc);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::unmrq_scratchpad_size<fp>,
-                                  left_right, trans, m, n, k, lda, ldc);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        host_to_device_copy(queue, C.data(), C_dev, C.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::unmrq(queue, left_right, trans, m, n, k, A_dev, lda, tau_dev, C_dev,
-                                   ldc, scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::unmrq, left_right, trans, m, n, k,
-                                  A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, C_dev, C.data(), C.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, C_dev);
-        device_free(queue, scratchpad_dev);
-    }
-    bool result = true;
-
-    /* |Q C - QC| < |QC| O(eps) */
-    const auto& QC = C;
-    auto& QC_ref = C_initial;
-    auto ldqc = ldc;
-    info = reference::or_un_mrq(left_right, trans, m, n, k, A.data(), lda, tau.data(),
-                                QC_ref.data(), ldqc);
-    if (0 != info) {
-        test_log::lout << "reference unmrq failed with info = " << info << std::endl;
-        return false;
-    }
-    if (!rel_mat_err_check(m, n, QC, ldqc, QC_ref, ldqc, 1.0)) {
-        test_log::lout << "Multiplication check failed" << std::endl;
-        result = false;
-    }
-    return result;
-}
-
-const char* dependency_input = R"(
-0 0 1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::side left_right,
-                    oneapi::mkl::transpose trans, int64_t m, int64_t n, int64_t k, int64_t lda,
-                    int64_t ldc, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    std::vector<fp> C_initial(ldc * n);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, C_initial, ldc);
-    std::vector<fp> C = C_initial;
-
-    int64_t nq = (left_right == oneapi::mkl::side::left) ? m : n;
-    std::vector<fp> A(lda * k);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, nq, k, A, lda);
-    std::vector<fp> tau(k);
-
-    auto info = reference::gerqf(nq, k, A.data(), lda, tau.data());
-    if (0 != info) {
-        test_log::lout << "reference gerqf failed with info = " << info << std::endl;
-        return false;
-    }
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-        auto C_dev = device_alloc<data_T>(queue, C.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::unmrq_scratchpad_size<fp>(
-            queue, left_right, trans, m, n, k, lda, ldc);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::unmrq_scratchpad_size<fp>,
-                                  left_right, trans, m, n, k, lda, ldc);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        host_to_device_copy(queue, C.data(), C_dev, C.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::unmrq(
-            queue, left_right, trans, m, n, k, A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::unmrq, left_right, trans,
-                                  m, n, k, A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev,
-                                  scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, C_dev);
-        device_free(queue, scratchpad_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_COMPLEX(Unmrq);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_COMPLEX(Unmrq);
diff --git a/tests/unit_tests/lapack/source/unmtr.cpp b/tests/unit_tests/lapack/source/unmtr.cpp
deleted file mode 100644
index 8148c644d..000000000
--- a/tests/unit_tests/lapack/source/unmtr.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "lapack_common.hpp"
-#include "lapack_test_controller.hpp"
-#include "lapack_accuracy_checks.hpp"
-#include "lapack_reference_wrappers.hpp"
-#include "test_helper.hpp"
-
-namespace {
-
-const char* accuracy_input = R"(
-1 31 33 35 37 27182
-)";
-
-template <typename data_T>
-bool accuracy(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t m, int64_t n, int64_t lda,
-              int64_t ldc, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    oneapi::mkl::side side = oneapi::mkl::side::right;
-    oneapi::mkl::transpose trans = oneapi::mkl::transpose::nontrans;
-
-    std::vector<fp> A(n * lda);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A, lda);
-
-    std::vector<fp> tau(n);
-    std::vector<fp_real> d(n);
-    std::vector<fp_real> e(n);
-    auto info = reference::hetrd(uplo, n, A.data(), lda, d.data(), e.data(), tau.data());
-    if (0 != info) {
-        test_log::lout << "reference hetrd failed with info = " << info << std::endl;
-        return false;
-    }
-
-    std::vector<fp> C(n * ldc);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, C, ldc);
-    std::vector<fp> C_initial = C;
-
-    /* Compute on device */
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-        auto C_dev = device_alloc<data_T>(queue, C.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::unmtr_scratchpad_size<fp>(
-            queue, side, uplo, trans, m, n, lda, ldc);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::unmtr_scratchpad_size<fp>,
-                                  side, uplo, trans, m, n, lda, ldc);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        host_to_device_copy(queue, C.data(), C_dev, C.size());
-        queue.wait_and_throw();
-
-#ifdef CALL_RT_API
-        oneapi::mkl::lapack::unmtr(queue, side, uplo, trans, m, n, A_dev, lda, tau_dev, C_dev, ldc,
-                                   scratchpad_dev, scratchpad_size);
-#else
-        TEST_RUN_LAPACK_CT_SELECT(queue, oneapi::mkl::lapack::unmtr, side, uplo, trans, m, n, A_dev,
-                                  lda, tau_dev, C_dev, ldc, scratchpad_dev, scratchpad_size);
-#endif
-        queue.wait_and_throw();
-
-        device_to_host_copy(queue, A_dev, A.data(), A.size());
-        device_to_host_copy(queue, tau_dev, tau.data(), tau.size());
-        device_to_host_copy(queue, C_dev, C.data(), C.size());
-        queue.wait_and_throw();
-
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, C_dev);
-    }
-    bool result = true;
-
-    auto& C_ref = C_initial;
-    info = reference::unmtr(side, uplo, trans, m, n, A.data(), lda, tau.data(), C_ref.data(), ldc);
-    if (0 != info) {
-        test_log::lout << "reference unmtr failed with info = " << info << std::endl;
-        return false;
-    }
-    if (!rel_mat_err_check(m, n, C, ldc, C_ref, ldc)) {
-        test_log::lout << "Multiplication check failed" << std::endl;
-        result = false;
-    }
-
-    return result;
-}
-
-const char* dependency_input = R"(
-1 1 1 1 1 1
-)";
-
-template <typename data_T>
-bool usm_dependency(const sycl::device& dev, oneapi::mkl::uplo uplo, int64_t m, int64_t n,
-                    int64_t lda, int64_t ldc, uint64_t seed) {
-    using fp = typename data_T_info<data_T>::value_type;
-    using fp_real = typename complex_info<fp>::real_type;
-
-    /* Initialize */
-    oneapi::mkl::side side = oneapi::mkl::side::right;
-    oneapi::mkl::transpose trans = oneapi::mkl::transpose::nontrans;
-
-    std::vector<fp> A(n * lda);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, n, n, A, lda);
-
-    std::vector<fp> tau(n);
-    std::vector<fp_real> d(n);
-    std::vector<fp_real> e(n);
-    auto info = reference::hetrd(uplo, n, A.data(), lda, d.data(), e.data(), tau.data());
-    if (0 != info) {
-        test_log::lout << "reference hetrd failed with info = " << info << std::endl;
-        return false;
-    }
-
-    std::vector<fp> C(n * ldc);
-    rand_matrix(seed, oneapi::mkl::transpose::nontrans, m, n, C, ldc);
-    std::vector<fp> C_initial = C;
-
-    /* Compute on device */
-    bool result;
-    {
-        sycl::queue queue{ dev, async_error_handler };
-
-        auto A_dev = device_alloc<data_T>(queue, A.size());
-        auto tau_dev = device_alloc<data_T>(queue, tau.size());
-        auto C_dev = device_alloc<data_T>(queue, C.size());
-#ifdef CALL_RT_API
-        const auto scratchpad_size = oneapi::mkl::lapack::unmtr_scratchpad_size<fp>(
-            queue, side, uplo, trans, m, n, lda, ldc);
-#else
-        int64_t scratchpad_size;
-        TEST_RUN_LAPACK_CT_SELECT(queue,
-                                  scratchpad_size = oneapi::mkl::lapack::unmtr_scratchpad_size<fp>,
-                                  side, uplo, trans, m, n, lda, ldc);
-#endif
-        auto scratchpad_dev = device_alloc<data_T>(queue, scratchpad_size);
-
-        host_to_device_copy(queue, A.data(), A_dev, A.size());
-        host_to_device_copy(queue, tau.data(), tau_dev, tau.size());
-        host_to_device_copy(queue, C.data(), C_dev, C.size());
-        queue.wait_and_throw();
-
-        /* Check dependency handling */
-        auto in_event = create_dependency(queue);
-#ifdef CALL_RT_API
-        sycl::event func_event = oneapi::mkl::lapack::unmtr(
-            queue, side, uplo, trans, m, n, A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev,
-            scratchpad_size, std::vector<sycl::event>{ in_event });
-#else
-        sycl::event func_event;
-        TEST_RUN_LAPACK_CT_SELECT(queue, func_event = oneapi::mkl::lapack::unmtr, side, uplo, trans,
-                                  m, n, A_dev, lda, tau_dev, C_dev, ldc, scratchpad_dev,
-                                  scratchpad_size, std::vector<sycl::event>{ in_event });
-#endif
-        result = check_dependency(queue, in_event, func_event);
-        queue.wait_and_throw();
-
-        queue.wait_and_throw();
-        device_free(queue, A_dev);
-        device_free(queue, tau_dev);
-        device_free(queue, C_dev);
-    }
-
-    return result;
-}
-
-InputTestController<decltype(::accuracy<void>)> accuracy_controller{ accuracy_input };
-InputTestController<decltype(::usm_dependency<void>)> dependency_controller{ dependency_input };
-
-} /* anonymous namespace */
-
-#include "lapack_gtest_suite.hpp"
-INSTANTIATE_GTEST_SUITE_ACCURACY_COMPLEX(Unmtr);
-INSTANTIATE_GTEST_SUITE_DEPENDENCY_COMPLEX(Unmtr);
diff --git a/tests/unit_tests/main_test.cpp b/tests/unit_tests/main_test.cpp
deleted file mode 100644
index bac3f8c83..000000000
--- a/tests/unit_tests/main_test.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <gtest/gtest.h>
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include <string>
-#include "test_helper.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "oneapi/mkl.hpp"
-
-#define MAX_STR 128
-
-using ::testing::EmptyTestEventListener;
-using ::testing::InitGoogleTest;
-using ::testing::Test;
-using ::testing::TestCase;
-using ::testing::TestEventListeners;
-using ::testing::TestInfo;
-using ::testing::TestPartResult;
-using ::testing::UnitTest;
-
-std::vector<sycl::device*> devices;
-
-std::string gtestInFile;
-
-namespace {
-// Provides alternative output mode which produces minimal amount of
-// information about tests.
-class TersePrinter : public EmptyTestEventListener {
-private:
-    // Called before any test activity starts.
-    void OnTestProgramStart(const UnitTest& /* unit_test */) override {}
-
-    // Called after all test activities have ended.
-    void OnTestProgramEnd(const UnitTest& unit_test) override {
-        fprintf(stdout, "TEST %s\n", unit_test.Passed() ? "PASSED" : "FAILED");
-        fflush(stdout);
-    }
-
-    // Called before a test starts.
-    void OnTestStart(const TestInfo& test_info) override {
-        fprintf(stdout, "*** Test %s.%s starting.\n", test_info.test_case_name(), test_info.name());
-        fflush(stdout);
-    }
-
-    // Called after a failed assertion or a SUCCEED() invocation.
-    void OnTestPartResult(const TestPartResult& test_part_result) override {
-        const char* file_name = test_part_result.file_name();
-        fprintf(stdout, "%s in %s:%d\n%s\n", test_part_result.failed() ? "*** Failure" : "Success",
-                file_name ? file_name : "unknown file", test_part_result.line_number(),
-                test_part_result.summary());
-        fflush(stdout);
-    }
-
-    // Called after a test ends.
-    void OnTestEnd(const TestInfo& test_info) override {
-        fprintf(stdout, "*** Test %s.%s ending.\n", test_info.test_case_name(), test_info.name());
-        fflush(stdout);
-    }
-}; // class TersePrinter
-
-} // anonymous namespace
-
-void print_error_code(sycl::exception const& e) {
-#ifdef __HIPSYCL__
-    std::cout << "Backend status: " << e.code() << std::endl;
-#else
-    std::cout << "OpenCL status: " << e.code() << std::endl;
-#endif
-}
-
-int main(int argc, char** argv) {
-    std::set<std::string> unique_devices;
-    std::vector<sycl::device> local_devices;
-
-    auto platforms = sycl::platform::get_platforms();
-    for (auto plat : platforms) {
-#ifdef __HIPSYCL__
-        if (!plat.is_host()) {
-#endif
-            auto plat_devs = plat.get_devices();
-            for (auto dev : plat_devs) {
-                try {
-                    /* Do not test for OpenCL backend on GPU */
-                    if (dev.is_gpu() && plat.get_info<sycl::info::platform::name>().find(
-                                            "OpenCL") != std::string::npos)
-                        continue;
-                    if (unique_devices.find(dev.get_info<sycl::info::device::name>()) ==
-                        unique_devices.end()) {
-                        unique_devices.insert(dev.get_info<sycl::info::device::name>());
-                        unsigned int vendor_id = static_cast<unsigned int>(
-                            dev.get_info<sycl::info::device::vendor_id>());
-#if !defined(ENABLE_MKLCPU_BACKEND) && !defined(ENABLE_PORTBLAS_BACKEND_INTEL_CPU) && \
-    !defined(ENABLE_PORTFFT_BACKEND)
-                        if (dev.is_cpu())
-                            continue;
-#endif
-#if !defined(ENABLE_MKLGPU_BACKEND) && !defined(ENABLE_PORTBLAS_BACKEND_INTEL_GPU) && \
-    !defined(ENABLE_PORTFFT_BACKEND)
-                        if (dev.is_gpu() && vendor_id == INTEL_ID)
-                            continue;
-#endif
-#if !defined(ENABLE_CUBLAS_BACKEND) && !defined(ENABLE_CURAND_BACKEND) &&                \
-    !defined(ENABLE_CUSOLVER_BACKEND) && !defined(ENABLE_PORTBLAS_BACKEND_NVIDIA_GPU) && \
-    !defined(ENABLE_CUFFT_BACKEND) && !defined(ENABLE_PORTFFT_BACKEND)
-                        if (dev.is_gpu() && vendor_id == NVIDIA_ID)
-                            continue;
-#endif
-#if !defined(ENABLE_ROCBLAS_BACKEND) && !defined(ENABLE_ROCRAND_BACKEND) &&            \
-    !defined(ENABLE_ROCSOLVER_BACKEND) && !defined(ENABLE_PORTBLAS_BACKEND_AMD_GPU) && \
-    !defined(ENABLE_ROCFFT_BACKEND) && !defined(ENABLE_PORTFFT_BACKEND)
-                        if (dev.is_gpu() && vendor_id == AMD_ID)
-                            continue;
-#endif
-// clang-format off
-#ifdef __HIPSYCL__
-                        if (dev.is_accelerator())
-#else
-                        if (!dev.is_accelerator())
-// clang-format on
-#endif
-                            local_devices.push_back(dev);
-                    }
-                }
-                catch (std::exception const& e) {
-                    std::cout << "Exception while accessing device: " << e.what() << "\n";
-                }
-            }
-#ifdef __HIPSYCL__
-        }
-#endif
-    }
-
-#if defined(ENABLE_MKLCPU_BACKEND) || defined(ENABLE_NETLIB_BACKEND) || \
-    defined(ENABLE_PORTBLAS_BACKEND_INTEL_CPU)
-#ifdef __HIPSYCL__
-    local_devices.push_back(sycl::device(sycl::cpu_selector()));
-#else
-    local_devices.push_back(sycl::device(sycl::cpu_selector_v));
-#endif
-#endif
-#define GET_NAME(d) (d).template get_info<sycl::info::device::name>()
-    for (auto& local_dev : local_devices) {
-        // Test only unique devices
-        if (std::find_if(devices.begin(), devices.end(), [&](sycl::device* dev) {
-                return GET_NAME(*dev) == GET_NAME(local_dev);
-            }) == devices.end())
-            devices.push_back(&local_dev);
-    }
-
-    // start Google Test pickup and output
-    testing::InitGoogleTest(&argc, argv);
-
-    bool terse_output = false;
-    if (argc > 1 && strcmp(argv[1], "--terse_output") == 0)
-        terse_output = true;
-    else
-        printf("%s\n",
-               "Run this program with --terse_output to change the way it prints its output.");
-
-    for (int i = 0; i < argc; i++) {
-        if (strncmp(argv[i], "--input_file=", 13) == 0) {
-            std::string tmp(argv[i]);
-            gtestInFile = tmp.substr(13);
-            break;
-        }
-    }
-
-    UnitTest& unit_test = *UnitTest::GetInstance();
-
-    // If we are given the --terse_output command line flag, suppresses the
-    // standard output and attaches own result printer.
-    if (terse_output) {
-        TestEventListeners& listeners = unit_test.listeners();
-
-        // Removes the default console output listener from the list so it will
-        // not receive events from Google Test and won't print any output. Since
-        // this operation transfers ownership of the listener to the caller we
-        // have to delete it as well.
-        delete listeners.Release(listeners.default_result_printer());
-
-        // Adds the custom output listener to the list. It will now receive
-        // events from Google Test and print the alternative output. We don't
-        // have to worry about deleting it since Google Test assumes ownership
-        // over it after adding it to the list.
-        listeners.Append(new TersePrinter);
-    }
-    int ret_val = RUN_ALL_TESTS();
-
-    // This is an example of using the UnitTest reflection API to inspect test
-    // results. Here we discount failures from the tests we expected to fail.
-    int unexpectedly_failed_tests = 0;
-    for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
-        const TestCase& test_case = *unit_test.GetTestCase(i);
-        for (int j = 0; j < test_case.total_test_count(); ++j) {
-            const TestInfo& test_info = *test_case.GetTestInfo(j);
-            // Counts failed tests that were not meant to fail (those without
-            // 'Fails' in the name).
-            if (test_info.result()->Failed() && strcmp(test_info.name(), "Fails") != 0) {
-                unexpectedly_failed_tests++;
-            }
-        }
-    }
-
-    // Test that were meant to fail should not affect the test program outcome.
-    if (unexpectedly_failed_tests == 0)
-        ret_val = 0;
-
-    return ret_val;
-}
diff --git a/tests/unit_tests/rng/CMakeLists.txt b/tests/unit_tests/rng/CMakeLists.txt
deleted file mode 100644
index a2f077d35..000000000
--- a/tests/unit_tests/rng/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_subdirectory(device)
-add_subdirectory(service)
-add_subdirectory(statistics_check)
diff --git a/tests/unit_tests/rng/device/CMakeLists.txt b/tests/unit_tests/rng/device/CMakeLists.txt
deleted file mode 100644
index e3f36d972..000000000
--- a/tests/unit_tests/rng/device/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_subdirectory(moments)
-add_subdirectory(service)
diff --git a/tests/unit_tests/rng/device/include/moments.hpp b/tests/unit_tests/rng/device/include/moments.hpp
deleted file mode 100644
index 8acf20bf9..000000000
--- a/tests/unit_tests/rng/device/include/moments.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/*
-*
-*  Content:
-*       oneapi::mkl::rng::device:: distributions moments test (SYCL interface)
-*
-*******************************************************************************/
-
-#ifndef _RNG_DEVICE_DISTR_MOMENTS_TEST_HPP_
-#define _RNG_DEVICE_DISTR_MOMENTS_TEST_HPP_
-
-#include <iostream>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/rng/device.hpp"
-
-#include "rng_device_test_common.hpp"
-
-template <class Engine, class Distribution>
-class moments_test {
-public:
-    template <typename Queue>
-    void operator()(Queue queue) {
-        // Note: the following methods of discrete distributions require double precision support
-        if ((std::is_same_v<
-                 Distribution,
-                 oneapi::mkl::rng::device::uniform<
-                     std::uint32_t, oneapi::mkl::rng::device::uniform_method::accurate>> ||
-             std::is_same_v<
-                 Distribution,
-                 oneapi::mkl::rng::device::uniform<
-                     std::int32_t, oneapi::mkl::rng::device::uniform_method::accurate>> ||
-             std::is_same_v<Distribution, oneapi::mkl::rng::device::poisson<
-                                              std::uint32_t,
-                                              oneapi::mkl::rng::device::poisson_method::devroye>> ||
-             std::is_same_v<
-                 Distribution,
-                 oneapi::mkl::rng::device::poisson<
-                     std::int32_t, oneapi::mkl::rng::device::poisson_method::devroye>>)&&!queue
-                .get_device()
-                .has(sycl::aspect::fp64)) {
-            status = test_skipped;
-            return;
-        }
-        using Type = typename Distribution::result_type;
-        // prepare array for random numbers
-        std::vector<Type> r(N_GEN);
-
-        try {
-            sycl::range<1> range(N_GEN / Engine::vec_size);
-
-            sycl::buffer<Type> buf(r);
-            auto event = queue.submit([&](sycl::handler& cgh) {
-                sycl::accessor acc(buf, cgh, sycl::write_only);
-                cgh.parallel_for(range, [=](sycl::item<1> item) {
-                    size_t id = item.get_id(0);
-                    auto multiplier = Engine::vec_size;
-                    if constexpr (std::is_same_v<Distribution,
-                                                 oneapi::mkl::rng::device::uniform_bits<uint64_t>>)
-                        multiplier *= 2;
-                    Engine engine(SEED, id * multiplier);
-                    Distribution distr;
-                    auto res = oneapi::mkl::rng::device::generate(distr, engine);
-                    if constexpr (Engine::vec_size == 1) {
-                        acc[id] = res;
-                    }
-                    else {
-                        res.store(id, get_multi_ptr(acc));
-                    }
-                });
-            });
-            event.wait_and_throw();
-        }
-        catch (const oneapi::mkl::unimplemented& e) {
-            status = test_skipped;
-            return;
-        }
-        catch (sycl::exception const& e) {
-            std::cout << "SYCL exception during generation" << std::endl
-                      << e.what() << std::endl
-                      << "Error code: " << get_error_code(e) << std::endl;
-            status = test_failed;
-            return;
-        }
-
-        // validation (statistics check is turned out for mcg59)
-        if constexpr (!std::is_same<Engine,
-                                    oneapi::mkl::rng::device::mcg59<Engine::vec_size>>::value) {
-            statistics_device<Distribution> stat;
-            status = stat.check(r, Distribution{});
-        }
-        return;
-    }
-
-    int status = test_passed;
-};
-
-#endif // _RNG_DEVICE_DISTR_MOMENTS_TEST_HPP_
diff --git a/tests/unit_tests/rng/device/include/rng_device_test_common.hpp b/tests/unit_tests/rng/device/include/rng_device_test_common.hpp
deleted file mode 100644
index 6b014f0ec..000000000
--- a/tests/unit_tests/rng/device/include/rng_device_test_common.hpp
+++ /dev/null
@@ -1,342 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _RNG_DEVICE_TEST_COMMON_HPP__
-#define _RNG_DEVICE_TEST_COMMON_HPP__
-
-#include <iostream>
-#include <limits>
-
-#include "test_helper.hpp"
-
-#define SEED  777
-#define N_GEN 960
-
-// Defines for skip_ahead and leapfrog tests
-#define N_ENGINES     5
-#define N_PORTION     100
-#define N_GEN_SERVICE (N_ENGINES * N_PORTION)
-
-// defines for skip_ahead_ex tests
-#define N_SKIP     ((std::uint64_t)pow(2, 62))
-#define SKIP_TIMES ((std::int32_t)pow(2, 14))
-#define NUM_TO_SKIP \
-    { 0, (std::uint64_t)pow(2, 12) }
-
-// Correctness checking.
-static inline bool check_equal_device(float x, float x_ref) {
-    float bound = std::numeric_limits<float>::epsilon();
-    float aerr = std::abs(x - x_ref);
-    return (aerr <= bound);
-}
-
-static inline bool check_equal_device(double x, double x_ref) {
-    double bound = std::numeric_limits<double>::epsilon();
-    double aerr = std::abs(x - x_ref);
-    return (aerr <= bound);
-}
-
-static inline bool check_equal_device(std::uint32_t x, std::uint32_t x_ref) {
-    return x == x_ref;
-}
-
-static inline bool check_equal_device(std::uint64_t x, std::uint64_t x_ref) {
-    return x == x_ref;
-}
-
-template <typename Fp, typename AllocType>
-static inline bool check_equal_vector_device(std::vector<Fp, AllocType>& r1,
-                                             std::vector<Fp, AllocType>& r2) {
-    bool good = true;
-    for (int i = 0; i < r1.size(); i++) {
-        if (!check_equal_device(r1[i], r2[i])) {
-            good = false;
-            break;
-        }
-    }
-    return good;
-}
-
-template <typename Test>
-class rng_device_test {
-public:
-    // method to call any tests, switch between rt and ct
-    template <typename... Args>
-    int operator()(sycl::device* dev, Args... args) {
-        auto exception_handler = [](sycl::exception_list exceptions) {
-            for (std::exception_ptr const& e : exceptions) {
-                try {
-                    std::rethrow_exception(e);
-                }
-                catch (sycl::exception const& e) {
-                    std::cout << "Caught asynchronous SYCL exception during ASUM:\n"
-                              << e.what() << std::endl;
-                    print_error_code(e);
-                }
-            }
-        };
-
-        sycl::queue queue(*dev, exception_handler);
-
-        test_(queue, args...);
-
-        return test_.status;
-    }
-
-protected:
-    Test test_;
-};
-
-template <typename T, typename = void>
-struct has_member_code_meta : std::false_type {};
-
-template <typename T>
-struct has_member_code_meta<T, std::void_t<decltype(std::declval<T>().get_multi_ptr())>>
-        : std::true_type {};
-
-template <typename T, typename std::enable_if<has_member_code_meta<T>::value>::type* = nullptr>
-auto get_multi_ptr(T acc) {
-#ifndef __HIPSYCL__
-    return acc.get_multi_ptr();
-#else
-    return acc.get_pointer();
-#endif
-};
-
-template <typename T, typename std::enable_if<!has_member_code_meta<T>::value>::type* = nullptr>
-auto get_multi_ptr(T acc) {
-#ifndef __HIPSYCL__
-    return acc.template get_multi_ptr<sycl::access::decorated::yes>();
-#else
-    return acc.get_pointer();
-#endif
-};
-
-template <typename T>
-auto get_error_code(T x) {
-    return x.code().value();
-};
-
-template <typename Fp, typename AllocType>
-bool compare_moments(const std::vector<Fp, AllocType>& r, double tM, double tD, double tQ) {
-    double tD2;
-    double sM, sD;
-    double sum, sum2;
-    double n, s;
-    double DeltaM, DeltaD;
-
-    // sample moments
-    sum = 0.0;
-    sum2 = 0.0;
-    for (int i = 0; i < N_GEN; i++) {
-        sum += (double)r[i];
-        sum2 += (double)r[i] * (double)r[i];
-    }
-    sM = sum / ((double)N_GEN);
-    sD = sum2 / (double)N_GEN - (sM * sM);
-
-    // Comparison of theoretical and sample moments
-    n = (double)N_GEN;
-    tD2 = tD * tD;
-    s = ((tQ - tD2) / n) - (2 * (tQ - 2 * tD2) / (n * n)) + ((tQ - 3 * tD2) / (n * n * n));
-
-    DeltaM = (tM - sM) / std::sqrt(tD / n);
-    DeltaD = (tD - sD) / std::sqrt(s);
-    if (fabs(DeltaM) > 3.0 || fabs(DeltaD) > 10.0) {
-        std::cout << "Error: sample moments (mean=" << sM << ", variance=" << sD
-                  << ") disagree with theory (mean=" << tM << ", variance=" << tD << ")"
-                  << " N_GEN = " << N_GEN << std::endl;
-        return false;
-    }
-    return true;
-}
-
-template <typename Distribution>
-struct statistics_device {};
-
-template <typename Fp, typename Method>
-struct statistics_device<oneapi::mkl::rng::device::uniform<Fp, Method>> {
-    template <typename AllocType>
-    bool check(const std::vector<Fp, AllocType>& r,
-               const oneapi::mkl::rng::device::uniform<Fp, Method>& distr) {
-        double tM, tD, tQ;
-        Fp a = distr.a();
-        Fp b = distr.b();
-
-        // Theoretical moments
-        tM = (b + a) / 2.0;
-        tD = ((b - a) * (b - a)) / 12.0;
-        tQ = ((b - a) * (b - a) * (b - a) * (b - a)) / 80.0;
-
-        return compare_moments(r, tM, tD, tQ);
-    }
-};
-
-template <typename Method>
-struct statistics_device<oneapi::mkl::rng::device::uniform<std::int32_t, Method>> {
-    template <typename AllocType>
-    bool check(const std::vector<int32_t, AllocType>& r,
-               const oneapi::mkl::rng::device::uniform<int32_t, Method>& distr) {
-        double tM, tD, tQ;
-        double a = distr.a();
-        double b = distr.b();
-
-        // Theoretical moments
-        tM = (a + b - 1.0) / 2.0;
-        tD = ((b - a) * (b - a) - 1.0) / 12.0;
-        tQ = (((b - a) * (b - a)) * ((1.0 / 80.0) * (b - a) * (b - a) - (1.0 / 24.0))) +
-             (7.0 / 240.0);
-
-        return compare_moments(r, tM, tD, tQ);
-    }
-};
-
-template <typename Method>
-struct statistics_device<oneapi::mkl::rng::device::uniform<std::uint32_t, Method>> {
-    template <typename AllocType>
-    bool check(const std::vector<uint32_t, AllocType>& r,
-               const oneapi::mkl::rng::device::uniform<uint32_t, Method>& distr) {
-        double tM, tD, tQ;
-        double a = distr.a();
-        double b = distr.b();
-
-        // Theoretical moments
-        tM = (a + b - 1.0) / 2.0;
-        tD = ((b - a) * (b - a) - 1.0) / 12.0;
-        tQ = (((b - a) * (b - a)) * ((1.0 / 80.0) * (b - a) * (b - a) - (1.0 / 24.0))) +
-             (7.0 / 240.0);
-
-        return compare_moments(r, tM, tD, tQ);
-    }
-};
-
-template <typename Fp, typename Method>
-struct statistics_device<oneapi::mkl::rng::device::gaussian<Fp, Method>> {
-    template <typename AllocType>
-    bool check(const std::vector<Fp, AllocType>& r,
-               const oneapi::mkl::rng::device::gaussian<Fp, Method>& distr) {
-        double tM, tD, tQ;
-        Fp a = distr.mean();
-        Fp sigma = distr.stddev();
-
-        // Theoretical moments
-        tM = a;
-        tD = sigma * sigma;
-        tQ = 720.0 * sigma * sigma * sigma * sigma;
-
-        return compare_moments(r, tM, tD, tQ);
-    }
-};
-
-template <typename Fp, typename Method>
-struct statistics_device<oneapi::mkl::rng::device::lognormal<Fp, Method>> {
-    template <typename AllocType>
-    bool check(const std::vector<Fp, AllocType>& r,
-               const oneapi::mkl::rng::device::lognormal<Fp, Method>& distr) {
-        double tM, tD, tQ;
-        Fp a = distr.m();
-        Fp b = distr.displ();
-        Fp sigma = distr.s();
-        Fp beta = distr.scale();
-
-        // Theoretical moments
-        tM = b + beta * std::exp(a + sigma * sigma * 0.5);
-        tD = beta * beta * std::exp(2.0 * a + sigma * sigma) * (std::exp(sigma * sigma) - 1.0);
-        tQ = beta * beta * beta * beta * std::exp(4.0 * a + 2.0 * sigma * sigma) *
-             (std::exp(6.0 * sigma * sigma) - 4.0 * std::exp(3.0 * sigma * sigma) +
-              6.0 * std::exp(sigma * sigma) - 3.0);
-
-        return compare_moments(r, tM, tD, tQ);
-    }
-};
-
-template <typename Fp, typename Method>
-struct statistics_device<oneapi::mkl::rng::device::exponential<Fp, Method>> {
-    template <typename AllocType>
-    bool check(const std::vector<Fp, AllocType>& r,
-               const oneapi::mkl::rng::device::exponential<Fp, Method>& distr) {
-        double tM, tD, tQ;
-        Fp a = distr.a();
-        Fp beta = distr.beta();
-
-        tM = a + beta;
-        tD = beta * beta;
-        tQ = 9.0 * beta * beta * beta * beta;
-
-        return compare_moments(r, tM, tD, tQ);
-    }
-};
-
-template <typename Fp, typename Method>
-struct statistics_device<oneapi::mkl::rng::device::poisson<Fp, Method>> {
-    template <typename AllocType>
-    bool check(const std::vector<Fp, AllocType>& r,
-               const oneapi::mkl::rng::device::poisson<Fp, Method>& distr) {
-        double tM, tD, tQ;
-        double lambda = distr.lambda();
-
-        tM = lambda;
-        tD = lambda;
-        tQ = 4 * lambda * lambda + lambda;
-
-        return compare_moments(r, tM, tD, tQ);
-    }
-};
-
-template <typename Fp, typename Method>
-struct statistics_device<oneapi::mkl::rng::device::bernoulli<Fp, Method>> {
-    template <typename AllocType>
-    bool check(const std::vector<Fp, AllocType>& r,
-               const oneapi::mkl::rng::device::bernoulli<Fp, Method>& distr) {
-        double tM, tD, tQ;
-        double p = static_cast<double>(distr.p());
-
-        tM = p;
-        tD = p * (1.0 - p);
-        tQ = p * (1.0 - 4.0 * p + 6.0 * p * p - 3.0 * p * p * p);
-
-        return compare_moments(r, tM, tD, tQ);
-    }
-};
-
-template <typename Fp>
-struct statistics_device<oneapi::mkl::rng::device::bits<Fp>> {
-    template <typename AllocType>
-    bool check(const std::vector<Fp, AllocType>& r,
-               const oneapi::mkl::rng::device::bits<Fp>& distr) {
-        return true;
-    }
-};
-
-template <typename Fp>
-struct statistics_device<oneapi::mkl::rng::device::uniform_bits<Fp>> {
-    template <typename AllocType>
-    bool check(const std::vector<Fp, AllocType>& r,
-               const oneapi::mkl::rng::device::uniform_bits<Fp>& distr) {
-        return true;
-    }
-};
-
-template <typename Engine>
-struct is_mcg59 : std::false_type {};
-
-template <std::int32_t VecSize>
-struct is_mcg59<oneapi::mkl::rng::device::mcg59<VecSize>> : std::true_type {};
-
-#endif // _RNG_DEVICE_TEST_COMMON_HPP__
diff --git a/tests/unit_tests/rng/device/include/skip_ahead_test.hpp b/tests/unit_tests/rng/device/include/skip_ahead_test.hpp
deleted file mode 100644
index 0b3bcf8a7..000000000
--- a/tests/unit_tests/rng/device/include/skip_ahead_test.hpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-/*
-*
-*  Content:
-*       oneapi::mkl::rng::device:: engines skip_ahead and skip_ahead_ex tests
-*       (SYCL interface)
-*
-*******************************************************************************/
-
-#ifndef _RNG_DEVICE_SKIP_AHEAD_TEST_HPP__
-#define _RNG_DEVICE_SKIP_AHEAD_TEST_HPP__
-
-#include <cstdint>
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl/rng/device.hpp"
-
-#include "rng_device_test_common.hpp"
-
-template <typename Engine>
-class skip_ahead_test {
-public:
-    template <typename Queue>
-    void operator()(Queue queue) {
-        using UIntType = std::conditional_t<is_mcg59<Engine>::value, std::uint64_t, std::uint32_t>;
-
-        std::vector<UIntType> r(N_GEN);
-        std::vector<UIntType> r_ref(N_GEN);
-
-        try {
-            sycl::range<1> range(N_GEN / Engine::vec_size);
-
-            sycl::buffer<UIntType> buf(r);
-            auto event = queue.submit([&](sycl::handler& cgh) {
-                sycl::accessor acc(buf, cgh, sycl::write_only);
-                cgh.parallel_for(range, [=](sycl::item<1> item) {
-                    size_t id = item.get_id(0);
-                    Engine engine(SEED);
-                    oneapi::mkl::rng::device::skip_ahead(engine, id * Engine::vec_size);
-                    oneapi::mkl::rng::device::bits<UIntType> distr;
-                    auto res = oneapi::mkl::rng::device::generate(distr, engine);
-                    if constexpr (Engine::vec_size == 1) {
-                        acc[id] = res;
-                    }
-                    else {
-                        res.store(id, get_multi_ptr(acc));
-                    }
-                });
-            });
-            event.wait_and_throw();
-        }
-        catch (const oneapi::mkl::unimplemented& e) {
-            status = test_skipped;
-            return;
-        }
-        catch (sycl::exception const& e) {
-            std::cout << "SYCL exception during generation" << std::endl
-                      << e.what() << std::endl
-                      << "Error code: " << get_error_code(e) << std::endl;
-            status = test_failed;
-            return;
-        }
-
-        // validation
-        Engine engine(SEED);
-        oneapi::mkl::rng::device::bits<UIntType> distr;
-        for (int i = 0; i < N_GEN; i += Engine::vec_size) {
-            auto res = oneapi::mkl::rng::device::generate(distr, engine);
-            if constexpr (Engine::vec_size == 1) {
-                r_ref[i] = res;
-            }
-            else {
-                for (int j = 0; j < Engine::vec_size; ++j) {
-                    r_ref[i + j] = res[j];
-                }
-            }
-        }
-
-        status = check_equal_vector_device(r, r_ref);
-    }
-
-    int status = test_passed;
-};
-
-template <class Engine>
-class skip_ahead_ex_test {
-public:
-    template <typename Queue>
-    void operator()(Queue queue) {
-        std::vector<std::uint32_t> r(N_GEN);
-        std::vector<std::uint32_t> r_ref(N_GEN);
-
-        try {
-            sycl::range<1> range(N_GEN / Engine::vec_size);
-
-            sycl::buffer<std::uint32_t, 1> buf(r);
-            std::uint64_t skip_num = (std::uint64_t)pow(2, 12);
-            auto event = queue.submit([&](sycl::handler& cgh) {
-                sycl::accessor acc(buf, cgh, sycl::write_only);
-                cgh.parallel_for(range, [=](sycl::item<1> item) {
-                    size_t id = item.get_id(0);
-                    Engine engine(SEED);
-                    oneapi::mkl::rng::device::skip_ahead(engine,
-                                                         { id * Engine::vec_size, skip_num });
-                    oneapi::mkl::rng::device::bits<> distr;
-                    auto res = oneapi::mkl::rng::device::generate(distr, engine);
-                    if constexpr (Engine::vec_size == 1) {
-                        acc[id] = res;
-                    }
-                    else {
-                        res.store(id, get_multi_ptr(acc));
-                    }
-                });
-            });
-            event.wait_and_throw();
-        }
-        catch (const oneapi::mkl::unimplemented& e) {
-            status = test_skipped;
-            return;
-        }
-        catch (sycl::exception const& e) {
-            std::cout << "SYCL exception during generation" << std::endl
-                      << e.what() << std::endl
-                      << "Error code: " << get_error_code(e) << std::endl;
-            status = test_failed;
-            return;
-        }
-
-        // validation
-        Engine engine(SEED);
-        for (int j = 0; j < SKIP_TIMES; j++) {
-            oneapi::mkl::rng::device::skip_ahead(engine, N_SKIP);
-        }
-        oneapi::mkl::rng::device::bits<> distr;
-        for (int i = 0; i < N_GEN; i += Engine::vec_size) {
-            auto res = oneapi::mkl::rng::device::generate(distr, engine);
-            if constexpr (Engine::vec_size == 1) {
-                r_ref[i] = res;
-            }
-            else {
-                for (int j = 0; j < Engine::vec_size; ++j) {
-                    r_ref[i + j] = res[j];
-                }
-            }
-        }
-
-        status = check_equal_vector_device(r, r_ref);
-    }
-
-    int status = test_passed;
-};
-
-#endif // _RNG_DEVICE_SKIP_AHEAD_TEST_HPP__
diff --git a/tests/unit_tests/rng/device/moments/CMakeLists.txt b/tests/unit_tests/rng/device/moments/CMakeLists.txt
deleted file mode 100644
index 2da8033bf..000000000
--- a/tests/unit_tests/rng/device/moments/CMakeLists.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build object from all test sources
-set(MOMENTS_DEVICE_TESTS_SOURCES "moments.cpp")
-
-add_library(rng_device_moments_ct OBJECT ${MOMENTS_DEVICE_TESTS_SOURCES})
-target_compile_options(rng_device_moments_ct PRIVATE -DNOMINMAX)
-target_include_directories(rng_device_moments_ct
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../include
-    PUBLIC ${PROJECT_SOURCE_DIR}/include
-    PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-    PUBLIC ${CMAKE_BINARY_DIR}/bin
-)
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET rng_device_moments_ct SOURCES ${MOMENTS_DEVICE_TESTS_SOURCES})
-else()
-  target_link_libraries(rng_device_moments_ct PUBLIC ONEMKL::SYCL::SYCL)
-endif()
-
-if(NOT ${ONEMKL_SYCL_IMPLEMENTATION} STREQUAL "hipsycl")
-  target_link_options(rng_device_moments_ct PUBLIC -fsycl -fsycl-device-code-split=per_kernel)
-endif()
diff --git a/tests/unit_tests/rng/device/moments/moments.cpp b/tests/unit_tests/rng/device/moments/moments.cpp
deleted file mode 100644
index 36ce38ee8..000000000
--- a/tests/unit_tests/rng/device/moments/moments.cpp
+++ /dev/null
@@ -1,1050 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "moments.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-class Philox4x32x10UniformStdDeviceMomentsTests : public ::testing::TestWithParam<sycl::device*> {};
-
-class Philox4x32x10UniformAccDeviceMomentsTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Philox4x32x10UniformStdDeviceMomentsTests, RealSinglePrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Philox4x32x10UniformStdDeviceMomentsTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Philox4x32x10UniformStdDeviceMomentsTests, IntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Philox4x32x10UniformStdDeviceMomentsTests, UnsignedIntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Philox4x32x10UniformAccDeviceMomentsTests, RealSinglePrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Philox4x32x10UniformAccDeviceMomentsTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Philox4x32x10UniformAccDeviceMomentsTests, IntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Philox4x32x10UniformAccDeviceMomentsTests, UnsignedIntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10UniformStdDeviceMomentsTestsSuite,
-                         Philox4x32x10UniformStdDeviceMomentsTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10UniformAccDeviceMomentsTestsSuite,
-                         Philox4x32x10UniformAccDeviceMomentsTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-class Mrg32k3aUniformStdDeviceMomentsTests : public ::testing::TestWithParam<sycl::device*> {};
-
-class Mrg32k3aUniformAccDeviceMomentsTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Mrg32k3aUniformStdDeviceMomentsTests, RealSinglePrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mrg32k3a<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mrg32k3a<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mrg32k3a<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mrg32k3aUniformStdDeviceMomentsTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mrg32k3a<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mrg32k3a<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mrg32k3a<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mrg32k3aUniformStdDeviceMomentsTests, IntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mrg32k3a<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mrg32k3a<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mrg32k3a<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mrg32k3aUniformStdDeviceMomentsTests, UnsignedIntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mrg32k3a<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mrg32k3a<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mrg32k3a<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mrg32k3aUniformAccDeviceMomentsTests, RealSinglePrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mrg32k3a<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mrg32k3a<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mrg32k3a<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mrg32k3aUniformAccDeviceMomentsTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mrg32k3a<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mrg32k3a<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mrg32k3a<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mrg32k3aUniformAccDeviceMomentsTests, IntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mrg32k3a<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mrg32k3a<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mrg32k3a<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mrg32k3aUniformAccDeviceMomentsTests, UnsignedIntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mrg32k3a<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mrg32k3a<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mrg32k3a<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Mrg32k3aUniformStdDeviceMomentsTestsSuite,
-                         Mrg32k3aUniformStdDeviceMomentsTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(Mrg32k3aUniformAccDeviceMomentsTestsSuite,
-                         Mrg32k3aUniformAccDeviceMomentsTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-class Mcg31m1UniformStdDeviceMomentsTests : public ::testing::TestWithParam<sycl::device*> {};
-
-class Mcg31m1UniformAccDeviceMomentsTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Mcg31m1UniformStdDeviceMomentsTests, RealSinglePrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg31m1<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg31m1<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg31m1<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mcg31m1UniformStdDeviceMomentsTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg31m1<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg31m1<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg31m1<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mcg31m1UniformStdDeviceMomentsTests, IntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg31m1<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg31m1<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg31m1<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mcg31m1UniformStdDeviceMomentsTests, UnsignedIntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg31m1<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg31m1<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg31m1<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mcg31m1UniformAccDeviceMomentsTests, RealSinglePrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg31m1<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg31m1<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg31m1<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mcg31m1UniformAccDeviceMomentsTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg31m1<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg31m1<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg31m1<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mcg31m1UniformAccDeviceMomentsTests, IntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg31m1<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg31m1<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg31m1<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mcg31m1UniformAccDeviceMomentsTests, UnsignedIntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg31m1<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg31m1<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg31m1<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Mcg31m1UniformStdDeviceMomentsTestsSuite,
-                         Mcg31m1UniformStdDeviceMomentsTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(Mcg31m1UniformAccDeviceMomentsTestsSuite,
-                         Mcg31m1UniformAccDeviceMomentsTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-class Mcg59UniformStdDeviceMomentsTests : public ::testing::TestWithParam<sycl::device*> {};
-
-class Mcg59UniformAccDeviceMomentsTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Mcg59UniformStdDeviceMomentsTests, RealSinglePrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg59<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg59<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg59<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mcg59UniformStdDeviceMomentsTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg59<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg59<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg59<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mcg59UniformStdDeviceMomentsTests, IntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg59<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg59<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg59<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mcg59UniformStdDeviceMomentsTests, UnsignedIntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg59<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg59<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg59<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::standard>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mcg59UniformAccDeviceMomentsTests, RealSinglePrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg59<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg59<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg59<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     float, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mcg59UniformAccDeviceMomentsTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg59<1>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg59<4>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::mcg59<16>,
-                                 oneapi::mkl::rng::device::uniform<
-                                     double, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mcg59UniformAccDeviceMomentsTests, IntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg59<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg59<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg59<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::int32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mcg59UniformAccDeviceMomentsTests, UnsignedIntegerPrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg59<1>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg59<4>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::mcg59<16>,
-                     oneapi::mkl::rng::device::uniform<
-                         std::uint32_t, oneapi::mkl::rng::device::uniform_method::accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Mcg59UniformStdDeviceMomentsTestsSuite, Mcg59UniformStdDeviceMomentsTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(Mcg59UniformAccDeviceMomentsTestsSuite, Mcg59UniformAccDeviceMomentsTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-class Philox4x32x10BitsDeviceMomentsTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Philox4x32x10BitsDeviceMomentsTests, UnsignedIntegerPrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                                 oneapi::mkl::rng::device::bits<uint32_t>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                                 oneapi::mkl::rng::device::bits<uint32_t>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                                 oneapi::mkl::rng::device::bits<uint32_t>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10BitsDeviceMomentsTestsSuite,
-                         Philox4x32x10BitsDeviceMomentsTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-class Philox4x32x10UniformBitsDeviceMomentsTests : public ::testing::TestWithParam<sycl::device*> {
-};
-
-TEST_P(Philox4x32x10UniformBitsDeviceMomentsTests, UnsignedIntegerPrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                                 oneapi::mkl::rng::device::uniform_bits<uint32_t>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                                 oneapi::mkl::rng::device::uniform_bits<uint32_t>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                                 oneapi::mkl::rng::device::uniform_bits<uint32_t>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Philox4x32x10UniformBitsDeviceMomentsTests, UnsignedLongIntegerPrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                                 oneapi::mkl::rng::device::uniform_bits<uint64_t>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                                 oneapi::mkl::rng::device::uniform_bits<uint64_t>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                                 oneapi::mkl::rng::device::uniform_bits<uint64_t>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10UniformBitsDeviceMomentsTestsSuite,
-                         Philox4x32x10UniformBitsDeviceMomentsTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-class Philox4x32x10GaussianBoxMuller2DeviceMomentsTests
-        : public ::testing::TestWithParam<sycl::device*> {};
-
-// implementation uses double precision for accuracy
-TEST_P(Philox4x32x10GaussianBoxMuller2DeviceMomentsTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                     oneapi::mkl::rng::device::gaussian<
-                         float, oneapi::mkl::rng::device::gaussian_method::box_muller2>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                     oneapi::mkl::rng::device::gaussian<
-                         float, oneapi::mkl::rng::device::gaussian_method::box_muller2>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                     oneapi::mkl::rng::device::gaussian<
-                         float, oneapi::mkl::rng::device::gaussian_method::box_muller2>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                     oneapi::mkl::rng::device::gaussian<
-                         double, oneapi::mkl::rng::device::gaussian_method::box_muller2>>>
-        test4;
-    EXPECT_TRUEORSKIP((test4(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                     oneapi::mkl::rng::device::gaussian<
-                         double, oneapi::mkl::rng::device::gaussian_method::box_muller2>>>
-        test5;
-    EXPECT_TRUEORSKIP((test5(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                     oneapi::mkl::rng::device::gaussian<
-                         double, oneapi::mkl::rng::device::gaussian_method::box_muller2>>>
-        test6;
-    EXPECT_TRUEORSKIP((test6(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10GaussianBoxMuller2DeviceMomentsTestsSuite,
-                         Philox4x32x10GaussianBoxMuller2DeviceMomentsTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-class Philox4x32x10LognormalBoxMuller2DeviceMomentsTests
-        : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Philox4x32x10LognormalBoxMuller2DeviceMomentsTests, RealSinglePrecision) {
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                     oneapi::mkl::rng::device::lognormal<
-                         float, oneapi::mkl::rng::device::lognormal_method::box_muller2>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                     oneapi::mkl::rng::device::lognormal<
-                         float, oneapi::mkl::rng::device::lognormal_method::box_muller2>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                     oneapi::mkl::rng::device::lognormal<
-                         float, oneapi::mkl::rng::device::lognormal_method::box_muller2>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Philox4x32x10LognormalBoxMuller2DeviceMomentsTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                     oneapi::mkl::rng::device::lognormal<
-                         double, oneapi::mkl::rng::device::lognormal_method::box_muller2>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                     oneapi::mkl::rng::device::lognormal<
-                         double, oneapi::mkl::rng::device::lognormal_method::box_muller2>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                     oneapi::mkl::rng::device::lognormal<
-                         double, oneapi::mkl::rng::device::lognormal_method::box_muller2>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10LognormalBoxMuller2DeviceMomentsTestsSuite,
-                         Philox4x32x10LognormalBoxMuller2DeviceMomentsTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-class Philox4x32x10ExponentialIcdfDeviceMomentsTests
-        : public ::testing::TestWithParam<sycl::device*> {};
-
-class Philox4x32x10ExponentialIcdfAccDeviceMomentsTests
-        : public ::testing::TestWithParam<sycl::device*> {};
-
-// implementation uses double precision for accuracy
-TEST_P(Philox4x32x10ExponentialIcdfDeviceMomentsTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                                 oneapi::mkl::rng::device::exponential<
-                                     float, oneapi::mkl::rng::device::exponential_method::icdf>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                                 oneapi::mkl::rng::device::exponential<
-                                     float, oneapi::mkl::rng::device::exponential_method::icdf>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                                 oneapi::mkl::rng::device::exponential<
-                                     float, oneapi::mkl::rng::device::exponential_method::icdf>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                                 oneapi::mkl::rng::device::exponential<
-                                     double, oneapi::mkl::rng::device::exponential_method::icdf>>>
-        test4;
-    EXPECT_TRUEORSKIP((test4(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                                 oneapi::mkl::rng::device::exponential<
-                                     double, oneapi::mkl::rng::device::exponential_method::icdf>>>
-        test5;
-    EXPECT_TRUEORSKIP((test5(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                                 oneapi::mkl::rng::device::exponential<
-                                     double, oneapi::mkl::rng::device::exponential_method::icdf>>>
-        test6;
-    EXPECT_TRUEORSKIP((test6(GetParam())));
-}
-
-// implementation uses double precision for accuracy
-TEST_P(Philox4x32x10ExponentialIcdfAccDeviceMomentsTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                     oneapi::mkl::rng::device::exponential<
-                         float, oneapi::mkl::rng::device::exponential_method::icdf_accurate>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                     oneapi::mkl::rng::device::exponential<
-                         float, oneapi::mkl::rng::device::exponential_method::icdf_accurate>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                     oneapi::mkl::rng::device::exponential<
-                         float, oneapi::mkl::rng::device::exponential_method::icdf_accurate>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                     oneapi::mkl::rng::device::exponential<
-                         double, oneapi::mkl::rng::device::exponential_method::icdf_accurate>>>
-        test4;
-    EXPECT_TRUEORSKIP((test4(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                     oneapi::mkl::rng::device::exponential<
-                         double, oneapi::mkl::rng::device::exponential_method::icdf_accurate>>>
-        test5;
-    EXPECT_TRUEORSKIP((test5(GetParam())));
-    rng_device_test<
-        moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                     oneapi::mkl::rng::device::exponential<
-                         double, oneapi::mkl::rng::device::exponential_method::icdf_accurate>>>
-        test6;
-    EXPECT_TRUEORSKIP((test6(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10ExponentialIcdfDeviceMomentsTestsSuite,
-                         Philox4x32x10ExponentialIcdfDeviceMomentsTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10ExponentialIcdfAccDeviceMomentsTestsSuite,
-                         Philox4x32x10ExponentialIcdfAccDeviceMomentsTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-class Philox4x32x10PoissonDevroyeDeviceMomentsTests
-        : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Philox4x32x10PoissonDevroyeDeviceMomentsTests, IntegerPrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                                 oneapi::mkl::rng::device::poisson<
-                                     int32_t, oneapi::mkl::rng::device::poisson_method::devroye>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                                 oneapi::mkl::rng::device::poisson<
-                                     int32_t, oneapi::mkl::rng::device::poisson_method::devroye>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                                 oneapi::mkl::rng::device::poisson<
-                                     int32_t, oneapi::mkl::rng::device::poisson_method::devroye>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Philox4x32x10PoissonDevroyeDeviceMomentsTests, UnsignedIntegerPrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                                 oneapi::mkl::rng::device::poisson<
-                                     uint32_t, oneapi::mkl::rng::device::poisson_method::devroye>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                                 oneapi::mkl::rng::device::poisson<
-                                     uint32_t, oneapi::mkl::rng::device::poisson_method::devroye>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                                 oneapi::mkl::rng::device::poisson<
-                                     uint32_t, oneapi::mkl::rng::device::poisson_method::devroye>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10PoissonDevroyeDeviceMomentsTestsSuite,
-                         Philox4x32x10PoissonDevroyeDeviceMomentsTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-class Philox4x32x10BernoulliIcdfDeviceMomentsTests
-        : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Philox4x32x10BernoulliIcdfDeviceMomentsTests, IntegerPrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                                 oneapi::mkl::rng::device::bernoulli<
-                                     int32_t, oneapi::mkl::rng::device::bernoulli_method::icdf>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                                 oneapi::mkl::rng::device::bernoulli<
-                                     int32_t, oneapi::mkl::rng::device::bernoulli_method::icdf>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                                 oneapi::mkl::rng::device::bernoulli<
-                                     int32_t, oneapi::mkl::rng::device::bernoulli_method::icdf>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Philox4x32x10BernoulliIcdfDeviceMomentsTests, UnsignedIntegerPrecision) {
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<1>,
-                                 oneapi::mkl::rng::device::bernoulli<
-                                     uint32_t, oneapi::mkl::rng::device::bernoulli_method::icdf>>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<4>,
-                                 oneapi::mkl::rng::device::bernoulli<
-                                     uint32_t, oneapi::mkl::rng::device::bernoulli_method::icdf>>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<moments_test<oneapi::mkl::rng::device::philox4x32x10<16>,
-                                 oneapi::mkl::rng::device::bernoulli<
-                                     uint32_t, oneapi::mkl::rng::device::bernoulli_method::icdf>>>
-        test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10BernoulliIcdfDeviceMomentsTestsSuite,
-                         Philox4x32x10BernoulliIcdfDeviceMomentsTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/rng/device/service/CMakeLists.txt b/tests/unit_tests/rng/device/service/CMakeLists.txt
deleted file mode 100644
index 03d960e1a..000000000
--- a/tests/unit_tests/rng/device/service/CMakeLists.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build object from all test sources
-set(SERVICE_DEVICE_TESTS_SOURCES "skip_ahead.cpp")
-
-add_library(rng_device_service_ct OBJECT ${SERVICE_DEVICE_TESTS_SOURCES})
-target_compile_options(rng_device_service_ct PRIVATE -DNOMINMAX)
-target_include_directories(rng_device_service_ct
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../include
-    PUBLIC ${PROJECT_SOURCE_DIR}/include
-    PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-    PUBLIC ${CMAKE_BINARY_DIR}/bin
-)
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET rng_device_service_ct SOURCES ${SERVICE_DEVICE_TESTS_SOURCES})
-else()
-  target_link_libraries(rng_device_service_ct PUBLIC ONEMKL::SYCL::SYCL)
-endif()
-
-if(NOT ${ONEMKL_SYCL_IMPLEMENTATION} STREQUAL "hipsycl")
-  target_link_options(rng_device_service_ct PUBLIC -fsycl -fsycl-device-code-split=per_kernel)
-endif()
diff --git a/tests/unit_tests/rng/device/service/skip_ahead.cpp b/tests/unit_tests/rng/device/service/skip_ahead.cpp
deleted file mode 100644
index a5dfe0da8..000000000
--- a/tests/unit_tests/rng/device/service/skip_ahead.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "skip_ahead_test.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-class Philox4x32x10DeviceSkipAheadTests : public ::testing::TestWithParam<sycl::device*> {};
-
-class Philox4x32x10DeviceSkipAheadExTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Philox4x32x10DeviceSkipAheadTests, BinaryPrecision) {
-    rng_device_test<skip_ahead_test<oneapi::mkl::rng::device::philox4x32x10<1>>> test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<skip_ahead_test<oneapi::mkl::rng::device::philox4x32x10<4>>> test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<skip_ahead_test<oneapi::mkl::rng::device::philox4x32x10<16>>> test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Philox4x32x10DeviceSkipAheadExTests, BinaryPrecision) {
-    rng_device_test<skip_ahead_ex_test<oneapi::mkl::rng::device::philox4x32x10<1>>> test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<skip_ahead_ex_test<oneapi::mkl::rng::device::philox4x32x10<4>>> test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<skip_ahead_ex_test<oneapi::mkl::rng::device::philox4x32x10<16>>> test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10DeviceSkipAheadTestsSuite, Philox4x32x10DeviceSkipAheadTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10DeviceSkipAheadExTestsSuite,
-                         Philox4x32x10DeviceSkipAheadExTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-class Mrg32k3aDeviceSkipAheadTests : public ::testing::TestWithParam<sycl::device*> {};
-
-class Mrg32k3aDeviceSkipAheadExTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Mrg32k3aDeviceSkipAheadTests, BinaryPrecision) {
-    rng_device_test<skip_ahead_test<oneapi::mkl::rng::device::mrg32k3a<1>>> test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<skip_ahead_test<oneapi::mkl::rng::device::mrg32k3a<4>>> test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<skip_ahead_test<oneapi::mkl::rng::device::mrg32k3a<16>>> test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-TEST_P(Mrg32k3aDeviceSkipAheadExTests, BinaryPrecision) {
-    rng_device_test<skip_ahead_ex_test<oneapi::mkl::rng::device::mrg32k3a<1>>> test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<skip_ahead_ex_test<oneapi::mkl::rng::device::mrg32k3a<4>>> test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<skip_ahead_ex_test<oneapi::mkl::rng::device::mrg32k3a<16>>> test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Mrg32k3aDeviceSkipAheadTestsSuite, Mrg32k3aDeviceSkipAheadTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(Mrg32k3aDeviceSkipAheadExTestsSuite, Mrg32k3aDeviceSkipAheadExTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-class Mcg31m1DeviceSkipAheadTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Mcg31m1DeviceSkipAheadTests, BinaryPrecision) {
-    rng_device_test<skip_ahead_test<oneapi::mkl::rng::device::mcg31m1<1>>> test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<skip_ahead_test<oneapi::mkl::rng::device::mcg31m1<4>>> test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<skip_ahead_test<oneapi::mkl::rng::device::mcg31m1<16>>> test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Mcg31m1DeviceSkipAheadTestsSuite, Mcg31m1DeviceSkipAheadTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-class Mcg59DeviceSkipAheadTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Mcg59DeviceSkipAheadTests, BinaryPrecision) {
-    rng_device_test<skip_ahead_test<oneapi::mkl::rng::device::mcg59<1>>> test1;
-    EXPECT_TRUEORSKIP((test1(GetParam())));
-    rng_device_test<skip_ahead_test<oneapi::mkl::rng::device::mcg59<4>>> test2;
-    EXPECT_TRUEORSKIP((test2(GetParam())));
-    rng_device_test<skip_ahead_test<oneapi::mkl::rng::device::mcg59<16>>> test3;
-    EXPECT_TRUEORSKIP((test3(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Mcg59DeviceSkipAheadTestsSuite, Mcg59DeviceSkipAheadTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-} // namespace
diff --git a/tests/unit_tests/rng/include/engines_api_tests.hpp b/tests/unit_tests/rng/include/engines_api_tests.hpp
deleted file mode 100644
index 2469c3023..000000000
--- a/tests/unit_tests/rng/include/engines_api_tests.hpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _RNG_ENGINES_API_TESTS_HPP__
-#define _RNG_ENGINES_API_TESTS_HPP__
-
-#include <cstdint>
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-
-#include "rng_test_common.hpp"
-
-template <typename Engine>
-class engines_constructors_test {
-public:
-    template <typename Queue, typename... Args>
-    void operator()(Queue queue, Args... args) {
-        // Prepare arrays for random numbers
-        std::vector<std::uint32_t> r1(N_GEN);
-        std::vector<std::uint32_t> r2(N_GEN);
-        std::vector<std::uint32_t> r3(N_GEN);
-        std::vector<std::uint32_t> r4(N_GEN);
-
-        try {
-            // Initialize rng objects
-            Engine engine1(queue, SEED);
-            Engine engine2(queue, args...);
-            Engine engine3(engine1);
-            Engine engine4 = std::move(Engine(queue, SEED));
-
-            oneapi::mkl::rng::bits<std::uint32_t> distr;
-
-            sycl::buffer<std::uint32_t, 1> r1_buffer(r1.data(), r1.size());
-            sycl::buffer<std::uint32_t, 1> r2_buffer(r2.data(), r2.size());
-            sycl::buffer<std::uint32_t, 1> r3_buffer(r3.data(), r3.size());
-            sycl::buffer<std::uint32_t, 1> r4_buffer(r4.data(), r4.size());
-
-            oneapi::mkl::rng::generate(distr, engine1, N_GEN, r1_buffer);
-            oneapi::mkl::rng::generate(distr, engine2, N_GEN, r2_buffer);
-            oneapi::mkl::rng::generate(distr, engine3, N_GEN, r3_buffer);
-            oneapi::mkl::rng::generate(distr, engine4, N_GEN, r4_buffer);
-            QUEUE_WAIT(queue);
-        }
-        catch (const oneapi::mkl::unimplemented& e) {
-            status = test_skipped;
-            return;
-        }
-        catch (sycl::exception const& e) {
-            std::cout << "SYCL exception during generation" << std::endl << e.what() << std::endl;
-            print_error_code(e);
-            status = test_failed;
-            return;
-        }
-
-        // validation
-        status = (check_equal_vector(r1, r2) && check_equal_vector(r1, r3) &&
-                  check_equal_vector(r1, r4));
-    }
-
-    int status = test_passed;
-};
-
-template <typename Engine>
-class engines_copy_test {
-public:
-    template <typename Queue>
-    void operator()(Queue queue) {
-        // Prepare arrays for random numbers
-        std::vector<std::uint32_t> r1(N_GEN);
-        std::vector<std::uint32_t> r2(N_GEN);
-        std::vector<std::uint32_t> r3(N_GEN);
-
-        try {
-            // Initialize rng objects
-            Engine engine1(queue, SEED);
-            Engine engine2(engine1);
-
-            oneapi::mkl::rng::bits<std::uint32_t> distr;
-            {
-                sycl::buffer<std::uint32_t, 1> r1_buffer(r1.data(), r1.size());
-                sycl::buffer<std::uint32_t, 1> r2_buffer(r2.data(), r2.size());
-
-                oneapi::mkl::rng::generate(distr, engine1, N_GEN, r1_buffer);
-                oneapi::mkl::rng::generate(distr, engine2, N_GEN, r2_buffer);
-            }
-
-            Engine engine3 = engine1;
-            Engine engine4 = std::move(engine2);
-            {
-                sycl::buffer<std::uint32_t, 1> r1_buffer(r1.data(), r1.size());
-                sycl::buffer<std::uint32_t, 1> r2_buffer(r2.data(), r2.size());
-                sycl::buffer<std::uint32_t, 1> r3_buffer(r3.data(), r3.size());
-
-                oneapi::mkl::rng::generate(distr, engine1, N_GEN, r1_buffer);
-                oneapi::mkl::rng::generate(distr, engine3, N_GEN, r2_buffer);
-                oneapi::mkl::rng::generate(distr, engine4, N_GEN, r3_buffer);
-            }
-            QUEUE_WAIT(queue);
-        }
-        catch (const oneapi::mkl::unimplemented& e) {
-            status = test_skipped;
-            return;
-        }
-        catch (sycl::exception const& e) {
-            std::cout << "SYCL exception during generation" << std::endl << e.what() << std::endl;
-            print_error_code(e);
-            status = test_failed;
-            return;
-        }
-
-        // Validation
-        status = (check_equal_vector(r1, r2) && check_equal_vector(r1, r3));
-    }
-
-    int status = test_passed;
-};
-
-#endif // _RNG_ENGINES_API_TESTS_HPP__
diff --git a/tests/unit_tests/rng/include/rng_test_common.hpp b/tests/unit_tests/rng/include/rng_test_common.hpp
deleted file mode 100644
index d01b04cce..000000000
--- a/tests/unit_tests/rng/include/rng_test_common.hpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _RNG_TEST_COMMON_HPP__
-#define _RNG_TEST_COMMON_HPP__
-
-#include <iostream>
-#include <limits>
-
-#include "test_helper.hpp"
-
-#define SEED  777
-#define N_GEN 1000
-
-// Defines for skip_ahead and leapfrog tests
-#define N_ENGINES     5
-#define N_PORTION     100
-#define N_GEN_SERVICE (N_ENGINES * N_PORTION)
-
-// defines for skip_ahead_ex tests
-#define N_SKIP     ((std::uint64_t)pow(2, 62))
-#define SKIP_TIMES ((std::int32_t)pow(2, 14))
-#define NUM_TO_SKIP \
-    { 0, (std::uint64_t)pow(2, 12) }
-
-// Correctness checking.
-static inline bool check_equal(float x, float x_ref) {
-    float bound = std::numeric_limits<float>::epsilon();
-    float aerr = std::abs(x - x_ref);
-    return (aerr <= bound);
-}
-
-static inline bool check_equal(double x, double x_ref) {
-    double bound = std::numeric_limits<double>::epsilon();
-    double aerr = std::abs(x - x_ref);
-    return (aerr <= bound);
-}
-
-static inline bool check_equal(std::uint32_t x, std::uint32_t x_ref) {
-    return x == x_ref;
-}
-
-static inline bool check_equal(std::uint64_t x, std::uint64_t x_ref) {
-    return x == x_ref;
-}
-
-template <typename Fp, typename AllocType>
-static inline bool check_equal_vector(std::vector<Fp, AllocType>& r1,
-                                      std::vector<Fp, AllocType>& r2) {
-    bool good = true;
-    for (int i = 0; i < r1.size(); i++) {
-        if (!check_equal(r1[i], r2[i])) {
-            good = false;
-            break;
-        }
-    }
-    return good;
-}
-
-template <typename Test>
-class rng_test {
-public:
-    // method to call any tests, switch between rt and ct
-    template <typename... Args>
-    int operator()(sycl::device* dev, Args... args) {
-        auto exception_handler = [](sycl::exception_list exceptions) {
-            for (std::exception_ptr const& e : exceptions) {
-                try {
-                    std::rethrow_exception(e);
-                }
-                catch (sycl::exception const& e) {
-                    std::cout << "Caught asynchronous SYCL exception during ASUM:\n"
-                              << e.what() << std::endl;
-                    print_error_code(e);
-                }
-            }
-        };
-
-#ifdef ENABLE_CURAND_BACKEND // w/a for cuda backend hangs when there are several queues with different contexts
-        static sycl::device* previous_device = nullptr;
-        static sycl::context* context = nullptr;
-
-        if ((previous_device != dev)) {
-            previous_device = dev;
-            if (context != nullptr) {
-                delete context;
-            }
-            context = new sycl::context(*dev);
-        }
-
-        sycl::queue queue(*context, *dev, exception_handler);
-#else
-        sycl::queue queue(*dev, exception_handler);
-#endif
-
-#ifdef CALL_RT_API
-        test_(queue, args...);
-#else
-        TEST_RUN_RNG_CT_SELECT(queue, test_, args...);
-#endif
-
-        return test_.status;
-    }
-
-protected:
-    Test test_;
-};
-
-#ifdef CALL_RT_API
-#define QUEUE_WAIT(q) q.wait()
-#else
-#define QUEUE_WAIT(q) q.get_queue().wait()
-#endif
-
-#endif // _RNG_TEST_COMMON_HPP__
diff --git a/tests/unit_tests/rng/include/skip_ahead_test.hpp b/tests/unit_tests/rng/include/skip_ahead_test.hpp
deleted file mode 100644
index efec71dde..000000000
--- a/tests/unit_tests/rng/include/skip_ahead_test.hpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _RNG_TEST_SKIP_AHEAD_TEST_HPP__
-#define _RNG_TEST_SKIP_AHEAD_TEST_HPP__
-
-#include <cstdint>
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-
-#include "rng_test_common.hpp"
-
-template <typename Engine>
-class skip_ahead_test {
-public:
-    template <typename Queue>
-    void operator()(Queue queue) {
-        // Prepare arrays for random numbers
-        std::vector<std::uint32_t> r1(N_GEN_SERVICE);
-        std::vector<std::uint32_t> r2(N_GEN_SERVICE);
-
-        try {
-            // Initialize rng objects
-            Engine engine(queue);
-            std::vector<Engine*> engines;
-
-            oneapi::mkl::rng::bits<std::uint32_t> distr;
-
-            // Perform skip
-            for (int i = 0; i < N_ENGINES; i++) {
-                engines.push_back(new Engine(queue));
-                oneapi::mkl::rng::skip_ahead(*(engines[i]), i * N_PORTION);
-            }
-
-            sycl::buffer<std::uint32_t, 1> r_buffer(r1.data(), r1.size());
-            std::vector<sycl::buffer<std::uint32_t, 1>> r_buffers;
-            for (int i = 0; i < N_ENGINES; i++) {
-                r_buffers.push_back(
-                    sycl::buffer<std::uint32_t, 1>(r2.data() + i * N_PORTION, N_PORTION));
-            }
-
-            oneapi::mkl::rng::generate(distr, engine, N_GEN_SERVICE, r_buffer);
-            for (int i = 0; i < N_ENGINES; i++) {
-                oneapi::mkl::rng::generate(distr, *(engines[i]), N_PORTION, r_buffers[i]);
-            }
-            QUEUE_WAIT(queue);
-
-            // Clear memory
-            for (int i = 0; i < N_ENGINES; i++) {
-                delete engines[i];
-            }
-        }
-        catch (const oneapi::mkl::unimplemented& e) {
-            status = test_skipped;
-            return;
-        }
-        catch (sycl::exception const& e) {
-            std::cout << "SYCL exception during generation" << std::endl << e.what() << std::endl;
-            print_error_code(e);
-            status = test_failed;
-            return;
-        }
-
-        // Validation
-        status = check_equal_vector(r1, r2);
-    }
-
-    int status = test_passed;
-};
-
-template <typename Engine>
-class skip_ahead_ex_test {
-public:
-    template <typename Queue>
-    void operator()(Queue queue) {
-        // Prepare arrays for random numbers
-        std::vector<std::uint32_t> r1(N_GEN);
-        std::vector<std::uint32_t> r2(N_GEN);
-
-        try {
-            // Initialize rng objects
-            Engine engine1(queue);
-            Engine engine2(queue);
-
-            oneapi::mkl::rng::bits<std::uint32_t> distr;
-
-            // Perform skip
-            for (int j = 0; j < SKIP_TIMES; j++) {
-                oneapi::mkl::rng::skip_ahead(engine1, N_SKIP);
-            }
-            oneapi::mkl::rng::skip_ahead(engine2, NUM_TO_SKIP);
-
-            sycl::buffer<std::uint32_t, 1> r1_buffer(r1.data(), r1.size());
-            sycl::buffer<std::uint32_t, 1> r2_buffer(r2.data(), r2.size());
-
-            oneapi::mkl::rng::generate(distr, engine1, N_GEN, r1_buffer);
-            oneapi::mkl::rng::generate(distr, engine2, N_GEN, r2_buffer);
-            QUEUE_WAIT(queue);
-        }
-        catch (const oneapi::mkl::unimplemented& e) {
-            status = test_skipped;
-            return;
-        }
-        catch (sycl::exception const& e) {
-            std::cout << "SYCL exception during generation" << std::endl << e.what() << std::endl;
-            print_error_code(e);
-            status = test_failed;
-            return;
-        }
-
-        // validation
-        status = check_equal_vector(r1, r2);
-    }
-
-    int status = test_passed;
-};
-
-#endif // _RNG_TEST_SKIP_AHEAD_TEST_HPP__
diff --git a/tests/unit_tests/rng/include/statistics_check.hpp b/tests/unit_tests/rng/include/statistics_check.hpp
deleted file mode 100644
index 8a1d045f0..000000000
--- a/tests/unit_tests/rng/include/statistics_check.hpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _RNG_TEST_STATISTICS_CHECK_HPP__
-#define _RNG_TEST_STATISTICS_CHECK_HPP__
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-
-#include "rng_test_common.hpp"
-
-template <typename Type, typename AllocType>
-bool compare_moments(std::vector<Type, AllocType>& r, double tM, double tD, double tQ) {
-    double tD2;
-    double sM, sD;
-    double sum, sum2;
-    double n, s;
-    double DeltaM, DeltaD;
-
-    // Sample moments
-    sum = 0.0;
-    sum2 = 0.0;
-    for (int i = 0; i < r.size(); i++) {
-        sum += (double)r[i];
-        sum2 += (double)r[i] * (double)r[i];
-    }
-    sM = sum / ((double)r.size());
-    sD = sum2 / (double)r.size() - (sM * sM);
-
-    // Comparison of theoretical and sample moments
-    n = (double)r.size();
-    tD2 = tD * tD;
-    s = ((tQ - tD2) / n) - (2 * (tQ - 2 * tD2) / (n * n)) + ((tQ - 3 * tD2) / (n * n * n));
-
-    DeltaM = (tM - sM) / sqrt(tD / n);
-    DeltaD = (tD - sD) / sqrt(s);
-    if (fabs(DeltaM) > 3.0 || fabs(DeltaD) > 10.0) {
-        std::cout << "Error: sample moments (mean=" << sM << ", variance=" << sD
-                  << ") disagree with theory (mean=" << tM << ", variance=" << tD << ")"
-                  << " N_GEN = " << r.size() << std::endl;
-        return false;
-    }
-    return true;
-}
-
-template <typename Distribution>
-struct statistics {};
-
-template <typename Type, typename Method>
-struct statistics<oneapi::mkl::rng::uniform<Type, Method>> {
-    template <typename AllocType>
-    bool check(std::vector<Type, AllocType>& r,
-               const oneapi::mkl::rng::uniform<Type, Method>& distr) {
-        double tM, tD, tQ;
-        Type a = distr.a();
-        Type b = distr.b();
-
-        // Theoretical moments
-        tM = (b + a) / 2.0;
-        tD = ((b - a) * (b - a)) / 12.0;
-        tQ = ((b - a) * (b - a) * (b - a) * (b - a)) / 80.0;
-
-        return compare_moments(r, tM, tD, tQ);
-    }
-};
-
-template <typename Method>
-struct statistics<oneapi::mkl::rng::uniform<std::int32_t, Method>> {
-    template <typename AllocType>
-    bool check(std::vector<int32_t, AllocType>& r,
-               const oneapi::mkl::rng::uniform<int32_t, Method>& distr) {
-        double tM, tD, tQ;
-        int32_t a = distr.a();
-        int32_t b = distr.b();
-
-        // Theoretical moments
-        tM = (a + b - 1.0) / 2.0;
-        tD = ((b - a) * (b - a) - 1.0) / 12.0;
-        tQ = (((b - a) * (b - a)) * ((1.0 / 80.0) * (b - a) * (b - a) - (1.0 / 24.0))) +
-             (7.0 / 240.0);
-
-        return compare_moments(r, tM, tD, tQ);
-    }
-};
-
-template <typename Type, typename Method>
-struct statistics<oneapi::mkl::rng::gaussian<Type, Method>> {
-    template <typename AllocType>
-    bool check(std::vector<Type, AllocType>& r,
-               const oneapi::mkl::rng::gaussian<Type, Method>& distr) {
-        double tM, tD, tQ;
-        Type a = distr.mean();
-        Type sigma = distr.stddev();
-
-        // Theoretical moments
-        tM = a;
-        tD = sigma * sigma;
-        tQ = 720.0 * sigma * sigma * sigma * sigma;
-
-        return compare_moments(r, tM, tD, tQ);
-    }
-};
-
-template <typename Type, typename Method>
-struct statistics<oneapi::mkl::rng::lognormal<Type, Method>> {
-    template <typename AllocType>
-    bool check(std::vector<Type, AllocType>& r,
-               const oneapi::mkl::rng::lognormal<Type, Method>& distr) {
-        double tM, tD, tQ;
-        Type a = distr.m();
-        Type b = distr.displ();
-        Type sigma = distr.s();
-        Type beta = distr.scale();
-
-        // Theoretical moments
-        tM = b + beta * exp(a + sigma * sigma * 0.5);
-        tD = beta * beta * exp(2.0 * a + sigma * sigma) * (exp(sigma * sigma) - 1.0);
-        tQ = beta * beta * beta * beta * exp(4.0 * a + 2.0 * sigma * sigma) *
-             (exp(6.0 * sigma * sigma) - 4.0 * exp(3.0 * sigma * sigma) + 6.0 * exp(sigma * sigma) -
-              3.0);
-
-        return compare_moments(r, tM, tD, tQ);
-    }
-};
-
-template <typename Type, typename Method>
-struct statistics<oneapi::mkl::rng::bernoulli<Type, Method>> {
-    template <typename AllocType>
-    bool check(std::vector<Type, AllocType>& r,
-               const oneapi::mkl::rng::bernoulli<Type, Method>& distr) {
-        double tM, tD, tQ;
-        double p = distr.p();
-
-        tM = p;
-        tD = p * (1.0 - p);
-        tQ = p * (1.0 - 4.0 * p + 6.0 * p * p - 3.0 * p * p * p);
-
-        return compare_moments(r, tM, tD, tQ);
-    }
-};
-
-template <typename Type, typename Method>
-struct statistics<oneapi::mkl::rng::poisson<Type, Method>> {
-    template <typename AllocType>
-    bool check(std::vector<Type, AllocType>& r,
-               const oneapi::mkl::rng::poisson<Type, Method>& distr) {
-        double tM, tD, tQ;
-        double lambda = distr.lambda();
-
-        tM = lambda;
-        tD = lambda;
-        tQ = 4 * lambda * lambda + lambda;
-
-        return compare_moments(r, tM, tD, tQ);
-    }
-};
-
-#endif // _RNG_TEST_STATISTICS_CHECK_HPP__
diff --git a/tests/unit_tests/rng/include/statistics_check_test.hpp b/tests/unit_tests/rng/include/statistics_check_test.hpp
deleted file mode 100644
index 14a637d7a..000000000
--- a/tests/unit_tests/rng/include/statistics_check_test.hpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _RNG_TEST_STATISTICS_CHECK_TEST_HPP__
-#define _RNG_TEST_STATISTICS_CHECK_TEST_HPP__
-
-#include <cstdint>
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-
-#include "statistics_check.hpp"
-
-#define UNIFORM_ARGS_FLOAT  -1.0f, 5.0f
-#define UNIFORM_ARGS_DOUBLE -1.0, 5.0
-#define UNIFORM_ARGS_INT    -1, 5
-
-#define GAUSSIAN_ARGS_FLOAT  -1.0f, 5.0f
-#define GAUSSIAN_ARGS_DOUBLE -1.0, 5.0
-
-#define LOGNORMAL_ARGS_FLOAT  -1.0f, 5.0f, 1.0f, 2.0f
-#define LOGNORMAL_ARGS_DOUBLE -1.0, 5.0, 1.0, 2.0
-
-#define BERNOULLI_ARGS 0.5f
-
-#define POISSON_ARGS 0.5
-
-template <typename Distr, typename Engine>
-class statistics_test {
-public:
-    template <typename Queue, typename... Args>
-    void operator()(Queue queue, std::int64_t n_gen, Args... args) {
-        using Type = typename Distr::result_type;
-
-        std::vector<Type> r(n_gen);
-
-        try {
-            sycl::buffer<Type, 1> r_buffer(r.data(), r.size());
-
-            Engine engine(queue, SEED);
-            Distr distr(args...);
-            oneapi::mkl::rng::generate(distr, engine, n_gen, r_buffer);
-            QUEUE_WAIT(queue);
-        }
-        catch (sycl::exception const& e) {
-            std::cout << "Caught synchronous SYCL exception during generation:\n"
-                      << e.what() << std::endl;
-            print_error_code(e);
-        }
-        catch (const oneapi::mkl::unimplemented& e) {
-            status = test_skipped;
-            return;
-        }
-        catch (const std::runtime_error& error) {
-            std::cout << "Error raised during execution:\n" << error.what() << std::endl;
-        }
-
-        status = statistics<Distr>{}.check(r, Distr{ args... });
-    }
-
-    int status = test_passed;
-};
-
-template <typename Distr, typename Engine>
-class statistics_usm_test {
-public:
-    template <typename Queue, typename... Args>
-    void operator()(Queue queue, std::int64_t n_gen, Args... args) {
-        using Type = typename Distr::result_type;
-
-#ifdef CALL_RT_API
-        auto ua = sycl::usm_allocator<Type, sycl::usm::alloc::shared, 64>(queue);
-#else
-        auto ua = sycl::usm_allocator<Type, sycl::usm::alloc::shared, 64>(queue.get_queue());
-#endif
-        std::vector<Type, decltype(ua)> r(n_gen, ua);
-
-        try {
-            Engine engine(queue, SEED);
-            Distr distr(args...);
-            auto event = oneapi::mkl::rng::generate(distr, engine, n_gen, r.data());
-            event.wait_and_throw();
-        }
-        catch (sycl::exception const& e) {
-            std::cout << "Caught synchronous SYCL exception during generation:\n"
-                      << e.what() << std::endl;
-            print_error_code(e);
-        }
-        catch (const oneapi::mkl::unimplemented& e) {
-            status = test_skipped;
-            return;
-        }
-        catch (const std::runtime_error& error) {
-            std::cout << "Error raised during execution:\n" << error.what() << std::endl;
-        }
-
-        status = statistics<Distr>{}.check(r, Distr{ args... });
-    }
-
-    int status = test_passed;
-};
-
-#endif // _RNG_TEST_STATISTICS_CHECK_TEST_HPP__
diff --git a/tests/unit_tests/rng/service/CMakeLists.txt b/tests/unit_tests/rng/service/CMakeLists.txt
deleted file mode 100644
index 8436ce9eb..000000000
--- a/tests/unit_tests/rng/service/CMakeLists.txt
+++ /dev/null
@@ -1,53 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build object from all test sources
-set(SERVICE_TESTS_SOURCES "skip_ahead.cpp" "engines_api_test.cpp")
-
-if(BUILD_SHARED_LIBS)
-  add_library(rng_service_rt OBJECT ${SERVICE_TESTS_SOURCES})
-  target_compile_options(rng_service_rt PRIVATE -DCALL_RT_API -DNOMINMAX)
-  target_include_directories(rng_service_rt
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-  )
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET rng_service_rt SOURCES ${SERVICE_TESTS_SOURCES})
-  else()
-    target_link_libraries(rng_service_rt PUBLIC ONEMKL::SYCL::SYCL)
-  endif()
-endif()
-
-add_library(rng_service_ct OBJECT ${SERVICE_TESTS_SOURCES})
-target_compile_options(rng_service_ct PRIVATE -DNOMINMAX)
-target_include_directories(rng_service_ct
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-    PUBLIC ${PROJECT_SOURCE_DIR}/include
-    PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-    PUBLIC ${CMAKE_BINARY_DIR}/bin
-)
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET rng_service_ct SOURCES ${SERVICE_TESTS_SOURCES})
-else()
-  target_link_libraries(rng_service_ct PUBLIC ONEMKL::SYCL::SYCL)
-endif()
diff --git a/tests/unit_tests/rng/service/engines_api_test.cpp b/tests/unit_tests/rng/service/engines_api_test.cpp
deleted file mode 100644
index 500231703..000000000
--- a/tests/unit_tests/rng/service/engines_api_test.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "engines_api_tests.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-class Philox4x32x10ConstructorsTests : public ::testing::TestWithParam<sycl::device*> {};
-
-class Philox4x32x10CopyTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Philox4x32x10ConstructorsTests, BinaryPrecision) {
-    rng_test<engines_constructors_test<oneapi::mkl::rng::philox4x32x10>> test;
-    std::initializer_list<std::uint64_t> seed_ex = { SEED, 0, 0 };
-    EXPECT_TRUEORSKIP((test(GetParam(), seed_ex)));
-}
-
-TEST_P(Philox4x32x10CopyTests, BinaryPrecision) {
-    rng_test<engines_copy_test<oneapi::mkl::rng::philox4x32x10>> test;
-    EXPECT_TRUEORSKIP((test(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10ConstructorsTestsuite, Philox4x32x10ConstructorsTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10CopyTestsuite, Philox4x32x10CopyTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-class Mrg32k3aConstructorsTests : public ::testing::TestWithParam<sycl::device*> {};
-
-class Mrg32k3aCopyTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Mrg32k3aConstructorsTests, BinaryPrecision) {
-    rng_test<engines_constructors_test<oneapi::mkl::rng::mrg32k3a>> test;
-    std::initializer_list<std::uint32_t> seed_ex = { SEED, 1, 1, 1, 1, 1 };
-    EXPECT_TRUEORSKIP((test(GetParam(), seed_ex)));
-}
-
-TEST_P(Mrg32k3aCopyTests, BinaryPrecision) {
-    rng_test<engines_copy_test<oneapi::mkl::rng::mrg32k3a>> test;
-    EXPECT_TRUEORSKIP((test(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Mrg32k3aConstructorsTestsuite, Mrg32k3aConstructorsTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(Mrg32k3aCopyTestsuite, Mrg32k3aCopyTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/rng/service/skip_ahead.cpp b/tests/unit_tests/rng/service/skip_ahead.cpp
deleted file mode 100644
index 445b76abe..000000000
--- a/tests/unit_tests/rng/service/skip_ahead.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "skip_ahead_test.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-class Philox4x32x10SkipAheadTests : public ::testing::TestWithParam<sycl::device*> {};
-
-class Philox4x32x10SkipAheadExTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Philox4x32x10SkipAheadTests, BinaryPrecision) {
-    rng_test<skip_ahead_test<oneapi::mkl::rng::philox4x32x10>> test;
-    EXPECT_TRUEORSKIP((test(GetParam())));
-}
-
-TEST_P(Philox4x32x10SkipAheadExTests, BinaryPrecision) {
-    rng_test<skip_ahead_ex_test<oneapi::mkl::rng::philox4x32x10>> test;
-    EXPECT_TRUEORSKIP((test(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10SkipAheadTestSuite, Philox4x32x10SkipAheadTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(Philox4x32x10SkipAheadExTestSuite, Philox4x32x10SkipAheadExTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-class Mrg32k3aSkipAheadTests : public ::testing::TestWithParam<sycl::device*> {};
-
-class Mrg32k3aSkipAheadExTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(Mrg32k3aSkipAheadTests, BinaryPrecision) {
-    rng_test<skip_ahead_test<oneapi::mkl::rng::mrg32k3a>> test;
-    EXPECT_TRUEORSKIP((test(GetParam())));
-}
-
-TEST_P(Mrg32k3aSkipAheadExTests, BinaryPrecision) {
-    rng_test<skip_ahead_ex_test<oneapi::mkl::rng::mrg32k3a>> test;
-    EXPECT_TRUEORSKIP((test(GetParam())));
-}
-
-INSTANTIATE_TEST_SUITE_P(Mrg32k3aSkipAheadTestSuite, Mrg32k3aSkipAheadTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(Mrg32k3aSkipAheadExTestSuite, Mrg32k3aSkipAheadExTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/rng/statistics_check/CMakeLists.txt b/tests/unit_tests/rng/statistics_check/CMakeLists.txt
deleted file mode 100644
index 244d33976..000000000
--- a/tests/unit_tests/rng/statistics_check/CMakeLists.txt
+++ /dev/null
@@ -1,53 +0,0 @@
-#===============================================================================
-# Copyright 2020-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-# Build object from all test sources
-set(STATS_CHECK_SOURCES "uniform.cpp" "uniform_usm.cpp" "gaussian_usm.cpp" "gaussian.cpp" "lognormal_usm.cpp" "lognormal.cpp" "bernoulli_usm.cpp" "bernoulli.cpp" "poisson_usm.cpp" "poisson.cpp")
-
-if(BUILD_SHARED_LIBS)
-  add_library(rng_statistics_rt OBJECT ${STATS_CHECK_SOURCES})
-  target_compile_options(rng_statistics_rt PRIVATE -DCALL_RT_API -DNOMINMAX)
-  target_include_directories(rng_statistics_rt
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-      PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-      PUBLIC ${PROJECT_SOURCE_DIR}/include
-      PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-      PUBLIC ${CMAKE_BINARY_DIR}/bin
-  )
-  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET rng_statistics_rt SOURCES ${STATS_CHECK_SOURCES})
-  else()
-    target_link_libraries(rng_statistics_rt PUBLIC ONEMKL::SYCL::SYCL)
-  endif()
-endif()
-
-add_library(rng_statistics_ct OBJECT ${STATS_CHECK_SOURCES})
-target_compile_options(rng_statistics_ct PRIVATE -DNOMINMAX)
-target_include_directories(rng_statistics_ct
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-    PUBLIC ${PROJECT_SOURCE_DIR}/include
-    PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-    PUBLIC ${CMAKE_BINARY_DIR}/bin
-)
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-  add_sycl_to_target(TARGET rng_statistics_ct SOURCES ${STATS_CHECK_SOURCES})
-else()
-  target_link_libraries(rng_statistics_ct PUBLIC ONEMKL::SYCL::SYCL)
-endif()
diff --git a/tests/unit_tests/rng/statistics_check/bernoulli.cpp b/tests/unit_tests/rng/statistics_check/bernoulli.cpp
deleted file mode 100755
index b95d98118..000000000
--- a/tests/unit_tests/rng/statistics_check/bernoulli.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "statistics_check_test.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-class BernoulliIcdfTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(BernoulliIcdfTests, IntegerPrecision) {
-    rng_test<statistics_test<
-        oneapi::mkl::rng::bernoulli<std::int32_t, oneapi::mkl::rng::bernoulli_method::icdf>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, BERNOULLI_ARGS)));
-    rng_test<statistics_test<
-        oneapi::mkl::rng::bernoulli<std::int32_t, oneapi::mkl::rng::bernoulli_method::icdf>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, BERNOULLI_ARGS)));
-}
-
-TEST_P(BernoulliIcdfTests, UnsignedIntegerPrecision) {
-    rng_test<statistics_test<
-        oneapi::mkl::rng::bernoulli<std::uint32_t, oneapi::mkl::rng::bernoulli_method::icdf>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, BERNOULLI_ARGS)));
-    rng_test<statistics_test<
-        oneapi::mkl::rng::bernoulli<std::uint32_t, oneapi::mkl::rng::bernoulli_method::icdf>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, BERNOULLI_ARGS)));
-}
-
-INSTANTIATE_TEST_SUITE_P(BernoulliIcdfTestSuite, BernoulliIcdfTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/rng/statistics_check/bernoulli_usm.cpp b/tests/unit_tests/rng/statistics_check/bernoulli_usm.cpp
deleted file mode 100755
index 9c8c934dd..000000000
--- a/tests/unit_tests/rng/statistics_check/bernoulli_usm.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "statistics_check_test.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-class BernoulliIcdfUsmTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(BernoulliIcdfUsmTests, IntegerPrecision) {
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::bernoulli<std::int32_t, oneapi::mkl::rng::bernoulli_method::icdf>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, BERNOULLI_ARGS)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::bernoulli<std::int32_t, oneapi::mkl::rng::bernoulli_method::icdf>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, BERNOULLI_ARGS)));
-}
-
-TEST_P(BernoulliIcdfUsmTests, UnsignedIntegerPrecision) {
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::bernoulli<std::uint32_t, oneapi::mkl::rng::bernoulli_method::icdf>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, BERNOULLI_ARGS)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::bernoulli<std::int32_t, oneapi::mkl::rng::bernoulli_method::icdf>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, BERNOULLI_ARGS)));
-}
-
-INSTANTIATE_TEST_SUITE_P(BernoulliIcdfUsmTestSuite, BernoulliIcdfUsmTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/rng/statistics_check/gaussian.cpp b/tests/unit_tests/rng/statistics_check/gaussian.cpp
deleted file mode 100644
index ed63f3221..000000000
--- a/tests/unit_tests/rng/statistics_check/gaussian.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "statistics_check_test.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-class GaussianBoxmullerTest : public ::testing::TestWithParam<sycl::device*> {};
-
-class GaussianIcdfTest : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(GaussianIcdfTest, RealSinglePrecision) {
-    rng_test<
-        statistics_test<oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>,
-                        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, GAUSSIAN_ARGS_FLOAT)));
-    rng_test<
-        statistics_test<oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>,
-                        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, GAUSSIAN_ARGS_FLOAT)));
-}
-
-TEST_P(GaussianIcdfTest, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_test<
-        statistics_test<oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>,
-                        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, GAUSSIAN_ARGS_DOUBLE)));
-    rng_test<
-        statistics_test<oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>,
-                        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, GAUSSIAN_ARGS_DOUBLE)));
-}
-
-TEST_P(GaussianBoxmullerTest, RealSinglePrecision) {
-    rng_test<statistics_test<
-        oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::box_muller2>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, GAUSSIAN_ARGS_FLOAT)));
-    rng_test<statistics_test<
-        oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::box_muller2>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, GAUSSIAN_ARGS_FLOAT)));
-}
-
-TEST_P(GaussianBoxmullerTest, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_test<statistics_test<
-        oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::box_muller2>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, GAUSSIAN_ARGS_DOUBLE)));
-    rng_test<statistics_test<
-        oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::box_muller2>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, GAUSSIAN_ARGS_DOUBLE)));
-}
-
-INSTANTIATE_TEST_SUITE_P(GaussianIcdfTestSuite, GaussianIcdfTest, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(GaussianBoxmullerTestSuite, GaussianBoxmullerTest,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/rng/statistics_check/gaussian_usm.cpp b/tests/unit_tests/rng/statistics_check/gaussian_usm.cpp
deleted file mode 100644
index a1d4d1b06..000000000
--- a/tests/unit_tests/rng/statistics_check/gaussian_usm.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "statistics_check_test.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-class GaussianBoxmullerUsmTest : public ::testing::TestWithParam<sycl::device*> {};
-
-class GaussianIcdfUsmTest : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(GaussianIcdfUsmTest, RealSinglePrecision) {
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, GAUSSIAN_ARGS_FLOAT)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::icdf>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, GAUSSIAN_ARGS_FLOAT)));
-}
-
-TEST_P(GaussianIcdfUsmTest, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, GAUSSIAN_ARGS_DOUBLE)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::icdf>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, GAUSSIAN_ARGS_DOUBLE)));
-}
-
-TEST_P(GaussianBoxmullerUsmTest, RealSinglePrecision) {
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::box_muller2>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, GAUSSIAN_ARGS_FLOAT)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::gaussian<float, oneapi::mkl::rng::gaussian_method::box_muller2>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, GAUSSIAN_ARGS_FLOAT)));
-}
-
-TEST_P(GaussianBoxmullerUsmTest, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::box_muller2>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, GAUSSIAN_ARGS_DOUBLE)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::gaussian<double, oneapi::mkl::rng::gaussian_method::box_muller2>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, GAUSSIAN_ARGS_DOUBLE)));
-}
-
-INSTANTIATE_TEST_SUITE_P(GaussianIcdfUsmTestSuite, GaussianIcdfUsmTest,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(GaussianBoxmullerUsmTestSuite, GaussianBoxmullerUsmTest,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/rng/statistics_check/lognormal.cpp b/tests/unit_tests/rng/statistics_check/lognormal.cpp
deleted file mode 100755
index 5486202bb..000000000
--- a/tests/unit_tests/rng/statistics_check/lognormal.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "statistics_check_test.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-class LognormalBoxmullerTest : public ::testing::TestWithParam<sycl::device*> {};
-
-class LognormalIcdfTest : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(LognormalIcdfTest, RealSinglePrecision) {
-    rng_test<statistics_test<
-        oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, LOGNORMAL_ARGS_FLOAT)));
-    rng_test<statistics_test<
-        oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, LOGNORMAL_ARGS_FLOAT)));
-}
-
-TEST_P(LognormalIcdfTest, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_test<statistics_test<
-        oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, LOGNORMAL_ARGS_DOUBLE)));
-    rng_test<statistics_test<
-        oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, LOGNORMAL_ARGS_DOUBLE)));
-}
-
-TEST_P(LognormalBoxmullerTest, RealSinglePrecision) {
-    rng_test<statistics_test<
-        oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::box_muller2>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, LOGNORMAL_ARGS_FLOAT)));
-    rng_test<statistics_test<
-        oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::box_muller2>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, LOGNORMAL_ARGS_FLOAT)));
-}
-
-TEST_P(LognormalBoxmullerTest, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_test<statistics_test<
-        oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::box_muller2>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, LOGNORMAL_ARGS_DOUBLE)));
-    rng_test<statistics_test<
-        oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::box_muller2>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, LOGNORMAL_ARGS_DOUBLE)));
-}
-
-INSTANTIATE_TEST_SUITE_P(LognormalIcdfTestSuite, LognormalIcdfTest, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(LognormalBoxmullerTestSuite, LognormalBoxmullerTest,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/rng/statistics_check/lognormal_usm.cpp b/tests/unit_tests/rng/statistics_check/lognormal_usm.cpp
deleted file mode 100755
index d59d9458a..000000000
--- a/tests/unit_tests/rng/statistics_check/lognormal_usm.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "statistics_check_test.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-class LognormalBoxmullerUsmTest : public ::testing::TestWithParam<sycl::device*> {};
-
-class LognormalIcdfUsmTest : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(LognormalIcdfUsmTest, RealSinglePrecision) {
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, LOGNORMAL_ARGS_FLOAT)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::icdf>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, LOGNORMAL_ARGS_FLOAT)));
-}
-
-TEST_P(LognormalIcdfUsmTest, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, LOGNORMAL_ARGS_DOUBLE)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::icdf>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, LOGNORMAL_ARGS_DOUBLE)));
-}
-
-TEST_P(LognormalBoxmullerUsmTest, RealSinglePrecision) {
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::box_muller2>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, LOGNORMAL_ARGS_FLOAT)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::lognormal<float, oneapi::mkl::rng::lognormal_method::box_muller2>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, LOGNORMAL_ARGS_FLOAT)));
-}
-
-TEST_P(LognormalBoxmullerUsmTest, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::box_muller2>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, LOGNORMAL_ARGS_DOUBLE)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::lognormal<double, oneapi::mkl::rng::lognormal_method::box_muller2>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, LOGNORMAL_ARGS_DOUBLE)));
-}
-
-INSTANTIATE_TEST_SUITE_P(LognormalIcdfUsmTestSuite, LognormalIcdfUsmTest,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(LognormalBoxmullerUsmTestSuite, LognormalBoxmullerUsmTest,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/rng/statistics_check/poisson.cpp b/tests/unit_tests/rng/statistics_check/poisson.cpp
deleted file mode 100755
index d39842e9f..000000000
--- a/tests/unit_tests/rng/statistics_check/poisson.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "statistics_check_test.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-class PoissonIcdfTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(PoissonIcdfTests, IntegerPrecision) {
-    rng_test<
-        statistics_test<oneapi::mkl::rng::poisson<
-                            std::int32_t, oneapi::mkl::rng::poisson_method::gaussian_icdf_based>,
-                        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, POISSON_ARGS)));
-    rng_test<
-        statistics_test<oneapi::mkl::rng::poisson<
-                            std::int32_t, oneapi::mkl::rng::poisson_method::gaussian_icdf_based>,
-                        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, POISSON_ARGS)));
-}
-
-TEST_P(PoissonIcdfTests, UnsignedIntegerPrecision) {
-    rng_test<
-        statistics_test<oneapi::mkl::rng::poisson<
-                            std::uint32_t, oneapi::mkl::rng::poisson_method::gaussian_icdf_based>,
-                        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, POISSON_ARGS)));
-    rng_test<
-        statistics_test<oneapi::mkl::rng::poisson<
-                            std::int32_t, oneapi::mkl::rng::poisson_method::gaussian_icdf_based>,
-                        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, POISSON_ARGS)));
-}
-
-INSTANTIATE_TEST_SUITE_P(PoissonIcdfTestSuite, PoissonIcdfTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/rng/statistics_check/poisson_usm.cpp b/tests/unit_tests/rng/statistics_check/poisson_usm.cpp
deleted file mode 100755
index 052eff5a3..000000000
--- a/tests/unit_tests/rng/statistics_check/poisson_usm.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "statistics_check_test.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-class PoissonIcdfUsmTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(PoissonIcdfUsmTests, IntegerPrecision) {
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::poisson<std::int32_t,
-                                  oneapi::mkl::rng::poisson_method::gaussian_icdf_based>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, POISSON_ARGS)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::poisson<std::int32_t,
-                                  oneapi::mkl::rng::poisson_method::gaussian_icdf_based>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, POISSON_ARGS)));
-}
-
-TEST_P(PoissonIcdfUsmTests, UnsignedIntegerPrecision) {
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::poisson<std::uint32_t,
-                                  oneapi::mkl::rng::poisson_method::gaussian_icdf_based>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, POISSON_ARGS)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::poisson<std::int32_t,
-                                  oneapi::mkl::rng::poisson_method::gaussian_icdf_based>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, POISSON_ARGS)));
-}
-
-INSTANTIATE_TEST_SUITE_P(PoissonIcdfUsmTestSuite, PoissonIcdfUsmTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/rng/statistics_check/uniform.cpp b/tests/unit_tests/rng/statistics_check/uniform.cpp
deleted file mode 100644
index eb11714e1..000000000
--- a/tests/unit_tests/rng/statistics_check/uniform.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "statistics_check_test.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-class UniformStdTests : public ::testing::TestWithParam<sycl::device*> {};
-
-class UniformAccurateTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(UniformStdTests, RealSinglePrecision) {
-    rng_test<statistics_test<
-        oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, UNIFORM_ARGS_FLOAT)));
-    rng_test<statistics_test<
-        oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, UNIFORM_ARGS_FLOAT)));
-}
-
-TEST_P(UniformStdTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_test<statistics_test<
-        oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, UNIFORM_ARGS_DOUBLE)));
-    rng_test<statistics_test<
-        oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, UNIFORM_ARGS_DOUBLE)));
-}
-
-TEST_P(UniformStdTests, IntegerPrecision) {
-    rng_test<statistics_test<
-        oneapi::mkl::rng::uniform<std::int32_t, oneapi::mkl::rng::uniform_method::standard>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, UNIFORM_ARGS_INT)));
-    rng_test<statistics_test<
-        oneapi::mkl::rng::uniform<std::int32_t, oneapi::mkl::rng::uniform_method::standard>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, UNIFORM_ARGS_INT)));
-}
-
-TEST_P(UniformAccurateTests, RealSinglePrecision) {
-    rng_test<statistics_test<
-        oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, UNIFORM_ARGS_FLOAT)));
-    rng_test<statistics_test<
-        oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, UNIFORM_ARGS_FLOAT)));
-}
-
-TEST_P(UniformAccurateTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_test<statistics_test<
-        oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, UNIFORM_ARGS_DOUBLE)));
-    rng_test<statistics_test<
-        oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, UNIFORM_ARGS_DOUBLE)));
-}
-
-INSTANTIATE_TEST_SUITE_P(UniformStdTestSuite, UniformStdTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(UniformAccurateTestSuite, UniformAccurateTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/rng/statistics_check/uniform_usm.cpp b/tests/unit_tests/rng/statistics_check/uniform_usm.cpp
deleted file mode 100644
index df4f7a764..000000000
--- a/tests/unit_tests/rng/statistics_check/uniform_usm.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include "statistics_check_test.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device*> devices;
-
-namespace {
-
-class UniformStdUsmTests : public ::testing::TestWithParam<sycl::device*> {};
-
-class UniformAccurateUsmTests : public ::testing::TestWithParam<sycl::device*> {};
-
-TEST_P(UniformStdUsmTests, RealSinglePrecision) {
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, UNIFORM_ARGS_FLOAT)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::standard>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, UNIFORM_ARGS_FLOAT)));
-}
-
-TEST_P(UniformStdUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, UNIFORM_ARGS_DOUBLE)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::standard>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, UNIFORM_ARGS_DOUBLE)));
-}
-
-TEST_P(UniformStdUsmTests, IntegerPrecision) {
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::uniform<std::int32_t, oneapi::mkl::rng::uniform_method::standard>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, UNIFORM_ARGS_INT)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::uniform<std::int32_t, oneapi::mkl::rng::uniform_method::standard>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, UNIFORM_ARGS_INT)));
-}
-
-TEST_P(UniformAccurateUsmTests, RealSinglePrecision) {
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, UNIFORM_ARGS_FLOAT)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::uniform<float, oneapi::mkl::rng::uniform_method::accurate>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, UNIFORM_ARGS_FLOAT)));
-}
-
-TEST_P(UniformAccurateUsmTests, RealDoublePrecision) {
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>,
-        oneapi::mkl::rng::philox4x32x10>>
-        test1;
-    EXPECT_TRUEORSKIP((test1(GetParam(), N_GEN, UNIFORM_ARGS_DOUBLE)));
-    rng_test<statistics_usm_test<
-        oneapi::mkl::rng::uniform<double, oneapi::mkl::rng::uniform_method::accurate>,
-        oneapi::mkl::rng::mrg32k3a>>
-        test2;
-    EXPECT_TRUEORSKIP((test2(GetParam(), N_GEN, UNIFORM_ARGS_DOUBLE)));
-}
-
-INSTANTIATE_TEST_SUITE_P(UniformStdUsmTestSuite, UniformStdUsmTests, ::testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-INSTANTIATE_TEST_SUITE_P(UniformAccurateUsmTestSuite, UniformAccurateUsmTests,
-                         ::testing::ValuesIn(devices), ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/sparse_blas/CMakeLists.txt b/tests/unit_tests/sparse_blas/CMakeLists.txt
deleted file mode 100644
index 2c46cd38c..000000000
--- a/tests/unit_tests/sparse_blas/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-add_subdirectory(source)
diff --git a/tests/unit_tests/sparse_blas/include/sparse_reference.hpp b/tests/unit_tests/sparse_blas/include/sparse_reference.hpp
deleted file mode 100644
index ffb876f11..000000000
--- a/tests/unit_tests/sparse_blas/include/sparse_reference.hpp
+++ /dev/null
@@ -1,297 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _SPARSE_REFERENCE_HPP__
-#define _SPARSE_REFERENCE_HPP__
-
-#include <stdexcept>
-#include <string>
-#include <tuple>
-
-#include "oneapi/mkl.hpp"
-
-#include "test_common.hpp"
-
-template <typename T>
-inline T conjugate(T) {
-    static_assert(false, "Unsupported type");
-}
-template <>
-inline float conjugate(float t) {
-    return t;
-}
-template <>
-inline double conjugate(double t) {
-    return t;
-}
-template <>
-inline std::complex<float> conjugate(std::complex<float> t) {
-    return std::conj(t);
-}
-template <>
-inline std::complex<double> conjugate(std::complex<double> t) {
-    return std::conj(t);
-}
-
-template <typename T>
-inline T opVal(const T t, const bool isConj) {
-    return (isConj ? conjugate(t) : t);
-};
-
-template <typename fpType, typename intType, typename accIntType, typename accFpType>
-void do_csr_transpose(const oneapi::mkl::transpose opA, intType *ia_t, intType *ja_t, fpType *a_t,
-                      intType a_nrows, intType a_ncols, intType a_ind, accIntType &ia,
-                      accIntType &ja, accFpType &a, const bool structOnlyFlag = false) {
-    const bool isConj = (opA == oneapi::mkl::transpose::conjtrans);
-
-    // initialize ia_t to zero
-    for (intType i = 0; i < a_ncols + 1; ++i) {
-        ia_t[i] = 0;
-    }
-
-    // fill ia_t with counts of columns
-    for (intType i = 0; i < a_nrows; ++i) {
-        const intType st = ia[i] - a_ind;
-        const intType en = ia[i + 1] - a_ind;
-        for (intType j = st; j < en; ++j) {
-            const intType col = ja[j] - a_ind;
-            ia_t[col + 1]++;
-        }
-    }
-    // prefix sum to get official ia_t counts
-    ia_t[0] = a_ind;
-    for (intType i = 0; i < a_ncols; ++i) {
-        ia_t[i + 1] += ia_t[i];
-    }
-
-    // second pass through data to fill transpose structure
-    for (intType i = 0; i < a_nrows; ++i) {
-        const intType st = ia[i] - a_ind;
-        const intType en = ia[i + 1] - a_ind;
-        for (intType j = st; j < en; ++j) {
-            const intType col = ja[j] - a_ind;
-            const intType j_in_a_t = ia_t[col] - a_ind;
-            ia_t[col]++;
-            ja_t[j_in_a_t] = i + a_ind;
-            if (!structOnlyFlag) {
-                const fpType val = a[j];
-                a_t[j_in_a_t] = opVal(val, isConj);
-            }
-        }
-    }
-
-    // adjust ia_t back to original state after filling structure
-    for (intType i = a_ncols; i > 0; --i) {
-        ia_t[i] = ia_t[i - 1];
-    }
-    ia_t[0] = a_ind;
-}
-
-// Transpose the given sparse matrix if needed
-template <typename fpType, typename intType>
-auto sparse_transpose_if_needed(const intType *ia, const intType *ja, const fpType *a,
-                                intType a_nrows, intType a_ncols, std::size_t nnz, intType a_ind,
-                                oneapi::mkl::transpose transpose_val) {
-    std::vector<intType> iopa;
-    std::vector<intType> jopa;
-    std::vector<fpType> opa;
-    if (transpose_val == oneapi::mkl::transpose::nontrans) {
-        iopa.assign(ia, ia + a_nrows + 1);
-        jopa.assign(ja, ja + nnz);
-        opa.assign(a, a + nnz);
-    }
-    else if (transpose_val == oneapi::mkl::transpose::trans ||
-             transpose_val == oneapi::mkl::transpose::conjtrans) {
-        iopa.resize(static_cast<std::size_t>(a_ncols + 1));
-        jopa.resize(nnz);
-        opa.resize(nnz);
-        do_csr_transpose(transpose_val, iopa.data(), jopa.data(), opa.data(), a_nrows, a_ncols,
-                         a_ind, ia, ja, a);
-    }
-    else {
-        throw std::runtime_error("unsupported transpose_val=" +
-                                 std::to_string(static_cast<char>(transpose_val)));
-    }
-    return std::make_tuple(iopa, jopa, opa);
-}
-
-template <typename fpType>
-auto dense_transpose_if_needed(const fpType *x, std::size_t outer_size, std::size_t inner_size,
-                               std::size_t ld, oneapi::mkl::transpose transpose_val) {
-    std::vector<fpType> opx;
-    if (transpose_val == oneapi::mkl::transpose::nontrans) {
-        opx.assign(x, x + outer_size * ld);
-    }
-    else {
-        opx.resize(outer_size * ld);
-        for (std::size_t i = 0; i < outer_size; ++i) {
-            for (std::size_t j = 0; j < inner_size; ++j) {
-                opx[i + j * ld] = x[i * ld + j];
-            }
-        }
-    }
-    return opx;
-}
-
-/// Return the dense matrix A in row major layout.
-/// Diagonal values are overwritten with 1s if diag_val is unit.
-template <typename fpType, typename intType>
-std::vector<fpType> sparse_to_dense(const intType *ia, const intType *ja, const fpType *a,
-                                    std::size_t a_nrows, std::size_t a_ncols, intType a_ind,
-                                    oneapi::mkl::transpose transpose_val,
-                                    oneapi::mkl::diag diag_val) {
-    std::vector<fpType> dense_a(a_nrows * a_ncols, fpType(0));
-    for (std::size_t row = 0; row < a_nrows; row++) {
-        for (intType i = ia[row] - a_ind; i < ia[row + 1] - a_ind; i++) {
-            std::size_t iu = static_cast<std::size_t>(i);
-            std::size_t col = static_cast<std::size_t>(ja[iu] - a_ind);
-            std::size_t dense_a_idx = transpose_val != oneapi::mkl::transpose::nontrans
-                                          ? col * a_nrows + row
-                                          : row * a_ncols + col;
-            fpType val = a[iu];
-            if constexpr (complex_info<fpType>::is_complex) {
-                if (transpose_val == oneapi::mkl::transpose::conjtrans) {
-                    val = std::conj(val);
-                }
-            }
-            dense_a[dense_a_idx] = val;
-        }
-    }
-    if (diag_val == oneapi::mkl::diag::unit) {
-        for (std::size_t i = 0; i < a_nrows; ++i) {
-            dense_a[i * a_ncols + i] = set_fp_value<fpType>()(1.f, 0.f);
-        }
-    }
-    return dense_a;
-}
-
-template <typename fpType, typename intType>
-void prepare_reference_gemv_data(const intType *ia, const intType *ja, const fpType *a,
-                                 intType a_nrows, intType a_ncols, intType a_nnz, intType a_ind,
-                                 oneapi::mkl::transpose opA, fpType alpha, fpType beta,
-                                 const fpType *x, fpType *y_ref) {
-    std::size_t opa_nrows =
-        static_cast<std::size_t>((opA == oneapi::mkl::transpose::nontrans) ? a_nrows : a_ncols);
-    const std::size_t nnz = static_cast<std::size_t>(a_nnz);
-    auto [iopa, jopa, opa] =
-        sparse_transpose_if_needed(ia, ja, a, a_nrows, a_ncols, nnz, a_ind, opA);
-
-    //
-    // do GEMV operation
-    //
-    //  y_ref <- alpha * op(A) * x + beta * y_ref
-    //
-    for (std::size_t row = 0; row < opa_nrows; row++) {
-        fpType tmp = 0;
-        for (intType i = iopa[row] - a_ind; i < iopa[row + 1] - a_ind; i++) {
-            std::size_t iu = static_cast<std::size_t>(i);
-            std::size_t x_ind = static_cast<std::size_t>(jopa[iu] - a_ind);
-            tmp += opa[iu] * x[x_ind];
-        }
-
-        y_ref[row] = alpha * tmp + beta * y_ref[row];
-    }
-}
-
-template <typename fpType, typename intType>
-void prepare_reference_gemm_data(const intType *ia, const intType *ja, const fpType *a,
-                                 intType a_nrows, intType a_ncols, intType c_ncols, intType a_nnz,
-                                 intType a_ind, oneapi::mkl::layout dense_matrix_layout,
-                                 oneapi::mkl::transpose opA, oneapi::mkl::transpose opB,
-                                 fpType alpha, fpType beta, intType ldb, intType ldc,
-                                 const fpType *b, fpType *c_ref) {
-    std::size_t opa_nrows =
-        static_cast<std::size_t>((opA == oneapi::mkl::transpose::nontrans) ? a_nrows : a_ncols);
-    std::size_t opa_ncols =
-        static_cast<std::size_t>((opA == oneapi::mkl::transpose::nontrans) ? a_ncols : a_nrows);
-    const std::size_t nnz = static_cast<std::size_t>(a_nnz);
-    const std::size_t ldb_u = static_cast<std::size_t>(ldb);
-    const std::size_t ldc_u = static_cast<std::size_t>(ldc);
-    auto [iopa, jopa, opa] =
-        sparse_transpose_if_needed(ia, ja, a, a_nrows, a_ncols, nnz, a_ind, opA);
-
-    std::size_t b_outer_size = static_cast<std::size_t>(opa_ncols);
-    std::size_t b_inner_size = static_cast<std::size_t>(c_ncols);
-    if (dense_matrix_layout == oneapi::mkl::layout::col_major) {
-        std::swap(b_outer_size, b_inner_size);
-    }
-    auto opb = dense_transpose_if_needed(b, b_outer_size, b_inner_size, ldb_u, opB);
-
-    //
-    // do GEMM operation
-    //
-    //  C <- alpha * opA(A) * opB(B) + beta * C
-    //
-    if (dense_matrix_layout == oneapi::mkl::layout::row_major) {
-        for (std::size_t row = 0; row < opa_nrows; row++) {
-            for (std::size_t col = 0; col < static_cast<std::size_t>(c_ncols); col++) {
-                fpType tmp = 0;
-                for (std::size_t i = static_cast<std::size_t>(iopa[row] - a_ind);
-                     i < static_cast<std::size_t>(iopa[row + 1] - a_ind); i++) {
-                    tmp += opa[i] * opb[static_cast<std::size_t>(jopa[i] - a_ind) * ldb_u + col];
-                }
-                fpType &c = c_ref[row * ldc_u + col];
-                c = alpha * tmp + beta * c;
-            }
-        }
-    }
-    else {
-        for (std::size_t col = 0; col < static_cast<std::size_t>(c_ncols); col++) {
-            for (std::size_t row = 0; row < opa_nrows; row++) {
-                fpType tmp = 0;
-                for (std::size_t i = static_cast<std::size_t>(iopa[row] - a_ind);
-                     i < static_cast<std::size_t>(iopa[row + 1] - a_ind); i++) {
-                    tmp += opa[i] * opb[static_cast<std::size_t>(jopa[i] - a_ind) + col * ldb_u];
-                }
-                fpType &c = c_ref[row + col * ldc_u];
-                c = alpha * tmp + beta * c;
-            }
-        }
-    }
-}
-
-template <typename fpType, typename intType>
-void prepare_reference_trsv_data(const intType *ia, const intType *ja, const fpType *a, intType m,
-                                 intType a_ind, oneapi::mkl::uplo uplo_val,
-                                 oneapi::mkl::transpose opA, oneapi::mkl::diag diag_val,
-                                 const fpType *x, fpType *y_ref) {
-    std::size_t mu = static_cast<std::size_t>(m);
-    auto dense_a = sparse_to_dense(ia, ja, a, mu, mu, a_ind, opA, diag_val);
-
-    //
-    // do TRSV operation
-    //
-    //  y_ref <- op(A)^-1 * x
-    //
-    // Compute each element of the reference one after the other starting from 0 (resp. the end) for a lower (resp. upper) triangular matrix.
-    // A matrix is considered lowered if it is lower and not transposed or upper and transposed.
-    const bool is_lower =
-        (uplo_val == oneapi::mkl::uplo::lower) == (opA == oneapi::mkl::transpose::nontrans);
-    for (std::size_t row = 0; row < mu; row++) {
-        std::size_t uplo_row = is_lower ? row : (mu - 1 - row);
-        fpType rhs = x[uplo_row];
-        for (std::size_t col = 0; col < row; col++) {
-            std::size_t uplo_col = is_lower ? col : (mu - 1 - col);
-            rhs -= dense_a[uplo_row * mu + uplo_col] * y_ref[uplo_col];
-        }
-        y_ref[uplo_row] = rhs / dense_a[uplo_row * mu + uplo_row];
-    }
-}
-
-#endif // _SPARSE_REFERENCE_HPP__
diff --git a/tests/unit_tests/sparse_blas/include/test_common.hpp b/tests/unit_tests/sparse_blas/include/test_common.hpp
deleted file mode 100644
index fd1e91a47..000000000
--- a/tests/unit_tests/sparse_blas/include/test_common.hpp
+++ /dev/null
@@ -1,286 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#ifndef _TEST_COMMON_HPP__
-#define _TEST_COMMON_HPP__
-
-#include <complex>
-#include <iostream>
-#include <memory>
-#include <limits>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "test_helper.hpp"
-
-// Sparse BLAS domain needs to call more functions per test so we use this macro helper to select between runtime and compile dispatch for each function
-#ifdef CALL_RT_API
-#define CALL_RT_OR_CT(FUNC, QUEUE, ...) FUNC(QUEUE, __VA_ARGS__)
-#else
-#define CALL_RT_OR_CT(FUNC, QUEUE, ...) TEST_RUN_CT_SELECT(QUEUE, FUNC, __VA_ARGS__);
-#endif
-
-template <typename T>
-struct complex_info {
-    using real_type = T;
-    static const bool is_complex = false;
-};
-
-template <typename T>
-struct complex_info<std::complex<T>> {
-    using real_type = T;
-    static const bool is_complex = true;
-};
-
-void print_error_code(sycl::exception const &e);
-
-// Catch asynchronous exceptions.
-struct exception_handler_t {
-    void operator()(sycl::exception_list exceptions) {
-        for (std::exception_ptr const &e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const &e) {
-                std::cout << "Caught asynchronous SYCL exception:\n" << e.what() << std::endl;
-                print_error_code(e);
-            }
-        }
-    }
-};
-
-// Use a unique_ptr to automatically free device memory on unique_ptr destruction.
-template <class T>
-auto malloc_device_uptr(sycl::queue q, std::size_t num_elts) {
-    struct Deleter {
-        sycl::queue q;
-        Deleter(sycl::queue _q) : q(_q) {}
-        void operator()(T *ptr) {
-            sycl::free(ptr, q);
-        }
-    };
-    return std::unique_ptr<T, Deleter>(sycl::malloc_device<T>(num_elts, q), Deleter(q));
-}
-
-// SYCL buffer creation helper.
-template <typename vec>
-sycl::buffer<typename vec::value_type, 1> make_buffer(const vec &v) {
-    sycl::buffer<typename vec::value_type, 1> buf(v.data(), sycl::range<1>(v.size()));
-    return buf;
-}
-
-template <typename fpType>
-struct set_fp_value {
-    inline fpType operator()(fpType real, fpType /*imag*/) {
-        return real;
-    }
-};
-
-template <typename scalarType>
-struct set_fp_value<std::complex<scalarType>> {
-    inline auto operator()(scalarType real, scalarType imag) {
-        return std::complex<scalarType>(real, imag);
-    }
-};
-
-template <typename fpType>
-struct rand_scalar {
-    inline fpType operator()(double min, double max) {
-        return (fpType(std::rand()) / fpType(RAND_MAX)) * fpType(max - min) + fpType(min);
-    }
-};
-
-template <typename fpType>
-struct rand_scalar<std::complex<fpType>> {
-    inline std::complex<fpType> operator()(double min, double max) {
-        rand_scalar<fpType> rand;
-        return std::complex<fpType>(rand(min, max), rand(min, max));
-    }
-};
-
-template <typename fpType>
-void rand_vector(std::vector<fpType> &v, std::size_t n) {
-    using fpRealType = typename complex_info<fpType>::real_type;
-    v.resize(n);
-    rand_scalar<fpType> rand;
-    for (std::size_t i = 0; i < n; i++) {
-        v[i] = rand(fpRealType(-0.5), fpRealType(0.5));
-    }
-}
-
-template <typename fpType>
-void rand_matrix(std::vector<fpType> &m, oneapi::mkl::layout layout_val, std::size_t nrows,
-                 std::size_t ncols, std::size_t ld) {
-    using fpRealType = typename complex_info<fpType>::real_type;
-    std::size_t outer_size = nrows;
-    std::size_t inner_size = ncols;
-    if (layout_val == oneapi::mkl::layout::col_major) {
-        std::swap(outer_size, inner_size);
-    }
-    m.resize(outer_size * ld);
-    rand_scalar<fpType> rand;
-    for (std::size_t i = 0; i < outer_size; ++i) {
-        std::size_t j = 0;
-        for (; j < inner_size; ++j) {
-            m[i * ld + j] = rand(fpRealType(-0.5), fpRealType(0.5));
-        }
-        for (; j < ld; ++j) {
-            m[i * ld + j] = set_fp_value<fpType>()(-1.f, 0.f);
-        }
-    }
-}
-
-// Creating the 3arrays CSR representation (ia, ja, values)
-// of general random sparse matrix
-// with density (0 < density <= 1.0)
-// -0.5 <= value < 0.5
-// require_diagonal means all diagonal entries guaranteed to be nonzero
-template <typename fpType, typename intType>
-intType generate_random_matrix(const intType nrows, const intType ncols, const double density_val,
-                               intType indexing, std::vector<intType> &ia, std::vector<intType> &ja,
-                               std::vector<fpType> &a, bool require_diagonal = false) {
-    intType nnz = 0;
-    rand_scalar<double> rand_density;
-    rand_scalar<fpType> rand_data;
-
-    ia.push_back(indexing); // starting index of row0.
-    for (intType i = 0; i < nrows; i++) {
-        ia.push_back(nnz + indexing); // ending index of row_i.
-        for (intType j = 0; j < ncols; j++) {
-            const bool is_diag = require_diagonal && i == j;
-            if (is_diag || (rand_density(0.0, 1.0) <= density_val)) {
-                fpType val;
-                if (is_diag) {
-                    // Guarantee an amplitude >= 0.1
-                    fpType sign = (std::rand() % 2) * 2 - 1;
-                    val = rand_data(0.1, 0.5) * sign;
-                }
-                else {
-                    val = rand_data(-0.5, 0.5);
-                }
-                a.push_back(val);
-                ja.push_back(j + indexing);
-                nnz++;
-            }
-        }
-        ia[static_cast<std::size_t>(i) + 1] = nnz + indexing;
-    }
-    return nnz;
-}
-
-// Shuffle the 3arrays CSR representation (ia, ja, values)
-// of any sparse matrix and set values serially from 0..nnz.
-// Intended for use with sorting.
-template <typename fpType, typename intType>
-void shuffle_data(const intType *ia, intType *ja, fpType *a, const std::size_t nrows) {
-    //
-    // shuffle indices according to random seed
-    //
-    intType indexing = ia[0];
-    for (std::size_t i = 0; i < nrows; ++i) {
-        intType nnz_row = ia[i + 1] - ia[i];
-        for (intType j = ia[i] - indexing; j < ia[i + 1] - indexing; ++j) {
-            intType q = ia[i] - indexing + std::rand() % (nnz_row);
-            // swap element i and q
-            std::swap(ja[q], ja[j]);
-            std::swap(a[q], a[j]);
-        }
-    }
-}
-
-inline void wait_and_free(sycl::queue &main_queue, oneapi::mkl::sparse::matrix_handle_t *p_handle) {
-    main_queue.wait();
-    sycl::event ev_release;
-    CALL_RT_OR_CT(ev_release = oneapi::mkl::sparse::release_matrix_handle, main_queue, p_handle);
-    ev_release.wait();
-}
-
-template <typename fpType>
-bool check_equal(fpType x, fpType x_ref, double abs_error_margin, double rel_error_margin,
-                 std::ostream &out) {
-    using fpRealType = typename complex_info<fpType>::real_type;
-    static_assert(std::is_floating_point_v<fpRealType>,
-                  "Expected floating-point real or complex type.");
-
-    const fpRealType epsilon = std::numeric_limits<fpRealType>::epsilon();
-    const auto abs_bound = static_cast<fpRealType>(abs_error_margin) * epsilon;
-    const auto rel_bound = static_cast<fpRealType>(rel_error_margin) * epsilon;
-
-    const auto aerr = std::abs(x - x_ref);
-    const auto rerr = aerr / std::abs(x_ref);
-    const bool valid = (rerr <= rel_bound) || (aerr <= abs_bound);
-    if (!valid) {
-        out << "Mismatching results: actual = " << x << " vs. reference = " << x_ref << "\n";
-        out << " relative error = " << rerr << " absolute error = " << aerr
-            << " relative bound = " << rel_bound << " absolute bound = " << abs_bound << "\n";
-    }
-    return valid;
-}
-
-template <typename vecType1, typename vecType2>
-bool check_equal_vector(const vecType1 &v, const vecType2 &v_ref, double abs_error_factor = 10.0,
-                        double rel_error_factor = 200.0, std::ostream &out = std::cout) {
-    using T = typename vecType2::value_type;
-    std::size_t n = v.size();
-    if (n != v_ref.size()) {
-        out << "Mismatching size got " << n << " expected " << v_ref.size() << "\n";
-        return false;
-    }
-    if (n == 0) {
-        return true;
-    }
-
-    auto max_norm_ref =
-        *std::max_element(std::begin(v_ref), std::end(v_ref),
-                          [](const T &a, const T &b) { return std::abs(a) < std::abs(b); });
-    // Heuristic for the average-case error margins
-    double abs_error_margin =
-        abs_error_factor * std::abs(max_norm_ref) * std::log2(static_cast<double>(n));
-    double rel_error_margin = rel_error_factor * std::log2(static_cast<double>(n));
-
-    constexpr int max_print = 20;
-    int count = 0;
-    bool valid = true;
-
-    for (std::size_t i = 0; i < n; ++i) {
-        // Allow to convert the unsigned index `i` to a signed one to keep this function generic and allow for `v` and `v_ref` to be a vector, a pointer or a random access iterator.
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wsign-conversion"
-        auto res = v[i];
-        auto ref = v_ref[i];
-#pragma clang diagnostic pop
-        if (!check_equal(res, ref, abs_error_margin, rel_error_margin, out)) {
-            out << " at index i =" << i << "\n";
-            valid = false;
-            ++count;
-            if (count > max_print) {
-                return valid;
-            }
-        }
-    }
-
-    return valid;
-}
-
-#endif // _TEST_COMMON_HPP__
diff --git a/tests/unit_tests/sparse_blas/source/CMakeLists.txt b/tests/unit_tests/sparse_blas/source/CMakeLists.txt
deleted file mode 100644
index 3a1fcb288..000000000
--- a/tests/unit_tests/sparse_blas/source/CMakeLists.txt
+++ /dev/null
@@ -1,63 +0,0 @@
-#===============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-#
-# SPDX-License-Identifier: Apache-2.0
-#===============================================================================
-
-set(SPBLAS_SOURCES
-  "sparse_gemm_buffer.cpp"
-  "sparse_gemm_usm.cpp"
-  "sparse_gemv_buffer.cpp"
-  "sparse_gemv_usm.cpp"
-  "sparse_trsv_buffer.cpp"
-  "sparse_trsv_usm.cpp"
-)
-
-include(WarningsUtils)
-
-if (BUILD_SHARED_LIBS)
-    add_library(spblas_source_rt OBJECT ${SPBLAS_SOURCES})
-    target_compile_options(spblas_source_rt PRIVATE -DCALL_RT_API -DNOMINMAX)
-    target_include_directories(spblas_source_rt
-            PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-            PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-            PUBLIC ${PROJECT_SOURCE_DIR}/include
-            PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-            PUBLIC ${CMAKE_BINARY_DIR}/bin
-            )
-    if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-        add_sycl_to_target(TARGET spblas_source_rt SOURCES ${SPBLAS_SOURCES})
-    else ()
-        target_link_libraries(spblas_source_rt PUBLIC ONEMKL::SYCL::SYCL)
-    endif ()
-    target_link_libraries(spblas_source_rt PRIVATE onemkl_warnings)
-endif ()
-
-add_library(spblas_source_ct OBJECT ${SPBLAS_SOURCES})
-target_compile_options(spblas_source_ct PRIVATE -DNOMINMAX)
-target_include_directories(spblas_source_ct
-        PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
-        PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
-        PUBLIC ${PROJECT_SOURCE_DIR}/include
-        PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
-        PUBLIC ${CMAKE_BINARY_DIR}/bin
-        )
-if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
-    add_sycl_to_target(TARGET spblas_source_ct SOURCES ${SPBLAS_SOURCES})
-else ()
-    target_link_libraries(spblas_source_ct PUBLIC ONEMKL::SYCL::SYCL)
-endif ()
-target_link_libraries(spblas_source_ct PRIVATE onemkl_warnings)
diff --git a/tests/unit_tests/sparse_blas/source/sparse_gemm_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_gemm_buffer.cpp
deleted file mode 100644
index 1c9549fcc..000000000
--- a/tests/unit_tests/sparse_blas/source/sparse_gemm_buffer.cpp
+++ /dev/null
@@ -1,302 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "sparse_reference.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fpType, typename intType>
-int test(sycl::device *dev, intType nrows_A, intType ncols_A, intType ncols_C,
-         double density_A_matrix, oneapi::mkl::index_base index,
-         oneapi::mkl::layout dense_matrix_layout, oneapi::mkl::transpose transpose_A,
-         oneapi::mkl::transpose transpose_B, fpType alpha, fpType beta, intType ldb, intType ldc,
-         bool opt_1_input, bool opt_2_inputs) {
-    sycl::queue main_queue(*dev, exception_handler_t());
-
-    intType int_index = (index == oneapi::mkl::index_base::zero) ? 0 : 1;
-    std::size_t opa_nrows = static_cast<std::size_t>(
-        transpose_A == oneapi::mkl::transpose::nontrans ? nrows_A : ncols_A);
-    std::size_t opa_ncols = static_cast<std::size_t>(
-        transpose_A == oneapi::mkl::transpose::nontrans ? ncols_A : nrows_A);
-
-    // Input matrix in CSR format
-    std::vector<intType> ia_host, ja_host;
-    std::vector<fpType> a_host;
-    intType nnz = generate_random_matrix<fpType, intType>(nrows_A, ncols_A, density_A_matrix,
-                                                          int_index, ia_host, ja_host, a_host);
-
-    // Input and output dense vectors
-    std::vector<fpType> b_host, c_host;
-    rand_matrix(b_host, dense_matrix_layout, opa_ncols, static_cast<std::size_t>(ncols_C),
-                static_cast<std::size_t>(ldb));
-    rand_matrix(c_host, dense_matrix_layout, opa_nrows, static_cast<std::size_t>(ncols_C),
-                static_cast<std::size_t>(ldc));
-    std::vector<fpType> c_ref_host(c_host);
-
-    // Shuffle ordering of column indices/values to test sortedness
-    shuffle_data(ia_host.data(), ja_host.data(), a_host.data(), static_cast<std::size_t>(nrows_A));
-
-    auto ia_buf = make_buffer(ia_host);
-    auto ja_buf = make_buffer(ja_host);
-    auto a_buf = make_buffer(a_host);
-    auto b_buf = make_buffer(b_host);
-    auto c_buf = make_buffer(c_host);
-
-    sycl::event ev_release;
-    oneapi::mkl::sparse::matrix_handle_t handle = nullptr;
-    try {
-        CALL_RT_OR_CT(oneapi::mkl::sparse::init_matrix_handle, main_queue, &handle);
-
-        CALL_RT_OR_CT(oneapi::mkl::sparse::set_csr_data, main_queue, handle, nrows_A, ncols_A, nnz,
-                      index, ia_buf, ja_buf, a_buf);
-
-        if (opt_1_input) {
-            CALL_RT_OR_CT(oneapi::mkl::sparse::optimize_gemm, main_queue, transpose_A, handle);
-        }
-
-        if (opt_2_inputs) {
-            CALL_RT_OR_CT(oneapi::mkl::sparse::optimize_gemm, main_queue, transpose_A, transpose_B,
-                          dense_matrix_layout, static_cast<std::int64_t>(ncols_C), handle);
-        }
-
-        CALL_RT_OR_CT(oneapi::mkl::sparse::gemm, main_queue, dense_matrix_layout, transpose_A,
-                      transpose_B, alpha, handle, b_buf, ncols_C, ldb, beta, c_buf, ldc);
-
-        CALL_RT_OR_CT(ev_release = oneapi::mkl::sparse::release_matrix_handle, main_queue, &handle);
-    }
-    catch (const sycl::exception &e) {
-        std::cout << "Caught synchronous SYCL exception during sparse GEMV:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-        return 0;
-    }
-    catch (const oneapi::mkl::unimplemented &e) {
-        wait_and_free(main_queue, &handle);
-        return test_skipped;
-    }
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of sparse GEMV:\n" << error.what() << std::endl;
-        return 0;
-    }
-
-    // Compute reference.
-    prepare_reference_gemm_data(ia_host.data(), ja_host.data(), a_host.data(), nrows_A, ncols_A,
-                                ncols_C, nnz, int_index, dense_matrix_layout, transpose_A,
-                                transpose_B, alpha, beta, ldb, ldc, b_host.data(),
-                                c_ref_host.data());
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto c_acc = c_buf.get_host_access(sycl::read_only);
-    bool valid = check_equal_vector(c_acc, c_ref_host);
-
-    ev_release.wait_and_throw();
-    return static_cast<int>(valid);
-}
-
-class SparseGemmBufferTests : public ::testing::TestWithParam<sycl::device *> {};
-
-/**
- * Helper function to run tests in different configuration.
- * 
- * @tparam fpType Complex or scalar, single or double precision type
- * @param dev Device to test
- * @param transpose_A Transpose value for the A matrix
- * @param transpose_B Transpose value for the B matrix
- * @param num_passed Increase the number of configurations passed
- * @param num_skipped Increase the number of configurations skipped
- */
-template <typename fpType>
-void test_helper(sycl::device *dev, oneapi::mkl::transpose transpose_A,
-                 oneapi::mkl::transpose transpose_B, int &num_passed, int &num_skipped) {
-    double density_A_matrix = 0.8;
-    fpType fp_zero = set_fp_value<fpType>()(0.f, 0.f);
-    fpType fp_one = set_fp_value<fpType>()(1.f, 0.f);
-    oneapi::mkl::index_base index_zero = oneapi::mkl::index_base::zero;
-    oneapi::mkl::layout col_major = oneapi::mkl::layout::col_major;
-    int nrows_A = 4, ncols_A = 6, ncols_C = 5;
-    int ldb = transpose_A == oneapi::mkl::transpose::nontrans ? ncols_A : nrows_A;
-    int ldc = transpose_A == oneapi::mkl::transpose::nontrans ? nrows_A : ncols_A;
-    bool no_opt_1_input = false;
-    bool opt_2_inputs = true;
-
-    // Basic test
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_one, fp_zero, ldb, ldc, no_opt_1_input, opt_2_inputs),
-        num_passed, num_skipped);
-    // Test index_base 1
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, oneapi::mkl::index_base::one,
-             col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, no_opt_1_input,
-             opt_2_inputs),
-        num_passed, num_skipped);
-    // Test non-default alpha
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, set_fp_value<fpType>()(2.f, 1.5f), fp_zero, ldb, ldc, no_opt_1_input,
-             opt_2_inputs),
-        num_passed, num_skipped);
-    // Test non-default beta
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_one, set_fp_value<fpType>()(3.2f, 1.f), ldb, ldc, no_opt_1_input,
-             opt_2_inputs),
-        num_passed, num_skipped);
-    // Test 0 alpha
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_zero, fp_one, ldb, ldc, no_opt_1_input, opt_2_inputs),
-        num_passed, num_skipped);
-    // Test 0 alpha and beta
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_zero, fp_zero, ldb, ldc, no_opt_1_input, opt_2_inputs),
-        num_passed, num_skipped);
-    // Test non-default ldb
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_one, fp_zero, ldb + 5, ldc, no_opt_1_input, opt_2_inputs),
-        num_passed, num_skipped);
-    // Test non-default ldc
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_one, fp_zero, ldb, ldc + 6, no_opt_1_input, opt_2_inputs),
-        num_passed, num_skipped);
-    // Test row major layout
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero,
-             oneapi::mkl::layout::row_major, transpose_A, transpose_B, fp_one, fp_zero, ncols_C,
-             ncols_C, no_opt_1_input, opt_2_inputs),
-        num_passed, num_skipped);
-    // Test int64 indices
-    long long_nrows_A = 27, long_ncols_A = 13, long_ncols_C = 6;
-    long long_ldb = transpose_A == oneapi::mkl::transpose::nontrans ? long_ncols_A : long_nrows_A;
-    long long_ldc = transpose_A == oneapi::mkl::transpose::nontrans ? long_nrows_A : long_ncols_A;
-    EXPECT_TRUE_OR_FUTURE_SKIP(test(dev, long_nrows_A, long_ncols_A, long_ncols_C, density_A_matrix,
-                                    index_zero, col_major, transpose_A, transpose_B, fp_one,
-                                    fp_zero, long_ldb, long_ldc, no_opt_1_input, opt_2_inputs),
-                               num_passed, num_skipped);
-    // Use optimize_gemm with only the sparse gemm input
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_one, fp_zero, ldb, ldc, true, false),
-        num_passed, num_skipped);
-    // Use the 2 optimize_gemm versions
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_one, fp_zero, ldb, ldc, true, true),
-        num_passed, num_skipped);
-    // Do not use optimize_gemm
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_one, fp_zero, ldb, ldc, false, false),
-        num_passed, num_skipped);
-}
-
-/**
- * Helper function to test combination of transpose vals.
- * Only test \p conjtrans if \p fpType is complex.
- * 
- * @tparam fpType Complex or scalar, single or double precision type
- * @param dev Device to test
- * @param num_passed Increase the number of configurations passed
- * @param num_skipped Increase the number of configurations skipped
- */
-template <typename fpType>
-void test_helper_transpose(sycl::device *dev, int &num_passed, int &num_skipped) {
-    std::vector<oneapi::mkl::transpose> transpose_vals{ oneapi::mkl::transpose::nontrans,
-                                                        oneapi::mkl::transpose::trans };
-    if (complex_info<fpType>::is_complex) {
-        transpose_vals.push_back(oneapi::mkl::transpose::conjtrans);
-    }
-    for (auto transpose_A : transpose_vals) {
-        for (auto transpose_B : transpose_vals) {
-            test_helper<fpType>(dev, transpose_A, transpose_B, num_passed, num_skipped);
-        }
-    }
-}
-
-TEST_P(SparseGemmBufferTests, RealSinglePrecision) {
-    using fpType = float;
-    int num_passed = 0, num_skipped = 0;
-    test_helper_transpose<fpType>(GetParam(), num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseGemmBufferTests, RealDoublePrecision) {
-    using fpType = double;
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-    int num_passed = 0, num_skipped = 0;
-    test_helper_transpose<fpType>(GetParam(), num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseGemmBufferTests, ComplexSinglePrecision) {
-    using fpType = std::complex<float>;
-    int num_passed = 0, num_skipped = 0;
-    test_helper_transpose<fpType>(GetParam(), num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseGemmBufferTests, ComplexDoublePrecision) {
-    using fpType = std::complex<double>;
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-    int num_passed = 0, num_skipped = 0;
-    test_helper_transpose<fpType>(GetParam(), num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-INSTANTIATE_TEST_SUITE_P(SparseGemmBufferTestSuite, SparseGemmBufferTests,
-                         testing::ValuesIn(devices), ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/sparse_blas/source/sparse_gemm_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_gemm_usm.cpp
deleted file mode 100644
index 3850f3b99..000000000
--- a/tests/unit_tests/sparse_blas/source/sparse_gemm_usm.cpp
+++ /dev/null
@@ -1,330 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "sparse_reference.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fpType, typename intType>
-int test(sycl::device *dev, intType nrows_A, intType ncols_A, intType ncols_C,
-         double density_A_matrix, oneapi::mkl::index_base index,
-         oneapi::mkl::layout dense_matrix_layout, oneapi::mkl::transpose transpose_A,
-         oneapi::mkl::transpose transpose_B, fpType alpha, fpType beta, intType ldb, intType ldc,
-         bool opt_1_input, bool opt_2_inputs) {
-    sycl::queue main_queue(*dev, exception_handler_t());
-
-    intType int_index = (index == oneapi::mkl::index_base::zero) ? 0 : 1;
-    std::size_t opa_nrows = static_cast<std::size_t>(
-        transpose_A == oneapi::mkl::transpose::nontrans ? nrows_A : ncols_A);
-    std::size_t opa_ncols = static_cast<std::size_t>(
-        transpose_A == oneapi::mkl::transpose::nontrans ? ncols_A : nrows_A);
-
-    // Input matrix in CSR format
-    std::vector<intType> ia_host, ja_host;
-    std::vector<fpType> a_host;
-    intType nnz = generate_random_matrix<fpType, intType>(nrows_A, ncols_A, density_A_matrix,
-                                                          int_index, ia_host, ja_host, a_host);
-
-    // Input and output dense vectors
-    std::vector<fpType> b_host, c_host;
-    rand_matrix(b_host, dense_matrix_layout, opa_ncols, static_cast<std::size_t>(ncols_C),
-                static_cast<std::size_t>(ldb));
-    rand_matrix(c_host, dense_matrix_layout, opa_nrows, static_cast<std::size_t>(ncols_C),
-                static_cast<std::size_t>(ldc));
-    std::vector<fpType> c_ref_host(c_host);
-
-    // Shuffle ordering of column indices/values to test sortedness
-    shuffle_data(ia_host.data(), ja_host.data(), a_host.data(), static_cast<std::size_t>(nrows_A));
-
-    auto ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size());
-    auto ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size());
-    auto a_usm_uptr = malloc_device_uptr<fpType>(main_queue, a_host.size());
-    auto b_usm_uptr = malloc_device_uptr<fpType>(main_queue, b_host.size());
-    auto c_usm_uptr = malloc_device_uptr<fpType>(main_queue, c_host.size());
-
-    intType *ia_usm = ia_usm_uptr.get();
-    intType *ja_usm = ja_usm_uptr.get();
-    fpType *a_usm = a_usm_uptr.get();
-    fpType *b_usm = b_usm_uptr.get();
-    fpType *c_usm = c_usm_uptr.get();
-
-    std::vector<sycl::event> mat_dependencies;
-    std::vector<sycl::event> gemm_dependencies;
-    // Copy host to device
-    mat_dependencies.push_back(
-        main_queue.memcpy(ia_usm, ia_host.data(), ia_host.size() * sizeof(intType)));
-    mat_dependencies.push_back(
-        main_queue.memcpy(ja_usm, ja_host.data(), ja_host.size() * sizeof(intType)));
-    mat_dependencies.push_back(
-        main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType)));
-    gemm_dependencies.push_back(
-        main_queue.memcpy(b_usm, b_host.data(), b_host.size() * sizeof(fpType)));
-    gemm_dependencies.push_back(
-        main_queue.memcpy(c_usm, c_host.data(), c_host.size() * sizeof(fpType)));
-
-    sycl::event ev_copy, ev_release;
-    oneapi::mkl::sparse::matrix_handle_t handle = nullptr;
-    try {
-        sycl::event event;
-        CALL_RT_OR_CT(oneapi::mkl::sparse::init_matrix_handle, main_queue, &handle);
-
-        CALL_RT_OR_CT(event = oneapi::mkl::sparse::set_csr_data, main_queue, handle, nrows_A,
-                      ncols_A, nnz, index, ia_usm, ja_usm, a_usm, mat_dependencies);
-
-        if (opt_1_input) {
-            CALL_RT_OR_CT(event = oneapi::mkl::sparse::optimize_gemm, main_queue, transpose_A,
-                          handle, { event });
-        }
-
-        if (opt_2_inputs) {
-            CALL_RT_OR_CT(event = oneapi::mkl::sparse::optimize_gemm, main_queue, transpose_A,
-                          transpose_B, dense_matrix_layout, static_cast<std::int64_t>(ncols_C),
-                          handle, { event });
-        }
-
-        gemm_dependencies.push_back(event);
-        CALL_RT_OR_CT(event = oneapi::mkl::sparse::gemm, main_queue, dense_matrix_layout,
-                      transpose_A, transpose_B, alpha, handle, b_usm, ncols_C, ldb, beta, c_usm,
-                      ldc, gemm_dependencies);
-
-        CALL_RT_OR_CT(ev_release = oneapi::mkl::sparse::release_matrix_handle, main_queue, &handle,
-                      { event });
-
-        ev_copy = main_queue.memcpy(c_host.data(), c_usm, c_host.size() * sizeof(fpType), event);
-    }
-    catch (const sycl::exception &e) {
-        std::cout << "Caught synchronous SYCL exception during sparse GEMV:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-        return 0;
-    }
-    catch (const oneapi::mkl::unimplemented &e) {
-        wait_and_free(main_queue, &handle);
-        return test_skipped;
-    }
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of sparse GEMV:\n" << error.what() << std::endl;
-        return 0;
-    }
-
-    // Compute reference.
-    prepare_reference_gemm_data(ia_host.data(), ja_host.data(), a_host.data(), nrows_A, ncols_A,
-                                ncols_C, nnz, int_index, dense_matrix_layout, transpose_A,
-                                transpose_B, alpha, beta, ldb, ldc, b_host.data(),
-                                c_ref_host.data());
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    ev_copy.wait_and_throw();
-    bool valid = check_equal_vector(c_host, c_ref_host);
-
-    ev_release.wait_and_throw();
-    return static_cast<int>(valid);
-}
-
-class SparseGemmUsmTests : public ::testing::TestWithParam<sycl::device *> {};
-
-/**
- * Helper function to run tests in different configuration.
- * 
- * @tparam fpType Complex or scalar, single or double precision type
- * @param dev Device to test
- * @param transpose_A Transpose value for the A matrix
- * @param transpose_B Transpose value for the B matrix
- * @param num_passed Increase the number of configurations passed
- * @param num_skipped Increase the number of configurations skipped
- */
-template <typename fpType>
-void test_helper(sycl::device *dev, oneapi::mkl::transpose transpose_A,
-                 oneapi::mkl::transpose transpose_B, int &num_passed, int &num_skipped) {
-    double density_A_matrix = 0.8;
-    fpType fp_zero = set_fp_value<fpType>()(0.f, 0.f);
-    fpType fp_one = set_fp_value<fpType>()(1.f, 0.f);
-    oneapi::mkl::index_base index_zero = oneapi::mkl::index_base::zero;
-    oneapi::mkl::layout col_major = oneapi::mkl::layout::col_major;
-    int nrows_A = 4, ncols_A = 6, ncols_C = 5;
-    int ldb = transpose_A == oneapi::mkl::transpose::nontrans ? ncols_A : nrows_A;
-    int ldc = transpose_A == oneapi::mkl::transpose::nontrans ? nrows_A : ncols_A;
-    bool no_opt_1_input = false;
-    bool opt_2_inputs = true;
-
-    // Basic test
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_one, fp_zero, ldb, ldc, no_opt_1_input, opt_2_inputs),
-        num_passed, num_skipped);
-    // Test index_base 1
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, oneapi::mkl::index_base::one,
-             col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, no_opt_1_input,
-             opt_2_inputs),
-        num_passed, num_skipped);
-    // Test non-default alpha
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, set_fp_value<fpType>()(2.f, 1.5f), fp_zero, ldb, ldc, no_opt_1_input,
-             opt_2_inputs),
-        num_passed, num_skipped);
-    // Test non-default beta
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_one, set_fp_value<fpType>()(3.2f, 1.f), ldb, ldc, no_opt_1_input,
-             opt_2_inputs),
-        num_passed, num_skipped);
-    // Test 0 alpha
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_zero, fp_one, ldb, ldc, no_opt_1_input, opt_2_inputs),
-        num_passed, num_skipped);
-    // Test 0 alpha and beta
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_zero, fp_zero, ldb, ldc, no_opt_1_input, opt_2_inputs),
-        num_passed, num_skipped);
-    // Test non-default ldb
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_one, fp_zero, ldb + 5, ldc, no_opt_1_input, opt_2_inputs),
-        num_passed, num_skipped);
-    // Test non-default ldc
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_one, fp_zero, ldb, ldc + 6, no_opt_1_input, opt_2_inputs),
-        num_passed, num_skipped);
-    // Test row major layout
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero,
-             oneapi::mkl::layout::row_major, transpose_A, transpose_B, fp_one, fp_zero, ncols_C,
-             ncols_C, no_opt_1_input, opt_2_inputs),
-        num_passed, num_skipped);
-    // Test int64 indices
-    long long_nrows_A = 27, long_ncols_A = 13, long_ncols_C = 6;
-    long long_ldb = transpose_A == oneapi::mkl::transpose::nontrans ? long_ncols_A : long_nrows_A;
-    long long_ldc = transpose_A == oneapi::mkl::transpose::nontrans ? long_nrows_A : long_ncols_A;
-    EXPECT_TRUE_OR_FUTURE_SKIP(test(dev, long_nrows_A, long_ncols_A, long_ncols_C, density_A_matrix,
-                                    index_zero, col_major, transpose_A, transpose_B, fp_one,
-                                    fp_zero, long_ldb, long_ldc, no_opt_1_input, opt_2_inputs),
-                               num_passed, num_skipped);
-    // Use optimize_gemm with only the sparse gemm input
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_one, fp_zero, ldb, ldc, true, false),
-        num_passed, num_skipped);
-    // Use the 2 optimize_gemm versions
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_one, fp_zero, ldb, ldc, true, true),
-        num_passed, num_skipped);
-    // Do not use optimize_gemm
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A,
-             transpose_B, fp_one, fp_zero, ldb, ldc, false, false),
-        num_passed, num_skipped);
-}
-
-/**
- * Helper function to test combination of transpose vals.
- * Only test \p conjtrans if \p fpType is complex.
- * 
- * @tparam fpType Complex or scalar, single or double precision type
- * @param dev Device to test
- * @param num_passed Increase the number of configurations passed
- * @param num_skipped Increase the number of configurations skipped
- */
-template <typename fpType>
-auto test_helper_transpose(sycl::device *dev, int &num_passed, int &num_skipped) {
-    std::vector<oneapi::mkl::transpose> transpose_vals{ oneapi::mkl::transpose::nontrans,
-                                                        oneapi::mkl::transpose::trans };
-    if (complex_info<fpType>::is_complex) {
-        transpose_vals.push_back(oneapi::mkl::transpose::conjtrans);
-    }
-    for (auto transpose_A : transpose_vals) {
-        for (auto transpose_B : transpose_vals) {
-            test_helper<fpType>(dev, transpose_A, transpose_B, num_passed, num_skipped);
-        }
-    }
-}
-
-TEST_P(SparseGemmUsmTests, RealSinglePrecision) {
-    using fpType = float;
-    int num_passed = 0, num_skipped = 0;
-    test_helper_transpose<fpType>(GetParam(), num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseGemmUsmTests, RealDoublePrecision) {
-    using fpType = double;
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-    int num_passed = 0, num_skipped = 0;
-    test_helper_transpose<fpType>(GetParam(), num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseGemmUsmTests, ComplexSinglePrecision) {
-    using fpType = std::complex<float>;
-    int num_passed = 0, num_skipped = 0;
-    test_helper_transpose<fpType>(GetParam(), num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseGemmUsmTests, ComplexDoublePrecision) {
-    using fpType = std::complex<double>;
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-    int num_passed = 0, num_skipped = 0;
-    test_helper_transpose<fpType>(GetParam(), num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-INSTANTIATE_TEST_SUITE_P(SparseGemmUsmTestSuite, SparseGemmUsmTests, testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/sparse_blas/source/sparse_gemv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_gemv_buffer.cpp
deleted file mode 100644
index b95636831..000000000
--- a/tests/unit_tests/sparse_blas/source/sparse_gemv_buffer.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "sparse_reference.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fpType, typename intType>
-int test(sycl::device *dev, intType nrows, intType ncols, double density_A_matrix,
-         oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha,
-         fpType beta, bool use_optimize) {
-    sycl::queue main_queue(*dev, exception_handler_t());
-
-    intType int_index = (index == oneapi::mkl::index_base::zero) ? 0 : 1;
-    std::size_t opa_nrows =
-        static_cast<std::size_t>(transpose_val == oneapi::mkl::transpose::nontrans ? nrows : ncols);
-    std::size_t opa_ncols =
-        static_cast<std::size_t>(transpose_val == oneapi::mkl::transpose::nontrans ? ncols : nrows);
-
-    // Input matrix in CSR format
-    std::vector<intType> ia_host, ja_host;
-    std::vector<fpType> a_host;
-    intType nnz = generate_random_matrix<fpType, intType>(nrows, ncols, density_A_matrix, int_index,
-                                                          ia_host, ja_host, a_host);
-
-    // Input and output dense vectors
-    // The input `x` and the input-output `y` are both initialized to random values on host and device.
-    std::vector<fpType> x_host, y_host;
-    rand_vector(x_host, opa_ncols);
-    rand_vector(y_host, opa_nrows);
-    std::vector<fpType> y_ref_host(y_host);
-
-    // Shuffle ordering of column indices/values to test sortedness
-    shuffle_data(ia_host.data(), ja_host.data(), a_host.data(), static_cast<std::size_t>(nrows));
-
-    auto ia_buf = make_buffer(ia_host);
-    auto ja_buf = make_buffer(ja_host);
-    auto a_buf = make_buffer(a_host);
-    auto x_buf = make_buffer(x_host);
-    auto y_buf = make_buffer(y_host);
-
-    oneapi::mkl::sparse::matrix_handle_t handle = nullptr;
-    sycl::event ev_release;
-    try {
-        CALL_RT_OR_CT(oneapi::mkl::sparse::init_matrix_handle, main_queue, &handle);
-
-        CALL_RT_OR_CT(oneapi::mkl::sparse::set_csr_data, main_queue, handle, nrows, ncols, nnz,
-                      index, ia_buf, ja_buf, a_buf);
-
-        if (use_optimize) {
-            CALL_RT_OR_CT(oneapi::mkl::sparse::optimize_gemv, main_queue, transpose_val, handle);
-        }
-
-        CALL_RT_OR_CT(oneapi::mkl::sparse::gemv, main_queue, transpose_val, alpha, handle, x_buf,
-                      beta, y_buf);
-
-        CALL_RT_OR_CT(ev_release = oneapi::mkl::sparse::release_matrix_handle, main_queue, &handle);
-    }
-    catch (const sycl::exception &e) {
-        std::cout << "Caught synchronous SYCL exception during sparse GEMV:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-        return 0;
-    }
-    catch (const oneapi::mkl::unimplemented &e) {
-        wait_and_free(main_queue, &handle);
-        return test_skipped;
-    }
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of sparse GEMV:\n" << error.what() << std::endl;
-        return 0;
-    }
-
-    // Compute reference.
-    prepare_reference_gemv_data(ia_host.data(), ja_host.data(), a_host.data(), nrows, ncols, nnz,
-                                int_index, transpose_val, alpha, beta, x_host.data(),
-                                y_ref_host.data());
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto y_acc = y_buf.get_host_access(sycl::read_only);
-    bool valid = check_equal_vector(y_acc, y_ref_host);
-
-    ev_release.wait_and_throw();
-    return static_cast<int>(valid);
-}
-
-class SparseGemvBufferTests : public ::testing::TestWithParam<sycl::device *> {};
-
-/**
- * Helper function to run tests in different configuration.
- *
- * @tparam fpType Complex or scalar, single or double precision type
- * @param dev Device to test
- * @param transpose_val Transpose value for the input matrix
- * @param num_passed Increase the number of configurations passed
- * @param num_skipped Increase the number of configurations skipped
- */
-template <typename fpType>
-void test_helper(sycl::device *dev, oneapi::mkl::transpose transpose_val, int &num_passed,
-                 int &num_skipped) {
-    double density_A_matrix = 0.8;
-    fpType fp_zero = set_fp_value<fpType>()(0.f, 0.f);
-    fpType fp_one = set_fp_value<fpType>()(1.f, 0.f);
-    oneapi::mkl::index_base index_zero = oneapi::mkl::index_base::zero;
-    bool use_optimize = true;
-
-    // Basic test
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_one, fp_zero, use_optimize),
-        num_passed, num_skipped);
-    // Test index_base 1
-    EXPECT_TRUE_OR_FUTURE_SKIP(test(dev, 4, 6, density_A_matrix, oneapi::mkl::index_base::one,
-                                    transpose_val, fp_one, fp_zero, use_optimize),
-                               num_passed, num_skipped);
-    // Test non-default alpha
-    EXPECT_TRUE_OR_FUTURE_SKIP(test(dev, 4, 6, density_A_matrix, index_zero, transpose_val,
-                                    set_fp_value<fpType>()(2.f, 1.5f), fp_zero, use_optimize),
-                               num_passed, num_skipped);
-    // Test non-default beta
-    EXPECT_TRUE_OR_FUTURE_SKIP(test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_one,
-                                    set_fp_value<fpType>()(3.2f, 1.f), use_optimize),
-                               num_passed, num_skipped);
-    // Test 0 alpha
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_zero, fp_one, use_optimize),
-        num_passed, num_skipped);
-    // Test 0 alpha and beta
-    EXPECT_TRUE_OR_FUTURE_SKIP(test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_zero,
-                                    fp_zero, use_optimize),
-                               num_passed, num_skipped);
-    // Test int64 indices
-    EXPECT_TRUE_OR_FUTURE_SKIP(test(dev, 27L, 13L, density_A_matrix, index_zero, transpose_val,
-                                    fp_one, fp_one, use_optimize),
-                               num_passed, num_skipped);
-    // Test without optimize_gemv
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_one, fp_zero, false),
-        num_passed, num_skipped);
-}
-
-TEST_P(SparseGemvBufferTests, RealSinglePrecision) {
-    using fpType = float;
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseGemvBufferTests, RealDoublePrecision) {
-    using fpType = double;
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseGemvBufferTests, ComplexSinglePrecision) {
-    using fpType = std::complex<float>;
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::conjtrans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseGemvBufferTests, ComplexDoublePrecision) {
-    using fpType = std::complex<double>;
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::conjtrans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-INSTANTIATE_TEST_SUITE_P(SparseGemvBufferTestSuite, SparseGemvBufferTests,
-                         testing::ValuesIn(devices), ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/sparse_blas/source/sparse_gemv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_gemv_usm.cpp
deleted file mode 100644
index 582e0c6f4..000000000
--- a/tests/unit_tests/sparse_blas/source/sparse_gemv_usm.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "sparse_reference.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fpType, typename intType>
-int test(sycl::device *dev, intType nrows, intType ncols, double density_A_matrix,
-         oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha,
-         fpType beta, bool use_optimize) {
-    sycl::queue main_queue(*dev, exception_handler_t());
-
-    intType int_index = (index == oneapi::mkl::index_base::zero) ? 0 : 1;
-    std::size_t opa_nrows =
-        static_cast<std::size_t>(transpose_val == oneapi::mkl::transpose::nontrans ? nrows : ncols);
-    std::size_t opa_ncols =
-        static_cast<std::size_t>(transpose_val == oneapi::mkl::transpose::nontrans ? ncols : nrows);
-
-    // Input matrix in CSR format
-    std::vector<intType> ia_host, ja_host;
-    std::vector<fpType> a_host;
-    intType nnz = generate_random_matrix<fpType, intType>(nrows, ncols, density_A_matrix, int_index,
-                                                          ia_host, ja_host, a_host);
-
-    // Input and output dense vectors
-    // The input `x` and the input-output `y` are both initialized to random values on host and device.
-    std::vector<fpType> x_host, y_host;
-    rand_vector(x_host, opa_ncols);
-    rand_vector(y_host, opa_nrows);
-    std::vector<fpType> y_ref_host(y_host);
-
-    // Shuffle ordering of column indices/values to test sortedness
-    shuffle_data(ia_host.data(), ja_host.data(), a_host.data(), static_cast<std::size_t>(nrows));
-
-    auto ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size());
-    auto ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size());
-    auto a_usm_uptr = malloc_device_uptr<fpType>(main_queue, a_host.size());
-    auto x_usm_uptr = malloc_device_uptr<fpType>(main_queue, x_host.size());
-    auto y_usm_uptr = malloc_device_uptr<fpType>(main_queue, y_host.size());
-
-    intType *ia_usm = ia_usm_uptr.get();
-    intType *ja_usm = ja_usm_uptr.get();
-    fpType *a_usm = a_usm_uptr.get();
-    fpType *x_usm = x_usm_uptr.get();
-    fpType *y_usm = y_usm_uptr.get();
-
-    std::vector<sycl::event> mat_dependencies;
-    std::vector<sycl::event> gemv_dependencies;
-    // Copy host to device
-    mat_dependencies.push_back(
-        main_queue.memcpy(ia_usm, ia_host.data(), ia_host.size() * sizeof(intType)));
-    mat_dependencies.push_back(
-        main_queue.memcpy(ja_usm, ja_host.data(), ja_host.size() * sizeof(intType)));
-    mat_dependencies.push_back(
-        main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType)));
-    gemv_dependencies.push_back(
-        main_queue.memcpy(x_usm, x_host.data(), x_host.size() * sizeof(fpType)));
-    gemv_dependencies.push_back(
-        main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType)));
-
-    sycl::event ev_copy, ev_release;
-    oneapi::mkl::sparse::matrix_handle_t handle = nullptr;
-    try {
-        sycl::event event;
-        CALL_RT_OR_CT(oneapi::mkl::sparse::init_matrix_handle, main_queue, &handle);
-
-        CALL_RT_OR_CT(event = oneapi::mkl::sparse::set_csr_data, main_queue, handle, nrows, ncols,
-                      nnz, index, ia_usm, ja_usm, a_usm, mat_dependencies);
-
-        if (use_optimize) {
-            CALL_RT_OR_CT(event = oneapi::mkl::sparse::optimize_gemv, main_queue, transpose_val,
-                          handle, { event });
-        }
-
-        gemv_dependencies.push_back(event);
-        CALL_RT_OR_CT(event = oneapi::mkl::sparse::gemv, main_queue, transpose_val, alpha, handle,
-                      x_usm, beta, y_usm, gemv_dependencies);
-
-        CALL_RT_OR_CT(ev_release = oneapi::mkl::sparse::release_matrix_handle, main_queue, &handle,
-                      { event });
-
-        ev_copy = main_queue.memcpy(y_host.data(), y_usm, y_host.size() * sizeof(fpType), event);
-    }
-    catch (const sycl::exception &e) {
-        std::cout << "Caught synchronous SYCL exception during sparse GEMV:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-        return 0;
-    }
-    catch (const oneapi::mkl::unimplemented &e) {
-        wait_and_free(main_queue, &handle);
-        return test_skipped;
-    }
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of sparse GEMV:\n" << error.what() << std::endl;
-        return 0;
-    }
-
-    // Compute reference.
-    prepare_reference_gemv_data(ia_host.data(), ja_host.data(), a_host.data(), nrows, ncols, nnz,
-                                int_index, transpose_val, alpha, beta, x_host.data(),
-                                y_ref_host.data());
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    ev_copy.wait_and_throw();
-    bool valid = check_equal_vector(y_host, y_ref_host);
-
-    ev_release.wait_and_throw();
-    return static_cast<int>(valid);
-}
-
-class SparseGemvUsmTests : public ::testing::TestWithParam<sycl::device *> {};
-
-/**
- * Helper function to run tests in different configuration.
- *
- * @tparam fpType Complex or scalar, single or double precision type
- * @param dev Device to test
- * @param transpose_val Transpose value for the input matrix
- * @param num_passed Increase the number of configurations passed
- * @param num_skipped Increase the number of configurations skipped
- */
-template <typename fpType>
-void test_helper(sycl::device *dev, oneapi::mkl::transpose transpose_val, int &num_passed,
-                 int &num_skipped) {
-    double density_A_matrix = 0.8;
-    fpType fp_zero = set_fp_value<fpType>()(0.f, 0.f);
-    fpType fp_one = set_fp_value<fpType>()(1.f, 0.f);
-    oneapi::mkl::index_base index_zero = oneapi::mkl::index_base::zero;
-    bool use_optimize = true;
-
-    // Basic test
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_one, fp_zero, use_optimize),
-        num_passed, num_skipped);
-    // Test index_base 1
-    EXPECT_TRUE_OR_FUTURE_SKIP(test(dev, 4, 6, density_A_matrix, oneapi::mkl::index_base::one,
-                                    transpose_val, fp_one, fp_zero, use_optimize),
-                               num_passed, num_skipped);
-    // Test non-default alpha
-    EXPECT_TRUE_OR_FUTURE_SKIP(test(dev, 4, 6, density_A_matrix, index_zero, transpose_val,
-                                    set_fp_value<fpType>()(2.f, 1.5f), fp_zero, use_optimize),
-                               num_passed, num_skipped);
-    // Test non-default beta
-    EXPECT_TRUE_OR_FUTURE_SKIP(test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_one,
-                                    set_fp_value<fpType>()(3.2f, 1.f), use_optimize),
-                               num_passed, num_skipped);
-    // Test 0 alpha
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_zero, fp_one, use_optimize),
-        num_passed, num_skipped);
-    // Test 0 alpha and beta
-    EXPECT_TRUE_OR_FUTURE_SKIP(test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_zero,
-                                    fp_zero, use_optimize),
-                               num_passed, num_skipped);
-    // Test int64 indices
-    EXPECT_TRUE_OR_FUTURE_SKIP(test(dev, 27L, 13L, density_A_matrix, index_zero, transpose_val,
-                                    fp_one, fp_one, use_optimize),
-                               num_passed, num_skipped);
-    // Test without optimize_gemv
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_one, fp_zero, false),
-        num_passed, num_skipped);
-}
-
-TEST_P(SparseGemvUsmTests, RealSinglePrecision) {
-    using fpType = float;
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseGemvUsmTests, RealDoublePrecision) {
-    using fpType = double;
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseGemvUsmTests, ComplexSinglePrecision) {
-    using fpType = std::complex<float>;
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::conjtrans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseGemvUsmTests, ComplexDoublePrecision) {
-    using fpType = std::complex<double>;
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::conjtrans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-INSTANTIATE_TEST_SUITE_P(SparseGemvUsmTestSuite, SparseGemvUsmTests, testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/sparse_blas/source/sparse_trsv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_trsv_buffer.cpp
deleted file mode 100644
index 4e82ae1f0..000000000
--- a/tests/unit_tests/sparse_blas/source/sparse_trsv_buffer.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "sparse_reference.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fpType, typename intType>
-int test(sycl::device *dev, intType m, double density_A_matrix, oneapi::mkl::index_base index,
-         oneapi::mkl::uplo uplo_val, oneapi::mkl::transpose transpose_val,
-         oneapi::mkl::diag diag_val, bool use_optimize) {
-    sycl::queue main_queue(*dev, exception_handler_t());
-
-    intType int_index = (index == oneapi::mkl::index_base::zero) ? 0 : 1;
-    const std::size_t mu = static_cast<std::size_t>(m);
-
-    // Input matrix in CSR format
-    std::vector<intType> ia_host, ja_host;
-    std::vector<fpType> a_host;
-    // Always require values to be present in the diagonal of the sparse matrix.
-    // The values set in the matrix don't need to be 1s even if diag_val is unit.
-    const bool require_diagonal = true;
-    intType nnz = generate_random_matrix<fpType, intType>(
-        m, m, density_A_matrix, int_index, ia_host, ja_host, a_host, require_diagonal);
-
-    // Input dense vector.
-    // The input `x` is initialized to random values on host and device.
-    std::vector<fpType> x_host;
-    rand_vector(x_host, mu);
-
-    // Output and reference dense vectors.
-    // They are both initialized with a dummy value to catch more errors.
-    std::vector<fpType> y_host(mu, -2.0f);
-    std::vector<fpType> y_ref_host(y_host);
-
-    // Intel oneMKL does not support unsorted data if
-    // `sparse::optimize_trsv()` is not called first.
-    if (use_optimize) {
-        // Shuffle ordering of column indices/values to test sortedness
-        shuffle_data(ia_host.data(), ja_host.data(), a_host.data(), mu);
-    }
-
-    auto ia_buf = make_buffer(ia_host);
-    auto ja_buf = make_buffer(ja_host);
-    auto a_buf = make_buffer(a_host);
-    auto x_buf = make_buffer(x_host);
-    auto y_buf = make_buffer(y_host);
-
-    sycl::event ev_release;
-    oneapi::mkl::sparse::matrix_handle_t handle = nullptr;
-    try {
-        CALL_RT_OR_CT(oneapi::mkl::sparse::init_matrix_handle, main_queue, &handle);
-
-        CALL_RT_OR_CT(oneapi::mkl::sparse::set_csr_data, main_queue, handle, m, m, nnz, index,
-                      ia_buf, ja_buf, a_buf);
-
-        if (use_optimize) {
-            CALL_RT_OR_CT(oneapi::mkl::sparse::optimize_trsv, main_queue, uplo_val, transpose_val,
-                          diag_val, handle);
-        }
-
-        CALL_RT_OR_CT(oneapi::mkl::sparse::trsv, main_queue, uplo_val, transpose_val, diag_val,
-                      handle, x_buf, y_buf);
-
-        CALL_RT_OR_CT(ev_release = oneapi::mkl::sparse::release_matrix_handle, main_queue, &handle);
-    }
-    catch (const sycl::exception &e) {
-        std::cout << "Caught synchronous SYCL exception during sparse TRSV:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-        return 0;
-    }
-    catch (const oneapi::mkl::unimplemented &e) {
-        wait_and_free(main_queue, &handle);
-        return test_skipped;
-    }
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of sparse TRSV:\n" << error.what() << std::endl;
-        return 0;
-    }
-
-    // Compute reference.
-    prepare_reference_trsv_data(ia_host.data(), ja_host.data(), a_host.data(), m, int_index,
-                                uplo_val, transpose_val, diag_val, x_host.data(),
-                                y_ref_host.data());
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    auto y_acc = y_buf.get_host_access(sycl::read_only);
-    bool valid = check_equal_vector(y_acc, y_ref_host);
-
-    ev_release.wait_and_throw();
-    return static_cast<int>(valid);
-}
-
-class SparseTrsvBufferTests : public ::testing::TestWithParam<sycl::device *> {};
-
-/**
- * Helper function to run tests in different configuration.
- *
- * @tparam fpType Complex or scalar, single or double precision type
- * @param dev Device to test
- * @param transpose_val Transpose value for the input matrix
- * @param num_passed Increase the number of configurations passed
- * @param num_skipped Increase the number of configurations skipped
- */
-template <typename fpType>
-auto test_helper(sycl::device *dev, oneapi::mkl::transpose transpose_val, int &num_passed,
-                 int &num_skipped) {
-    double density_A_matrix = 0.144;
-    oneapi::mkl::index_base index_zero = oneapi::mkl::index_base::zero;
-    oneapi::mkl::uplo lower = oneapi::mkl::uplo::lower;
-    oneapi::mkl::diag nonunit = oneapi::mkl::diag::nonunit;
-    int m = 277;
-    bool use_optimize = true;
-
-    // Basic test
-    EXPECT_TRUE_OR_FUTURE_SKIP(test<fpType>(dev, m, density_A_matrix, index_zero, lower,
-                                            transpose_val, nonunit, use_optimize),
-                               num_passed, num_skipped);
-    // Test index_base 1
-    EXPECT_TRUE_OR_FUTURE_SKIP(test<fpType>(dev, m, density_A_matrix, oneapi::mkl::index_base::one,
-                                            lower, transpose_val, nonunit, use_optimize),
-                               num_passed, num_skipped);
-    // Test upper triangular matrix
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test<fpType>(dev, m, density_A_matrix, index_zero, oneapi::mkl::uplo::upper, transpose_val,
-                     nonunit, use_optimize),
-        num_passed, num_skipped);
-    // Test unit diagonal matrix
-    EXPECT_TRUE_OR_FUTURE_SKIP(test<fpType>(dev, m, density_A_matrix, index_zero, lower,
-                                            transpose_val, oneapi::mkl::diag::unit, use_optimize),
-                               num_passed, num_skipped);
-    // Temporarily disable trsv using long indices on GPU
-    if (!dev->is_gpu()) {
-        // Test int64 indices
-        EXPECT_TRUE_OR_FUTURE_SKIP(test<fpType>(dev, 15L, density_A_matrix, index_zero, lower,
-                                                transpose_val, nonunit, use_optimize),
-                                   num_passed, num_skipped);
-    }
-    // Test lower without optimize_trsv
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test<fpType>(dev, m, density_A_matrix, index_zero, lower, transpose_val, nonunit, false),
-        num_passed, num_skipped);
-    // Test upper without optimize_trsv
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test<fpType>(dev, m, density_A_matrix, index_zero, oneapi::mkl::uplo::upper, transpose_val,
-                     nonunit, false),
-        num_passed, num_skipped);
-}
-
-TEST_P(SparseTrsvBufferTests, RealSinglePrecision) {
-    using fpType = float;
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseTrsvBufferTests, RealDoublePrecision) {
-    using fpType = double;
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseTrsvBufferTests, ComplexSinglePrecision) {
-    using fpType = std::complex<float>;
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::conjtrans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseTrsvBufferTests, ComplexDoublePrecision) {
-    using fpType = std::complex<double>;
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::conjtrans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-INSTANTIATE_TEST_SUITE_P(SparseTrsvBufferTestSuite, SparseTrsvBufferTests,
-                         testing::ValuesIn(devices), ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/tests/unit_tests/sparse_blas/source/sparse_trsv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_trsv_usm.cpp
deleted file mode 100644
index 8292395fb..000000000
--- a/tests/unit_tests/sparse_blas/source/sparse_trsv_usm.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-#include <complex>
-#include <iostream>
-#include <vector>
-
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/mkl.hpp"
-#include "oneapi/mkl/detail/config.hpp"
-#include "sparse_reference.hpp"
-#include "test_common.hpp"
-#include "test_helper.hpp"
-
-#include <gtest/gtest.h>
-
-extern std::vector<sycl::device *> devices;
-
-namespace {
-
-template <typename fpType, typename intType>
-int test(sycl::device *dev, intType m, double density_A_matrix, oneapi::mkl::index_base index,
-         oneapi::mkl::uplo uplo_val, oneapi::mkl::transpose transpose_val,
-         oneapi::mkl::diag diag_val, bool use_optimize) {
-    sycl::queue main_queue(*dev, exception_handler_t());
-
-    intType int_index = (index == oneapi::mkl::index_base::zero) ? 0 : 1;
-    const std::size_t mu = static_cast<std::size_t>(m);
-
-    // Input matrix in CSR format
-    std::vector<intType> ia_host, ja_host;
-    std::vector<fpType> a_host;
-    const bool require_diagonal = diag_val == oneapi::mkl::diag::nonunit;
-    intType nnz = generate_random_matrix<fpType, intType>(
-        m, m, density_A_matrix, int_index, ia_host, ja_host, a_host, require_diagonal);
-
-    // Input dense vector.
-    // The input `x` is initialized to random values on host and device.
-    std::vector<fpType> x_host;
-    rand_vector(x_host, mu);
-
-    // Output and reference dense vectors.
-    // They are both initialized with a dummy value to catch more errors.
-    std::vector<fpType> y_host(mu, -2.0f);
-    std::vector<fpType> y_ref_host(y_host);
-
-    // Intel oneMKL does not support unsorted data if
-    // `sparse::optimize_trsv()` is not called first.
-    if (use_optimize) {
-        // Shuffle ordering of column indices/values to test sortedness
-        shuffle_data(ia_host.data(), ja_host.data(), a_host.data(), mu);
-    }
-
-    auto ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size());
-    auto ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size());
-    auto a_usm_uptr = malloc_device_uptr<fpType>(main_queue, a_host.size());
-    auto x_usm_uptr = malloc_device_uptr<fpType>(main_queue, x_host.size());
-    auto y_usm_uptr = malloc_device_uptr<fpType>(main_queue, y_host.size());
-
-    intType *ia_usm = ia_usm_uptr.get();
-    intType *ja_usm = ja_usm_uptr.get();
-    fpType *a_usm = a_usm_uptr.get();
-    fpType *x_usm = x_usm_uptr.get();
-    fpType *y_usm = y_usm_uptr.get();
-
-    std::vector<sycl::event> mat_dependencies;
-    std::vector<sycl::event> trsv_dependencies;
-    // Copy host to device
-    mat_dependencies.push_back(
-        main_queue.memcpy(ia_usm, ia_host.data(), ia_host.size() * sizeof(intType)));
-    mat_dependencies.push_back(
-        main_queue.memcpy(ja_usm, ja_host.data(), ja_host.size() * sizeof(intType)));
-    mat_dependencies.push_back(
-        main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType)));
-    trsv_dependencies.push_back(
-        main_queue.memcpy(x_usm, x_host.data(), x_host.size() * sizeof(fpType)));
-    trsv_dependencies.push_back(
-        main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType)));
-
-    sycl::event ev_copy, ev_release;
-    oneapi::mkl::sparse::matrix_handle_t handle = nullptr;
-    try {
-        sycl::event event;
-        CALL_RT_OR_CT(oneapi::mkl::sparse::init_matrix_handle, main_queue, &handle);
-
-        CALL_RT_OR_CT(event = oneapi::mkl::sparse::set_csr_data, main_queue, handle, m, m, nnz,
-                      index, ia_usm, ja_usm, a_usm, mat_dependencies);
-
-        if (use_optimize) {
-            CALL_RT_OR_CT(event = oneapi::mkl::sparse::optimize_trsv, main_queue, uplo_val,
-                          transpose_val, diag_val, handle, { event });
-        }
-
-        trsv_dependencies.push_back(event);
-        CALL_RT_OR_CT(event = oneapi::mkl::sparse::trsv, main_queue, uplo_val, transpose_val,
-                      diag_val, handle, x_usm, y_usm, trsv_dependencies);
-
-        CALL_RT_OR_CT(ev_release = oneapi::mkl::sparse::release_matrix_handle, main_queue, &handle,
-                      { event });
-
-        ev_copy = main_queue.memcpy(y_host.data(), y_usm, y_host.size() * sizeof(fpType), event);
-    }
-    catch (const sycl::exception &e) {
-        std::cout << "Caught synchronous SYCL exception during sparse TRSV:\n"
-                  << e.what() << std::endl;
-        print_error_code(e);
-        return 0;
-    }
-    catch (const oneapi::mkl::unimplemented &e) {
-        wait_and_free(main_queue, &handle);
-        return test_skipped;
-    }
-    catch (const std::runtime_error &error) {
-        std::cout << "Error raised during execution of sparse TRSV:\n" << error.what() << std::endl;
-        return 0;
-    }
-
-    // Compute reference.
-    prepare_reference_trsv_data(ia_host.data(), ja_host.data(), a_host.data(), m, int_index,
-                                uplo_val, transpose_val, diag_val, x_host.data(),
-                                y_ref_host.data());
-
-    // Compare the results of reference implementation and DPC++ implementation.
-    ev_copy.wait_and_throw();
-    bool valid = check_equal_vector(y_host, y_ref_host);
-
-    ev_release.wait_and_throw();
-    return static_cast<int>(valid);
-}
-
-class SparseTrsvUsmTests : public ::testing::TestWithParam<sycl::device *> {};
-
-/**
- * Helper function to run tests in different configuration.
- *
- * @tparam fpType Complex or scalar, single or double precision type
- * @param dev Device to test
- * @param transpose_val Transpose value for the input matrix
- */
-template <typename fpType>
-void test_helper(sycl::device *dev, oneapi::mkl::transpose transpose_val, int &num_passed,
-                 int &num_skipped) {
-    double density_A_matrix = 0.144;
-    oneapi::mkl::index_base index_zero = oneapi::mkl::index_base::zero;
-    oneapi::mkl::uplo lower = oneapi::mkl::uplo::lower;
-    oneapi::mkl::diag nonunit = oneapi::mkl::diag::nonunit;
-    int m = 277;
-    bool use_optimize = true;
-
-    // Basic test
-    EXPECT_TRUE_OR_FUTURE_SKIP(test<fpType>(dev, m, density_A_matrix, index_zero, lower,
-                                            transpose_val, nonunit, use_optimize),
-                               num_passed, num_skipped);
-    // Test index_base 1
-    EXPECT_TRUE_OR_FUTURE_SKIP(test<fpType>(dev, m, density_A_matrix, oneapi::mkl::index_base::one,
-                                            lower, transpose_val, nonunit, use_optimize),
-                               num_passed, num_skipped);
-    // Test upper triangular matrix
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test<fpType>(dev, m, density_A_matrix, index_zero, oneapi::mkl::uplo::upper, transpose_val,
-                     nonunit, use_optimize),
-        num_passed, num_skipped);
-    // Test unit diagonal matrix
-    EXPECT_TRUE_OR_FUTURE_SKIP(test<fpType>(dev, m, density_A_matrix, index_zero, lower,
-                                            transpose_val, oneapi::mkl::diag::unit, use_optimize),
-                               num_passed, num_skipped);
-    // Temporarily disable trsv using long indices on GPU
-    if (!dev->is_gpu()) {
-        // Test int64 indices
-        EXPECT_TRUE_OR_FUTURE_SKIP(test<fpType>(dev, 15L, density_A_matrix, index_zero, lower,
-                                                transpose_val, nonunit, use_optimize),
-                                   num_passed, num_skipped);
-    }
-    // Test lower without optimize_trsv
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test<fpType>(dev, m, density_A_matrix, index_zero, lower, transpose_val, nonunit, false),
-        num_passed, num_skipped);
-    // Test upper without optimize_trsv
-    EXPECT_TRUE_OR_FUTURE_SKIP(
-        test<fpType>(dev, m, density_A_matrix, index_zero, oneapi::mkl::uplo::upper, transpose_val,
-                     nonunit, false),
-        num_passed, num_skipped);
-}
-
-TEST_P(SparseTrsvUsmTests, RealSinglePrecision) {
-    using fpType = float;
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseTrsvUsmTests, RealDoublePrecision) {
-    using fpType = double;
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseTrsvUsmTests, ComplexSinglePrecision) {
-    using fpType = std::complex<float>;
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::conjtrans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-TEST_P(SparseTrsvUsmTests, ComplexDoublePrecision) {
-    using fpType = std::complex<double>;
-    CHECK_DOUBLE_ON_DEVICE(GetParam());
-    int num_passed = 0, num_skipped = 0;
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans, num_passed, num_skipped);
-    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::conjtrans, num_passed, num_skipped);
-    if (num_skipped > 0) {
-        // Mark that some tests were skipped
-        GTEST_SKIP() << "Passed: " << num_passed << ", Skipped: " << num_skipped
-                     << " configurations." << std::endl;
-    }
-}
-
-INSTANTIATE_TEST_SUITE_P(SparseTrsvUsmTestSuite, SparseTrsvUsmTests, testing::ValuesIn(devices),
-                         ::DeviceNamePrint());
-
-} // anonymous namespace
diff --git a/third-party-programs/THIRD-PARTY-PROGRAMS b/third-party-programs/THIRD-PARTY-PROGRAMS
deleted file mode 100644
index fd462fa83..000000000
--- a/third-party-programs/THIRD-PARTY-PROGRAMS
+++ /dev/null
@@ -1,115 +0,0 @@
-Intel® oneAPI Math Kernel Library (oneMKL) interfaces
-
-This file contains the list of third party software (“third party programs”)
-contained in the Intel software and their required notices and/or license terms.
-This third party software, even if included with the distribution of the Intel
-software, may be governed by separate license terms, including without limitation,
-third party license terms, other Intel software license terms, and open source
-software license terms. These separate license terms govern your use of the third
-party programs as set forth in the “third-party-programs-binary.txt” or other similarly-named text file.
-
-
-Third party programs and their corresponding required notices and/or license terms are listed below.
-
---------------------------------------------------------------
-1. rocRAND backend files
-   Copyright (C) 2022 Heidelberg University, Engineering Mathematics and Computing Lab (EMCL) 
-
-   cuRAND backend files
-   cuRAND back-end Copyright (c) 2021, The Regents of the University of
-   California, through Lawrence Berkeley National Laboratory (subject to receipt
-   of any required approvals from the U.S. Dept. of Energy). All rights
-   reserved.
-
-
-* Redistribution and use in source and binary forms, with or without
-* modification, are permitted provided that the following conditions are met:
-*
-* (1) Redistributions of source code must retain the above copyright notice,
-* this list of conditions and the following disclaimer.
-*
-* (2) Redistributions in binary form must reproduce the above copyright
-* notice, this list of conditions and the following disclaimer in the
-* documentation and/or other materials provided with the distribution.
-*
-* (3) Neither the name of the University of California, Lawrence Berkeley
-* National Laboratory, U.S. Dept. of Energy nor the names of its contributors
-* may be used to endorse or promote products derived from this software
-* without specific prior written permission.
-*
-*
-* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-* POSSIBILITY OF SUCH DAMAGE.
-*
-* You are under no obligation whatsoever to provide any bug fixes, patches,
-* or upgrades to the features, functionality or performance of the source
-* code ("Enhancements") to anyone; however, if you choose to make your
-* Enhancements available either publicly, or directly to Lawrence Berkeley
-* National Laboratory, without imposing a separate written license agreement
-* for such Enhancements, then you hereby grant the following license: a
-* non-exclusive, royalty-free perpetual license to install, use, modify,
-* prepare derivative works, incorporate into other computer software,
-* distribute, and sublicense such enhancements or derivative works thereof,
-* in binary and source code form.
-*
-* If you have questions about your rights to use or distribute this software,
-* please contact Berkeley Lab's Intellectual Property Office at
-* IPO@lbl.gov.
-*
-* NOTICE. This Software was developed under funding from the U.S. Department
-* of Energy and the U.S. Government consequently retains certain rights. As
-* such, the U.S. Government has been granted for itself and others acting on
-* its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
-* Software to reproduce, distribute copies to the public, prepare derivative
-* works, and perform publicly and display publicly, and to permit others to do
-* so.
-
---------------------------------------------------------------
-2. Google C++ Testing Framework
-   Copyright 2008, Google Inc.
-   All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-    * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
---------------------------------------------------------------
-3. Math.NET Numerics
-   Copyright (c) 2002-2022 Math.NET
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

From 72163239ef9774040f26873c28eb747ed0044c46 Mon Sep 17 00:00:00 2001
From: Maria Kraynyuk <maria.kraynyuk@intel.com>
Date: Thu, 8 Aug 2024 09:48:01 -0700
Subject: [PATCH 02/10] workflows: remove rfcs branch from pr.yml

---
 .github/workflows/pr.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 0da2153c6..14f452d54 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -6,6 +6,9 @@ on:
   push:
     branches: develop
   pull_request:
+    branches: 
+      -'*'
+      -'!rfcs'
   workflow_dispatch:
 
 env:

From 88fad5a5431dd2c649cf4b6e050897fd16cdea64 Mon Sep 17 00:00:00 2001
From: Maria Kraynyuk <maria.kraynyuk@intel.com>
Date: Thu, 8 Aug 2024 09:55:16 -0700
Subject: [PATCH 03/10] readme: add RFC label requirement for PR to the RFC
 process

---
 README.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 38bd4e2c6..583d0022c 100644
--- a/README.md
+++ b/README.md
@@ -26,12 +26,14 @@ e.g. link for this README in your fork will be
 ```
 https://github.com/<USERNAME>/oneMKL/blob/rfcs/README.md
 ```
-2. Assign all affected [teams](https://github.com/oneapi-src/oneMKL/blob/develop/README.md#contributing) and individual contributors as reviewers to the PR
-3. Organize offline review or an architecture meeting in order to collect feedback
+2. Assign all affected [teams](https://github.com/oneapi-src/oneMKL/blob/develop/README.md#contributing) and individual
+contributors as reviewers to the PR.
+3. Add `RFC` label to the PR to trigger slack notification in [#onemkl](https://uxlfoundation.slack.com/archives/onemkl) channel.
+4. Organize offline review or an architecture meeting in order to collect feedback.
     * It's recommended to keep all feedback as part of PR review, so it also
 will be documented in one place
-4. If changes affect API defined by [oneMKL specification](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemkl/source/) the design document must be reviewed by [UXL Foundation Math SIG](https://github.com/uxlfoundation/foundation/tree/main/math) and contributed to [oneAPI specification](https://github.com/uxlfoundation/oneAPI-spec) and only after it the proposed changes can be implemented in this project.
-5. Merge PR when it has all required approvals
+5. If changes affect API defined by [oneMKL specification](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemkl/source/) the design document must be reviewed by [UXL Foundation Math SIG](https://github.com/uxlfoundation/foundation/tree/main/math) and contributed to [oneAPI specification](https://github.com/uxlfoundation/oneAPI-spec) and only after it the proposed changes can be implemented in this project.
+6. Merge PR when it has all required approvals
     * It's recommended to add PR number to the commit message, so it will be easy
 to find the design discussion
     * It's recommended to update the preview document link in the PR to the merged

From 2ab11f999c2e67e38bd14941707f41e8dd3ca6f8 Mon Sep 17 00:00:00 2001
From: Maria Kraynyuk <maria.kraynyuk@intel.com>
Date: Thu, 8 Aug 2024 09:57:58 -0700
Subject: [PATCH 04/10] workflows: remove rfcs branch from documentation.yml

---
 .github/workflows/documentation.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index 000d737aa..0b245cbd5 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -8,6 +8,9 @@ on:
     paths:
       - 'docs/**'
   pull_request:
+    branches:
+      - '*'
+      - '!rfcs'
     paths:
       - 'docs/**'
   workflow_dispatch:

From ec125bc1b352ec5d5c8d55a23300ab9ea8046c20 Mon Sep 17 00:00:00 2001
From: Maria Kraynyuk <maria.kraynyuk@intel.com>
Date: Fri, 9 Aug 2024 08:58:23 -0700
Subject: [PATCH 05/10] readme: indenting the links

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 583d0022c..215dd4e78 100644
--- a/README.md
+++ b/README.md
@@ -23,9 +23,9 @@ long lines make it hard to read the document in the raw format
 1. Add new design document as a PR to this repository
     * Please add a link to preview document in the PR description,
 e.g. link for this README in your fork will be
-```
-https://github.com/<USERNAME>/oneMKL/blob/rfcs/README.md
-```
+        ```
+        https://github.com/<USERNAME>/oneMKL/blob/rfcs/README.md
+        ```
 2. Assign all affected [teams](https://github.com/oneapi-src/oneMKL/blob/develop/README.md#contributing) and individual
 contributors as reviewers to the PR.
 3. Add `RFC` label to the PR to trigger slack notification in [#onemkl](https://uxlfoundation.slack.com/archives/onemkl) channel.
@@ -39,7 +39,7 @@ to find the design discussion
     * It's recommended to update the preview document link in the PR to the merged
 one because initial link to the local fork/branch will stop working after local branch removal,
 e.g. link for this README will be 
-```
-https://github.com/oneapi-src/oneMKL/blob/rfcs/README.md
-```
+        ```
+        https://github.com/oneapi-src/oneMKL/blob/rfcs/README.md
+        ```
 

From e514f93938ba64b2eec9fcdb6d8a0def91185bf9 Mon Sep 17 00:00:00 2001
From: Maria Kraynyuk <maria.kraynyuk@intel.com>
Date: Fri, 9 Aug 2024 11:22:57 -0700
Subject: [PATCH 06/10] readme: add specific forums for RFC online review

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 215dd4e78..4a151fa43 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ e.g. link for this README in your fork will be
 2. Assign all affected [teams](https://github.com/oneapi-src/oneMKL/blob/develop/README.md#contributing) and individual
 contributors as reviewers to the PR.
 3. Add `RFC` label to the PR to trigger slack notification in [#onemkl](https://uxlfoundation.slack.com/archives/onemkl) channel.
-4. Organize offline review or an architecture meeting in order to collect feedback.
+4. Organize offline review or/and bring the RFC to [Math SIG forum](https://lists.uxlfoundation.org/g/Math-SIG), [UXL Foundation Open Source Working Group](https://lists.uxlfoundation.org/g/open-source-wg), or any other related forums in order to collect feedback.
     * It's recommended to keep all feedback as part of PR review, so it also
 will be documented in one place
 5. If changes affect API defined by [oneMKL specification](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemkl/source/) the design document must be reviewed by [UXL Foundation Math SIG](https://github.com/uxlfoundation/foundation/tree/main/math) and contributed to [oneAPI specification](https://github.com/uxlfoundation/oneAPI-spec) and only after it the proposed changes can be implemented in this project.

From 550e345041508c81b86aad1b655751046adb826c Mon Sep 17 00:00:00 2001
From: Maria Kraynyuk <maria.kraynyuk@intel.com>
Date: Fri, 9 Aug 2024 11:31:28 -0700
Subject: [PATCH 07/10] template: add a note about revision format

---
 rfcs/template.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/rfcs/template.md b/rfcs/template.md
index 12340e6dc..854b86160 100644
--- a/rfcs/template.md
+++ b/rfcs/template.md
@@ -3,12 +3,14 @@
 ### Revision
 
 
-|Date       |Revision| Comments                                                                 |
-|-----------|--------|--------------------------------------------------------------------------|
-|  YYYYMMDD |  1.0   | Initial version                                                          |
-|  YYYYMMDD |  X.Y   |                                                                          |
-
-
+|Date       |Revision | Comments                                                                 |
+|-----------|---------|--------------------------------------------------------------------------|
+|  YYYYMMDD |  1.0    | Initial version                                                          |
+|  YYYYMMDD |  X.Y*   |                                                                          |
+
+\* *where **X** indicates revision version that breaks backward compatibility with the previous revision
+in numerous significant ways, and **Y** indicates revision version that may extend or improve the design document,
+but the changes have minimal impact, if any, on backward compatibility.*
 
 ## Motivation
 

From 966e54c765a6a5a9248e6f580affa6b106540194 Mon Sep 17 00:00:00 2001
From: Maria Kraynyuk <maria.kraynyuk@intel.com>
Date: Fri, 9 Aug 2024 16:45:56 -0700
Subject: [PATCH 08/10] workflows: extend branch pattern for documentation

---
 .github/workflows/documentation.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
index 0b245cbd5..1b2e4327c 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/documentation.yml
@@ -9,7 +9,7 @@ on:
       - 'docs/**'
   pull_request:
     branches:
-      - '*'
+      - '**'
       - '!rfcs'
     paths:
       - 'docs/**'

From 6f33e54102ba9e6981a693668163612d653cd94f Mon Sep 17 00:00:00 2001
From: Maria Kraynyuk <maria.kraynyuk@intel.com>
Date: Fri, 9 Aug 2024 16:46:24 -0700
Subject: [PATCH 09/10] workflow: extend branch pattern for pr

---
 .github/workflows/pr.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 14f452d54..b2512b48b 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -7,7 +7,7 @@ on:
     branches: develop
   pull_request:
     branches: 
-      -'*'
+      -'**'
       -'!rfcs'
   workflow_dispatch:
 

From 1fb1e68a3477bd6a0d64547e139e7164db1f69e0 Mon Sep 17 00:00:00 2001
From: Maria Kraynyuk <maria.kraynyuk@intel.com>
Date: Fri, 9 Aug 2024 16:50:40 -0700
Subject: [PATCH 10/10] readme: align links to math sig forum

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4a151fa43..427c8f90e 100644
--- a/README.md
+++ b/README.md
@@ -29,10 +29,10 @@ e.g. link for this README in your fork will be
 2. Assign all affected [teams](https://github.com/oneapi-src/oneMKL/blob/develop/README.md#contributing) and individual
 contributors as reviewers to the PR.
 3. Add `RFC` label to the PR to trigger slack notification in [#onemkl](https://uxlfoundation.slack.com/archives/onemkl) channel.
-4. Organize offline review or/and bring the RFC to [Math SIG forum](https://lists.uxlfoundation.org/g/Math-SIG), [UXL Foundation Open Source Working Group](https://lists.uxlfoundation.org/g/open-source-wg), or any other related forums in order to collect feedback.
+4. Organize offline review or/and bring the RFC to [UXL Foundation Math SIG forum](https://lists.uxlfoundation.org/g/Math-SIG), [UXL Foundation Open Source Working Group](https://lists.uxlfoundation.org/g/open-source-wg), or any other related forums in order to collect feedback.
     * It's recommended to keep all feedback as part of PR review, so it also
 will be documented in one place
-5. If changes affect API defined by [oneMKL specification](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemkl/source/) the design document must be reviewed by [UXL Foundation Math SIG](https://github.com/uxlfoundation/foundation/tree/main/math) and contributed to [oneAPI specification](https://github.com/uxlfoundation/oneAPI-spec) and only after it the proposed changes can be implemented in this project.
+5. If changes affect API defined by [oneMKL specification](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemkl/source/) the design document must be reviewed by [UXL Foundation Math SIG forum](https://lists.uxlfoundation.org/g/Math-SIG) and contributed to [oneAPI specification](https://github.com/uxlfoundation/oneAPI-spec), and only after it the proposed changes can be implemented in this project.
 6. Merge PR when it has all required approvals
     * It's recommended to add PR number to the commit message, so it will be easy
 to find the design discussion